Skip to content

Commit

Permalink
Remove split.
Browse files Browse the repository at this point in the history
  • Loading branch information
Narsil committed Mar 29, 2024
1 parent b08b6bc commit bc51e3f
Show file tree
Hide file tree
Showing 11 changed files with 60 additions and 114 deletions.
7 changes: 1 addition & 6 deletions bindings/node/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,7 @@ export function ctcDecoder(
cleanup?: boolean | undefined | null,
): Decoder
export function fuseDecoder(): Decoder
export function metaspaceDecoder(
replacement?: string = '▁',
prependScheme?: prepend_scheme = 'always',
split?: split = true,
): Decoder
export function metaspaceDecoder(replacement?: string = '▁', prependScheme?: prepend_scheme = 'always'): Decoder
export function replaceDecoder(pattern: string, content: string): Decoder
export function sequenceDecoder(decoders: Array<Decoder>): Decoder
export function stripDecoder(content: string, left: number, right: number): Decoder
Expand Down Expand Up @@ -96,7 +92,6 @@ export function bertPreTokenizer(): PreTokenizer
export function metaspacePreTokenizer(
replacement?: string = '▁',
prependScheme?: prepend_scheme = 'always',
split?: split = true,
): PreTokenizer
export function splitPreTokenizer(pattern: string, behavior: string, invert?: boolean | undefined | null): PreTokenizer
export function punctuationPreTokenizer(behavior?: string | undefined | null): PreTokenizer
Expand Down
4 changes: 1 addition & 3 deletions bindings/node/src/decoders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,10 +91,8 @@ pub fn fuse_decoder() -> Decoder {
pub fn metaspace_decoder(
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
) -> Result<Decoder> {
use tk::pre_tokenizers::metaspace::PrependScheme;
let split = split.unwrap_or(true);
let replacement = replacement.unwrap_or("▁".to_string());
if replacement.chars().count() != 1 {
return Err(Error::from_reason(
Expand All @@ -115,7 +113,7 @@ pub fn metaspace_decoder(
};
Ok(Decoder {
decoder: Some(Arc::new(RwLock::new(
tk::decoders::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
tk::decoders::metaspace::Metaspace::new(replacement, prepend_scheme).into(),
))),
})
}
Expand Down
4 changes: 1 addition & 3 deletions bindings/node/src/pre_tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,8 @@ pub fn bert_pre_tokenizer() -> PreTokenizer {
pub fn metaspace_pre_tokenizer(
#[napi(ts_arg_type = "string = '▁'")] replacement: Option<String>,
#[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option<String>,
#[napi(ts_arg_type = "split = true")] split: Option<bool>,
) -> Result<PreTokenizer> {
use tk::pre_tokenizers::metaspace::PrependScheme;
let split = split.unwrap_or(true);
let replacement = replacement.unwrap_or("▁".to_string());
if replacement.chars().count() != 1 {
return Err(Error::from_reason(
Expand All @@ -181,7 +179,7 @@ pub fn metaspace_pre_tokenizer(

Ok(PreTokenizer {
pretok: Some(Arc::new(RwLock::new(
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(),
tk::pre_tokenizers::metaspace::Metaspace::new(replacement, prepend_scheme).into(),
))),
})
}
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/py_src/tokenizers/decoders/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ class Metaspace(Decoder):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
def __init__(self, replacement="▁", prepend_scheme="always", split=True):
def __init__(self, replacement="▁", prepend_scheme="always"):
pass

def decode(self, tokens):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ class Metaspace(PreTokenizer):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
def __init__(self, replacement="_", prepend_scheme="always", split=True):
def __init__(self, replacement="_", prepend_scheme="always"):
pass

def pre_tokenize(self, pretok):
Expand Down
20 changes: 3 additions & 17 deletions bindings/python/src/decoders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -322,16 +322,6 @@ impl PyMetaspaceDec {
setter!(self_, Metaspace, @set_replacement, replacement.0);
}

#[getter]
fn get_split(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, get_split())
}

#[setter]
fn set_split(self_: PyRef<Self>, split: bool) {
setter!(self_, Metaspace, @set_split, split);
}

#[getter]
fn get_prepend_scheme(self_: PyRef<Self>) -> String {
// Assuming Metaspace has a method to get the prepend_scheme as a string
Expand All @@ -352,16 +342,12 @@ impl PyMetaspaceDec {
}

#[new]
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"\", prepend_scheme = \"always\", split = True)")]
fn new(
replacement: PyChar,
prepend_scheme: String,
split: bool,
) -> PyResult<(Self, PyDecoder)> {
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always")), text_signature = "(self, replacement = \"\", prepend_scheme = \"always\")")]
fn new(replacement: PyChar, prepend_scheme: String) -> PyResult<(Self, PyDecoder)> {
let prepend_scheme = from_string(prepend_scheme)?;
Ok((
PyMetaspaceDec {},
Metaspace::new(replacement.0, prepend_scheme, split).into(),
Metaspace::new(replacement.0, prepend_scheme).into(),
))
}
}
Expand Down
20 changes: 3 additions & 17 deletions bindings/python/src/pre_tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -494,16 +494,6 @@ impl PyMetaspace {
setter!(self_, Metaspace, @set_replacement, replacement.0);
}

#[getter]
fn get_split(self_: PyRef<Self>) -> bool {
getter!(self_, Metaspace, get_split())
}

#[setter]
fn set_split(self_: PyRef<Self>, split: bool) {
setter!(self_, Metaspace, @set_split, split);
}

#[getter]
fn get_prepend_scheme(self_: PyRef<Self>) -> String {
// Assuming Metaspace has a method to get the prepend_scheme as a string
Expand All @@ -524,15 +514,11 @@ impl PyMetaspace {
}

#[new]
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")]
fn new(
replacement: PyChar,
prepend_scheme: String,
split: bool,
) -> PyResult<(Self, PyPreTokenizer)> {
#[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always")), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\")")]
fn new(replacement: PyChar, prepend_scheme: String) -> PyResult<(Self, PyPreTokenizer)> {
// Create a new Metaspace instance
let prepend_scheme = from_string(prepend_scheme)?;
let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme, split);
let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme);
Ok((PyMetaspace {}, new_instance.into()))
}
}
Expand Down
12 changes: 5 additions & 7 deletions bindings/python/tests/bindings/test_pre_tokenizers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import pickle
import re

import pytest

Expand Down Expand Up @@ -94,26 +95,23 @@ def test_instantiate(self):
assert Metaspace(replacement="-") is not None
with pytest.raises(ValueError, match="expected a string of length 1"):
Metaspace(replacement="")
assert Metaspace(add_prefix_space=True) is not None
with pytest.raises(ValueError, match=re.escape("toto is an unknown variant, should be one of ['first', 'never', 'always']")):
assert Metaspace(prepend_scheme="toto") is not None
assert Metaspace(prepend_scheme="always") is not None
assert isinstance(Metaspace(), PreTokenizer)
assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)

def test_can_modify(self):
pretok = Metaspace(replacement="$", add_prefix_space=False)
pretok = Metaspace(replacement="$")

assert pretok.replacement == "$"
assert pretok.add_prefix_space == False

# Modify these
pretok.replacement = "%"
assert pretok.replacement == "%"
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
pretok.prepend_scheme = "never"
assert pretok.prepend_scheme == "never"
pretok.split = False
assert pretok.split == False


class TestCharDelimiterSplit:
Expand Down
4 changes: 2 additions & 2 deletions tokenizers/src/decoders/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ mod tests {
let oldjson = r#"{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}]}"#;
let olddecoder: DecoderWrapper = serde_json::from_str(oldjson).unwrap();
let oldserialized = serde_json::to_string(&olddecoder).unwrap();
let json = r#"{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always","split":true}]}"#;
let json = r#"{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always"}]}"#;
assert_eq!(oldserialized, json);

let decoder: DecoderWrapper = serde_json::from_str(json).unwrap();
Expand All @@ -85,7 +85,7 @@ mod tests {
}
#[test]
fn decoder_serialization_other_no_arg() {
let json = r#"{"type":"Sequence","decoders":[{"type":"Fuse"},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always","split":true}]}"#;
let json = r#"{"type":"Sequence","decoders":[{"type":"Fuse"},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always"}]}"#;
let decoder: DecoderWrapper = serde_json::from_str(json).unwrap();
let serialized = serde_json::to_string(&decoder).unwrap();
assert_eq!(serialized, json);
Expand Down
Loading

0 comments on commit bc51e3f

Please sign in to comment.