diff --git a/bindings/node/index.d.ts b/bindings/node/index.d.ts index 1252a2623..b3b3322f6 100644 --- a/bindings/node/index.d.ts +++ b/bindings/node/index.d.ts @@ -11,11 +11,7 @@ export function ctcDecoder( cleanup?: boolean | undefined | null, ): Decoder export function fuseDecoder(): Decoder -export function metaspaceDecoder( - replacement?: string = '▁', - prependScheme?: prepend_scheme = 'always', - split?: split = true, -): Decoder +export function metaspaceDecoder(replacement?: string = '▁', prependScheme?: prepend_scheme = 'always'): Decoder export function replaceDecoder(pattern: string, content: string): Decoder export function sequenceDecoder(decoders: Array): Decoder export function stripDecoder(content: string, left: number, right: number): Decoder @@ -96,7 +92,6 @@ export function bertPreTokenizer(): PreTokenizer export function metaspacePreTokenizer( replacement?: string = '▁', prependScheme?: prepend_scheme = 'always', - split?: split = true, ): PreTokenizer export function splitPreTokenizer(pattern: string, behavior: string, invert?: boolean | undefined | null): PreTokenizer export function punctuationPreTokenizer(behavior?: string | undefined | null): PreTokenizer diff --git a/bindings/node/src/decoders.rs b/bindings/node/src/decoders.rs index 51123887d..3d972124d 100644 --- a/bindings/node/src/decoders.rs +++ b/bindings/node/src/decoders.rs @@ -91,10 +91,8 @@ pub fn fuse_decoder() -> Decoder { pub fn metaspace_decoder( #[napi(ts_arg_type = "string = '▁'")] replacement: Option, #[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option, - #[napi(ts_arg_type = "split = true")] split: Option, ) -> Result { use tk::pre_tokenizers::metaspace::PrependScheme; - let split = split.unwrap_or(true); let replacement = replacement.unwrap_or("▁".to_string()); if replacement.chars().count() != 1 { return Err(Error::from_reason( @@ -115,7 +113,7 @@ pub fn metaspace_decoder( }; Ok(Decoder { decoder: Some(Arc::new(RwLock::new( - tk::decoders::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(), + tk::decoders::metaspace::Metaspace::new(replacement, prepend_scheme).into(), ))), }) } diff --git a/bindings/node/src/pre_tokenizers.rs b/bindings/node/src/pre_tokenizers.rs index b3da46dbf..6e827be32 100644 --- a/bindings/node/src/pre_tokenizers.rs +++ b/bindings/node/src/pre_tokenizers.rs @@ -156,10 +156,8 @@ pub fn bert_pre_tokenizer() -> PreTokenizer { pub fn metaspace_pre_tokenizer( #[napi(ts_arg_type = "string = '▁'")] replacement: Option, #[napi(ts_arg_type = "prepend_scheme = 'always'")] prepend_scheme: Option, - #[napi(ts_arg_type = "split = true")] split: Option, ) -> Result { use tk::pre_tokenizers::metaspace::PrependScheme; - let split = split.unwrap_or(true); let replacement = replacement.unwrap_or("▁".to_string()); if replacement.chars().count() != 1 { return Err(Error::from_reason( @@ -181,7 +179,7 @@ pub fn metaspace_pre_tokenizer( Ok(PreTokenizer { pretok: Some(Arc::new(RwLock::new( - tk::pre_tokenizers::metaspace::Metaspace::new(replacement, prepend_scheme, split).into(), + tk::pre_tokenizers::metaspace::Metaspace::new(replacement, prepend_scheme).into(), ))), }) } diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 7d8e2334a..084b15c50 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -156,7 +156,7 @@ class Metaspace(Decoder): Whether to add a space to the first word if there isn't already one. This lets us treat `hello` exactly like `say hello`. """ - def __init__(self, replacement="▁", prepend_scheme="always", split=True): + def __init__(self, replacement="▁", prepend_scheme="always"): pass def decode(self, tokens): diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index d2019e6fb..9c99daa01 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -274,7 +274,7 @@ class Metaspace(PreTokenizer): Whether to add a space to the first word if there isn't already one. This lets us treat `hello` exactly like `say hello`. """ - def __init__(self, replacement="_", prepend_scheme="always", split=True): + def __init__(self, replacement="_", prepend_scheme="always"): pass def pre_tokenize(self, pretok): diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index f3d36532a..a2d62ff26 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -322,16 +322,6 @@ impl PyMetaspaceDec { setter!(self_, Metaspace, @set_replacement, replacement.0); } - #[getter] - fn get_split(self_: PyRef) -> bool { - getter!(self_, Metaspace, get_split()) - } - - #[setter] - fn set_split(self_: PyRef, split: bool) { - setter!(self_, Metaspace, @set_split, split); - } - #[getter] fn get_prepend_scheme(self_: PyRef) -> String { // Assuming Metaspace has a method to get the prepend_scheme as a string @@ -352,16 +342,12 @@ impl PyMetaspaceDec { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always"), split = true), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\", split = True)")] - fn new( - replacement: PyChar, - prepend_scheme: String, - split: bool, - ) -> PyResult<(Self, PyDecoder)> { + #[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme = String::from("always")), text_signature = "(self, replacement = \"▁\", prepend_scheme = \"always\")")] + fn new(replacement: PyChar, prepend_scheme: String) -> PyResult<(Self, PyDecoder)> { let prepend_scheme = from_string(prepend_scheme)?; Ok(( PyMetaspaceDec {}, - Metaspace::new(replacement.0, prepend_scheme, split).into(), + Metaspace::new(replacement.0, prepend_scheme).into(), )) } } diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index 59cc394da..bd0e32885 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -494,16 +494,6 @@ impl PyMetaspace { setter!(self_, Metaspace, @set_replacement, replacement.0); } - #[getter] - fn get_split(self_: PyRef) -> bool { - getter!(self_, Metaspace, get_split()) - } - - #[setter] - fn set_split(self_: PyRef, split: bool) { - setter!(self_, Metaspace, @set_split, split); - } - #[getter] fn get_prepend_scheme(self_: PyRef) -> String { // Assuming Metaspace has a method to get the prepend_scheme as a string @@ -524,15 +514,11 @@ impl PyMetaspace { } #[new] - #[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always"), split=true), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\", split=True)")] - fn new( - replacement: PyChar, - prepend_scheme: String, - split: bool, - ) -> PyResult<(Self, PyPreTokenizer)> { + #[pyo3(signature = (replacement = PyChar('▁'), prepend_scheme=String::from("always")), text_signature = "(self, replacement=\"_\", prepend_scheme=\"always\")")] + fn new(replacement: PyChar, prepend_scheme: String) -> PyResult<(Self, PyPreTokenizer)> { // Create a new Metaspace instance let prepend_scheme = from_string(prepend_scheme)?; - let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme, split); + let new_instance: Metaspace = Metaspace::new(replacement.0, prepend_scheme); Ok((PyMetaspace {}, new_instance.into())) } } diff --git a/bindings/python/tests/bindings/test_pre_tokenizers.py b/bindings/python/tests/bindings/test_pre_tokenizers.py index 4226df125..88b01a39d 100644 --- a/bindings/python/tests/bindings/test_pre_tokenizers.py +++ b/bindings/python/tests/bindings/test_pre_tokenizers.py @@ -1,5 +1,6 @@ import json import pickle +import re import pytest @@ -94,26 +95,23 @@ def test_instantiate(self): assert Metaspace(replacement="-") is not None with pytest.raises(ValueError, match="expected a string of length 1"): Metaspace(replacement="") - assert Metaspace(add_prefix_space=True) is not None + with pytest.raises(ValueError, match=re.escape("toto is an unknown variant, should be one of ['first', 'never', 'always']")): + assert Metaspace(prepend_scheme="toto") is not None + assert Metaspace(prepend_scheme="always") is not None assert isinstance(Metaspace(), PreTokenizer) assert isinstance(Metaspace(), Metaspace) assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace) def test_can_modify(self): - pretok = Metaspace(replacement="$", add_prefix_space=False) + pretok = Metaspace(replacement="$") assert pretok.replacement == "$" - assert pretok.add_prefix_space == False # Modify these pretok.replacement = "%" assert pretok.replacement == "%" - pretok.add_prefix_space = True - assert pretok.add_prefix_space == True pretok.prepend_scheme = "never" assert pretok.prepend_scheme == "never" - pretok.split = False - assert pretok.split == False class TestCharDelimiterSplit: diff --git a/tokenizers/src/decoders/mod.rs b/tokenizers/src/decoders/mod.rs index 682e63b50..9b3b99838 100644 --- a/tokenizers/src/decoders/mod.rs +++ b/tokenizers/src/decoders/mod.rs @@ -76,7 +76,7 @@ mod tests { let oldjson = r#"{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true,"prepend_scheme":"always"}]}"#; let olddecoder: DecoderWrapper = serde_json::from_str(oldjson).unwrap(); let oldserialized = serde_json::to_string(&olddecoder).unwrap(); - let json = r#"{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always","split":true}]}"#; + let json = r#"{"type":"Sequence","decoders":[{"type":"ByteFallback"},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always"}]}"#; assert_eq!(oldserialized, json); let decoder: DecoderWrapper = serde_json::from_str(json).unwrap(); @@ -85,7 +85,7 @@ mod tests { } #[test] fn decoder_serialization_other_no_arg() { - let json = r#"{"type":"Sequence","decoders":[{"type":"Fuse"},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always","split":true}]}"#; + let json = r#"{"type":"Sequence","decoders":[{"type":"Fuse"},{"type":"Metaspace","replacement":"▁","prepend_scheme":"always"}]}"#; let decoder: DecoderWrapper = serde_json::from_str(json).unwrap(); let serialized = serde_json::to_string(&decoder).unwrap(); assert_eq!(serialized, json); diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 52b415c9b..2b46471a7 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -20,7 +20,6 @@ pub enum PrependScheme { pub struct Metaspace { replacement: char, pub prepend_scheme: PrependScheme, - pub split: bool, #[serde(skip)] str_rep: String, } @@ -48,7 +47,6 @@ impl<'de> Deserialize<'de> for Metaspace { pub add_prefix_space: Option, #[serde(default = "default_prepend_scheme_value")] pub prepend_scheme: PrependScheme, - pub split: Option, #[serde(rename = "str_rep")] _str_rep: Option, } @@ -62,22 +60,17 @@ impl<'de> Deserialize<'de> for Metaspace { } helper.prepend_scheme = PrependScheme::Never; } - let instance = Self::new( - helper.replacement, - helper.prepend_scheme, - helper.split.unwrap_or(true), - ); + let instance = Self::new(helper.replacement, helper.prepend_scheme); Ok(instance) } } impl Metaspace { - pub fn new(replacement: char, prepend_scheme: PrependScheme, split: bool) -> Self { + pub fn new(replacement: char, prepend_scheme: PrependScheme) -> Self { Self { replacement, str_rep: replacement.to_string(), prepend_scheme, - split, } } @@ -90,14 +83,6 @@ impl Metaspace { self.str_rep = replacement.to_string(); } - pub fn get_split(&self) -> bool { - self.split - } - - pub fn set_split(&mut self, split: bool) { - self.split = split; - } - pub fn get_prepend_scheme(&self) -> PrependScheme { self.prepend_scheme } @@ -109,7 +94,7 @@ impl Metaspace { impl Default for Metaspace { fn default() -> Self { - Self::new('▁', PrependScheme::Always, true) + Self::new('▁', PrependScheme::Always) } } @@ -132,11 +117,7 @@ impl PreTokenizer for Metaspace { } PrependScheme::Never => {} }; - if self.split { - normalized.split(self.replacement, SplitDelimiterBehavior::MergedWithNext) - } else { - Ok(vec![normalized]) - } + normalized.split(self.replacement, SplitDelimiterBehavior::MergedWithNext) }) } } @@ -175,9 +156,8 @@ mod tests { #[test] fn serialization() { - let metaspace = Metaspace::new('_', PrependScheme::Always, true); - let metaspace_s = - r#"{"type":"Metaspace","replacement":"_","prepend_scheme":"always","split":true}"#; + let metaspace = Metaspace::new('_', PrependScheme::Always); + let metaspace_s = r#"{"type":"Metaspace","replacement":"_","prepend_scheme":"always"}"#; assert_eq!(serde_json::to_string(&metaspace).unwrap(), metaspace_s); assert_eq!( serde_json::from_str::(metaspace_s).unwrap(), @@ -188,7 +168,7 @@ mod tests { let metaspace_s = r#"{"type":"Metaspace","replacement":"_","add_prefix_space":false,"prepend_scheme":"always"}"#; assert!(serde_json::from_str::(metaspace_s).is_err(),); - let metaspace = Metaspace::new('_', PrependScheme::Always, true); + let metaspace = Metaspace::new('_', PrependScheme::Always); let metaspace_s = r#"{"type":"Metaspace","str_rep":"_","replacement":"_","add_prefix_space":true,"prepend_scheme":"always"}"#; assert_eq!( serde_json::from_str::(metaspace_s).unwrap(), @@ -204,7 +184,7 @@ mod tests { #[test] fn basic() { - let pretok = Metaspace::new('▁', PrependScheme::Always, true); + let pretok = Metaspace::new('▁', PrependScheme::Always); let mut pretokenized = PreTokenizedString::from("Hey friend!"); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( @@ -227,7 +207,7 @@ mod tests { #[test] fn multiple_spaces() { - let pretok = Metaspace::new('▁', PrependScheme::Always, true); + let pretok = Metaspace::new('▁', PrependScheme::Always); let mut pretokenized = PreTokenizedString::from("Hey friend!"); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( @@ -260,17 +240,17 @@ mod tests { #[test] fn non_legacy_meta_space() { - let mut pretok = Metaspace::new('▁', PrependScheme::Always, true); + let mut pretok = Metaspace::new('▁', PrependScheme::Always); pretok.set_prepend_scheme(PrependScheme::Always); - assert_eq!(pretok, Metaspace::new('▁', PrependScheme::Always, true)); + assert_eq!(pretok, Metaspace::new('▁', PrependScheme::Always)); pretok.set_prepend_scheme(PrependScheme::Never); - assert_eq!(pretok, Metaspace::new('▁', PrependScheme::Never, true)); + assert_eq!(pretok, Metaspace::new('▁', PrependScheme::Never)); pretok.set_prepend_scheme(PrependScheme::First); - assert_eq!(pretok, Metaspace::new('▁', PrependScheme::First, true)); + assert_eq!(pretok, Metaspace::new('▁', PrependScheme::First)); - let pretok = Metaspace::new('▁', PrependScheme::First, false); + let pretok = Metaspace::new('▁', PrependScheme::First); let mut pretokenized = PreTokenizedString::from("Hey my friend how▁are you"); let re_ref = Regex::new(r"()").unwrap(); pretokenized @@ -285,12 +265,17 @@ mod tests { .map(|(s, o, _)| (s, o)) .collect::>(), vec![ - ("▁Hey▁my▁friend▁", (0, 23)), + ("▁Hey", (0, 6)), + ("▁my", (6, 11)), + ("▁friend", (11, 20)), + ("▁", (20, 23)), ("", (23, 26)), - ("how▁are▁you", (26, 41)) + ("how", (26, 29)), + ("▁are", (29, 35)), + ("▁you", (35, 41)) ] ); - let pretok = Metaspace::new('▁', PrependScheme::Always, true); + let pretok = Metaspace::new('▁', PrependScheme::Always); pretok.pre_tokenize(&mut pretokenized).unwrap(); assert_eq!( pretokenized @@ -310,7 +295,7 @@ mod tests { ] ); - let pretok = Metaspace::new('▁', PrependScheme::First, false); + let pretok = Metaspace::new('▁', PrependScheme::First); let mut pretokenized = PreTokenizedString::from(" Hey how"); // test with prefix pretokenized .split(|_, sequence| sequence.split(&re_ref, SplitDelimiterBehavior::Isolated)) @@ -322,7 +307,12 @@ mod tests { .into_iter() .map(|(s, o, _)| (s, o)) .collect::>(), - vec![("▁Hey▁", (0, 9)), ("", (9, 12)), ("how", (12, 15))] + vec![ + ("▁Hey", (0, 6)), + ("▁", (6, 9)), + ("", (9, 12)), + ("how", (12, 15)) + ] ); let mut pretokenized = PreTokenizedString::from(" Hey how are you"); // test with many splits @@ -337,11 +327,14 @@ mod tests { .map(|(s, o, _)| (s, o)) .collect::>(), vec![ - ("▁Hey▁", (0, 9)), + ("▁Hey", (0, 6)), + ("▁", (6, 9)), ("", (9, 12)), - ("how▁", (12, 18)), + ("how", (12, 15)), + ("▁", (15, 18)), ("", (18, 21)), - ("are▁", (21, 27)), + ("are", (21, 24)), + ("▁", (24, 27)), ("", (27, 30)), ("▁you", (30, 36)) ] @@ -349,13 +342,13 @@ mod tests { } #[test] fn decode() { - let decoder = Metaspace::new('▁', PrependScheme::Always, true); + let decoder = Metaspace::new('▁', PrependScheme::Always); let res = decoder .decode_chain(vec!["▁Hey".into(), "▁friend!".into()]) .unwrap(); assert_eq!(res, vec!["Hey", " friend!"]); - let decoder = Metaspace::new('▁', PrependScheme::Never, true); + let decoder = Metaspace::new('▁', PrependScheme::Never); let res = decoder .decode_chain(vec!["▁Hey".into(), "▁friend!".into()]) .unwrap(); diff --git a/tokenizers/src/pre_tokenizers/mod.rs b/tokenizers/src/pre_tokenizers/mod.rs index cf64fb876..34792a9f9 100644 --- a/tokenizers/src/pre_tokenizers/mod.rs +++ b/tokenizers/src/pre_tokenizers/mod.rs @@ -82,7 +82,7 @@ mod tests { pre_tokenizer, PreTokenizerWrapper::Sequence(Sequence::new(vec![ PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit {}), - PreTokenizerWrapper::Metaspace(Metaspace::new('▁', PrependScheme::Always, true)) + PreTokenizerWrapper::Metaspace(Metaspace::new('▁', PrependScheme::Always)) ])) ); @@ -93,7 +93,7 @@ mod tests { assert_eq!( pre_tokenizer, - PreTokenizerWrapper::Metaspace(Metaspace::new('▁', PrependScheme::Always, true)) + PreTokenizerWrapper::Metaspace(Metaspace::new('▁', PrependScheme::Always)) ); let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str(r#"{"type":"Sequence","pretokenizers":[{"type":"WhitespaceSplit"},{"type":"Metaspace","replacement":"▁","add_prefix_space":true}]}"#).unwrap(); @@ -102,7 +102,7 @@ mod tests { pre_tokenizer, PreTokenizerWrapper::Sequence(Sequence::new(vec![ PreTokenizerWrapper::WhitespaceSplit(WhitespaceSplit {}), - PreTokenizerWrapper::Metaspace(Metaspace::new('▁', PrependScheme::Always, true)) + PreTokenizerWrapper::Metaspace(Metaspace::new('▁', PrependScheme::Always)) ])) ); @@ -113,11 +113,7 @@ mod tests { assert_eq!( pre_tokenizer, - PreTokenizerWrapper::Metaspace(Metaspace::new( - '▁', - metaspace::PrependScheme::First, - true - )) + PreTokenizerWrapper::Metaspace(Metaspace::new('▁', metaspace::PrependScheme::First,)) ); let pre_tokenizer: PreTokenizerWrapper = serde_json::from_str( @@ -127,11 +123,7 @@ mod tests { assert_eq!( pre_tokenizer, - PreTokenizerWrapper::Metaspace(Metaspace::new( - '▁', - metaspace::PrependScheme::Always, - true - )) + PreTokenizerWrapper::Metaspace(Metaspace::new('▁', metaspace::PrependScheme::Always,)) ); }