diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index ab1cee2d3..d92e04a1d 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -761,7 +761,7 @@ where /// Encode the given input. This method accepts both single sequences, as well as pair /// sequences. Also, a sequence can be a string, or already pre-tokenized input directly: - /// + /// Contrarily to `encode`, it does not compute offsets /// ``` /// # use tokenizers::Tokenizer; /// # use tokenizers::models::bpe::BPE; diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs index 865aaea05..125689a81 100644 --- a/tokenizers/src/tokenizer/pre_tokenizer.rs +++ b/tokenizers/src/tokenizer/pre_tokenizer.rs @@ -154,7 +154,7 @@ impl PreTokenizedString { .flat_map(|split| { split.tokens.unwrap().into_iter().map(|token| { // Replace this with the actual fields you need for the Encoding type - (token.id, String::new(), (0, 0), None, 0) + (token.id,String::with_capacity(0), (0, 0), None, 0) }) }) .collect();