From b617b18bb681d33b0e1f203c29ca1ad8ebb38060 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Thu, 8 Aug 2024 14:47:17 +0200 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- tokenizers/src/tokenizer/mod.rs | 2 +- tokenizers/src/tokenizer/pre_tokenizer.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index ab1cee2d3..d92e04a1d 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -761,7 +761,7 @@ where /// Encode the given input. This method accepts both single sequences, as well as pair /// sequences. Also, a sequence can be a string, or already pre-tokenized input directly: - /// + /// Contrarily to `encode`, it does not compute offsets /// ``` /// # use tokenizers::Tokenizer; /// # use tokenizers::models::bpe::BPE; diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs index 865aaea05..125689a81 100644 --- a/tokenizers/src/tokenizer/pre_tokenizer.rs +++ b/tokenizers/src/tokenizer/pre_tokenizer.rs @@ -154,7 +154,7 @@ impl PreTokenizedString { .flat_map(|split| { split.tokens.unwrap().into_iter().map(|token| { // Replace this with the actual fields you need for the Encoding type - (token.id, String::new(), (0, 0), None, 0) + (token.id,String::with_capacity(0), (0, 0), None, 0) }) }) .collect();