From 29fef1e7aa47216d3b9fe4a9416f6021c14b4561 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 12 Mar 2024 21:24:21 +1100 Subject: [PATCH] [`remove black`] And use ruff (#1436) * nits * Fixing deps. * Ruff update. * Import order matters. * Fix. * Revert ruff fix. * Visualizer. * Putting back the imports. --------- Co-authored-by: Nicolas Patry --- bindings/python/Makefile | 6 +- bindings/python/examples/example.py | 11 +-- .../python/examples/train_with_datasets.py | 2 +- .../python/py_src/tokenizers/__init__.pyi | 89 +++++++++++++++++-- .../py_src/tokenizers/decoders/__init__.pyi | 21 +++-- .../py_src/tokenizers/models/__init__.pyi | 35 ++++++-- .../tokenizers/normalizers/__init__.pyi | 40 ++++++--- .../tokenizers/pre_tokenizers/__init__.pyi | 35 +++++--- .../py_src/tokenizers/processors/__init__.pyi | 17 ++-- .../py_src/tokenizers/tools/visualizer.py | 6 +- .../py_src/tokenizers/trainers/__init__.pyi | 2 - bindings/python/pyproject.toml | 20 ++++- bindings/python/scripts/convert.py | 24 ++--- .../python/scripts/sentencepiece_extractor.py | 9 +- bindings/python/scripts/spm_parity_check.py | 39 +++----- bindings/python/stub.py | 31 +++---- bindings/python/tests/bindings/test_models.py | 1 - .../python/tests/bindings/test_normalizers.py | 3 +- .../python/tests/bindings/test_processors.py | 8 +- .../python/tests/bindings/test_tokenizer.py | 5 +- .../tests/documentation/test_pipeline.py | 1 - .../tests/documentation/test_quicktour.py | 4 - .../test_tutorial_train_from_iterators.py | 1 + .../implementations/test_base_tokenizer.py | 2 - .../implementations/test_bert_wordpiece.py | 2 - .../implementations/test_byte_level_bpe.py | 2 - .../tests/implementations/test_char_bpe.py | 2 - .../implementations/test_sentencepiece.py | 2 - bindings/python/tests/test_serialization.py | 7 +- 29 files changed, 258 insertions(+), 169 deletions(-) diff --git a/bindings/python/Makefile b/bindings/python/Makefile index f7a05dac7..8eeaf83a1 100644 --- a/bindings/python/Makefile +++ b/bindings/python/Makefile @@ -8,12 +8,14 @@ check_dirs := examples py_src/tokenizers tests # Format source code automatically style: python stub.py - black --line-length 119 --target-version py35 $(check_dirs) + ruff check $(check_dirs) --fix + ruff format $(check_dirs)t # Check the source code is formatted correctly check-style: python stub.py --check - black --check --line-length 119 --target-version py35 examples py_src/tokenizers tests + ruff check examples py_src/tokenizers tests + ruff format --check examples py_src/tokenizers tests TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json diff --git a/bindings/python/examples/example.py b/bindings/python/examples/example.py index 97b903401..d62390644 100644 --- a/bindings/python/examples/example.py +++ b/bindings/python/examples/example.py @@ -4,16 +4,15 @@ from tqdm import tqdm - -logging.getLogger("transformers").disabled = True -logging.getLogger("transformers.tokenization_utils").disabled = True - from tokenizers import Tokenizer, decoders, pre_tokenizers from tokenizers.models import BPE, WordPiece from tokenizers.normalizers import BertNormalizer from tokenizers.processors import BertProcessing from transformers import BertTokenizer, GPT2Tokenizer +logging.getLogger("transformers").disabled = True +logging.getLogger("transformers.tokenization_utils").disabled = True + parser = argparse.ArgumentParser() parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)") @@ -51,9 +50,7 @@ If the implementation is hard to explain, it's a bad idea. If the implementation is easy to explain, it may be a good idea. Namespaces are one honking great idea -- let's do more of those! -""".split( - "\n" - ) +""".split("\n") if args.type == "gpt2": print("Running GPT-2 tokenizer") diff --git a/bindings/python/examples/train_with_datasets.py b/bindings/python/examples/train_with_datasets.py index 7c3168342..b54376a2a 100644 --- a/bindings/python/examples/train_with_datasets.py +++ b/bindings/python/examples/train_with_datasets.py @@ -1,6 +1,6 @@ import datasets -from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers +from tokenizers import Tokenizer, models, normalizers, pre_tokenizers # Build a tokenizer diff --git a/bindings/python/py_src/tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/__init__.pyi index 7c21c5b56..5dbc665dc 100644 --- a/bindings/python/py_src/tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/__init__.pyi @@ -34,39 +34,44 @@ class AddedToken: Defines whether this token should be skipped when decoding. """ - def __init__(self, content, single_word=False, lstrip=False, rstrip=False, normalized=True, special=False): pass + @property def content(self): """ Get the content of this :obj:`AddedToken` """ pass + @property def lstrip(self): """ Get the value of the :obj:`lstrip` option """ pass + @property def normalized(self): """ Get the value of the :obj:`normalized` option """ pass + @property def rstrip(self): """ Get the value of the :obj:`rstrip` option """ pass + @property def single_word(self): """ Get the value of the :obj:`single_word` option """ pass + @property def special(self): """ @@ -78,7 +83,6 @@ class Encoding: """ The :class:`~tokenizers.Encoding` represents the output of a :class:`~tokenizers.Tokenizer`. """ - @property def attention_mask(self): """ @@ -92,6 +96,7 @@ class Encoding: :obj:`List[int]`: The attention mask """ pass + def char_to_token(self, char_pos, sequence_index=0): """ Get the token that contains the char at the given position in the input sequence. @@ -106,6 +111,7 @@ class Encoding: :obj:`int`: The index of the token that contains this char in the encoded sequence """ pass + def char_to_word(self, char_pos, sequence_index=0): """ Get the word that contains the char at the given position in the input sequence. @@ -120,6 +126,7 @@ class Encoding: :obj:`int`: The index of the word that contains this char in the input sequence """ pass + @property def ids(self): """ @@ -132,6 +139,7 @@ class Encoding: :obj:`List[int]`: The list of IDs """ pass + @staticmethod def merge(encodings, growing_offsets=True): """ @@ -148,6 +156,7 @@ class Encoding: :class:`~tokenizers.Encoding`: The resulting Encoding """ pass + @property def n_sequences(self): """ @@ -157,6 +166,7 @@ class Encoding: :obj:`int`: The number of sequences in this :class:`~tokenizers.Encoding` """ pass + @property def offsets(self): """ @@ -169,6 +179,7 @@ class Encoding: A :obj:`List` of :obj:`Tuple[int, int]`: The list of offsets """ pass + @property def overflowing(self): """ @@ -183,6 +194,7 @@ class Encoding: maximum length. """ pass + def pad(self, length, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]"): """ Pad the :class:`~tokenizers.Encoding` at the given length @@ -204,6 +216,7 @@ class Encoding: The pad token to use """ pass + @property def sequence_ids(self): """ @@ -217,6 +230,7 @@ class Encoding: A :obj:`List` of :obj:`Optional[int]`: A list of optional sequence index. """ pass + def set_sequence_id(self, sequence_id): """ Set the given sequence index @@ -225,6 +239,7 @@ class Encoding: :class:`~tokenizers.Encoding`. """ pass + @property def special_tokens_mask(self): """ @@ -236,6 +251,7 @@ class Encoding: :obj:`List[int]`: The special tokens mask """ pass + def token_to_chars(self, token_index): """ Get the offsets of the token at the given index. @@ -252,6 +268,7 @@ class Encoding: :obj:`Tuple[int, int]`: The token offsets :obj:`(first, last + 1)` """ pass + def token_to_sequence(self, token_index): """ Get the index of the sequence represented by the given token. @@ -267,6 +284,7 @@ class Encoding: :obj:`int`: The sequence id of the given token """ pass + def token_to_word(self, token_index): """ Get the index of the word that contains the token in one of the input sequences. @@ -283,6 +301,7 @@ class Encoding: :obj:`int`: The index of the word in the relevant input sequence. """ pass + @property def tokens(self): """ @@ -294,6 +313,7 @@ class Encoding: :obj:`List[str]`: The list of tokens """ pass + def truncate(self, max_length, stride=0, direction="right"): """ Truncate the :class:`~tokenizers.Encoding` at the given length @@ -312,6 +332,7 @@ class Encoding: Truncate direction """ pass + @property def type_ids(self): """ @@ -324,6 +345,7 @@ class Encoding: :obj:`List[int]`: The list of type ids """ pass + @property def word_ids(self): """ @@ -341,6 +363,7 @@ class Encoding: A :obj:`List` of :obj:`Optional[int]`: A list of optional word index. """ pass + def word_to_chars(self, word_index, sequence_index=0): """ Get the offsets of the word at the given index in one of the input sequences. @@ -355,6 +378,7 @@ class Encoding: :obj:`Tuple[int, int]`: The range of characters (span) :obj:`(first, last + 1)` """ pass + def word_to_tokens(self, word_index, sequence_index=0): """ Get the encoded tokens corresponding to the word at the given index @@ -370,6 +394,7 @@ class Encoding: :obj:`Tuple[int, int]`: The range of tokens: :obj:`(first, last + 1)` """ pass + @property def words(self): """ @@ -404,37 +429,42 @@ class NormalizedString: sequence: str: The string sequence used to initialize this NormalizedString """ - def append(self, s): """ Append the given sequence to the string """ pass + def clear(self): """ Clears the string """ pass + def filter(self, func): """ Filter each character of the string using the given func """ pass + def for_each(self, func): """ Calls the given function for each character of the string """ pass + def lowercase(self): """ Lowercase the string """ pass + def lstrip(self): """ Strip the left of the string """ pass + def map(self, func): """ Calls the given function for each character of the string @@ -443,37 +473,44 @@ class NormalizedString: returned value **must** be a str of length 1 (ie a character). """ pass + def nfc(self): """ Runs the NFC normalization """ pass + def nfd(self): """ Runs the NFD normalization """ pass + def nfkc(self): """ Runs the NFKC normalization """ pass + def nfkd(self): """ Runs the NFKD normalization """ pass + @property def normalized(self): """ The normalized part of the string """ pass + def prepend(self, s): """ Prepend the given sequence to the string """ pass + def replace(self, pattern, content): """ Replace the content of the given pattern with the provided content @@ -486,16 +523,19 @@ class NormalizedString: The content to be used as replacement """ pass + def rstrip(self): """ Strip the right of the string """ pass + def slice(self, range): """ Slice the string using the given range """ pass + def split(self, pattern, behavior): """ Split the NormalizedString using the given pattern and the specified behavior @@ -513,11 +553,13 @@ class NormalizedString: A list of NormalizedString, representing each split """ pass + def strip(self): """ Strip both ends of the string """ pass + def uppercase(self): """ Uppercase the string @@ -542,9 +584,9 @@ class PreTokenizedString: sequence: str: The string sequence used to initialize this PreTokenizedString """ - def __init__(self, sequence): pass + def get_splits(self, offset_referential="original", offset_type="char"): """ Get the splits currently managed by the PreTokenizedString @@ -565,6 +607,7 @@ class PreTokenizedString: A list of splits """ pass + def normalize(self, func): """ Normalize each split of the `PreTokenizedString` using the given `func` @@ -576,6 +619,7 @@ class PreTokenizedString: NormalizedString allow its modification. """ pass + def split(self, func): """ Split the PreTokenizedString using the given `func` @@ -590,6 +634,7 @@ class PreTokenizedString: should come from calling either `.split` or `.slice` on the received one. """ pass + def to_encoding(self, type_id=0, word_idx=None): """ Return an Encoding generated from this PreTokenizedString @@ -607,6 +652,7 @@ class PreTokenizedString: An Encoding """ pass + def tokenize(self, func): """ Tokenize each split of the `PreTokenizedString` using the given `func` @@ -622,7 +668,6 @@ class Regex: """ Instantiate a new Regex with the given pattern """ - def __init__(self, pattern): pass @@ -639,9 +684,9 @@ class Tokenizer: The core algorithm that this :obj:`Tokenizer` should be using. """ - def __init__(self, model): pass + def add_special_tokens(self, tokens): """ Add the given special tokens to the Tokenizer. @@ -662,6 +707,7 @@ class Tokenizer: :obj:`int`: The number of tokens that were created in the vocabulary """ pass + def add_tokens(self, tokens): """ Add the given tokens to the vocabulary @@ -678,6 +724,7 @@ class Tokenizer: :obj:`int`: The number of tokens that were created in the vocabulary """ pass + def decode(self, ids, skip_special_tokens=True): """ Decode the given list of ids back to a string @@ -695,6 +742,7 @@ class Tokenizer: :obj:`str`: The decoded string """ pass + def decode_batch(self, sequences, skip_special_tokens=True): """ Decode a batch of ids back to their corresponding string @@ -710,12 +758,14 @@ class Tokenizer: :obj:`List[str]`: A list of decoded strings """ pass + @property def decoder(self): """ The `optional` :class:`~tokenizers.decoders.Decoder` in use by the Tokenizer """ pass + def enable_padding( self, direction="right", pad_id=0, pad_type_id=0, pad_token="[PAD]", length=None, pad_to_multiple_of=None ): @@ -745,6 +795,7 @@ class Tokenizer: the longest sequence in a batch. """ pass + def enable_truncation(self, max_length, stride=0, strategy="longest_first", direction="right"): """ Enable truncation @@ -765,6 +816,7 @@ class Tokenizer: Truncate direction """ pass + def encode(self, sequence, pair=None, is_pretokenized=False, add_special_tokens=True): """ Encode the given sequence and pair. This method can process raw text sequences @@ -803,6 +855,7 @@ class Tokenizer: """ pass + def encode_batch(self, input, is_pretokenized=False, add_special_tokens=True): """ Encode the given batch of inputs. This method accept both raw text sequences @@ -838,6 +891,7 @@ class Tokenizer: """ pass + @property def encode_special_tokens(self): """ @@ -850,6 +904,7 @@ class Tokenizer: """ pass + @staticmethod def from_buffer(buffer): """ @@ -863,6 +918,7 @@ class Tokenizer: :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass + @staticmethod def from_file(path): """ @@ -877,6 +933,7 @@ class Tokenizer: :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass + @staticmethod def from_pretrained(identifier, revision="main", auth_token=None): """ @@ -897,6 +954,7 @@ class Tokenizer: :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass + @staticmethod def from_str(json): """ @@ -911,6 +969,7 @@ class Tokenizer: :class:`~tokenizers.Tokenizer`: The new tokenizer """ pass + def get_added_tokens_decoder(self): """ Get the underlying vocabulary @@ -919,6 +978,7 @@ class Tokenizer: :obj:`Dict[int, AddedToken]`: The vocabulary """ pass + def get_vocab(self, with_added_tokens=True): """ Get the underlying vocabulary @@ -931,6 +991,7 @@ class Tokenizer: :obj:`Dict[str, int]`: The vocabulary """ pass + def get_vocab_size(self, with_added_tokens=True): """ Get the size of the underlying vocabulary @@ -943,6 +1004,7 @@ class Tokenizer: :obj:`int`: The size of the vocabulary """ pass + def id_to_token(self, id): """ Convert the given id to its corresponding token if it exists @@ -955,28 +1017,33 @@ class Tokenizer: :obj:`Optional[str]`: An optional token, :obj:`None` if out of vocabulary """ pass + @property def model(self): """ The :class:`~tokenizers.models.Model` in use by the Tokenizer """ pass + def no_padding(self): """ Disable padding """ pass + def no_truncation(self): """ Disable truncation """ pass + @property def normalizer(self): """ The `optional` :class:`~tokenizers.normalizers.Normalizer` in use by the Tokenizer """ pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -984,6 +1051,7 @@ class Tokenizer: :return: """ pass + @property def padding(self): """ @@ -996,6 +1064,7 @@ class Tokenizer: A dict with the current padding parameters if padding is enabled """ pass + def post_process(self, encoding, pair=None, add_special_tokens=True): """ Apply all the post-processing steps to the given encodings. @@ -1022,18 +1091,21 @@ class Tokenizer: :class:`~tokenizers.Encoding`: The final post-processed encoding """ pass + @property def post_processor(self): """ The `optional` :class:`~tokenizers.processors.PostProcessor` in use by the Tokenizer """ pass + @property def pre_tokenizer(self): """ The `optional` :class:`~tokenizers.pre_tokenizers.PreTokenizer` in use by the Tokenizer """ pass + def save(self, path, pretty=True): """ Save the :class:`~tokenizers.Tokenizer` to the file at the given path. @@ -1046,6 +1118,7 @@ class Tokenizer: Whether the JSON file should be pretty formatted. """ pass + def to_str(self, pretty=False): """ Gets a serialized string representing this :class:`~tokenizers.Tokenizer`. @@ -1058,6 +1131,7 @@ class Tokenizer: :obj:`str`: A string representing the serialized Tokenizer """ pass + def token_to_id(self, token): """ Convert the given token to its corresponding id if it exists @@ -1070,6 +1144,7 @@ class Tokenizer: :obj:`Optional[int]`: An optional id, :obj:`None` if out of vocabulary """ pass + def train(self, files, trainer=None): """ Train the Tokenizer using the given files. @@ -1086,6 +1161,7 @@ class Tokenizer: An optional trainer that should be used to train our Model """ pass + def train_from_iterator(self, iterator, trainer=None, length=None): """ Train the Tokenizer using the provided iterator. @@ -1109,6 +1185,7 @@ class Tokenizer: provide meaningful progress tracking """ pass + @property def truncation(self): """ diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 83a0e827d..94dda2354 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -6,7 +6,6 @@ class Decoder: This class is not supposed to be instantiated directly. Instead, any implementation of a Decoder will return an instance of this class when instantiated. """ - def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -29,9 +28,9 @@ class BPEDecoder(Decoder): The suffix that was used to caracterize an end-of-word. This suffix will be replaced by whitespaces during the decoding """ - def __init__(self, suffix=""): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -53,9 +52,9 @@ class ByteFallback(Decoder): cannot be decoded you will get � instead for each inconvertable byte token """ - def __init__(self): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -76,9 +75,9 @@ class ByteLevel(Decoder): This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.ByteLevel` :class:`~tokenizers.pre_tokenizers.PreTokenizer`. """ - def __init__(self): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -105,9 +104,9 @@ class CTC(Decoder): Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, and some abbreviated english forms. """ - def __init__(self, pad_token="", word_delimiter_token="|", cleanup=True): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -128,9 +127,9 @@ class Fuse(Decoder): This is the last step of decoding, this decoder exists only if there is need to add other decoders *after* the fusion """ - def __init__(self): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -157,9 +156,9 @@ class Metaspace(Decoder): Whether to add a space to the first word if there isn't already one. This lets us treat `hello` exactly like `say hello`. """ - def __init__(self, replacement="▁", add_prefix_space=True): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -180,9 +179,9 @@ class Replace(Decoder): This decoder is to be used in tandem with the :class:`~tokenizers.pre_tokenizers.Replace` :class:`~tokenizers.pre_tokenizers.PreTokenizer`. """ - def __init__(self, pattern, content): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -204,9 +203,9 @@ class Sequence(Decoder): decoders (:obj:`List[Decoder]`) The decoders that need to be chained """ - def __init__(self, decoders): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -225,9 +224,9 @@ class Strip(Decoder): Strip normalizer Strips n left characters of each token, or n right characters of each token """ - def __init__(self, content, left=0, right=0): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string @@ -253,9 +252,9 @@ class WordPiece(Decoder): Whether to cleanup some tokenization artifacts. Mainly spaces before punctuation, and some abbreviated english forms. """ - def __init__(self, prefix="##", cleanup=True): pass + def decode(self, tokens): """ Decode the given list of tokens to a final string diff --git a/bindings/python/py_src/tokenizers/models/__init__.pyi b/bindings/python/py_src/tokenizers/models/__init__.pyi index 0218f8e56..b46f32f25 100644 --- a/bindings/python/py_src/tokenizers/models/__init__.pyi +++ b/bindings/python/py_src/tokenizers/models/__init__.pyi @@ -8,7 +8,6 @@ class Model: This class cannot be constructed directly. Please use one of the concrete models. """ - def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -20,6 +19,7 @@ class Model: :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -32,6 +32,7 @@ class Model: :obj:`str`: The token associated to the ID """ pass + def save(self, folder, prefix): """ Save the current model @@ -51,6 +52,7 @@ class Model: :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -63,6 +65,7 @@ class Model: :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence @@ -110,7 +113,6 @@ class BPE(Model): byte_fallback (:obj:`bool`, `optional`): Whether to use spm byte-fallback trick (defaults to False) """ - def __init__( self, vocab=None, @@ -124,6 +126,7 @@ class BPE(Model): byte_fallback=False, ): pass + @staticmethod def from_file(cls, vocab, merge, **kwargs): """ @@ -149,6 +152,7 @@ class BPE(Model): :class:`~tokenizers.models.BPE`: An instance of BPE loaded from these files """ pass + def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -160,6 +164,7 @@ class BPE(Model): :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -172,6 +177,7 @@ class BPE(Model): :obj:`str`: The token associated to the ID """ pass + @staticmethod def read_file(self, vocab, merges): """ @@ -193,6 +199,7 @@ class BPE(Model): The vocabulary and merges loaded into memory """ pass + def save(self, folder, prefix): """ Save the current model @@ -212,6 +219,7 @@ class BPE(Model): :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -224,6 +232,7 @@ class BPE(Model): :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence @@ -245,9 +254,9 @@ class Unigram(Model): vocab (:obj:`List[Tuple[str, float]]`, `optional`, `optional`): A list of vocabulary items and their relative score [("am", -0.2442),...] """ - def __init__(self, vocab, unk_id, byte_fallback): pass + def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -259,6 +268,7 @@ class Unigram(Model): :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -271,6 +281,7 @@ class Unigram(Model): :obj:`str`: The token associated to the ID """ pass + def save(self, folder, prefix): """ Save the current model @@ -290,6 +301,7 @@ class Unigram(Model): :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -302,6 +314,7 @@ class Unigram(Model): :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence @@ -328,9 +341,9 @@ class WordLevel(Model): unk_token (:obj:`str`, `optional`): The unknown token to be used by the model. """ - def __init__(self, vocab, unk_token): pass + @staticmethod def from_file(vocab, unk_token): """ @@ -353,6 +366,7 @@ class WordLevel(Model): :class:`~tokenizers.models.WordLevel`: An instance of WordLevel loaded from file """ pass + def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -364,6 +378,7 @@ class WordLevel(Model): :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -376,6 +391,7 @@ class WordLevel(Model): :obj:`str`: The token associated to the ID """ pass + @staticmethod def read_file(vocab): """ @@ -393,6 +409,7 @@ class WordLevel(Model): :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` """ pass + def save(self, folder, prefix): """ Save the current model @@ -412,6 +429,7 @@ class WordLevel(Model): :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -424,6 +442,7 @@ class WordLevel(Model): :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence @@ -451,9 +470,9 @@ class WordPiece(Model): max_input_chars_per_word (:obj:`int`, `optional`): The maximum number of characters to authorize in a single word. """ - def __init__(self, vocab, unk_token, max_input_chars_per_word): pass + @staticmethod def from_file(vocab, **kwargs): """ @@ -476,6 +495,7 @@ class WordPiece(Model): :class:`~tokenizers.models.WordPiece`: An instance of WordPiece loaded from file """ pass + def get_trainer(self): """ Get the associated :class:`~tokenizers.trainers.Trainer` @@ -487,6 +507,7 @@ class WordPiece(Model): :class:`~tokenizers.trainers.Trainer`: The Trainer used to train this model """ pass + def id_to_token(self, id): """ Get the token associated to an ID @@ -499,6 +520,7 @@ class WordPiece(Model): :obj:`str`: The token associated to the ID """ pass + @staticmethod def read_file(vocab): """ @@ -517,6 +539,7 @@ class WordPiece(Model): :obj:`Dict[str, int]`: The vocabulary as a :obj:`dict` """ pass + def save(self, folder, prefix): """ Save the current model @@ -536,6 +559,7 @@ class WordPiece(Model): :obj:`List[str]`: The list of saved files """ pass + def token_to_id(self, tokens): """ Get the ID associated to a token @@ -548,6 +572,7 @@ class WordPiece(Model): :obj:`int`: The ID associated to the token """ pass + def tokenize(self, sequence): """ Tokenize a sequence diff --git a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi index 09c2d8397..507d44731 100644 --- a/bindings/python/py_src/tokenizers/normalizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/normalizers/__init__.pyi @@ -6,7 +6,6 @@ class Normalizer: This class is not supposed to be instantiated directly. Instead, any implementation of a Normalizer will return an instance of this class when instantiated. """ - def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -22,6 +21,7 @@ class Normalizer: :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -62,9 +62,9 @@ class BertNormalizer(Normalizer): lowercase (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to lowercase. """ - def __init__(self, clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -80,6 +80,7 @@ class BertNormalizer(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -102,9 +103,9 @@ class Lowercase(Normalizer): """ Lowercase Normalizer """ - def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -120,6 +121,7 @@ class Lowercase(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -142,9 +144,9 @@ class NFC(Normalizer): """ NFC Unicode Normalizer """ - def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -160,6 +162,7 @@ class NFC(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -182,9 +185,9 @@ class NFD(Normalizer): """ NFD Unicode Normalizer """ - def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -200,6 +203,7 @@ class NFD(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -222,9 +226,9 @@ class NFKC(Normalizer): """ NFKC Unicode Normalizer """ - def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -240,6 +244,7 @@ class NFKC(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -262,9 +267,9 @@ class NFKD(Normalizer): """ NFKD Unicode Normalizer """ - def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -280,6 +285,7 @@ class NFKD(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -302,9 +308,9 @@ class Nmt(Normalizer): """ Nmt normalizer """ - def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -320,6 +326,7 @@ class Nmt(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -343,9 +350,9 @@ class Precompiled(Normalizer): Precompiled normalizer Don't use manually it is used for compatiblity for SentencePiece. """ - def __init__(self, precompiled_charsmap): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -361,6 +368,7 @@ class Precompiled(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -383,9 +391,9 @@ class Prepend(Normalizer): """ Prepend normalizer """ - def __init__(self, prepend): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -401,6 +409,7 @@ class Prepend(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -423,9 +432,9 @@ class Replace(Normalizer): """ Replace normalizer """ - def __init__(self, pattern, content): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -441,6 +450,7 @@ class Replace(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -468,7 +478,6 @@ class Sequence(Normalizer): normalizers (:obj:`List[Normalizer]`): A list of Normalizer to be run as a sequence """ - def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -484,6 +493,7 @@ class Sequence(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -506,9 +516,9 @@ class Strip(Normalizer): """ Strip normalizer """ - def __init__(self, left=True, right=True): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -524,6 +534,7 @@ class Strip(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string @@ -546,9 +557,9 @@ class StripAccents(Normalizer): """ StripAccents normalizer """ - def __init__(self): pass + def normalize(self, normalized): """ Normalize a :class:`~tokenizers.NormalizedString` in-place @@ -564,6 +575,7 @@ class StripAccents(Normalizer): :class:`~tokenizers.normalizers.Normalizer` """ pass + def normalize_str(self, sequence): """ Normalize the given string diff --git a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi index e3cb84dd2..9e975326f 100644 --- a/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/pre_tokenizers/__init__.pyi @@ -6,7 +6,6 @@ class PreTokenizer: This class is not supposed to be instantiated directly. Instead, any implementation of a PreTokenizer will return an instance of this class when instantiated. """ - def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -23,6 +22,7 @@ class PreTokenizer: :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -50,9 +50,9 @@ class BertPreTokenizer(PreTokenizer): This pre-tokenizer splits tokens on spaces, and also on punctuation. Each occurence of a punctuation character will be treated separately. """ - def __init__(self): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -69,6 +69,7 @@ class BertPreTokenizer(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -104,9 +105,9 @@ class ByteLevel(PreTokenizer): Set this to :obj:`False` to prevent this `pre_tokenizer` from using the GPT2 specific regexp for spliting on whitespace. """ - def __init__(self, add_prefix_space=True, use_regex=True): pass + @staticmethod def alphabet(): """ @@ -120,6 +121,7 @@ class ByteLevel(PreTokenizer): :obj:`List[str]`: A list of characters that compose the alphabet """ pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -136,6 +138,7 @@ class ByteLevel(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -164,7 +167,6 @@ class CharDelimiterSplit(PreTokenizer): delimiter: str: The delimiter char that will be used to split input """ - def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -181,6 +183,7 @@ class CharDelimiterSplit(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -215,9 +218,9 @@ class Digits(PreTokenizer): "Call 123 please" -> "Call ", "123", " please" """ - def __init__(self, individual_digits=False): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -234,6 +237,7 @@ class Digits(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -270,9 +274,9 @@ class Metaspace(PreTokenizer): Whether to add a space to the first word if there isn't already one. This lets us treat `hello` exactly like `say hello`. """ - def __init__(self, replacement="_", add_prefix_space=True): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -289,6 +293,7 @@ class Metaspace(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -319,9 +324,9 @@ class Punctuation(PreTokenizer): Choices: "removed", "isolated" (default), "merged_with_previous", "merged_with_next", "contiguous" """ - def __init__(self, behavior="isolated"): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -338,6 +343,7 @@ class Punctuation(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -362,9 +368,9 @@ class Sequence(PreTokenizer): """ This pre-tokenizer composes other pre_tokenizers and applies them in sequence """ - def __init__(self, pretokenizers): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -381,6 +387,7 @@ class Sequence(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -421,9 +428,9 @@ class Split(PreTokenizer): invert (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to invert the pattern. """ - def __init__(self, pattern, behavior, invert=False): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -440,6 +447,7 @@ class Split(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -467,9 +475,9 @@ class UnicodeScripts(PreTokenizer): Actually Hiragana and Katakana are fused with Han, and 0x30FC is Han too. This mimicks SentencePiece Unigram implementation. """ - def __init__(self): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -486,6 +494,7 @@ class UnicodeScripts(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -510,9 +519,9 @@ class Whitespace(PreTokenizer): """ This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` """ - def __init__(self): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -529,6 +538,7 @@ class Whitespace(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string @@ -553,9 +563,9 @@ class WhitespaceSplit(PreTokenizer): """ This pre-tokenizer simply splits on the whitespace. Works like `.split()` """ - def __init__(self): pass + def pre_tokenize(self, pretok): """ Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place @@ -572,6 +582,7 @@ class WhitespaceSplit(PreTokenizer): :class:`~tokenizers.pre_tokenizers.PreTokenizer` """ pass + def pre_tokenize_str(self, sequence): """ Pre tokenize the given string diff --git a/bindings/python/py_src/tokenizers/processors/__init__.pyi b/bindings/python/py_src/tokenizers/processors/__init__.pyi index ab73a337c..5136d02bb 100644 --- a/bindings/python/py_src/tokenizers/processors/__init__.pyi +++ b/bindings/python/py_src/tokenizers/processors/__init__.pyi @@ -6,7 +6,6 @@ class PostProcessor: This class is not supposed to be instantiated directly. Instead, any implementation of a PostProcessor will return an instance of this class when instantiated. """ - def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -19,6 +18,7 @@ class PostProcessor: :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -53,9 +53,9 @@ class BertProcessing(PostProcessor): cls (:obj:`Tuple[str, int]`): A tuple with the string representation of the CLS token, and its id """ - def __init__(self, sep, cls): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -68,6 +68,7 @@ class BertProcessing(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -98,9 +99,9 @@ class ByteLevel(PostProcessor): trim_offsets (:obj:`bool`): Whether to trim the whitespaces from the produced offsets. """ - def __init__(self, trim_offsets=True): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -113,6 +114,7 @@ class ByteLevel(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -159,9 +161,9 @@ class RobertaProcessing(PostProcessor): Whether the add_prefix_space option was enabled during pre-tokenization. This is relevant because it defines the way the offsets are trimmed out. """ - def __init__(self, sep, cls, trim_offsets=True, add_prefix_space=True): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -174,6 +176,7 @@ class RobertaProcessing(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -201,9 +204,9 @@ class Sequence(PostProcessor): processors (:obj:`List[PostProcessor]`) The processors that need to be chained """ - def __init__(self, processors): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -216,6 +219,7 @@ class Sequence(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one @@ -302,9 +306,9 @@ class TemplateProcessing(PostProcessor): The given dict expects the provided :obj:`ids` and :obj:`tokens` lists to have the same length. """ - def __init__(self, single, pair, special_tokens): pass + def num_special_tokens_to_add(self, is_pair): """ Return the number of special tokens that would be added for single/pair sentences. @@ -317,6 +321,7 @@ class TemplateProcessing(PostProcessor): :obj:`int`: The number of tokens to add """ pass + def process(self, encoding, pair=None, add_special_tokens=True): """ Post-process the given encodings, generating the final one diff --git a/bindings/python/py_src/tokenizers/tools/visualizer.py b/bindings/python/py_src/tokenizers/tools/visualizer.py index da368054c..c988a6481 100644 --- a/bindings/python/py_src/tokenizers/tools/visualizer.py +++ b/bindings/python/py_src/tokenizers/tools/visualizer.py @@ -92,7 +92,7 @@ def __init__( if default_to_notebook: try: from IPython.core.display import HTML, display - except ImportError as e: + except ImportError: raise Exception( """We couldn't import IPython utils for html display. Are you running in a notebook? @@ -136,7 +136,7 @@ def __call__( if final_default_to_notebook: try: from IPython.core.display import HTML, display - except ImportError as e: + except ImportError: raise Exception( """We couldn't import IPython utils for html display. Are you running in a notebook?""" @@ -170,7 +170,7 @@ def calculate_label_colors(annotations: AnnotationList) -> Dict[str, str]: if h_step < 20: h_step = 20 s = 32 - l = 64 + l = 64 # noqa: E741 h = 10 colors = {} diff --git a/bindings/python/py_src/tokenizers/trainers/__init__.pyi b/bindings/python/py_src/tokenizers/trainers/__init__.pyi index 911fdeb29..d6c525718 100644 --- a/bindings/python/py_src/tokenizers/trainers/__init__.pyi +++ b/bindings/python/py_src/tokenizers/trainers/__init__.pyi @@ -80,7 +80,6 @@ class UnigramTrainer(Trainer): The number of iterations of the EM algorithm to perform before pruning the vocabulary. """ - def __init__( self, vocab_size=8000, @@ -143,7 +142,6 @@ class WordPieceTrainer(Trainer): end_of_word_suffix (:obj:`str`, `optional`): A suffix to be used for every subword that is a end-of-word. """ - def __init__( self, vocab_size=30000, diff --git a/bindings/python/pyproject.toml b/bindings/python/pyproject.toml index 81f70af3d..5cdf090fa 100644 --- a/bindings/python/pyproject.toml +++ b/bindings/python/pyproject.toml @@ -34,7 +34,7 @@ Source = 'https://github.com/huggingface/tokenizers' [project.optional-dependencies] -testing = ["pytest", "requests", "numpy", "datasets", "black==22.3"] +testing = ["pytest", "requests", "numpy", "datasets", "black==22.3", "ruff"] docs = ["sphinx", "sphinx_rtd_theme", "setuptools_rust"] dev = ["tokenizers[testing]"] @@ -52,3 +52,21 @@ features = ["pyo3/extension-module"] [tool.black] line-length = 119 target-version = ['py35'] + +[tool.ruff] +line-length = 119 +target-version = "py311" +lint.ignore = [ + # a == None in tests vs is None. + "E711", + # a == False in tests vs is False. + "E712", + # try.. import except.. pattern without using the lib. + "F401", + # Raw type equality is required in asserts + "E721", + # Import order + "E402", + # Fixtures unused import + "F811", +] diff --git a/bindings/python/scripts/convert.py b/bindings/python/scripts/convert.py index 6c812f8c7..e6df5ad8a 100644 --- a/bindings/python/scripts/convert.py +++ b/bindings/python/scripts/convert.py @@ -80,9 +80,7 @@ def tokenizer(self, proto): tokenizer = Tokenizer(Unigram(vocab, unk_id)) elif model_type == 2: vocab, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract() - tokenizer = Tokenizer( - BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True) - ) + tokenizer = Tokenizer(BPE(vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True)) else: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm" @@ -105,12 +103,8 @@ def converted(self): replacement = "▁" add_prefix_space = True - tokenizer.pre_tokenizer = Metaspace( - replacement=replacement, add_prefix_space=add_prefix_space - ) - tokenizer.decoder = decoders.Metaspace( - replacement=replacement, add_prefix_space=add_prefix_space - ) + tokenizer.pre_tokenizer = Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) + tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space) post_processor = self.post_processor(tokenizer) if post_processor: tokenizer.post_processor = post_processor @@ -124,9 +118,7 @@ def converted(self): class AlbertConverter(SpmConverter): def vocab(self, proto): return [ - (piece.piece, piece.score) - if check_number_comma(piece.piece) - else (piece.piece, piece.score - 100) + (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100) for piece in proto.pieces ] @@ -261,9 +253,7 @@ def post_processor(self, tokenizer): class XLNetConverter(SpmConverter): def vocab(self, proto): return [ - (piece.piece, piece.score) - if check_number_comma(piece.piece) - else (piece.piece, piece.score - 100) + (piece.piece, piece.score) if check_number_comma(piece.piece) else (piece.piece, piece.score - 100) for piece in proto.pieces ] @@ -420,9 +410,7 @@ def main(): print(f"|{'-'*model_len}|{'-'*status_len}|{'-'*speedup_len}|") for pretrained in args.models: status, speedup = check(pretrained, args.filename) - print( - f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|" - ) + print(f"|{pretrained:<{model_len}}|{status:^{status_len}}|{speedup:^{speedup_len - 1}.2f}x|") if __name__ == "__main__": diff --git a/bindings/python/scripts/sentencepiece_extractor.py b/bindings/python/scripts/sentencepiece_extractor.py index fba05d8f4..a7bce9b49 100644 --- a/bindings/python/scripts/sentencepiece_extractor.py +++ b/bindings/python/scripts/sentencepiece_extractor.py @@ -59,7 +59,6 @@ def __init__(self, model: str): def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: with open(self._model, "r") as model_f: - # Retrieve information nb_pieces, nb_merges = map(int, model_f.readline().split()) vocab, merges = {}, [] @@ -97,9 +96,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: choices=["sentencepiece", "youtokentome"], help="Indicate the format of the file.", ) - parser.add_argument( - "--model", type=str, required=True, help="SentencePiece model to extract vocab from." - ) + parser.add_argument("--model", type=str, required=True, help="SentencePiece model to extract vocab from.") parser.add_argument( "--vocab-output-path", type=str, @@ -128,9 +125,7 @@ def extract(self) -> Tuple[Dict[str, int], List[Tuple]]: args.model = f.name # Allocate extractor - extractor = ( - SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor - ) + extractor = SentencePieceExtractor if args.provider == "sentencepiece" else YouTokenToMeExtractor extractor = extractor(args.model) logger.info(f"Using {type(extractor).__name__}") diff --git a/bindings/python/scripts/spm_parity_check.py b/bindings/python/scripts/spm_parity_check.py index 09e5b9475..33cfff4fa 100644 --- a/bindings/python/scripts/spm_parity_check.py +++ b/bindings/python/scripts/spm_parity_check.py @@ -121,9 +121,7 @@ def check_train(args): break print(f"Tokenizer used {tokenizer_tokens}, where spm used {spm_tokens}") - assert ( - tokenizer_tokens < spm_tokens - ), "Our trainer should be at least more efficient than the SPM one" + assert tokenizer_tokens < spm_tokens, "Our trainer should be at least more efficient than the SPM one" print("Ok our trainer is at least more efficient than the SPM one") @@ -131,9 +129,7 @@ def check_diff(spm_diff, tok_diff, sp, tok): if spm_diff == list(reversed(tok_diff)): # AAA -> AA+A vs A+AA case. return True - elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode( - tok_diff - ): + elif len(spm_diff) == len(tok_diff) and tok.decode(spm_diff) == tok.decode(tok_diff): # Second order OK # Barrich -> Barr + ich vs Bar + rich return True @@ -173,24 +169,17 @@ def check_details(line, spm_ids, tok_ids, sp, tok): spms = Counter(spm_ids[first:last]) toks = Counter(tok_ids[first:last]) - removable_tokens = { - spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si - } + removable_tokens = {spm_ for (spm_, si) in spms.items() if toks.get(spm_, 0) == si} min_width = 3 for i in range(last - first - min_width): - if all( - spm_ids[first + i + j] in removable_tokens for j in range(min_width) - ): + if all(spm_ids[first + i + j] in removable_tokens for j in range(min_width)): possible_matches = [ k for k in range(last - first - min_width) - if tok_ids[first + k : first + k + min_width] - == spm_ids[first + i : first + i + min_width] + if tok_ids[first + k : first + k + min_width] == spm_ids[first + i : first + i + min_width] ] for j in possible_matches: - if check_diff( - spm_ids[first : first + i], tok_ids[first : first + j], sp, tok - ) and check_details( + if check_diff(spm_ids[first : first + i], tok_ids[first : first + j], sp, tok) and check_details( line, spm_ids[first + i : last], tok_ids[first + j : last], @@ -210,9 +199,7 @@ def check_details(line, spm_ids, tok_ids, sp, tok): wrong = tok.decode(spm_ids[first:last]) print() if has_color: - print( - f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}" - ) + print(f"{colored(ok_start, 'grey')}{colored(wrong, 'red')}{colored(ok_end, 'grey')}") else: print(wrong) return False @@ -251,9 +238,7 @@ def check_encode(args): if args.verbose: if i % 10000 == 0: - print( - f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})" - ) + print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})") print(f"SPM: {spm_total_time} - TOK: {tok_total_time}") if ids != encoded.ids: @@ -265,13 +250,13 @@ def check_encode(args): else: perfect += 1 - assert ids == encoded.ids, f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}" + assert ( + ids == encoded.ids + ), f"line {i}: {line} : \n\n{ids}\n{encoded.ids}\n{list(zip(encoded.ids, encoded.tokens))}" print(f"({perfect} / {imperfect} / {wrong} ----- {perfect + imperfect + wrong})") total = perfect + imperfect + wrong - print( - f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}" - ) + print(f"Accuracy {perfect * 100 / total:.2f} Slowdown : {tok_total_time/ spm_total_time:.2f}") if __name__ == "__main__": diff --git a/bindings/python/stub.py b/bindings/python/stub.py index c9a20237c..41ef2d6ec 100644 --- a/bindings/python/stub.py +++ b/bindings/python/stub.py @@ -3,8 +3,6 @@ import os from pathlib import Path -import black - INDENT = " " * 4 GENERATED_COMMENT = "# Generated content DO NOT EDIT\n" @@ -85,7 +83,7 @@ def pyi_file(obj, indent=""): body += f"{indent+INDENT}pass\n" body += "\n" - for (name, fn) in fns: + for name, fn in fns: body += pyi_file(fn, indent=indent) if not body: @@ -122,18 +120,17 @@ def py_file(module, origin): return string -def do_black(content, is_pyi): - mode = black.Mode( - target_versions={black.TargetVersion.PY35}, - line_length=119, - is_pyi=is_pyi, - string_normalization=True, - experimental_string_processing=False, - ) - try: - return black.format_file_contents(content, fast=True, mode=mode) - except black.NothingChanged: - return content +import subprocess +from typing import List, Optional, Tuple + + +def do_ruff(code, is_pyi: bool): + command = ["ruff", "format", "--config", "pyproject.toml", "--silent", "-"] + if is_pyi: + command.extend(["--stdin-filename", "test.pyi"]) + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE) + stdout, _ = process.communicate(input=code.encode("utf-8")) + return stdout.decode("utf-8") def write(module, directory, origin, check=False): @@ -141,7 +138,7 @@ def write(module, directory, origin, check=False): filename = os.path.join(directory, "__init__.pyi") pyi_content = pyi_file(module) - pyi_content = do_black(pyi_content, is_pyi=True) + pyi_content = do_ruff(pyi_content, is_pyi=True) os.makedirs(directory, exist_ok=True) if check: with open(filename, "r") as f: @@ -153,7 +150,7 @@ def write(module, directory, origin, check=False): filename = os.path.join(directory, "__init__.py") py_content = py_file(module, origin) - py_content = do_black(py_content, is_pyi=False) + py_content = do_ruff(py_content, is_pyi=False) os.makedirs(directory, exist_ok=True) is_auto = False diff --git a/bindings/python/tests/bindings/test_models.py b/bindings/python/tests/bindings/test_models.py index b4b29682d..c6a50ce86 100644 --- a/bindings/python/tests/bindings/test_models.py +++ b/bindings/python/tests/bindings/test_models.py @@ -3,7 +3,6 @@ import pytest from tokenizers.models import BPE, Model, WordLevel, WordPiece - from ..utils import bert_files, data_dir, roberta_files diff --git a/bindings/python/tests/bindings/test_normalizers.py b/bindings/python/tests/bindings/test_normalizers.py index cf9f3d1a4..3fafd60d1 100644 --- a/bindings/python/tests/bindings/test_normalizers.py +++ b/bindings/python/tests/bindings/test_normalizers.py @@ -2,8 +2,7 @@ import pytest -from tokenizers import NormalizedString, Tokenizer -from tokenizers.models import BPE +from tokenizers import NormalizedString from tokenizers.normalizers import BertNormalizer, Lowercase, Normalizer, Sequence, Strip, Prepend diff --git a/bindings/python/tests/bindings/test_processors.py b/bindings/python/tests/bindings/test_processors.py index 14af9fbe4..842754a69 100644 --- a/bindings/python/tests/bindings/test_processors.py +++ b/bindings/python/tests/bindings/test_processors.py @@ -146,18 +146,18 @@ def test_instantiate(self): assert isinstance(pickle.loads(pickle.dumps(bert)), TemplateProcessing) # It is absolutely legal to have tokens with spaces in the name: - processor = TemplateProcessing( + TemplateProcessing( single=["[ C L S ]", "Token with space"], special_tokens=[("[ C L S ]", 0), ("Token with space", 1)], ) # Sequence identifiers must be well formed: with pytest.raises(Exception, match="Cannot build Piece"): - processor = TemplateProcessing(single="[CLS] $$ [SEP]") + TemplateProcessing(single="[CLS] $$ [SEP]") with pytest.raises(Exception, match="Cannot build Piece"): - processor = TemplateProcessing(single="[CLS] $A: [SEP]") + TemplateProcessing(single="[CLS] $A: [SEP]") # Special tokens must be provided when used in template: with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"): - processor = TemplateProcessing(single=["[CLS]"]) + TemplateProcessing(single=["[CLS]"]) def test_bert_parity(self): tokenizer = Tokenizer(BPE()) diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index 2eb5ce59c..01deb7a85 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -5,10 +5,9 @@ from tokenizers import AddedToken, Encoding, Tokenizer from tokenizers.implementations import BertWordPieceTokenizer -from tokenizers.models import BPE, Model, WordPiece, Unigram -from tokenizers.normalizers import Lowercase +from tokenizers.models import BPE, Model, Unigram from tokenizers.pre_tokenizers import ByteLevel -from tokenizers.processors import BertProcessing, RobertaProcessing +from tokenizers.processors import RobertaProcessing from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files diff --git a/bindings/python/tests/documentation/test_pipeline.py b/bindings/python/tests/documentation/test_pipeline.py index 90117f075..25300ff64 100644 --- a/bindings/python/tests/documentation/test_pipeline.py +++ b/bindings/python/tests/documentation/test_pipeline.py @@ -2,7 +2,6 @@ from ..utils import data_dir, doc_pipeline_bert_tokenizer, doc_wiki_tokenizer - disable_printing = True original_print = print diff --git a/bindings/python/tests/documentation/test_quicktour.py b/bindings/python/tests/documentation/test_quicktour.py index 866a6f99d..a98b0c12e 100644 --- a/bindings/python/tests/documentation/test_quicktour.py +++ b/bindings/python/tests/documentation/test_quicktour.py @@ -1,8 +1,4 @@ from tokenizers import Tokenizer -from tokenizers.models import BPE -from tokenizers.pre_tokenizers import Whitespace -from tokenizers.trainers import BpeTrainer - from ..utils import data_dir, doc_wiki_tokenizer diff --git a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py index 58d93351d..fc9ffce19 100644 --- a/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py +++ b/bindings/python/tests/documentation/test_tutorial_train_from_iterators.py @@ -1,3 +1,4 @@ +# flake8: noqa import gzip import os diff --git a/bindings/python/tests/implementations/test_base_tokenizer.py b/bindings/python/tests/implementations/test_base_tokenizer.py index 5b4c45160..535964656 100644 --- a/bindings/python/tests/implementations/test_base_tokenizer.py +++ b/bindings/python/tests/implementations/test_base_tokenizer.py @@ -1,5 +1,3 @@ -import pytest - from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, processors from tokenizers.implementations import BaseTokenizer diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py index a05d98c94..4e7c29cf5 100644 --- a/bindings/python/tests/implementations/test_bert_wordpiece.py +++ b/bindings/python/tests/implementations/test_bert_wordpiece.py @@ -1,5 +1,3 @@ -import pytest - from tokenizers import BertWordPieceTokenizer from ..utils import bert_files, data_dir, multiprocessing_with_parallelism diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py index 579575d3f..441aded7a 100644 --- a/bindings/python/tests/implementations/test_byte_level_bpe.py +++ b/bindings/python/tests/implementations/test_byte_level_bpe.py @@ -1,5 +1,3 @@ -import pytest - from tokenizers import ByteLevelBPETokenizer from ..utils import data_dir, multiprocessing_with_parallelism, roberta_files diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py index 09b2fc6e1..3ce5cf9a3 100644 --- a/bindings/python/tests/implementations/test_char_bpe.py +++ b/bindings/python/tests/implementations/test_char_bpe.py @@ -1,5 +1,3 @@ -import pytest - from tokenizers import CharBPETokenizer from ..utils import data_dir, multiprocessing_with_parallelism, openai_files diff --git a/bindings/python/tests/implementations/test_sentencepiece.py b/bindings/python/tests/implementations/test_sentencepiece.py index d9fade774..1da41fec0 100644 --- a/bindings/python/tests/implementations/test_sentencepiece.py +++ b/bindings/python/tests/implementations/test_sentencepiece.py @@ -1,5 +1,3 @@ -import os - import pytest from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer diff --git a/bindings/python/tests/test_serialization.py b/bindings/python/tests/test_serialization.py index 2057d763e..a56c6bb33 100644 --- a/bindings/python/tests/test_serialization.py +++ b/bindings/python/tests/test_serialization.py @@ -6,7 +6,6 @@ from huggingface_hub import HfApi, cached_download, hf_hub_url from tokenizers import Tokenizer - from .utils import albert_base, data_dir @@ -15,7 +14,7 @@ def test_full_serialization_albert(self, albert_base): # Check we can read this file. # This used to fail because of BufReader that would fail because the # file exceeds the buffer capacity - tokenizer = Tokenizer.from_file(albert_base) + Tokenizer.from_file(albert_base) def check(tokenizer_file) -> bool: @@ -51,8 +50,6 @@ def test_full_deserialization_hub(self): # Check we can read this file. # This used to fail because of BufReader that would fail because the # file exceeds the buffer capacity - api = HfApi() - not_loadable = [] invalid_pre_tokenizer = [] @@ -77,7 +74,7 @@ def test_full_deserialization_hub(self): except Exception as e: print(f"{model_id} is not loadable: {e}") not_loadable.append(model_id) - except: + except: # noqa: E722 print(f"{model_id} is not loadable: Rust error") not_loadable.append(model_id)