Skip to content

Commit

Permalink
[remove black] And use ruff (#1436)
Browse files Browse the repository at this point in the history
* nits

* Fixing deps.

* Ruff update.

* Import order matters.

* Fix.

* Revert ruff fix.

* Visualizer.

* Putting back the imports.

---------

Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
ArthurZucker and Narsil authored Mar 12, 2024
1 parent 72a1973 commit 29fef1e
Showing 29 changed files with 258 additions and 169 deletions.
6 changes: 4 additions & 2 deletions bindings/python/Makefile
Original file line number Diff line number Diff line change
@@ -8,12 +8,14 @@ check_dirs := examples py_src/tokenizers tests
# Format source code automatically
style:
python stub.py
black --line-length 119 --target-version py35 $(check_dirs)
ruff check $(check_dirs) --fix
ruff format $(check_dirs)t

# Check the source code is formatted correctly
check-style:
python stub.py --check
black --check --line-length 119 --target-version py35 examples py_src/tokenizers tests
ruff check examples py_src/tokenizers tests
ruff format --check examples py_src/tokenizers tests

TESTS_RESOURCES = $(DATA_DIR)/small.txt $(DATA_DIR)/roberta.json

11 changes: 4 additions & 7 deletions bindings/python/examples/example.py
Original file line number Diff line number Diff line change
@@ -4,16 +4,15 @@

from tqdm import tqdm


logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True

from tokenizers import Tokenizer, decoders, pre_tokenizers
from tokenizers.models import BPE, WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.processors import BertProcessing
from transformers import BertTokenizer, GPT2Tokenizer

logging.getLogger("transformers").disabled = True
logging.getLogger("transformers.tokenization_utils").disabled = True


parser = argparse.ArgumentParser()
parser.add_argument("--type", default="gpt2", type=str, help="The type of tokenizer (bert|gpt2)")
@@ -51,9 +50,7 @@
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
""".split(
"\n"
)
""".split("\n")

if args.type == "gpt2":
print("Running GPT-2 tokenizer")
2 changes: 1 addition & 1 deletion bindings/python/examples/train_with_datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import datasets

from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers


# Build a tokenizer
Loading

0 comments on commit 29fef1e

Please sign in to comment.