diff --git a/pii/ner/requirements.txt b/pii/ner/requirements.txt new file mode 100644 index 0000000..4901e0b --- /dev/null +++ b/pii/ner/requirements.txt @@ -0,0 +1,4 @@ +datasets +transformers +evaluate +seqeval \ No newline at end of file diff --git a/pii/ner/train.py b/pii/ner/train.py index c72677e..3f56d44 100644 --- a/pii/ner/train.py +++ b/pii/ner/train.py @@ -1,13 +1,10 @@ import argparse -import itertools -import json +import os from pprint import pprint -import numpy as np -import pandas as pd -from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric -from huggingface_hub import notebook_login +from datasets import DatasetDict, load_dataset from tqdm import tqdm +from functools import partial from transformers import ( AutoModelForTokenClassification, AutoTokenizer, @@ -15,55 +12,12 @@ EarlyStoppingCallback, Trainer, TrainingArguments, + set_seed, + logging ) from utils.preprocessing import chunk_dataset, tokenize_and_label_batch - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_ckpt", type=str, default="bigcode/bigcode-encoder") - parser.add_argument( - "--dataset_name", - type=str, - default="bigcode/pii-annotated-toloka-donwsample-emails", - ) - parser.add_argument("batch_size", type=int, default=16) - parser.add_argument("learning_rate", type=float, default=1e-5) - parser.add_argument("lr_scheduler_type", type=str, default="cosine") - parser.add_argument("num_train_epochs", type=int, default=3) - parser.add_argument("weight_decay", type=float, default=0.01) - parser.add_argument("gradient_checkpointing", action="store_true") - parser.add_argument("output_dir", type=str, default="finetuned-encoder-pii") - parser.add_argument("seed", type=int, default=0) - parser.add_argument("num_proc", type=int, default=8) - parser.add_argument("max_length", type=int, default=1024) - parser.add_argument("debug", action="store_true") - parser.add_argument("bf16", action="store_true") - parser.add_argument("fp16", action="store_true") - parser.add_argument("eval_freq", type=int, default=100) - parser.add_argument("save_freq", type=int, default=1000) - return parser.parse_args() - - -def get_stats(data): - # get number of B-cat for cat in categories for each data split - stats = {cat: 0 for cat in CATEGORIES} - for entry in tqdm(data): - for label in entry["labels"]: - # only add labels for beginning with B- - if label > 0 and ID2LABEL[label].startswith("B-"): - stats[ID2LABEL[label][2:]] += 1 - return stats - - -def prepare_tokenizer(tokenizer): - tokenizer.add_special_tokens({"pad_token": PAD_TOKEN}) - tokenizer.add_special_tokens({"sep_token": SEPARATOR_TOKEN}) - tokenizer.add_special_tokens({"cls_token": CLS_TOKEN}) - tokenizer.add_special_tokens({"mask_token": MASK_TOKEN}) - tokenizer.model_max_length = 1024 - return tokenizer +from utils.eval import compute_metrics # Special tokens @@ -95,14 +49,63 @@ def prepare_tokenizer(tokenizer): LABEL2ID[f"I-{cat}"] = len(LABEL2ID) ID2LABEL = {v: k for k, v in LABEL2ID.items()} +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_ckpt", type=str, default="bigcode/bigcode-encoder") + parser.add_argument( + "--dataset_name", + type=str, + default="bigcode/pii-annotated-toloka-donwsample-emails" + ) + parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--lr_scheduler_type", type=str, default="cosine") + parser.add_argument("--num_train_epochs", type=int, default=20) + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--warmup_steps", type=int, default=100) + parser.add_argument("--gradient_checkpointing", action="store_true") + parser.add_argument("--gradient_accumulation_steps", type=int, default=1) + parser.add_argument("--num_proc", type=int, default=8) + parser.add_argument("--bf16", action="store_true") + parser.add_argument("--fp16", action="store_true") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--num_workers", type=int, default=16) + parser.add_argument("--eval_freq", type=int, default=100) + parser.add_argument("--save_freq", type=int, default=1000) + parser.add_argument("--debug", action="store_true") + parser.add_argument("--output_dir", type=str, default="finetuned-encoder-pii") + return parser.parse_args() -def run_training(args, ner_dataset): + +def get_stats(data): + # get number of B-cat for cat in categories for each data split + stats = {cat: 0 for cat in CATEGORIES} + for entry in tqdm(data): + for label in entry["labels"]: + # only add labels for beginning with B- + if label > 0 and ID2LABEL[label].startswith("B-"): + stats[ID2LABEL[label][2:]] += 1 + return stats + + +def prepare_tokenizer(tokenizer): + tokenizer.add_special_tokens({"pad_token": PAD_TOKEN}) + tokenizer.add_special_tokens({"sep_token": SEPARATOR_TOKEN}) + tokenizer.add_special_tokens({"cls_token": CLS_TOKEN}) + tokenizer.add_special_tokens({"mask_token": MASK_TOKEN}) + tokenizer.model_max_length = 1024 + return tokenizer + + +def run_training(args, ner_dataset, model, tokenizer): print(f"Initializing Trainer...") training_args = TrainingArguments( output_dir=args.output_dir, evaluation_strategy="steps", num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.batch_size, + per_device_eval_batch_size=args.batch_size, eval_steps=args.eval_freq, save_steps=args.save_freq, logging_steps=10, @@ -111,16 +114,17 @@ def run_training(args, ner_dataset): weight_decay=args.weight_decay, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, - warmup_steps=args.num_warmup_steps, - gradient_checkpointing=args.no_gradient_checkpointing, + warmup_steps=args.warmup_steps, + gradient_checkpointing=args.gradient_checkpointing, gradient_accumulation_steps=args.gradient_accumulation_steps, fp16=args.fp16, bf16=args.bf16, - weight_decay=args.weight_decay, - run_name=f"pii-bs{batch_size}-lr{lr}-wd{wd}-epochs{max_epochs}", + run_name=f"pii-bs{args.batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-epochs{args.num_train_epochs}", report_to="wandb", ) + + data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) trainer = Trainer( model=model, args=training_args, @@ -163,7 +167,7 @@ def main(args): data = dataset.map( partial( tokenize_and_label_batch, - tokenizer, + tokenizer=tokenizer, target_text="text", pii_column="fragments", LABEL2ID=LABEL2ID, @@ -171,7 +175,7 @@ def main(args): ), batched=True, batch_size=1000, - num_proc=NUM_PROC, + num_proc=args.num_workers, ) # split to train and test @@ -201,7 +205,6 @@ def main(args): pprint({k: v for k, v in test_stats.items() if v < 100}) print("Chunking the dataset...") - data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) ner_dataset = DatasetDict( train=chunk_dataset(train_data, tokenizer), validation=chunk_dataset(valid_data, tokenizer), @@ -209,7 +212,7 @@ def main(args): ) print(ner_dataset) - run_training(args, ner_dataset) + run_training(args, ner_dataset, model, tokenizer) if __name__ == "__main__": diff --git a/pii/ner/utils/preprocessing.py b/pii/ner/utils/preprocessing.py index 623d4e1..203e627 100644 --- a/pii/ner/utils/preprocessing.py +++ b/pii/ner/utils/preprocessing.py @@ -1,5 +1,7 @@ # source: https://github.com/mponty/bigcode-dataset/tree/main/pii/ner_model_training/utils by @mponty - +import itertools +from tqdm import tqdm +from datasets import Dataset def is_overlap(span, reference_span): l1, r1 = min(*span), max(*span) @@ -17,7 +19,7 @@ def label_tokenized( entry["labels"] = [LABEL2ID["O"]] * len(entry["offset_mapping"]) for entity in pii: - if entity["category"] == IGNORE_CLASS: + if entity["category"] in IGNORE_CLASS: continue prefix = "B-" entity_span = tuple(entity["position"])