Skip to content

Commit

Permalink
various fixes and add requirements
Browse files Browse the repository at this point in the history
  • Loading branch information
loubnabnl committed Aug 24, 2023
1 parent e2f3a9f commit f16d804
Show file tree
Hide file tree
Showing 3 changed files with 72 additions and 63 deletions.
4 changes: 4 additions & 0 deletions pii/ner/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
datasets
transformers
evaluate
seqeval
125 changes: 64 additions & 61 deletions pii/ner/train.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,23 @@
import argparse
import itertools
import json
import os
from pprint import pprint

import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric
from huggingface_hub import notebook_login
from datasets import DatasetDict, load_dataset
from tqdm import tqdm
from functools import partial
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
DataCollatorForTokenClassification,
EarlyStoppingCallback,
Trainer,
TrainingArguments,
set_seed,
logging
)

from utils.preprocessing import chunk_dataset, tokenize_and_label_batch


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_ckpt", type=str, default="bigcode/bigcode-encoder")
parser.add_argument(
"--dataset_name",
type=str,
default="bigcode/pii-annotated-toloka-donwsample-emails",
)
parser.add_argument("batch_size", type=int, default=16)
parser.add_argument("learning_rate", type=float, default=1e-5)
parser.add_argument("lr_scheduler_type", type=str, default="cosine")
parser.add_argument("num_train_epochs", type=int, default=3)
parser.add_argument("weight_decay", type=float, default=0.01)
parser.add_argument("gradient_checkpointing", action="store_true")
parser.add_argument("output_dir", type=str, default="finetuned-encoder-pii")
parser.add_argument("seed", type=int, default=0)
parser.add_argument("num_proc", type=int, default=8)
parser.add_argument("max_length", type=int, default=1024)
parser.add_argument("debug", action="store_true")
parser.add_argument("bf16", action="store_true")
parser.add_argument("fp16", action="store_true")
parser.add_argument("eval_freq", type=int, default=100)
parser.add_argument("save_freq", type=int, default=1000)
return parser.parse_args()


def get_stats(data):
# get number of B-cat for cat in categories for each data split
stats = {cat: 0 for cat in CATEGORIES}
for entry in tqdm(data):
for label in entry["labels"]:
# only add labels for beginning with B-
if label > 0 and ID2LABEL[label].startswith("B-"):
stats[ID2LABEL[label][2:]] += 1
return stats


def prepare_tokenizer(tokenizer):
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.add_special_tokens({"sep_token": SEPARATOR_TOKEN})
tokenizer.add_special_tokens({"cls_token": CLS_TOKEN})
tokenizer.add_special_tokens({"mask_token": MASK_TOKEN})
tokenizer.model_max_length = 1024
return tokenizer
from utils.eval import compute_metrics


# Special tokens
Expand Down Expand Up @@ -95,14 +49,63 @@ def prepare_tokenizer(tokenizer):
LABEL2ID[f"I-{cat}"] = len(LABEL2ID)
ID2LABEL = {v: k for k, v in LABEL2ID.items()}

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_ckpt", type=str, default="bigcode/bigcode-encoder")
parser.add_argument(
"--dataset_name",
type=str,
default="bigcode/pii-annotated-toloka-donwsample-emails"
)
parser.add_argument("--batch_size", type=int, default=16)
parser.add_argument("--learning_rate", type=float, default=1e-5)
parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
parser.add_argument("--num_train_epochs", type=int, default=20)
parser.add_argument("--weight_decay", type=float, default=0.01)
parser.add_argument("--warmup_steps", type=int, default=100)
parser.add_argument("--gradient_checkpointing", action="store_true")
parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
parser.add_argument("--num_proc", type=int, default=8)
parser.add_argument("--bf16", action="store_true")
parser.add_argument("--fp16", action="store_true")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--num_workers", type=int, default=16)
parser.add_argument("--eval_freq", type=int, default=100)
parser.add_argument("--save_freq", type=int, default=1000)
parser.add_argument("--debug", action="store_true")
parser.add_argument("--output_dir", type=str, default="finetuned-encoder-pii")
return parser.parse_args()

def run_training(args, ner_dataset):

def get_stats(data):
# get number of B-cat for cat in categories for each data split
stats = {cat: 0 for cat in CATEGORIES}
for entry in tqdm(data):
for label in entry["labels"]:
# only add labels for beginning with B-
if label > 0 and ID2LABEL[label].startswith("B-"):
stats[ID2LABEL[label][2:]] += 1
return stats


def prepare_tokenizer(tokenizer):
tokenizer.add_special_tokens({"pad_token": PAD_TOKEN})
tokenizer.add_special_tokens({"sep_token": SEPARATOR_TOKEN})
tokenizer.add_special_tokens({"cls_token": CLS_TOKEN})
tokenizer.add_special_tokens({"mask_token": MASK_TOKEN})
tokenizer.model_max_length = 1024
return tokenizer


def run_training(args, ner_dataset, model, tokenizer):
print(f"Initializing Trainer...")

training_args = TrainingArguments(
output_dir=args.output_dir,
evaluation_strategy="steps",
num_train_epochs=args.num_train_epochs,
per_device_train_batch_size=args.batch_size,
per_device_eval_batch_size=args.batch_size,
eval_steps=args.eval_freq,
save_steps=args.save_freq,
logging_steps=10,
Expand All @@ -111,16 +114,17 @@ def run_training(args, ner_dataset):
weight_decay=args.weight_decay,
learning_rate=args.learning_rate,
lr_scheduler_type=args.lr_scheduler_type,
warmup_steps=args.num_warmup_steps,
gradient_checkpointing=args.no_gradient_checkpointing,
warmup_steps=args.warmup_steps,
gradient_checkpointing=args.gradient_checkpointing,
gradient_accumulation_steps=args.gradient_accumulation_steps,
fp16=args.fp16,
bf16=args.bf16,
weight_decay=args.weight_decay,
run_name=f"pii-bs{batch_size}-lr{lr}-wd{wd}-epochs{max_epochs}",
run_name=f"pii-bs{args.batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-epochs{args.num_train_epochs}",
report_to="wandb",
)


data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
trainer = Trainer(
model=model,
args=training_args,
Expand Down Expand Up @@ -163,15 +167,15 @@ def main(args):
data = dataset.map(
partial(
tokenize_and_label_batch,
tokenizer,
tokenizer=tokenizer,
target_text="text",
pii_column="fragments",
LABEL2ID=LABEL2ID,
IGNORE_CLASS=IGNORE_CLASS,
),
batched=True,
batch_size=1000,
num_proc=NUM_PROC,
num_proc=args.num_workers,
)

# split to train and test
Expand Down Expand Up @@ -201,15 +205,14 @@ def main(args):
pprint({k: v for k, v in test_stats.items() if v < 100})

print("Chunking the dataset...")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
ner_dataset = DatasetDict(
train=chunk_dataset(train_data, tokenizer),
validation=chunk_dataset(valid_data, tokenizer),
test=chunk_dataset(test_data, tokenizer),
)
print(ner_dataset)

run_training(args, ner_dataset)
run_training(args, ner_dataset, model, tokenizer)


if __name__ == "__main__":
Expand Down
6 changes: 4 additions & 2 deletions pii/ner/utils/preprocessing.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# source: https://github.com/mponty/bigcode-dataset/tree/main/pii/ner_model_training/utils by @mponty

import itertools
from tqdm import tqdm
from datasets import Dataset

def is_overlap(span, reference_span):
l1, r1 = min(*span), max(*span)
Expand All @@ -17,7 +19,7 @@ def label_tokenized(

entry["labels"] = [LABEL2ID["O"]] * len(entry["offset_mapping"])
for entity in pii:
if entity["category"] == IGNORE_CLASS:
if entity["category"] in IGNORE_CLASS:
continue
prefix = "B-"
entity_span = tuple(entity["position"])
Expand Down

0 comments on commit f16d804

Please sign in to comment.