From 1270f5eb716988bbf219a0e6fcffb043efa80521 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Tue, 21 Mar 2023 19:11:48 +0100 Subject: [PATCH 01/14] add ner code training --- pii/ner/train.py | 222 +++++++++++++++++++++++++++++++++ pii/ner/utils/eval.py | 46 +++++++ pii/ner/utils/preprocessing.py | 143 +++++++++++++++++++++ 3 files changed, 411 insertions(+) create mode 100644 pii/ner/train.py create mode 100644 pii/ner/utils/eval.py create mode 100644 pii/ner/utils/preprocessing.py diff --git a/pii/ner/train.py b/pii/ner/train.py new file mode 100644 index 0000000..c72677e --- /dev/null +++ b/pii/ner/train.py @@ -0,0 +1,222 @@ +import argparse +import itertools +import json +from pprint import pprint + +import numpy as np +import pandas as pd +from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric +from huggingface_hub import notebook_login +from tqdm import tqdm +from transformers import ( + AutoModelForTokenClassification, + AutoTokenizer, + DataCollatorForTokenClassification, + EarlyStoppingCallback, + Trainer, + TrainingArguments, +) + +from utils.preprocessing import chunk_dataset, tokenize_and_label_batch + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_ckpt", type=str, default="bigcode/bigcode-encoder") + parser.add_argument( + "--dataset_name", + type=str, + default="bigcode/pii-annotated-toloka-donwsample-emails", + ) + parser.add_argument("batch_size", type=int, default=16) + parser.add_argument("learning_rate", type=float, default=1e-5) + parser.add_argument("lr_scheduler_type", type=str, default="cosine") + parser.add_argument("num_train_epochs", type=int, default=3) + parser.add_argument("weight_decay", type=float, default=0.01) + parser.add_argument("gradient_checkpointing", action="store_true") + parser.add_argument("output_dir", type=str, default="finetuned-encoder-pii") + parser.add_argument("seed", type=int, default=0) + parser.add_argument("num_proc", type=int, default=8) + parser.add_argument("max_length", type=int, default=1024) + parser.add_argument("debug", action="store_true") + parser.add_argument("bf16", action="store_true") + parser.add_argument("fp16", action="store_true") + parser.add_argument("eval_freq", type=int, default=100) + parser.add_argument("save_freq", type=int, default=1000) + return parser.parse_args() + + +def get_stats(data): + # get number of B-cat for cat in categories for each data split + stats = {cat: 0 for cat in CATEGORIES} + for entry in tqdm(data): + for label in entry["labels"]: + # only add labels for beginning with B- + if label > 0 and ID2LABEL[label].startswith("B-"): + stats[ID2LABEL[label][2:]] += 1 + return stats + + +def prepare_tokenizer(tokenizer): + tokenizer.add_special_tokens({"pad_token": PAD_TOKEN}) + tokenizer.add_special_tokens({"sep_token": SEPARATOR_TOKEN}) + tokenizer.add_special_tokens({"cls_token": CLS_TOKEN}) + tokenizer.add_special_tokens({"mask_token": MASK_TOKEN}) + tokenizer.model_max_length = 1024 + return tokenizer + + +# Special tokens +MASK_TOKEN = "" +SEPARATOR_TOKEN = "" +PAD_TOKEN = "" +CLS_TOKEN = "" + +# NER tags +CATEGORIES = [ + "NAME", + "NAME_LICENSE", + "NAME_EXAMPLE", + "EMAIL", + "EMAIL_LICENSE", + "EMAIL_EXAMPLE", + "USERNAME", + "USERNAME_LICENSE", + "USERNAME_EXAMPLE", + "KEY", + "IP_ADDRESS", + "PASSWORD", +] +IGNORE_CLASS = ["AMBIGUOUS", "ID"] + +LABEL2ID = {"O": 0} +for cat in CATEGORIES: + LABEL2ID[f"B-{cat}"] = len(LABEL2ID) + LABEL2ID[f"I-{cat}"] = len(LABEL2ID) +ID2LABEL = {v: k for k, v in LABEL2ID.items()} + + +def run_training(args, ner_dataset): + print(f"Initializing Trainer...") + + training_args = TrainingArguments( + output_dir=args.output_dir, + evaluation_strategy="steps", + num_train_epochs=args.num_train_epochs, + eval_steps=args.eval_freq, + save_steps=args.save_freq, + logging_steps=10, + metric_for_best_model="f1", + load_best_model_at_end=True, + weight_decay=args.weight_decay, + learning_rate=args.learning_rate, + lr_scheduler_type=args.lr_scheduler_type, + warmup_steps=args.num_warmup_steps, + gradient_checkpointing=args.no_gradient_checkpointing, + gradient_accumulation_steps=args.gradient_accumulation_steps, + fp16=args.fp16, + bf16=args.bf16, + weight_decay=args.weight_decay, + run_name=f"pii-bs{batch_size}-lr{lr}-wd{wd}-epochs{max_epochs}", + report_to="wandb", + ) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=ner_dataset["train"], + eval_dataset=ner_dataset["validation"], + data_collator=data_collator, + tokenizer=tokenizer, + compute_metrics=compute_metrics, + callbacks=[ + EarlyStoppingCallback( + early_stopping_patience=30, early_stopping_threshold=1e-3 + ) + ], + ) + + print("Training...") + trainer.train() + + print("Saving last checkpoint of the model") + model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/")) + + +def main(args): + # load model and tokenizer + model = AutoModelForTokenClassification.from_pretrained( + args.model_ckpt, + num_labels=len(ID2LABEL), + id2label=ID2LABEL, + label2id=LABEL2ID, + use_auth_token=True, + use_cache=not args.gradient_checkpointing, + ) + tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt, use_auth_token=True) + tokenizer = prepare_tokenizer(tokenizer) + + # load dataset + dataset = load_dataset(args.dataset_name, use_auth_token=True, split="train") + dataset = dataset.remove_columns(["id"]) + dataset = dataset.add_column("id", range(len(dataset))) + data = dataset.map( + partial( + tokenize_and_label_batch, + tokenizer, + target_text="text", + pii_column="fragments", + LABEL2ID=LABEL2ID, + IGNORE_CLASS=IGNORE_CLASS, + ), + batched=True, + batch_size=1000, + num_proc=NUM_PROC, + ) + + # split to train and test + data = data.train_test_split(test_size=0.2, shuffle=True, seed=args.seed) + test_valid = data["test"].train_test_split( + test_size=0.6, shuffle=True, seed=args.seed + ) + train_data = data["train"] + valid_data = test_valid["train"] + test_data = test_valid["test"] + test_data.to_json(f"{args.output_dir}/test_data.json") + print("Test data saved to test_data.json") + + if args.debug: + print( + f"Train size {len(train_data)}\nValid size {len(valid_data)}\nTest size {len(test_data)}" + ) + train_stats = get_stats(train_data) + valid_stats = get_stats(valid_data) + test_stats = get_stats(test_data) + print("Train low-resource stats") + # print stats for keys with less than 100 in teh value + pprint({k: v for k, v in train_stats.items() if v < 300}) + print("Valid low-resource stats") + pprint({k: v for k, v in valid_stats.items() if v < 100}) + print("Test low-resource stats") + pprint({k: v for k, v in test_stats.items() if v < 100}) + + print("Chunking the dataset...") + data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) + ner_dataset = DatasetDict( + train=chunk_dataset(train_data, tokenizer), + validation=chunk_dataset(valid_data, tokenizer), + test=chunk_dataset(test_data, tokenizer), + ) + print(ner_dataset) + + run_training(args, ner_dataset) + + +if __name__ == "__main__": + args = get_args() + set_seed(args.seed) + os.makedirs(args.output_dir, exist_ok=True) + + logging.set_verbosity_error() + + main(args) diff --git a/pii/ner/utils/eval.py b/pii/ner/utils/eval.py new file mode 100644 index 0000000..44fa05f --- /dev/null +++ b/pii/ner/utils/eval.py @@ -0,0 +1,46 @@ +# source: https://github.com/mponty/bigcode-dataset/tree/main/pii/ner_model_training/utils by @mponty +import numpy as np +from evaluate import load +from scipy.special import softmax +from sklearn.metrics import average_precision_score + +_seqeval_metric = load("seqeval") + + +def compute_ap(pred, truth): + pred_proba = 1 - softmax(pred, axis=-1)[..., 0] + pred_proba, truth = pred_proba.flatten(), np.array(truth).flatten() + pred_proba = pred_proba[truth != -100] + truth = truth[truth != -100] + + return average_precision_score(truth != 0, pred_proba) + + +def compute_metrics(p): + predictions, labels = p + avg_prec = compute_ap(predictions, labels) + predictions = np.argmax(predictions, axis=2) + + # Remove ignored index (special tokens) + true_predictions = [ + [ID2LABEL[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [ID2LABEL[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + + results = _seqeval_metric.compute( + predictions=true_predictions, references=true_labels + ) + agg_metrics = { + "Avg.Precision": avg_prec, + "precision": results.pop("overall_precision"), + "recall": results.pop("overall_recall"), + "f1": results.pop("overall_f1"), + } + results.pop("overall_accuracy") + per_cat_metrics = {name: metrics["f1"] for name, metrics in results.items()} + + return dict(**agg_metrics, **per_cat_metrics) diff --git a/pii/ner/utils/preprocessing.py b/pii/ner/utils/preprocessing.py new file mode 100644 index 0000000..623d4e1 --- /dev/null +++ b/pii/ner/utils/preprocessing.py @@ -0,0 +1,143 @@ +# source: https://github.com/mponty/bigcode-dataset/tree/main/pii/ner_model_training/utils by @mponty + + +def is_overlap(span, reference_span): + l1, r1 = min(*span), max(*span) + l2, r2 = min(*reference_span), max(*reference_span) + return l1 <= l2 < r1 or l1 < r2 <= r1 or l2 <= l1 < r2 or l2 < r1 <= r2 + + +def label_tokenized( + entry, target_text="text", pii_column="fragments", LABEL2ID=None, IGNORE_CLASS=None +): + content, pii = entry[target_text], entry[pii_column] + + if entry["offset_mapping"][-1] == (0, 0): + entry["offset_mapping"][-1] = (len(content), len(content)) + + entry["labels"] = [LABEL2ID["O"]] * len(entry["offset_mapping"]) + for entity in pii: + if entity["category"] == IGNORE_CLASS: + continue + prefix = "B-" + entity_span = tuple(entity["position"]) + for i, span in enumerate(entry["offset_mapping"]): + if is_overlap(entity_span, span): + label = prefix + entity["category"] + entry["labels"][i] = LABEL2ID[label] + prefix = "I-" + + return entry + + +def add_special_toks(entry, target_text, tokenizer): + content = entry[target_text] + entry["input_ids"] = ( + [tokenizer.cls_token_id] + entry["input_ids"] + [tokenizer.sep_token_id] + ) + entry["attention_mask"] = [1] + entry["attention_mask"] + [1] + entry["offset_mapping"] = ( + [(0, 0)] + entry["offset_mapping"] + [(len(content), len(content))] + ) + entry["labels"] = [-100] + entry["labels"] + [-100] + return entry + + +def tokenize_and_label_batch( + entries, + tokenizer, + target_text="text", + pii_column="fragments", + LABEL2ID=None, + IGNORE_CLASS=None, +): + """Tokenize and label a batch of entries""" + list_inputs = { + k: [] for k in ["input_ids", "attention_mask", "offset_mapping", "labels"] + } + for text, fragments in zip(entries[target_text], entries[pii_column]): + entry = {"text": text, "fragments": fragments} + inputs = tokenizer.encode_plus( + text, return_offsets_mapping=True, add_special_tokens=False + ) + entry.update(inputs) + entry = label_tokenized( + entry, + target_text=target_text, + pii_column=pii_column, + LABEL2ID=LABEL2ID, + IGNORE_CLASS=IGNORE_CLASS, + ) + entry = add_special_toks(entry, target_text=target_text, tokenizer=tokenizer) + for k in list_inputs.keys(): + list_inputs[k].append(entry[k]) + return list_inputs + + +# Chunking +# we do all chunking with overlap_freq = 0 + + +def _get_chunking_step(length, overlap_freq): + step = length + if overlap_freq: + if overlap_freq > 1: + step = length // overlap_freq + else: + step = length // 2 + return step + + +def _chunked_seq(seq, length, overlap_freq=0): + step = _get_chunking_step(length, overlap_freq) + + for i in range(len(seq) // step + 1): + if i * step < len(seq): + yield seq[i * step : i * step + length] + + +def chunk_inputs( + input_ids, + attention_mask, + labels, + id, + *, + tokenizer, + max_length, + overlap_freq=0, + **kwargs +): + chunks = zip( + *[ + _chunked_seq(seq, max_length, overlap_freq) + for seq in (input_ids, attention_mask, labels) + ] + ) + return [ + dict( + input_ids=input_ids, + attention_mask=attention_mask, + labels=labels, + id=id, + chunk_id=i, + ) + for i, (input_ids, attention_mask, labels) in enumerate(chunks) + ] + + +def chunk_dataset(dataset, tokenizer, overlap_freq=0): + return Dataset.from_list( + list( + itertools.chain( + *( + chunk_inputs( + **entry, + tokenizer=tokenizer, + max_length=tokenizer.model_max_length, + overlap_freq=overlap_freq + ) + for entry in tqdm(list(dataset)) + ) + ) + ) + ) From 30e74d5a1603c9f5628f124f0db89edb3e74793f Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Tue, 21 Mar 2023 21:02:16 +0000 Subject: [PATCH 02/14] various fixes and add requirements --- pii/ner/requirements.txt | 4 ++ pii/ner/train.py | 125 +++++++++++++++++---------------- pii/ner/utils/preprocessing.py | 6 +- 3 files changed, 72 insertions(+), 63 deletions(-) create mode 100644 pii/ner/requirements.txt diff --git a/pii/ner/requirements.txt b/pii/ner/requirements.txt new file mode 100644 index 0000000..4901e0b --- /dev/null +++ b/pii/ner/requirements.txt @@ -0,0 +1,4 @@ +datasets +transformers +evaluate +seqeval \ No newline at end of file diff --git a/pii/ner/train.py b/pii/ner/train.py index c72677e..3f56d44 100644 --- a/pii/ner/train.py +++ b/pii/ner/train.py @@ -1,13 +1,10 @@ import argparse -import itertools -import json +import os from pprint import pprint -import numpy as np -import pandas as pd -from datasets import Dataset, DatasetDict, load_dataset, load_from_disk, load_metric -from huggingface_hub import notebook_login +from datasets import DatasetDict, load_dataset from tqdm import tqdm +from functools import partial from transformers import ( AutoModelForTokenClassification, AutoTokenizer, @@ -15,55 +12,12 @@ EarlyStoppingCallback, Trainer, TrainingArguments, + set_seed, + logging ) from utils.preprocessing import chunk_dataset, tokenize_and_label_batch - - -def get_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--model_ckpt", type=str, default="bigcode/bigcode-encoder") - parser.add_argument( - "--dataset_name", - type=str, - default="bigcode/pii-annotated-toloka-donwsample-emails", - ) - parser.add_argument("batch_size", type=int, default=16) - parser.add_argument("learning_rate", type=float, default=1e-5) - parser.add_argument("lr_scheduler_type", type=str, default="cosine") - parser.add_argument("num_train_epochs", type=int, default=3) - parser.add_argument("weight_decay", type=float, default=0.01) - parser.add_argument("gradient_checkpointing", action="store_true") - parser.add_argument("output_dir", type=str, default="finetuned-encoder-pii") - parser.add_argument("seed", type=int, default=0) - parser.add_argument("num_proc", type=int, default=8) - parser.add_argument("max_length", type=int, default=1024) - parser.add_argument("debug", action="store_true") - parser.add_argument("bf16", action="store_true") - parser.add_argument("fp16", action="store_true") - parser.add_argument("eval_freq", type=int, default=100) - parser.add_argument("save_freq", type=int, default=1000) - return parser.parse_args() - - -def get_stats(data): - # get number of B-cat for cat in categories for each data split - stats = {cat: 0 for cat in CATEGORIES} - for entry in tqdm(data): - for label in entry["labels"]: - # only add labels for beginning with B- - if label > 0 and ID2LABEL[label].startswith("B-"): - stats[ID2LABEL[label][2:]] += 1 - return stats - - -def prepare_tokenizer(tokenizer): - tokenizer.add_special_tokens({"pad_token": PAD_TOKEN}) - tokenizer.add_special_tokens({"sep_token": SEPARATOR_TOKEN}) - tokenizer.add_special_tokens({"cls_token": CLS_TOKEN}) - tokenizer.add_special_tokens({"mask_token": MASK_TOKEN}) - tokenizer.model_max_length = 1024 - return tokenizer +from utils.eval import compute_metrics # Special tokens @@ -95,14 +49,63 @@ def prepare_tokenizer(tokenizer): LABEL2ID[f"I-{cat}"] = len(LABEL2ID) ID2LABEL = {v: k for k, v in LABEL2ID.items()} +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--model_ckpt", type=str, default="bigcode/bigcode-encoder") + parser.add_argument( + "--dataset_name", + type=str, + default="bigcode/pii-annotated-toloka-donwsample-emails" + ) + parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--learning_rate", type=float, default=1e-5) + parser.add_argument("--lr_scheduler_type", type=str, default="cosine") + parser.add_argument("--num_train_epochs", type=int, default=20) + parser.add_argument("--weight_decay", type=float, default=0.01) + parser.add_argument("--warmup_steps", type=int, default=100) + parser.add_argument("--gradient_checkpointing", action="store_true") + parser.add_argument("--gradient_accumulation_steps", type=int, default=1) + parser.add_argument("--num_proc", type=int, default=8) + parser.add_argument("--bf16", action="store_true") + parser.add_argument("--fp16", action="store_true") + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--num_workers", type=int, default=16) + parser.add_argument("--eval_freq", type=int, default=100) + parser.add_argument("--save_freq", type=int, default=1000) + parser.add_argument("--debug", action="store_true") + parser.add_argument("--output_dir", type=str, default="finetuned-encoder-pii") + return parser.parse_args() -def run_training(args, ner_dataset): + +def get_stats(data): + # get number of B-cat for cat in categories for each data split + stats = {cat: 0 for cat in CATEGORIES} + for entry in tqdm(data): + for label in entry["labels"]: + # only add labels for beginning with B- + if label > 0 and ID2LABEL[label].startswith("B-"): + stats[ID2LABEL[label][2:]] += 1 + return stats + + +def prepare_tokenizer(tokenizer): + tokenizer.add_special_tokens({"pad_token": PAD_TOKEN}) + tokenizer.add_special_tokens({"sep_token": SEPARATOR_TOKEN}) + tokenizer.add_special_tokens({"cls_token": CLS_TOKEN}) + tokenizer.add_special_tokens({"mask_token": MASK_TOKEN}) + tokenizer.model_max_length = 1024 + return tokenizer + + +def run_training(args, ner_dataset, model, tokenizer): print(f"Initializing Trainer...") training_args = TrainingArguments( output_dir=args.output_dir, evaluation_strategy="steps", num_train_epochs=args.num_train_epochs, + per_device_train_batch_size=args.batch_size, + per_device_eval_batch_size=args.batch_size, eval_steps=args.eval_freq, save_steps=args.save_freq, logging_steps=10, @@ -111,16 +114,17 @@ def run_training(args, ner_dataset): weight_decay=args.weight_decay, learning_rate=args.learning_rate, lr_scheduler_type=args.lr_scheduler_type, - warmup_steps=args.num_warmup_steps, - gradient_checkpointing=args.no_gradient_checkpointing, + warmup_steps=args.warmup_steps, + gradient_checkpointing=args.gradient_checkpointing, gradient_accumulation_steps=args.gradient_accumulation_steps, fp16=args.fp16, bf16=args.bf16, - weight_decay=args.weight_decay, - run_name=f"pii-bs{batch_size}-lr{lr}-wd{wd}-epochs{max_epochs}", + run_name=f"pii-bs{args.batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-epochs{args.num_train_epochs}", report_to="wandb", ) + + data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) trainer = Trainer( model=model, args=training_args, @@ -163,7 +167,7 @@ def main(args): data = dataset.map( partial( tokenize_and_label_batch, - tokenizer, + tokenizer=tokenizer, target_text="text", pii_column="fragments", LABEL2ID=LABEL2ID, @@ -171,7 +175,7 @@ def main(args): ), batched=True, batch_size=1000, - num_proc=NUM_PROC, + num_proc=args.num_workers, ) # split to train and test @@ -201,7 +205,6 @@ def main(args): pprint({k: v for k, v in test_stats.items() if v < 100}) print("Chunking the dataset...") - data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) ner_dataset = DatasetDict( train=chunk_dataset(train_data, tokenizer), validation=chunk_dataset(valid_data, tokenizer), @@ -209,7 +212,7 @@ def main(args): ) print(ner_dataset) - run_training(args, ner_dataset) + run_training(args, ner_dataset, model, tokenizer) if __name__ == "__main__": diff --git a/pii/ner/utils/preprocessing.py b/pii/ner/utils/preprocessing.py index 623d4e1..203e627 100644 --- a/pii/ner/utils/preprocessing.py +++ b/pii/ner/utils/preprocessing.py @@ -1,5 +1,7 @@ # source: https://github.com/mponty/bigcode-dataset/tree/main/pii/ner_model_training/utils by @mponty - +import itertools +from tqdm import tqdm +from datasets import Dataset def is_overlap(span, reference_span): l1, r1 = min(*span), max(*span) @@ -17,7 +19,7 @@ def label_tokenized( entry["labels"] = [LABEL2ID["O"]] * len(entry["offset_mapping"]) for entity in pii: - if entity["category"] == IGNORE_CLASS: + if entity["category"] in IGNORE_CLASS: continue prefix = "B-" entity_span = tuple(entity["position"]) From 22586db7c6f62b53eb57dd71c3e0b8857ff36ecf Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Wed, 22 Mar 2023 10:00:56 +0000 Subject: [PATCH 03/14] fixes --- pii/ner/train.py | 17 ++++++++++------- pii/ner/utils/eval.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 7 deletions(-) diff --git a/pii/ner/train.py b/pii/ner/train.py index 3f56d44..3a6e96c 100644 --- a/pii/ner/train.py +++ b/pii/ner/train.py @@ -57,7 +57,8 @@ def get_args(): type=str, default="bigcode/pii-annotated-toloka-donwsample-emails" ) - parser.add_argument("--batch_size", type=int, default=16) + parser.add_argument("--train_batch_size", type=int, default=4) + parser.add_argument("--eval_batch_size", type=int, default=1) parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--lr_scheduler_type", type=str, default="cosine") parser.add_argument("--num_train_epochs", type=int, default=20) @@ -65,11 +66,12 @@ def get_args(): parser.add_argument("--warmup_steps", type=int, default=100) parser.add_argument("--gradient_checkpointing", action="store_true") parser.add_argument("--gradient_accumulation_steps", type=int, default=1) + parser.add_argument("--eval_accumulation_steps", type=int, default=4) parser.add_argument("--num_proc", type=int, default=8) parser.add_argument("--bf16", action="store_true") parser.add_argument("--fp16", action="store_true") parser.add_argument("--seed", type=int, default=0) - parser.add_argument("--num_workers", type=int, default=16) + parser.add_argument("--num_workers", type=int, default=8) parser.add_argument("--eval_freq", type=int, default=100) parser.add_argument("--save_freq", type=int, default=1000) parser.add_argument("--debug", action="store_true") @@ -104,8 +106,8 @@ def run_training(args, ner_dataset, model, tokenizer): output_dir=args.output_dir, evaluation_strategy="steps", num_train_epochs=args.num_train_epochs, - per_device_train_batch_size=args.batch_size, - per_device_eval_batch_size=args.batch_size, + per_device_train_batch_size=args.train_batch_size, + per_device_eval_batch_size=args.eval_batch_size, eval_steps=args.eval_freq, save_steps=args.save_freq, logging_steps=10, @@ -117,9 +119,10 @@ def run_training(args, ner_dataset, model, tokenizer): warmup_steps=args.warmup_steps, gradient_checkpointing=args.gradient_checkpointing, gradient_accumulation_steps=args.gradient_accumulation_steps, + eval_accumulation_steps=args.eval_accumulation_steps, fp16=args.fp16, bf16=args.bf16, - run_name=f"pii-bs{args.batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-epochs{args.num_train_epochs}", + run_name=f"pii-bs{args.train_batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-epochs{args.num_train_epochs}", report_to="wandb", ) @@ -179,9 +182,9 @@ def main(args): ) # split to train and test - data = data.train_test_split(test_size=0.2, shuffle=True, seed=args.seed) + data = data.train_test_split(test_size=0.1, shuffle=True, seed=args.seed) test_valid = data["test"].train_test_split( - test_size=0.6, shuffle=True, seed=args.seed + test_size=0.85, shuffle=True, seed=args.seed ) train_data = data["train"] valid_data = test_valid["train"] diff --git a/pii/ner/utils/eval.py b/pii/ner/utils/eval.py index 44fa05f..a5be311 100644 --- a/pii/ner/utils/eval.py +++ b/pii/ner/utils/eval.py @@ -7,6 +7,30 @@ _seqeval_metric = load("seqeval") +# NER tags +CATEGORIES = [ + "NAME", + "NAME_LICENSE", + "NAME_EXAMPLE", + "EMAIL", + "EMAIL_LICENSE", + "EMAIL_EXAMPLE", + "USERNAME", + "USERNAME_LICENSE", + "USERNAME_EXAMPLE", + "KEY", + "IP_ADDRESS", + "PASSWORD", +] +IGNORE_CLASS = ["AMBIGUOUS", "ID"] + +LABEL2ID = {"O": 0} +for cat in CATEGORIES: + LABEL2ID[f"B-{cat}"] = len(LABEL2ID) + LABEL2ID[f"I-{cat}"] = len(LABEL2ID) +ID2LABEL = {v: k for k, v in LABEL2ID.items()} + + def compute_ap(pred, truth): pred_proba = 1 - softmax(pred, axis=-1)[..., 0] pred_proba, truth = pred_proba.flatten(), np.array(truth).flatten() @@ -18,6 +42,8 @@ def compute_ap(pred, truth): def compute_metrics(p): predictions, labels = p + print(f"predictions.shape: {predictions.shape} and type {type(predictions)}") + print(f"labels.shape: {labels.shape} and type {type(labels)}") avg_prec = compute_ap(predictions, labels) predictions = np.argmax(predictions, axis=2) From 5f003f495d2f0b3ba6662ddfb94756f89c2196a7 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Fri, 24 Mar 2023 17:59:12 +0000 Subject: [PATCH 04/14] add inference notebook --- pii/ner/notebook_inference.ipynb | 512 +++++++++++++++++++++++++++++++ 1 file changed, 512 insertions(+) create mode 100644 pii/ner/notebook_inference.ipynb diff --git a/pii/ner/notebook_inference.ipynb b/pii/ner/notebook_inference.ipynb new file mode 100644 index 0000000..6310fde --- /dev/null +++ b/pii/ner/notebook_inference.ipynb @@ -0,0 +1,512 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import time\n", + "from accelerate import Accelerator\n", + "from dataclasses import dataclass, field\n", + "from functools import partial\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "import torch\n", + "from torch.utils.data import DataLoader\n", + "import datasets \n", + "from transformers import AutoTokenizer, AutoModelForTokenClassification, HfArgumentParser, DataCollatorForTokenClassification" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "KEEP_LABELS = [\n", + " \"NAME\",\n", + " \"NAME_LICENSE\",\n", + " \"NAME_EXAMPLE\",\n", + " \"EMAIL\",\n", + " \"EMAIL_LICENSE\",\n", + " \"EMAIL_EXAMPLE\",\n", + " \"USERNAME\",\n", + " \"USERNAME_LICENSE\",\n", + " \"USERNAME_EXAMPLE\",\n", + " \"KEY\",\n", + " \"IP_ADDRESS\",\n", + " \"PASSWORD\",\n", + "]\n", + "\n", + "# Special tokens\n", + "MASK_TOKEN = \"\"\n", + "SEPARATOR_TOKEN = \"\"\n", + "PAD_TOKEN = \"\"\n", + "CLS_TOKEN = \"\"\n", + "\n", + "@dataclass\n", + "class NerArguments:\n", + "\n", + " \"\"\"configuration for running NER model inference\n", + " \"\"\"\n", + " model_name: str = field(\n", + " default=\"bigcode/deberta-v3-large-pii-ner-v2\",\n", + " metadata={\n", + " \"help\": \"Name of model to use for inference\"\n", + " }\n", + " )\n", + " num_workers: int = field(\n", + " default=16,\n", + " metadata={\n", + " \"help\": \"Number of processes to use for inference\"\n", + " }\n", + " )\n", + " batch_size: int = field(\n", + " default=64,\n", + " metadata={\n", + " \"help\": \"the batch size to use for inference\"\n", + " }\n", + " )\n", + " dataset_name: str = field(\n", + " default=\"bigcode/pii-annotated-toloka\",\n", + " metadata={\n", + " \"help\": \"Name of dataset to use for inference\"\n", + " }\n", + " )\n", + " dryrun: bool = field(\n", + " default=False,\n", + " metadata={\n", + " \"help\": \"Run a dryrun with a small subset of the data\"\n", + " }\n", + " )\n", + " output_path: str = field(\n", + " default=\"output.json\",\n", + " metadata={\n", + " \"help\": \"Path to save output entities\"\n", + " }\n", + " )\n", + "\n", + "# Adapted from: transformers.pipelines.token_classification\n", + "def group_sub_entities(entities, tokenizer):\n", + " first_entity, last_entity = entities[0], entities[-1]\n", + " entity = first_entity[\"entity\"].split(\"-\")[-1]\n", + " scores = np.nanmean([entity[\"score\"] for entity in entities])\n", + " tokens = [entity[\"word\"] for entity in entities]\n", + "\n", + " return {\n", + " \"entity\": entity,\n", + " \"score\": np.mean(scores),\n", + " \"word\": tokenizer.convert_tokens_to_string(tokens)\n", + " }" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "args = NerArguments" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## utilities" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "# Adapted from: transformers.pipelines.token_classification\n", + "def group_entities(entities, tokenizer):\n", + " entity_groups = []\n", + " entity_group_disagg = []\n", + "\n", + " if entities:\n", + " last_idx = entities[-1][\"index\"]\n", + "\n", + " for entity in entities:\n", + " is_last_idx = entity[\"index\"] == last_idx\n", + " if not entity_group_disagg:\n", + " entity_group_disagg += [entity]\n", + " if is_last_idx:\n", + " entity_groups += [group_sub_entities(entity_group_disagg, tokenizer)]\n", + " continue\n", + "\n", + " is_entity_start = entity[\"entity\"].split(\"-\")[0] == \"B\"\n", + " curr_entity_type = entity[\"entity\"].split(\"-\")[-1]\n", + " prev_entity_type = entity_group_disagg[-1][\"entity\"].split(\"-\")[-1]\n", + " is_adjacent_entity = entity[\"index\"] == entity_group_disagg[-1][\"index\"] + 1\n", + "\n", + " is_same_entity_as_previous = (\n", + " curr_entity_type == prev_entity_type and not is_entity_start\n", + " ) and is_adjacent_entity\n", + " if is_same_entity_as_previous:\n", + " entity_group_disagg += [entity]\n", + " if is_last_idx:\n", + " entity_groups += [group_sub_entities(entity_group_disagg, tokenizer)]\n", + " else:\n", + " entity_groups += [group_sub_entities(entity_group_disagg, tokenizer)]\n", + " entity_group_disagg = [entity]\n", + " if is_last_idx:\n", + " entity_groups += [group_sub_entities(entity_group_disagg, tokenizer)]\n", + "\n", + " return entity_groups\n", + "\n", + "\n", + "def prepare_tokenizer(tokenizer):\n", + " tokenizer.add_special_tokens({\"pad_token\": PAD_TOKEN})\n", + " tokenizer.add_special_tokens({\"sep_token\": SEPARATOR_TOKEN})\n", + " tokenizer.add_special_tokens({\"cls_token\": CLS_TOKEN})\n", + " tokenizer.add_special_tokens({\"mask_token\": MASK_TOKEN})\n", + " tokenizer.model_max_length = 1024\n", + " return tokenizer\n", + "\n", + "\n", + "def tokenize_function(entries, tokenizer):\n", + " list_inputs = {\n", + " k: [] for k in [\"input_ids\", \"attention_mask\", \"special_tokens_mask\"]\n", + " }\n", + " for text in entries[\"text\"]:\n", + " inputs = tokenizer(text, return_special_tokens_mask=True)\n", + " for k in list_inputs.keys():\n", + " list_inputs[k].append(inputs[k])\n", + " return list_inputs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initializing dataset, model and accelerator" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using custom data configuration bigcode--pii-annotated-toloka-aa0ea1d4040d00a1\n", + "Found cached dataset json (/fsx/loubna/.cache/bigcode___json/bigcode--pii-annotated-toloka-aa0ea1d4040d00a1/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)\n" + ] + } + ], + "source": [ + "accelerator = Accelerator()\n", + "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", + "\n", + "# load model and tokenizer\n", + "model = AutoModelForTokenClassification.from_pretrained(args.model_name).to(device)\n", + "tokenizer = AutoTokenizer.from_pretrained(args.model_name)\n", + "tokenizer = prepare_tokenizer(tokenizer)\n", + "# labels\n", + "IGNORE_LABELS_IDX = [i for l, i in model.config.label2id.items() if l not in KEEP_LABELS]\n", + "id2label = model.config.id2label\n", + "\n", + "# load and tokenize dataset\n", + "dataset = datasets.load_dataset(args.dataset_name, split=\"train\")\n", + "metadata_columns = [c for c in dataset.column_names if c != \"text\"]\n", + "if args.dryrun:\n", + " dataset = dataset.select(range(1000))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 'O',\n", + " 1: 'B-AMBIGUOUS',\n", + " 2: 'I-AMBIGUOUS',\n", + " 3: 'B-EMAIL',\n", + " 4: 'I-EMAIL',\n", + " 5: 'B-IP_ADDRESS',\n", + " 6: 'I-IP_ADDRESS',\n", + " 7: 'B-KEY',\n", + " 8: 'I-KEY',\n", + " 9: 'B-NAME',\n", + " 10: 'I-NAME',\n", + " 11: 'B-PASSWORD',\n", + " 12: 'I-PASSWORD',\n", + " 13: 'B-USERNAME',\n", + " 14: 'I-USERNAME'}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "id2label" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Chunk dataset so we don't need to truncate long files and end up losing data" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "from datasets import Dataset\n", + "from tqdm import tqdm\n", + "\n", + "def _chunked_seq(seq, length):\n", + " step = length\n", + "\n", + " for i in range(len(seq) // step + 1):\n", + " if i * step < len(seq):\n", + " yield seq[i * step : i * step + length]\n", + "\n", + "\n", + "def chunk_inputs(\n", + " input_ids,\n", + " attention_mask,\n", + " special_tokens_mask,\n", + " id,\n", + " *,\n", + " tokenizer,\n", + " max_length,\n", + " **kwargs\n", + "):\n", + " chunks = zip(\n", + " *[\n", + " _chunked_seq(seq, max_length)\n", + " for seq in (input_ids, attention_mask, special_tokens_mask)\n", + " ]\n", + " )\n", + " return [\n", + " dict(\n", + " input_ids=input_ids,\n", + " attention_mask=attention_mask,\n", + " special_tokens_mask=special_tokens_mask,\n", + " id=id,\n", + " chunk_id=i,\n", + " )\n", + " for i, (input_ids, attention_mask, special_tokens_mask) in enumerate(chunks)\n", + " ]\n", + "\n", + "\n", + "def chunk_dataset(dataset, tokenizer):\n", + " return Dataset.from_list(\n", + " list(\n", + " itertools.chain(\n", + " *(\n", + " chunk_inputs(\n", + " entry[\"input_ids\"],\n", + " entry[\"attention_mask\"],\n", + " entry[\"special_tokens_mask\"],\n", + " entry[\"id\"],\n", + " tokenizer=tokenizer,\n", + " max_length=tokenizer.model_max_length,\n", + " )\n", + " for entry in tqdm(list(dataset))\n", + " )\n", + " )\n", + " )\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)\n", + "with accelerator.main_process_first():\n", + " tokenized_data = dataset.map(\n", + " partial(tokenize_function, tokenizer=tokenizer),\n", + " batched=True,\n", + " num_proc=args.num_workers,\n", + " remove_columns=metadata_columns,\n", + " )\n", + " tokenized_data = tokenized_data.add_column(\"id\", range(len(tokenized_data)))\n", + " tokenized_data = tokenized_data.remove_columns(\"text\")\n", + " chunked_data = chunk_dataset(tokenized_data, tokenizer)\n", + "\n", + "dataloader = DataLoader(chunked_data, batch_size=args.batch_size, shuffle=False, collate_fn=data_collator)\n", + "print(\"length dataloader is\", len(dataloader))\n", + "model, dataloader = accelerator.prepare(model, dataloader)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['input_ids', 'attention_mask', 'special_tokens_mask', 'id'],\n", + " num_rows: 12171\n", + "})" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tokenized_data" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([64, 1024])\n", + "------------------------------------------------------------ Example 0 ------------------------------------------------------------\n", + "[CLS] { \"first_name\": \"Yance\", \"last_name\": \"Bugbee\", \"email_address\": \"ybugbee7@narod.ru\", \"age\": 22, }, { \"first_name\": \"Zita\", \"last_name\": \"Walak\", \"email_address\": \"zwalak8@ebay.com\", \"age\": 57, }, { \"first_name\": \"Davie\", \"last_name\": \"Garmans\", \"email_address\": \"dgarmans9@biblegateway.com\", \"age\": 53, }, ] return data def start_producer(service_uri: str, ca_path: str, cert_path: str, key_path: str): \"\"\"Start the Kafka producer\"\"\" producer = KafkaProducer( bootstrap_servers=service_uri, security_protocol=\"SSL\", ssl_cafile=ca_path, ssl_certfile=cert_path, ssl_keyfile=key_path, ) return producer def send_messages_to_consumer(producer, topic_name: str = \"sample_customer_profile\"): \"\"\"Send messages from Kafka producer to consumer\"\"\" data = get_fake_data() for message in data: print(f\"Sending message from producer: {message}\") producer.send(topic_name, dumps(message).encode(\"utf-8\")) # Wait for all messages to be sent print(f\"All producermessages sent to consumer for topic {topic_name}\") producer.flush()[SEP]\n", + "tensor([1, 1, 1, ..., 0, 0, 0])\n", + "tensor([1, 0, 0, ..., 1, 1, 1])\n", + "------------------------------------------------------------ Example 1 ------------------------------------------------------------\n", + "[CLS] #!/usr/bin/env python3 # -*- coding: utf-8 -*- from .. import TestUnitBase class TestStegoUnit(TestUnitBase): def test_simple(self): expected_rows = [ bytes.fromhex(r) for r in [ '2C 15 15 75 50 50 A2 51 51 C1 85 85 AC 5B 5B C9 95 95 CD 9E 9E 98 40 40 00 00 00 00 00 00', '82 71 71 AE A0 A0 BF 8F 8F E0 C4 C4 D1 A5 A5 E3 CC CC EB DB DB CB 9C 9C 5B 58 58 00 00 00', '27 27 27 41 41 41 A4 9E 9E C5 C3 C3 C4 C0 C0 B8 B6 B6 D3 D2 D2 EF EB EB CD CD CD A2 9D 9D', '01 01 01 0B 0B 0B 6A 6A 6A 68 68 68 59 59 59 4E 4E 4E 81 81 81 C1 C1 C1 77 45 45 7B 00 00', '26 26 26 6E 6E 6E C5 C5 C5 BD BD BD C1 BF BF BF BF BF DB DB DB F1 F1 F1 7F 03 03 7F 00 00', 'D7 D7 D7 DE DE DE 96 96 96 B8 B1 B1 C0 95 95 D1 C7 C7 F9 F9 F9 EF EF EF 85 25 25 7D 00 00', 'FC FC FC D2 D2 D2 76 71 71 93 6B 6B 86 24 24 7B 4E 4E D4 D1 D1 F6 F6 F6 B7 A9 A9 86 3A 3A', 'BB BB BB CF C9 C9 BB 9A 9A C4 A0 A0 A7 7D 7D 87 7E 7E DC DC DC F9 F6 F6 CC B2 B2 BF AE AE', '00 00 00 26 14 14 A1 5F 5F B8 78 78 A6 95 95 D7 D7 D7 FB FB FB D2 B9 B9 70 22 22 3F 02 02', '00 00 00 02 00 00 55 41 41 AD 9A 9A 3F 3C 3C B0 B0 B0 FD FD FD BC B6 B6 24 01 01 17 00 00', ] ] image = bytes.fromhex( '89504E470D0A1A0A0000000D494844520000000A0000000A0802000000025058EA000000017352474200AECE1CE900' '00000467414D410000B18F0BFC6105000000097048597300000EC300000EC301C76FA8640000014149444154285301' '3601C9FE012C1515493B3B2D01011F3434EBD6D61D3A3A040909CBA2A268C0C0000000036C6767334040171717203A' '3A0B1616162F2F1326260A0F0FF60A0AD3D4D403E6EFEFD7DEDE243636031212F90C0CE5F0F0020A0A203434282C2C' '3C3737010101010A0A0A5F5F5FFEFEFEF1F1F1F5F5F5333333404040B6848404BBBB01262626484848575757F8F8F8' '040202FE00001C1C1C1616168E121200FDFD02B1B1B1707070D1D1D1FBF4F4FFD6D61208081E1E1EFEFEFE062222FE' '000003919191E5E5E5C2BDBDFCDADADDA4A4D0D9D91A2E2E151616FA1C1CECE6E6033D3D3D09030319FDFD1D1E1E02' '1B1BF619192F3535100D0DF4E3E3163838010000002614147B4B4B171919EE1D1D314242242424D7BEBE9E6969CFE0' 'E002000000DCECECB4E2E2F5222299A7A7D9D9D9020202EAFDFDB4DFDFD8FEFE8A567CCFC3DE9AB90000000049454E' '44AE426082' ) stego = self.\n", + "tensor([1, 1, 1, ..., 1, 1, 1])\n", + "tensor([1, 0, 0, ..., 0, 0, 0])\n" + ] + } + ], + "source": [ + "res = next(iter(dataloader))\n", + "print(res[\"input_ids\"].shape)\n", + "print(\"-\" * 60, \"Example 0\", \"-\" * 60)\n", + "tokens = tokenizer.convert_ids_to_tokens(res[\"input_ids\"][0])\n", + "print(tokenizer.convert_tokens_to_string(tokens))\n", + "print(res[\"attention_mask\"][0])\n", + "print(res[\"special_tokens_mask\"][0])\n", + "\n", + "print(\"-\" * 60, \"Example 1\", \"-\" * 60)\n", + "tokens = tokenizer.convert_ids_to_tokens(res[\"input_ids\"][1])\n", + "print(tokenizer.convert_tokens_to_string(tokens))\n", + "print(res[\"attention_mask\"][1])\n", + "print(res[\"special_tokens_mask\"][1])" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [] + } + ], + "source": [ + "all_entities = []\n", + "t_start = time.time()\n", + "for step, batch in tqdm(enumerate(dataloader)):\n", + " t_1 = time.time()\n", + " with torch.no_grad():\n", + " outputs = model(\n", + " input_ids=batch[\"input_ids\"],\n", + " attention_mask=batch[\"attention_mask\"]\n", + " )\n", + " # warning: not very sure if this works with multiple GPU\n", + " predictions, input_ids, special_tokens_mask = accelerator.gather((\n", + " outputs.logits.squeeze(), batch[\"input_ids\"], batch['special_tokens_mask']\n", + " ))\n", + " predictions = predictions.cpu().numpy()\n", + " scores = np.exp(predictions) / np.exp(predictions).sum(-1, keepdims=True)\n", + " batch_labels_idx = scores.argmax(axis=-1)\n", + " forward_time = time.time() - t_1\n", + " t_1 = time.time()\n", + " batch_entities = []\n", + " for text_id, labels_idx in enumerate(batch_labels_idx):\n", + " entities = []\n", + " filtered_labels_idx = [\n", + " (id, label_id) \n", + " for id, label_id in enumerate(labels_idx) \n", + " if label_id not in IGNORE_LABELS_IDX and not special_tokens_mask[text_id][id]\n", + " ]\n", + " for id, label_id in filtered_labels_idx:\n", + " entity = {\n", + " \"word\": tokenizer.convert_ids_to_tokens(int(input_ids[text_id][id])),\n", + " \"index\": id,\n", + " \"score\": float(scores[text_id][id][label_id]),\n", + " \"entity\": id2label[label_id],\n", + " }\n", + " entities += [entity]\n", + " #print(f\"post-processing time {time.time() - t_1}\")\n", + " batch_entities.append(group_entities(entities, tokenizer))\n", + " all_entities += batch_entities\n", + " if args.dryrun:\n", + " print(f\"Step {step}\")\n", + " print(f\"forward time {forward_time}\")\n", + " print(f\"post-processing time {time.time() - t_1}\")\n", + "t_end = time.time()\n", + "\n", + "print(f\"total time: {t_end - t_start:.2f} seconds\")\n", + "all_entities = all_entities[:len(dataset)]\n", + "if accelerator.is_main_process:\n", + " print(all_entities[14])\n", + " with open(args.output_path, \"w\") as f:\n", + " json.dump(all_entities, f)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.9 ('eval-harness': conda)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "271972ab9158cd42175bc1ec5288153b91d150291a0b625c2babd1911356e891" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From f3307147400e444ae1132f43faf6c6f85017985f Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Fri, 24 Mar 2023 17:59:44 +0000 Subject: [PATCH 05/14] add comment --- pii/ner/notebook_inference.ipynb | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pii/ner/notebook_inference.ipynb b/pii/ner/notebook_inference.ipynb index 6310fde..0b62081 100644 --- a/pii/ner/notebook_inference.ipynb +++ b/pii/ner/notebook_inference.ipynb @@ -1,5 +1,16 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Notebook for inference:\n", + "\n", + "Todo:\n", + "- check correctness of predictions\n", + "- improve chunking method to avoid splitting files in teh middle" + ] + }, { "cell_type": "markdown", "metadata": {}, From 3f4d5845b3f1eed58b24d15cd475e2f78c034999 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Sat, 25 Mar 2023 02:08:20 +0000 Subject: [PATCH 06/14] remove license and example classes and use a different dataset --- pii/ner/train.py | 108 +++++++++++++++++++++++++----------------- pii/ner/utils/eval.py | 11 +---- 2 files changed, 67 insertions(+), 52 deletions(-) diff --git a/pii/ner/train.py b/pii/ner/train.py index 3a6e96c..312fcbe 100644 --- a/pii/ner/train.py +++ b/pii/ner/train.py @@ -29,19 +29,14 @@ # NER tags CATEGORIES = [ "NAME", - "NAME_LICENSE", - "NAME_EXAMPLE", "EMAIL", - "EMAIL_LICENSE", "EMAIL_EXAMPLE", "USERNAME", - "USERNAME_LICENSE", - "USERNAME_EXAMPLE", "KEY", "IP_ADDRESS", "PASSWORD", ] -IGNORE_CLASS = ["AMBIGUOUS", "ID"] +IGNORE_CLASS = ["AMBIGUOUS", "ID", "NAME_EXAMPLE", "USERNAME_EXAMPLE"] LABEL2ID = {"O": 0} for cat in CATEGORIES: @@ -55,21 +50,28 @@ def get_args(): parser.add_argument( "--dataset_name", type=str, - default="bigcode/pii-annotated-toloka-donwsample-emails" + default="bigcode/pii-full-ds" ) + # addprefix to wandb run + parser.add_argument("--prefix", type=str, default="") + parser.add_argument("--add_not_curated", action="store_true") parser.add_argument("--train_batch_size", type=int, default=4) - parser.add_argument("--eval_batch_size", type=int, default=1) + parser.add_argument("--eval_batch_size", type=int, default=4) + parser.add_argument("--num_train_epochs", type=int, default=100) + parser.add_argument("--learning_rate", type=float, default=1e-5) parser.add_argument("--lr_scheduler_type", type=str, default="cosine") - parser.add_argument("--num_train_epochs", type=int, default=20) parser.add_argument("--weight_decay", type=float, default=0.01) parser.add_argument("--warmup_steps", type=int, default=100) + parser.add_argument("--gradient_checkpointing", action="store_true") parser.add_argument("--gradient_accumulation_steps", type=int, default=1) - parser.add_argument("--eval_accumulation_steps", type=int, default=4) + parser.add_argument("--eval_accumulation_steps", type=int, default=1) parser.add_argument("--num_proc", type=int, default=8) parser.add_argument("--bf16", action="store_true") parser.add_argument("--fp16", action="store_true") + + parser.add_argument("--local_rank", type=int, default=0) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--num_workers", type=int, default=8) parser.add_argument("--eval_freq", type=int, default=100) @@ -99,6 +101,23 @@ def prepare_tokenizer(tokenizer): return tokenizer +def prepare_dataset(dataset, tokenizer, args): + # tokenize and label + dataset = dataset.map( + partial( + tokenize_and_label_batch, + tokenizer=tokenizer, + target_text="text", + pii_column="fragments", + LABEL2ID=LABEL2ID, + IGNORE_CLASS=IGNORE_CLASS, + ), + batched=True, + batch_size=1000, + num_proc=args.num_workers, + ) + return dataset + def run_training(args, ner_dataset, model, tokenizer): print(f"Initializing Trainer...") @@ -122,7 +141,7 @@ def run_training(args, ner_dataset, model, tokenizer): eval_accumulation_steps=args.eval_accumulation_steps, fp16=args.fp16, bf16=args.bf16, - run_name=f"pii-bs{args.train_batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-epochs{args.num_train_epochs}", + run_name=f"{args.prefix}-bs{args.train_batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-ep{args.num_train_epochs}", report_to="wandb", ) @@ -138,7 +157,7 @@ def run_training(args, ner_dataset, model, tokenizer): compute_metrics=compute_metrics, callbacks=[ EarlyStoppingCallback( - early_stopping_patience=30, early_stopping_threshold=1e-3 + early_stopping_patience=20, early_stopping_threshold=1e-2 ) ], ) @@ -149,6 +168,10 @@ def run_training(args, ner_dataset, model, tokenizer): print("Saving last checkpoint of the model") model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/")) + # evaluate on test set + #print("Evaluating on test set...") + #trainer.evaluate(ner_dataset["test"]) + def main(args): # load model and tokenizer @@ -159,43 +182,39 @@ def main(args): label2id=LABEL2ID, use_auth_token=True, use_cache=not args.gradient_checkpointing, + output_hidden_states = False, ) tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt, use_auth_token=True) tokenizer = prepare_tokenizer(tokenizer) # load dataset - dataset = load_dataset(args.dataset_name, use_auth_token=True, split="train") - dataset = dataset.remove_columns(["id"]) - dataset = dataset.add_column("id", range(len(dataset))) - data = dataset.map( - partial( - tokenize_and_label_batch, - tokenizer=tokenizer, - target_text="text", - pii_column="fragments", - LABEL2ID=LABEL2ID, - IGNORE_CLASS=IGNORE_CLASS, - ), - batched=True, - batch_size=1000, - num_proc=args.num_workers, - ) - - # split to train and test - data = data.train_test_split(test_size=0.1, shuffle=True, seed=args.seed) - test_valid = data["test"].train_test_split( - test_size=0.85, shuffle=True, seed=args.seed + dataset = load_dataset(args.dataset_name, use_auth_token=True) + train_data = dataset["train"].shuffle(seed=args.seed) + test_data = dataset["test"] + valid_data = dataset["valid"] + + from datasets import concatenate_datasets + train_data = concatenate_datasets([train_data, test_data]) + print(f"Concatenated train and test data, new train size: {len(train_data)}") + + + if args.dataset_name == "bigcode/pii-full-ds": + if not args.add_not_curated: + print("Removing not curated data (-400 long files)...") + # keep only curated data + train_data = train_data.filter(lambda x: x["data_origin"] == "curated") + else: + print("Keeping not curated data...") + + + train_data = prepare_dataset(train_data, tokenizer, args) + test_data = prepare_dataset(test_data, tokenizer, args) + valid_data = prepare_dataset(valid_data, tokenizer, args) + print( + f"After tokenization:\nTrain size {len(train_data)}\nValid size {len(valid_data)}\nTest size {len(test_data)}" ) - train_data = data["train"] - valid_data = test_valid["train"] - test_data = test_valid["test"] - test_data.to_json(f"{args.output_dir}/test_data.json") - print("Test data saved to test_data.json") if args.debug: - print( - f"Train size {len(train_data)}\nValid size {len(valid_data)}\nTest size {len(test_data)}" - ) train_stats = get_stats(train_data) valid_stats = get_stats(valid_data) test_stats = get_stats(test_data) @@ -207,12 +226,15 @@ def main(args): print("Test low-resource stats") pprint({k: v for k, v in test_stats.items() if v < 100}) + print("Chunking the dataset...") ner_dataset = DatasetDict( train=chunk_dataset(train_data, tokenizer), validation=chunk_dataset(valid_data, tokenizer), test=chunk_dataset(test_data, tokenizer), ) + # remove columns + ner_dataset = ner_dataset.remove_columns(["id", "chunk_id"]) print(ner_dataset) run_training(args, ner_dataset, model, tokenizer) @@ -223,6 +245,6 @@ def main(args): set_seed(args.seed) os.makedirs(args.output_dir, exist_ok=True) - logging.set_verbosity_error() + logging.set_verbosity_info() - main(args) + main(args) \ No newline at end of file diff --git a/pii/ner/utils/eval.py b/pii/ner/utils/eval.py index a5be311..fbb3034 100644 --- a/pii/ner/utils/eval.py +++ b/pii/ner/utils/eval.py @@ -10,19 +10,14 @@ # NER tags CATEGORIES = [ "NAME", - "NAME_LICENSE", - "NAME_EXAMPLE", "EMAIL", - "EMAIL_LICENSE", "EMAIL_EXAMPLE", "USERNAME", - "USERNAME_LICENSE", - "USERNAME_EXAMPLE", "KEY", "IP_ADDRESS", "PASSWORD", ] -IGNORE_CLASS = ["AMBIGUOUS", "ID"] +IGNORE_CLASS = ["AMBIGUOUS", "ID", "NAME_EXAMPLE", "USERNAME_EXAMPLE"] LABEL2ID = {"O": 0} for cat in CATEGORIES: @@ -42,8 +37,6 @@ def compute_ap(pred, truth): def compute_metrics(p): predictions, labels = p - print(f"predictions.shape: {predictions.shape} and type {type(predictions)}") - print(f"labels.shape: {labels.shape} and type {type(labels)}") avg_prec = compute_ap(predictions, labels) predictions = np.argmax(predictions, axis=2) @@ -58,7 +51,7 @@ def compute_metrics(p): ] results = _seqeval_metric.compute( - predictions=true_predictions, references=true_labels + predictions=true_predictions, references=true_labels, zero_division=0, ) agg_metrics = { "Avg.Precision": avg_prec, From 7ee331b1fc142b3c0670481d34c09e18eba8febb Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Sat, 25 Mar 2023 02:12:45 +0000 Subject: [PATCH 07/14] add readme --- pii/ner/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 pii/ner/README.md diff --git a/pii/ner/README.md b/pii/ner/README.md new file mode 100644 index 0000000..c0c8685 --- /dev/null +++ b/pii/ner/README.md @@ -0,0 +1,14 @@ +# Fine-tuning Bigcode-Encoder on an NER task for PII detection + +To run the training on all the dataset `bigcode/pii-full-ds`, use the following command: +```bash +python -m torch.distributed.launch \ + --nproc_per_node number_of_gpus train.py \ + --dataset_name bigcode/pii-full-ds \ + --debug \ + --learning_rate 2e-5 \ + --train_batch_size 8 \ + --bf16 \ + --add_not_curated +``` +Note that we use a global batch size of 64 (8*8 GPUs). To use only curated dataset remove the flag `--add_not_curated`. \ No newline at end of file From 8e5348afbc36f34fc6706927291735c049a28d61 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Mon, 27 Mar 2023 11:10:20 +0000 Subject: [PATCH 08/14] add redaction code --- pii/ner/pii_redaction/README.md | 5 + pii/ner/pii_redaction/main_redact.py | 288 +++++++++++++++++++++++ pii/ner/pii_redaction/manual_sharding.py | 54 +++++ pii/ner/pii_redaction/replacements.json | 1 + pii/ner/pii_redaction/utils.py | 175 ++++++++++++++ pii/ner/train.py | 15 +- 6 files changed, 531 insertions(+), 7 deletions(-) create mode 100644 pii/ner/pii_redaction/README.md create mode 100644 pii/ner/pii_redaction/main_redact.py create mode 100644 pii/ner/pii_redaction/manual_sharding.py create mode 100644 pii/ner/pii_redaction/replacements.json create mode 100644 pii/ner/pii_redaction/utils.py diff --git a/pii/ner/pii_redaction/README.md b/pii/ner/pii_redaction/README.md new file mode 100644 index 0000000..4fd1c28 --- /dev/null +++ b/pii/ner/pii_redaction/README.md @@ -0,0 +1,5 @@ +# PII redaction + +```bash +python main_redact.py --dataset_name /fsx/leandro/data/pii_result/ada --target_dataset ada-no-pii --save_path_disk ada-no-pii-local +``` \ No newline at end of file diff --git a/pii/ner/pii_redaction/main_redact.py b/pii/ner/pii_redaction/main_redact.py new file mode 100644 index 0000000..aae3e1e --- /dev/null +++ b/pii/ner/pii_redaction/main_redact.py @@ -0,0 +1,288 @@ +"""Mask detected PII in a dataset. +""" + +import argparse +import json +import logging +import random +import time +from functools import partial +from pprint import pformat + +from datasets import load_dataset +from datasets.utils.logging import set_verbosity_info + +from manual_sharding import save_manual_shards +from utils import get_replacements, redact_pii_batch + + +def parseArgs(): + parser = argparse.ArgumentParser(description="PII detection and redaction") + parser.add_argument( + "--dataset_name", + default="bigcode/pii-for-code", + type=str, + help="HF repo name/path of the dataset.", + ) + parser.add_argument( + "--num_load_proc", + default=64, + type=int, + help="Number of processes to use for loading the dataset", + ) + parser.add_argument( + "--lang", + default="ada", + type=str, + help="Language to redact PII in.", + ) + parser.add_argument( + "--text_column", + default="content", + type=str, + help="Text column to use, if will be renamed to content", + ) + parser.add_argument( + "--split", + default="train", + type=str, + help="Dataset split to process", + ) + parser.add_argument( + "--batch_size", + default=100, + type=int, + help="Batch size for the PII detection/redaction", + ) + parser.add_argument( + "--seed", + default=0, + type=int, + help="Seed for random", + ) + parser.add_argument( + "--num_proc", + default=96, + type=int, + help="Number of processes to use for the PII detection/redaction", + ) + parser.add_argument( + "--no_redaction", + action="store_true", + help="If set, we don't perform redaction", + ) + parser.add_argument( + "--load_replacements", + default=True, + help="If set, we load the replacements from file replacements.json", + ) + parser.add_argument( + "--add_reference_text", + default=False, + type=bool, + help="If True we add the reference text with PII between delimiters \ + in the redacted text -used for visualization-", + ) + parser.add_argument( + "--check_all_files", + action="store_true", + help="If set, we check all files, not only the ones that contain PII", + ) + parser.add_argument( + "--check_sampling_size", + default=0, + type=int, + help="Number of samples to check for PII", + ) + # for saving the dataset: either push to HF or save locally with datasets or save manual shards + parser.add_argument( + "--save_mode", + default="manual_shards", + type=str, + choices=["hub", "local", "manual_shards"], + help="How to save the dataset", + ) + parser.add_argument( + "--save_mode_checks", + default="hub", + type=str, + choices=["hub", "local", "manual_shards"], + help="How to save the checks dataset", + ) + # add argument for name of dataset on the hub + parser.add_argument( + "--target_dataset", + default="bigcode-pii2", + type=str, + help="HF repo name of the target dataset in save_mode=hub.", + ) + parser.add_argument( + "--hub_username", + default="loubnabnl", + type=str, + help="Username for the hub", + ) + parser.add_argument( + "--save_path_disk", + default="bigcode-pii2-local", + type=str, + help="Path to save the dataset on disk in save_mode=local.", + ) + return parser.parse_args() + + +def get_check_ds(ds, args): + if not args.check_all_files: + ds_checks = ds.filter( + lambda exs: exs["modified"], + batched=True, + batch_size=args.batch_size, + num_proc=args.num_proc, + ) + else: + ds_checks = ds + if not args.check_sampling_size: + sampling_size = len(ds_checks) + idx_samples = random.sample( + range(len(ds_checks)), min(len(ds_checks), sampling_size) + ) + ds_checks = ds_checks.select(idx_samples) + + return ds_checks + + +def check_uniques(example, uniques): + """Check if current id is still in set of unique id and remove if true.""" + if example["id"] in uniques: + uniques.remove(example["id"]) + return True + else: + return False + + +def main(): + set_verbosity_info() + logger = logging.getLogger(__name__) + logger.setLevel(logging.INFO) + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, + handlers=[logging.FileHandler("pii.log"), logging.StreamHandler()], + ) + args = parseArgs() + logger.info( + f"** The job is running with the following arguments: **\n{args}\n **** " + ) + + logger.info(f" ===== Loading {args.dataset_name} =====") + ds = load_dataset( + args.dataset_name, + split=args.split, + use_auth_token=True, + num_proc=args.num_load_proc, + ) + if args.text_column != "content": + ds = ds.rename_column(args.text_column, "content") + + logger.info(f" ===== Deduplicating dataset =====") + # Deduplication based on ids + uniques = set(ds["id"]) + frac = len(uniques) / len(ds) + logger.info(f"Fraction of duplicates: {1-frac:.2%}") + logger.info(f"Dataset:\n{ds}") + # Deduplicate data and apply heuristics + t_start = time.time() + ds_pii = ds.filter( + check_uniques, fn_kwargs={"uniques": uniques}, num_proc=args.num_proc + ) + logger.info(f"Time to filter dataset: {time.time()-t_start:.2f}") + logger.info(f"Dataset after dedup:\n{ds_pii}") + + logger.info( + f"Number of samples that contained PII: {sum([1 if x['entities'] else 0 for x in ds_pii])}" + ) + logger.info( + f"Total number of secrets found: {sum([len(x['entities']) for x in ds_pii])}" + ) + + # redact PII in the dataset + logger.info(f" ===== Applying PII redaction =====") + random.seed(args.seed) + + replacements = get_replacements() + with open("replacements.json", "w") as f: + json.dump(replacements, f) + logging.info(f"Using the following replacements:\n{pformat(replacements)}") + ds_pii = ds_pii.map( + partial( + redact_pii_batch, + replacements=replacements, + add_references=args.add_reference_text, + ), + batched=True, + batch_size=args.batch_size, + num_proc=args.num_proc, + load_from_cache_file=False, + ) + logging.info(f"Dataset info after PII redaction:\n{ds_pii}") + + # check the dataset + logger.info( + f" ===== Checking {args.check_sampling_size} samples from those modified in the dataset =====" + ) + ds_checks = get_check_ds(ds_pii, args) + + # save checks dataset + if len(ds_checks) == 0: + logger.info("Dataset was empty. Not saving anything.") + else: + logger.info(f"Checks dataset info {ds_checks}") + if args.save_mode_checks == "hub": + logger.info( + f"Pushing the checks dataset to the Hub as {args.target_dataset}_checks" + ) + ds_checks.push_to_hub(args.target_dataset + "_checks") + + elif args.save_mode_checks == "local": + logger.info(f"Saving the checks dataset to disk") + ds_checks.save_to_disk(args.save_path_disk + "_checks") + + elif args.save_mode_checks == "manual_shards": + logger.info(f"Saving the checks dataset in manual shards") + save_manual_shards( + ds_checks, + user=args.hub_username, + remote_dataset_repo=args.target_dataset + "_checks", + ) + + logger.info("Removing columns that are not needed for the final dataset") + columns = ["content", "modified", "entities"] + if args.add_reference_text: + columns.append("references") + ds_pii = ds_pii.remove_columns(columns) + ds_pii = ds_pii.rename_column("new_content", "content") + logger.info(f"Dataset info after removing columns:\n{ds_pii}") + + # save the final dataset + if args.save_mode == "hub": + logger.info( + f" ===== Pushing the dataset to the Hub as: {args.target_dataset} =====" + ) + ds_pii.push_to_hub(args.target_dataset) + + elif args.save_mode == "local": + logger.info(f" ===== Saving the dataset to disk =====") + ds_pii.save_to_disk(args.save_path_disk) + + elif args.save_mode == "manual_shards": + logger.info(f" ===== Saving the dataset in manual shards =====") + save_manual_shards( + ds_pii, user=args.hub_username, remote_dataset_repo=args.target_dataset + ) + + logger.info(f" ===== Dataset saved successfully =====") + + +if __name__ == "__main__": + main() diff --git a/pii/ner/pii_redaction/manual_sharding.py b/pii/ner/pii_redaction/manual_sharding.py new file mode 100644 index 0000000..8edf034 --- /dev/null +++ b/pii/ner/pii_redaction/manual_sharding.py @@ -0,0 +1,54 @@ +import os +import time +from multiprocessing import Pool +from tqdm import tqdm + +from huggingface_hub import Repository + + +def save_shard(shard_tuple): + """Save shard""" + filename, shard = shard_tuple + # use to_json instead to save as json file + shard.to_parquet(filename) + +def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): + """Save sharded data + Args: + ds (Dataset): dataset to be saved + user (str): user name + remote_dataset_repo (str): remote dataset repository + out_path (str): path to save the shards""" + # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO + # you can save the shards inside it and do git add/commit/push to push data to the hub + out_path = remote_dataset_repo + # if out path doesnt already exist + if not os.path.exists(out_path): + repo = Repository( + local_dir=out_path, + clone_from=user + "/" + remote_dataset_repo, + repo_type="dataset", + use_auth_token=True, + git_user=user + ) + + # files will be numerous we save them in a folder called data inside out_path + os.mkdir(out_path + "/data") + SHARD_SIZE = 1000 << 20 + if ds._indices is not None: + dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) + else: + dataset_nbytes = ds.data.nbytes + num_shards = int(dataset_nbytes / SHARD_SIZE) + 1 + print(f"Number of shards: {num_shards}") + + print("sharding the dataset") + t_start = time.time() + shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) + # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files + filenames = (f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) + + with Pool(16) as p: + list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) + print(f"Time to save dataset: {time.time()-t_start:.2f}") + # to push dataset to hub do: git add/commit/push inside OUT_PATH \ No newline at end of file diff --git a/pii/ner/pii_redaction/replacements.json b/pii/ner/pii_redaction/replacements.json new file mode 100644 index 0000000..474eccc --- /dev/null +++ b/pii/ner/pii_redaction/replacements.json @@ -0,0 +1 @@ +{"EMAIL": [""], "KEY": [""], "NAME": [""], "PASSWORD": [""], "IP_ADDRESS": {"IPv4": ["172.16.31.10", "172.16.58.3", "172.16.17.32", "192.168.127.12", "192.168.3.11"], "IPv6": ["fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b", "fd00:a516:7c1b:17cd:6d81:2137:bd2a:2c5b", "fc00:e968:6179::de52:7100", "fc00:db20:35b:7399::5", "fdf8:f53e:61e4::18"]}} \ No newline at end of file diff --git a/pii/ner/pii_redaction/utils.py b/pii/ner/pii_redaction/utils.py new file mode 100644 index 0000000..a054b7c --- /dev/null +++ b/pii/ner/pii_redaction/utils.py @@ -0,0 +1,175 @@ +import ipaddress +import json +import random + +IGNORE = ["AMBIGUOUS", "USERNAME"] +# List of random private IP addresses to use as replacements +REPLACEMENTS_IP = { + "IPv4": [ + "172.16.31.10", + "172.16.58.3", + "172.16.17.32", + "192.168.127.12", + "192.168.3.11", + ], + "IPv6": [ + "fd00:c2b6:b24b:be67:2827:688d:e6a1:6a3b", + "fd00:a516:7c1b:17cd:6d81:2137:bd2a:2c5b", + "fc00:e968:6179::de52:7100", + "fc00:db20:35b:7399::5", + "fdf8:f53e:61e4::18", + ], +} + +# DNS to avoid masking +POPULAR_DNS_SERVERS = [ + "8.8.8.8", + "8.8.4.4", + "1.1.1.1", + "1.0.0.1", + "76.76.19.19", + "76.223.122.150", + "9.9.9.9", + "149.112.112.112", + "208.67.222.222", + "208.67.220.220", + "8.26.56.26", + "8.20.247.20", + "94.140.14.14", + "94.140.15.15", +] + + +def load_json(sample): + try: + return json.loads(sample) + except ValueError: + return [] + + +def get_replacements(n=10): + """Build dictionaries of replacements for PII (key, email, IP address, name, password)""" + ip_addresses = REPLACEMENTS_IP + return { + "EMAIL": [""], + "KEY": [""], + "NAME": [""], + "PASSWORD": [""], + "IP_ADDRESS": ip_addresses, + } + + +def replace_ip(value, replacements_dict): + """Replace an IP address with a synthetic IP address of the same format""" + try: + ipaddress.IPv4Address(value) + return random.choice(replacements_dict["IP_ADDRESS"]["IPv4"]) + except ValueError: + try: + ipaddress.IPv6Address(value) + return random.choice(replacements_dict["IP_ADDRESS"]["IPv6"]) + except ValueError: + # this doesn't happen if we already use ipaddress filter in the detection + print("Invalid IP address") + return value + + +def is_private_ip(ip): + """Check if an IP address is allocated for private networks (non internet facing), or is not an ip address at all""" + try: + ip = ipaddress.ip_address(ip) + except ValueError: + # not an ip address + return True + return ip.is_private + + +def redact_pii_text(text, secrets, replacements, add_references=False): + """Redact PII in a text + Args: + text (str): text to redact + secrets (list): list with the secrets to redact + replacements (dict): dictionary of replacements for each PII type + add_references (bool): whether to add references to the redacted text (delimiters to PII) + for vizualization + Returns: + text (str): new text with redacted secrets + """ + modified = False + if secrets: + secrets = sorted(secrets, key=lambda x: x["start"]) + # store the secrets that were replaced here with their replacements + replaced_secrets = {} + subparts = [] + references = [] + step = 0 + last_text = text + for secret in secrets: + if secret["tag"] in IGNORE: + continue + if secret["tag"] == "IP_ADDRESS": + # skip secret if it is not actual ip address, is apopular DNS server or private IP address + if is_private_ip(secret["value"]) or ( + secret["value"] in POPULAR_DNS_SERVERS + ): + continue + modified = True + subtext = text[step : secret["start"]] + subpart = subtext if subtext else " " + subparts.append(subpart) + # if secret is already in replaced_secrets, use the same replacement + if secret["value"] in replaced_secrets: + replacement = replaced_secrets[secret["value"]] + else: + if secret["tag"] == "IP_ADDRESS": + replacement = replace_ip(secret["value"], replacements) + else: + replacement = random.choice(replacements[secret["tag"]]) + replaced_secrets[secret["value"]] = replacement + subparts.append(replacement) + replaced_secrets[secret["value"]] = replacement + if add_references: + references.append(subpart) + references.append(f"PI:{secret['tag']}:{replacement}END_PI") + last_text = text[secret["end"] :] + step = secret["end"] + # if supbarpts are not empty join them (it can be empty when all secrets were skipped) + new_text = "".join(subparts) + last_text if subparts else last_text + if add_references: + references = "".join(references) + last_text if references else "" + else: + new_text = text + references = "" + result = ( + (new_text, references, modified) if add_references else (new_text, modified) + ) + return result + + +def redact_pii_batch(examples, replacements, add_references=True): + """Anonymize PII in a batch of examples from a dataset""" + new_contents = [] + references = [] + modified = [] + for text, secrets in zip( + examples["content"], + examples["entities"], + ): + if secrets: + if add_references: + new_text, reference, modif = redact_pii_text( + text, secrets, replacements, add_references + ) + references.append(reference) + else: + new_text, modif = redact_pii_text(text, secrets, replacements) + new_contents.append(new_text) + modified.append(modif) + else: + new_contents.append(text) + references.append(text) + modified.append(False) + result = {"new_content": new_contents, "modified": modified} + if add_references: + result.update({"references": references}) + return result diff --git a/pii/ner/train.py b/pii/ner/train.py index 312fcbe..4e764cc 100644 --- a/pii/ner/train.py +++ b/pii/ner/train.py @@ -141,7 +141,7 @@ def run_training(args, ner_dataset, model, tokenizer): eval_accumulation_steps=args.eval_accumulation_steps, fp16=args.fp16, bf16=args.bf16, - run_name=f"{args.prefix}-bs{args.train_batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-ep{args.num_train_epochs}", + run_name=f"{args.prefix}-bs{args.train_batch_size}-lr{args.learning_rate}-wd{args.weight_decay}-ep{args.num_train_epochs}-last", report_to="wandb", ) @@ -157,26 +157,27 @@ def run_training(args, ner_dataset, model, tokenizer): compute_metrics=compute_metrics, callbacks=[ EarlyStoppingCallback( - early_stopping_patience=20, early_stopping_threshold=1e-2 + early_stopping_patience=15, early_stopping_threshold=1e-2 ) ], ) print("Training...") - trainer.train() + #trainer.train() print("Saving last checkpoint of the model") - model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/")) + #model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint_last_exp/")) # evaluate on test set - #print("Evaluating on test set...") - #trainer.evaluate(ner_dataset["test"]) + print("Evaluating on test set...") + trainer.evaluate(ner_dataset["validation"]) def main(args): # load model and tokenizer model = AutoModelForTokenClassification.from_pretrained( - args.model_ckpt, + #args.model_ckpt, + "/fsx/loubna/code/bigcode-dataset/pii/ner/finetuned-encoder-pii/final_checkpoint-all-noexamples", num_labels=len(ID2LABEL), id2label=ID2LABEL, label2id=LABEL2ID, From 86cbfe65a3cbf0c9d31d589e3542729059d68839 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Mon, 27 Mar 2023 11:58:03 +0000 Subject: [PATCH 09/14] update code --- pii/ner/pii_redaction/README.md | 3 ++- pii/ner/pii_redaction/main_redact.py | 16 ++++------------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/pii/ner/pii_redaction/README.md b/pii/ner/pii_redaction/README.md index 4fd1c28..60240f8 100644 --- a/pii/ner/pii_redaction/README.md +++ b/pii/ner/pii_redaction/README.md @@ -1,5 +1,6 @@ # PII redaction ```bash -python main_redact.py --dataset_name /fsx/leandro/data/pii_result/ada --target_dataset ada-no-pii --save_path_disk ada-no-pii-local +LANG=python +python main_redact.py --dataset_name /fsx/leandro/data/pii_result/$LANG --target_dataset $LANG-no-pii --save_path_disk $LANG-no-pii-local ``` \ No newline at end of file diff --git a/pii/ner/pii_redaction/main_redact.py b/pii/ner/pii_redaction/main_redact.py index aae3e1e..0a5be9e 100644 --- a/pii/ner/pii_redaction/main_redact.py +++ b/pii/ner/pii_redaction/main_redact.py @@ -30,12 +30,6 @@ def parseArgs(): type=int, help="Number of processes to use for loading the dataset", ) - parser.add_argument( - "--lang", - default="ada", - type=str, - help="Language to redact PII in.", - ) parser.add_argument( "--text_column", default="content", @@ -78,7 +72,7 @@ def parseArgs(): ) parser.add_argument( "--add_reference_text", - default=False, + default=True, type=bool, help="If True we add the reference text with PII between delimiters \ in the redacted text -used for visualization-", @@ -162,15 +156,15 @@ def check_uniques(example, uniques): def main(): set_verbosity_info() + args = parseArgs() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, - handlers=[logging.FileHandler("pii.log"), logging.StreamHandler()], + handlers=[logging.FileHandler(f"logs/pii-{args.dataset_name.split('/')[-1]}.log"), logging.StreamHandler()], ) - args = parseArgs() logger.info( f"** The job is running with the following arguments: **\n{args}\n **** " ) @@ -193,9 +187,7 @@ def main(): logger.info(f"Dataset:\n{ds}") # Deduplicate data and apply heuristics t_start = time.time() - ds_pii = ds.filter( - check_uniques, fn_kwargs={"uniques": uniques}, num_proc=args.num_proc - ) + ds_pii = ds.filter(check_uniques, fn_kwargs={"uniques": uniques}) logger.info(f"Time to filter dataset: {time.time()-t_start:.2f}") logger.info(f"Dataset after dedup:\n{ds_pii}") From 133e7441604eee7eb6a0befb905ae555fffc2b70 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Tue, 28 Mar 2023 10:08:07 +0000 Subject: [PATCH 10/14] add file metadata and postprocessinf of secrets --- pii/ner/pii_redaction/README.md | 9 ++- pii/ner/pii_redaction/main_redact.py | 80 +++++++++++++++++++++--- pii/ner/pii_redaction/manual_sharding.py | 44 ++++++++----- pii/ner/pii_redaction/utils.py | 41 ++++++++---- 4 files changed, 138 insertions(+), 36 deletions(-) diff --git a/pii/ner/pii_redaction/README.md b/pii/ner/pii_redaction/README.md index 60240f8..596413b 100644 --- a/pii/ner/pii_redaction/README.md +++ b/pii/ner/pii_redaction/README.md @@ -1,6 +1,13 @@ # PII redaction ```bash -LANG=python +LANG=jupyter-scripts-dedup-filtered +python main_redact.py --dataset_name /fsx/leandro/data/pii_result/$LANG --target_dataset $LANG-no-pii --save_path_disk $LANG-no-pii-local +``` +/fsx/leandro/data/pii_result/jupyter-scripts-dedup-filtered +/fsx/leandro/data/pii_result/jupyter-structured-clean-dedup +/fsx/leandro/data/pii_result/github-issues-filtered-structured + +```bash python main_redact.py --dataset_name /fsx/leandro/data/pii_result/$LANG --target_dataset $LANG-no-pii --save_path_disk $LANG-no-pii-local ``` \ No newline at end of file diff --git a/pii/ner/pii_redaction/main_redact.py b/pii/ner/pii_redaction/main_redact.py index 0a5be9e..a94c7a0 100644 --- a/pii/ner/pii_redaction/main_redact.py +++ b/pii/ner/pii_redaction/main_redact.py @@ -6,6 +6,7 @@ import logging import random import time +import numpy as np from functools import partial from pprint import pformat @@ -16,6 +17,44 @@ from utils import get_replacements, redact_pii_batch +REPONAME_TOKEN = "" +FILENAME_TOKEN = "" +STARS_TOKEN = "" + + +def get_num_stars_bucket(num_stars: int) -> str: + if num_stars is None or num_stars == 0: + return "0" + elif num_stars <= 10: + return "1-10" + elif num_stars <= 100: + return "10-100" + elif num_stars <= 1000: + return "100-1000" + else: + return "1000+" + + +def content_with_meta(example): + # TODO + res = "" + # repo-name + if np.random.binomial(n=1, p=0.2): + res += f"{REPONAME_TOKEN}{example['max_stars_repo_name']}" + # file-name + if np.random.binomial(n=1, p=0.2): + res += f"{FILENAME_TOKEN}{example['max_stars_repo_path']}" + # number of stars + if np.random.binomial(n=1, p=0.2): + num_stars = get_num_stars_bucket(example["max_stars_count"]) + res += f"{STARS_TOKEN}{num_stars}" + if len(res) > 0: + res += "\n" + res += example["content"] + + return {"content_with_meta": res} + + def parseArgs(): parser = argparse.ArgumentParser(description="PII detection and redaction") parser.add_argument( @@ -24,6 +63,12 @@ def parseArgs(): type=str, help="HF repo name/path of the dataset.", ) + # add arg true add metadata + parser.add_argument( + "--add_metadata", + action="store_true", + help="If set, we add metadata to the text", + ) parser.add_argument( "--num_load_proc", default=64, @@ -118,7 +163,7 @@ def parseArgs(): ) parser.add_argument( "--save_path_disk", - default="bigcode-pii2-local", + default="/fsx/loubna/data/the-stack-march-no-pii", type=str, help="Path to save the dataset on disk in save_mode=local.", ) @@ -163,7 +208,10 @@ def main(): format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, - handlers=[logging.FileHandler(f"logs/pii-{args.dataset_name.split('/')[-1]}.log"), logging.StreamHandler()], + handlers=[ + logging.FileHandler(f"logs/pii-{args.dataset_name.split('/')[-1]}.log"), + logging.StreamHandler(), + ], ) logger.info( f"** The job is running with the following arguments: **\n{args}\n **** " @@ -194,9 +242,9 @@ def main(): logger.info( f"Number of samples that contained PII: {sum([1 if x['entities'] else 0 for x in ds_pii])}" ) - logger.info( - f"Total number of secrets found: {sum([len(x['entities']) for x in ds_pii])}" - ) + # logger.info( + # f"Total number of secrets found: {sum([len(x['entities']) for x in ds_pii])}" + # ) # redact PII in the dataset logger.info(f" ===== Applying PII redaction =====") @@ -215,7 +263,6 @@ def main(): batched=True, batch_size=args.batch_size, num_proc=args.num_proc, - load_from_cache_file=False, ) logging.info(f"Dataset info after PII redaction:\n{ds_pii}") @@ -234,7 +281,7 @@ def main(): logger.info( f"Pushing the checks dataset to the Hub as {args.target_dataset}_checks" ) - ds_checks.push_to_hub(args.target_dataset + "_checks") + ds_checks.push_to_hub(args.target_dataset + "_checks", private=True) elif args.save_mode_checks == "local": logger.info(f"Saving the checks dataset to disk") @@ -246,6 +293,7 @@ def main(): ds_checks, user=args.hub_username, remote_dataset_repo=args.target_dataset + "_checks", + local_dir="/fsx/loubna/data/the-stack-march-no-pii_checks", ) logger.info("Removing columns that are not needed for the final dataset") @@ -256,21 +304,33 @@ def main(): ds_pii = ds_pii.rename_column("new_content", "content") logger.info(f"Dataset info after removing columns:\n{ds_pii}") + if args.add_metadata: + logger.info(f" ===== Adding metadata =====") + ds_pii = ds_pii.map( + content_with_meta, remove_columns=["content"], num_proc=args.num_proc + ) + ds_pii = ds_pii.rename_column("content_with_meta", "content") + # save the final dataset if args.save_mode == "hub": logger.info( f" ===== Pushing the dataset to the Hub as: {args.target_dataset} =====" ) - ds_pii.push_to_hub(args.target_dataset) + ds_pii.push_to_hub(args.target_dataset, private=True) elif args.save_mode == "local": logger.info(f" ===== Saving the dataset to disk =====") ds_pii.save_to_disk(args.save_path_disk) elif args.save_mode == "manual_shards": - logger.info(f" ===== Saving the dataset in manual shards =====") + logger.info( + f" ===== Saving the dataset in manual shards to {args.save_path_disk} =====" + ) save_manual_shards( - ds_pii, user=args.hub_username, remote_dataset_repo=args.target_dataset + ds_pii, + user=args.hub_username, + remote_dataset_repo="the-stack-no-pii-march", + local_dir=args.save_path_disk, ) logger.info(f" ===== Dataset saved successfully =====") diff --git a/pii/ner/pii_redaction/manual_sharding.py b/pii/ner/pii_redaction/manual_sharding.py index 8edf034..8e9f894 100644 --- a/pii/ner/pii_redaction/manual_sharding.py +++ b/pii/ner/pii_redaction/manual_sharding.py @@ -12,29 +12,34 @@ def save_shard(shard_tuple): # use to_json instead to save as json file shard.to_parquet(filename) -def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pjj"): + +def save_manual_shards( + ds, + user="loubnabnl", + remote_dataset_repo="bigcode-pii-pjj", + local_dir="/fsx/loubna/data/the-stack-march-no-pii", +): """Save sharded data Args: ds (Dataset): dataset to be saved user (str): user name - remote_dataset_repo (str): remote dataset repository out_path (str): path to save the shards""" # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO # you can save the shards inside it and do git add/commit/push to push data to the hub - out_path = remote_dataset_repo + out_path = remote_dataset_repo if local_dir is None else local_dir # if out path doesnt already exist if not os.path.exists(out_path): repo = Repository( - local_dir=out_path, - clone_from=user + "/" + remote_dataset_repo, - repo_type="dataset", - use_auth_token=True, - git_user=user - ) + local_dir=out_path, + clone_from=user + "/" + remote_dataset_repo, + repo_type="dataset", + use_auth_token=True, + git_user=user, + ) # files will be numerous we save them in a folder called data inside out_path os.mkdir(out_path + "/data") - SHARD_SIZE = 1000 << 20 + SHARD_SIZE = 1000 << 20 if ds._indices is not None: dataset_nbytes = ds.data.nbytes * len(ds._indices) / len(ds.data) else: @@ -44,11 +49,22 @@ def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pj print("sharding the dataset") t_start = time.time() - shards = (ds.shard(num_shards=num_shards, index=i, contiguous=True) for i in range(num_shards)) + shards = ( + ds.shard(num_shards=num_shards, index=i, contiguous=True) + for i in range(num_shards) + ) # use f"{OUT_PATH}/data/train-{index:05d}-of-{num_shards:05d}.json" instead for json files - filenames = (f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" for index in range(num_shards)) + filenames = ( + f"{out_path}/data/train-{index:05d}-of-{num_shards:05d}.parquet" + for index in range(num_shards) + ) with Pool(16) as p: - list(tqdm(p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), total=num_shards)) + list( + tqdm( + p.imap_unordered(save_shard, zip(filenames, shards), chunksize=4), + total=num_shards, + ) + ) print(f"Time to save dataset: {time.time()-t_start:.2f}") - # to push dataset to hub do: git add/commit/push inside OUT_PATH \ No newline at end of file + # to push dataset to hub do: git add/commit/push inside OUT_PATH diff --git a/pii/ner/pii_redaction/utils.py b/pii/ner/pii_redaction/utils.py index a054b7c..9927596 100644 --- a/pii/ner/pii_redaction/utils.py +++ b/pii/ner/pii_redaction/utils.py @@ -1,6 +1,6 @@ import ipaddress -import json import random +from gibberish_detector import detector IGNORE = ["AMBIGUOUS", "USERNAME"] # List of random private IP addresses to use as replacements @@ -40,14 +40,29 @@ ] -def load_json(sample): - try: - return json.loads(sample) - except ValueError: - return [] +def is_key(matched_str): + """Checks to make sure the PII span is long enough and is gibberish and not word like""" + # pip install gibberish-detector + # download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt + # run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds) + Detector = detector.create_from_model( + "/fsx/loubna/code/bigcode-dataset/pii/gibberish_data/big.model" + ) + is_gibberish = Detector.is_gibberish(matched_str.lower()) + return is_gibberish and len(matched_str) > 8 + + +def is_secret(matched_str): + """Checks to make sure the PII span is long enough""" + return len(matched_str) > 3 -def get_replacements(n=10): +def is_full_name(matched_str): + """Checks if detected name is a full names and not just first or last name""" + return len(matched_str.split()) > 1 + + +def get_replacements(): """Build dictionaries of replacements for PII (key, email, IP address, name, password)""" ip_addresses = REPLACEMENTS_IP return { @@ -74,7 +89,7 @@ def replace_ip(value, replacements_dict): return value -def is_private_ip(ip): +def is_secret_ip(ip): """Check if an IP address is allocated for private networks (non internet facing), or is not an ip address at all""" try: ip = ipaddress.ip_address(ip) @@ -105,14 +120,18 @@ def redact_pii_text(text, secrets, replacements, add_references=False): step = 0 last_text = text for secret in secrets: - if secret["tag"] in IGNORE: + if secret["tag"] in IGNORE or not is_secret(secret["value"]): continue if secret["tag"] == "IP_ADDRESS": - # skip secret if it is not actual ip address, is apopular DNS server or private IP address - if is_private_ip(secret["value"]) or ( + # skip if it's not actual ip address, is a popular DNS server or private IP address + if is_secret_ip(secret["value"]) or ( secret["value"] in POPULAR_DNS_SERVERS ): continue + if secret["tag"] == "KEY" and not is_key(secret["value"]): + continue + if secret["tag"] == "NAME" and not is_full_name(secret["value"]): + continue modified = True subtext = text[step : secret["start"]] subpart = subtext if subtext else " " From 09da2037137d61e8263f485eca1ffca9d79b6330 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Tue, 28 Mar 2023 10:25:03 +0000 Subject: [PATCH 11/14] add slurm script --- pii/ner/pii_redaction/run_pii_slurm.py | 207 +++++++++++++++++++++++++ 1 file changed, 207 insertions(+) create mode 100644 pii/ner/pii_redaction/run_pii_slurm.py diff --git a/pii/ner/pii_redaction/run_pii_slurm.py b/pii/ner/pii_redaction/run_pii_slurm.py new file mode 100644 index 0000000..0427152 --- /dev/null +++ b/pii/ner/pii_redaction/run_pii_slurm.py @@ -0,0 +1,207 @@ +import os +import argparse +import subprocess + +SCRIPT_DIR = "/fsx/loubna/code/bigcode-dataset/pii/ner/pii_redaction/jobs/scripts" +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--start", type=int, default=0) + parser.add_argument("--end", type=int, default=10) + parser.add_argument("--text_column", type=str, default="content") + args = parser.parse_args() + return args + + +def submit_job(job, job_name="job"): + with open(f"{SCRIPT_DIR}/{job_name}.sbatch", "w") as fp: + fp.write(job) + #os.system(f"{SCRIPT_DIR}/{job_name}.sbatch") + subprocess.run(["sbatch", f"{SCRIPT_DIR}/{job_name}.sbatch"]) + + +def makejob(JOB_NAME="pii-redaction", LANG=None, TEXT_COLUMN="content"): + return f"""#!/bin/bash + +#SBATCH --job-name={JOB_NAME} +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! +#SBATCH --cpus-per-task=96 +#SBATCH --gres=gpu:8 +#SBATCH --partition=production-cluster +#SBATCH -o /fsx/loubna/code/bigcode-dataset/pii/ner/pii_redaction/jobs/logs/%x-%j.out +#SBATCH -e /fsx/loubna/code/bigcode-dataset/pii/ner/pii_redaction/jobs/logs/%x-%j.err + +set -x -e +source /admin/home/loubna/.bashrc +conda activate eval-harness + +# File Path setup +echo "START TIME: $(date)" + +# Experiment parameters +LANG={LANG} + +# Training Setup +GPUS_PER_NODE=8 +# so processes know who to talk to +MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +MASTER_PORT=6000 +NNODES=$SLURM_NNODES +NODE_RANK=$SLURM_PROCID +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + + + +CMD=" \ + /fsx/loubna/code/bigcode-dataset/pii/ner/pii_redaction/main_redact.py \ + --add_metadata \ + --text_column {TEXT_COLUMN} \ + --dataset_name /fsx/leandro/data/pii_result/{LANG} \ + --target_dataset the-stack-no-pii-{LANG} \ + --save_path_disk /fsx/loubna/data/the-stack-march-no-pii-test/{LANG} + " + +export LAUNCHER="python \ + " + +# force crashing on nccl issues like hanging broadcast +export NCCL_ASYNC_ERROR_HANDLING=1 +# export NCCL_DEBUG=INFO +# export NCCL_DEBUG_SUBSYS=COLL +# export NCCL_SOCKET_NTHREADS=1 +# export NCCL_NSOCKS_PERTHREAD=1 +# export CUDA_LAUNCH_BLOCKING=1 + +# AWS specific +export NCCL_PROTO=simple +export RDMAV_FORK_SAFE=1 +export FI_EFA_FORK_SAFE=1 +export FI_EFA_USE_DEVICE_RDMA=1 +export FI_PROVIDER=efa +export FI_LOG_LEVEL=1 +export NCCL_IB_DISABLE=1 +export NCCL_SOCKET_IFNAME=ens + +echo $CMD + +# srun error handling: +# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks +# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code +SRUN_ARGS=" \ + --wait=60 \ + --kill-on-bad-exit=1 \ + " + +# py-spy top -s -i -n -- $LAUNCHER --node_rank $SLURM_PROCID --role $SLURMD_NODENAME: $CMD +clear; srun $SRUN_ARGS --jobid $SLURM_JOB_ID bash -c "$LAUNCHER $CMD" 2>&1 | tee $LOG_PATH + +echo "END TIME: $(date)" +""" + + +if __name__ == "__main__": + args = get_args() + # 88 PLs + languages = [ + "markdown", + "cpp", + "java", + "c-sharp", + "php", + "assembly", + "html", + "c", + "javascript", + "python", + "haskell", + "fortran", + "typescript", + "sparql", + "antlr", + "tex", + "lean", + "literate-haskell", + "elm", + "standard-ml", + "powershell", + "stan", + "matlab", + "solidity", + "smalltalk", + "tcsh", + "idris", + "julia", + "bluespec", + "visual-basic", + "java-server-pages", + "cuda", + "yacc", + "racket", + "thrift", + "sql", + "protocol-buffer", + "elixir", + "kotlin", + "vhdl", + "scheme", + "tcl", + "isabelle", + "prolog", + "json", + "restructuredtext", + "ada", + "rmarkdown", + "clojure", + "r", + "zig", + "ruby", + "batchfile", + "erlang", + "stata", + "xslt", + "css", + "augeas", + "agda", + "awk", + "groovy", + "coffeescript", + "lua", + "systemverilog", + "common-lisp", + "scala", + "verilog", + "dart", + "maple", + "shell", + "alloy", + "rust", + "sas", + "ocaml", + "go", + "literate-coffeescript", + "emacs-lisp", + "literate-agda", + "f-sharp", + "pascal", + "applescript", + "glsl", + "yaml", + "makefile", + "perl", + "mathematica", + "dockerfile", + "cmake", + ] + for i in range(args.start, args.end + 1): + #language = languages[i] + language="github-issues-filtered-structured" + print(f"Submitting jobs for experiment on language {language}") + job_name = f"{language}-pii-redaction-idx_{i}" + job = makejob( + JOB_NAME=job_name, + LANG=language, + TEXT_COL=args.text_column, + ) + # submit the job + print(f"Job for lang {language} ready and saved at jobs/{job_name}.sbatch") + submit_job(job, job_name) From 6abe5ebb4f2824c519ce12a4ffa94f722b248882 Mon Sep 17 00:00:00 2001 From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com> Date: Tue, 28 Mar 2023 12:29:30 +0200 Subject: [PATCH 12/14] Update README.md --- pii/ner/pii_redaction/README.md | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pii/ner/pii_redaction/README.md b/pii/ner/pii_redaction/README.md index 596413b..370b38a 100644 --- a/pii/ner/pii_redaction/README.md +++ b/pii/ner/pii_redaction/README.md @@ -1,13 +1,12 @@ # PII redaction - +To run PII redaction on a dataset that went though PII detection with this [NER model](https://huggingface.co/bigcode/bigcode-encoder-pii-ner-v2). ```bash -LANG=jupyter-scripts-dedup-filtered -python main_redact.py --dataset_name /fsx/leandro/data/pii_result/$LANG --target_dataset $LANG-no-pii --save_path_disk $LANG-no-pii-local +LANG=python +python main_redact.py --dataset_name $DATA_PATH --target_dataset $LANG-no-pii --save_path_disk $LANG-no-pii-local ``` -/fsx/leandro/data/pii_result/jupyter-scripts-dedup-filtered -/fsx/leandro/data/pii_result/jupyter-structured-clean-dedup -/fsx/leandro/data/pii_result/github-issues-filtered-structured + +To run multiple `slurm` jobs for each programming language ```bash -python main_redact.py --dataset_name /fsx/leandro/data/pii_result/$LANG --target_dataset $LANG-no-pii --save_path_disk $LANG-no-pii-local -``` \ No newline at end of file +python run_pii_slurm.py --start 0 --end 88 +``` From 0b5198b1314753df739ac1e5448a26e78e73015b Mon Sep 17 00:00:00 2001 From: Loubna Ben Allal <44069155+loubnabnl@users.noreply.github.com> Date: Tue, 28 Mar 2023 12:30:20 +0200 Subject: [PATCH 13/14] Update utils.py --- pii/ner/pii_redaction/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pii/ner/pii_redaction/utils.py b/pii/ner/pii_redaction/utils.py index 9927596..6ecdd27 100644 --- a/pii/ner/pii_redaction/utils.py +++ b/pii/ner/pii_redaction/utils.py @@ -46,7 +46,7 @@ def is_key(matched_str): # download the training corpora from https://raw.githubusercontent.com/domanchi/gibberish-detector/master/examples/big.txt # run gibberish-detector train big.txt > big.model to generate the model (it takes 3 seconds) Detector = detector.create_from_model( - "/fsx/loubna/code/bigcode-dataset/pii/gibberish_data/big.model" + "/bigcode-dataset/pii/gibberish_data/big.model" ) is_gibberish = Detector.is_gibberish(matched_str.lower()) return is_gibberish and len(matched_str) > 8 From 6611b430bf947f59373860f2801c6907bae5cd41 Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Tue, 28 Mar 2023 10:46:47 +0000 Subject: [PATCH 14/14] updates --- pii/ner/pii_redaction/run_pii_slurm.py | 5 ++--- pii/ner/pii_redaction/utils.py | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pii/ner/pii_redaction/run_pii_slurm.py b/pii/ner/pii_redaction/run_pii_slurm.py index 0427152..5aee4ab 100644 --- a/pii/ner/pii_redaction/run_pii_slurm.py +++ b/pii/ner/pii_redaction/run_pii_slurm.py @@ -193,14 +193,13 @@ def makejob(JOB_NAME="pii-redaction", LANG=None, TEXT_COLUMN="content"): "cmake", ] for i in range(args.start, args.end + 1): - #language = languages[i] - language="github-issues-filtered-structured" + language = languages[i] print(f"Submitting jobs for experiment on language {language}") job_name = f"{language}-pii-redaction-idx_{i}" job = makejob( JOB_NAME=job_name, LANG=language, - TEXT_COL=args.text_column, + TEXT_COLUMN=args.text_column, ) # submit the job print(f"Job for lang {language} ready and saved at jobs/{job_name}.sbatch") diff --git a/pii/ner/pii_redaction/utils.py b/pii/ner/pii_redaction/utils.py index 6ecdd27..bd4ed42 100644 --- a/pii/ner/pii_redaction/utils.py +++ b/pii/ner/pii_redaction/utils.py @@ -120,6 +120,7 @@ def redact_pii_text(text, secrets, replacements, add_references=False): step = 0 last_text = text for secret in secrets: + # some post-processing if secret["tag"] in IGNORE or not is_secret(secret["value"]): continue if secret["tag"] == "IP_ADDRESS":