From 4aaaf8a2a65fb07842f379fe1ea4879b7313f20b Mon Sep 17 00:00:00 2001 From: loubnabnl Date: Mon, 27 Mar 2023 11:58:03 +0000 Subject: [PATCH] update code --- pii/ner/pii_redaction/README.md | 3 ++- pii/ner/pii_redaction/main_redact.py | 16 ++++------------ 2 files changed, 6 insertions(+), 13 deletions(-) diff --git a/pii/ner/pii_redaction/README.md b/pii/ner/pii_redaction/README.md index 4fd1c28..60240f8 100644 --- a/pii/ner/pii_redaction/README.md +++ b/pii/ner/pii_redaction/README.md @@ -1,5 +1,6 @@ # PII redaction ```bash -python main_redact.py --dataset_name /fsx/leandro/data/pii_result/ada --target_dataset ada-no-pii --save_path_disk ada-no-pii-local +LANG=python +python main_redact.py --dataset_name /fsx/leandro/data/pii_result/$LANG --target_dataset $LANG-no-pii --save_path_disk $LANG-no-pii-local ``` \ No newline at end of file diff --git a/pii/ner/pii_redaction/main_redact.py b/pii/ner/pii_redaction/main_redact.py index aae3e1e..0a5be9e 100644 --- a/pii/ner/pii_redaction/main_redact.py +++ b/pii/ner/pii_redaction/main_redact.py @@ -30,12 +30,6 @@ def parseArgs(): type=int, help="Number of processes to use for loading the dataset", ) - parser.add_argument( - "--lang", - default="ada", - type=str, - help="Language to redact PII in.", - ) parser.add_argument( "--text_column", default="content", @@ -78,7 +72,7 @@ def parseArgs(): ) parser.add_argument( "--add_reference_text", - default=False, + default=True, type=bool, help="If True we add the reference text with PII between delimiters \ in the redacted text -used for visualization-", @@ -162,15 +156,15 @@ def check_uniques(example, uniques): def main(): set_verbosity_info() + args = parseArgs() logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, - handlers=[logging.FileHandler("pii.log"), logging.StreamHandler()], + handlers=[logging.FileHandler(f"logs/pii-{args.dataset_name.split('/')[-1]}.log"), logging.StreamHandler()], ) - args = parseArgs() logger.info( f"** The job is running with the following arguments: **\n{args}\n **** " ) @@ -193,9 +187,7 @@ def main(): logger.info(f"Dataset:\n{ds}") # Deduplicate data and apply heuristics t_start = time.time() - ds_pii = ds.filter( - check_uniques, fn_kwargs={"uniques": uniques}, num_proc=args.num_proc - ) + ds_pii = ds.filter(check_uniques, fn_kwargs={"uniques": uniques}) logger.info(f"Time to filter dataset: {time.time()-t_start:.2f}") logger.info(f"Dataset after dedup:\n{ds_pii}")