Skip to content

Commit

Permalink
update code
Browse files Browse the repository at this point in the history
  • Loading branch information
loubnabnl committed Aug 24, 2023
1 parent c04356b commit 4aaaf8a
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 13 deletions.
3 changes: 2 additions & 1 deletion pii/ner/pii_redaction/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# PII redaction

```bash
python main_redact.py --dataset_name /fsx/leandro/data/pii_result/ada --target_dataset ada-no-pii --save_path_disk ada-no-pii-local
LANG=python
python main_redact.py --dataset_name /fsx/leandro/data/pii_result/$LANG --target_dataset $LANG-no-pii --save_path_disk $LANG-no-pii-local
```
16 changes: 4 additions & 12 deletions pii/ner/pii_redaction/main_redact.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,6 @@ def parseArgs():
type=int,
help="Number of processes to use for loading the dataset",
)
parser.add_argument(
"--lang",
default="ada",
type=str,
help="Language to redact PII in.",
)
parser.add_argument(
"--text_column",
default="content",
Expand Down Expand Up @@ -78,7 +72,7 @@ def parseArgs():
)
parser.add_argument(
"--add_reference_text",
default=False,
default=True,
type=bool,
help="If True we add the reference text with PII between delimiters \
in the redacted text -used for visualization-",
Expand Down Expand Up @@ -162,15 +156,15 @@ def check_uniques(example, uniques):

def main():
set_verbosity_info()
args = parseArgs()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
datefmt="%m/%d/%Y %H:%M:%S",
level=logging.INFO,
handlers=[logging.FileHandler("pii.log"), logging.StreamHandler()],
handlers=[logging.FileHandler(f"logs/pii-{args.dataset_name.split('/')[-1]}.log"), logging.StreamHandler()],
)
args = parseArgs()
logger.info(
f"** The job is running with the following arguments: **\n{args}\n **** "
)
Expand All @@ -193,9 +187,7 @@ def main():
logger.info(f"Dataset:\n{ds}")
# Deduplicate data and apply heuristics
t_start = time.time()
ds_pii = ds.filter(
check_uniques, fn_kwargs={"uniques": uniques}, num_proc=args.num_proc
)
ds_pii = ds.filter(check_uniques, fn_kwargs={"uniques": uniques})
logger.info(f"Time to filter dataset: {time.time()-t_start:.2f}")
logger.info(f"Dataset after dedup:\n{ds_pii}")

Expand Down

0 comments on commit 4aaaf8a

Please sign in to comment.