diff --git a/README.md b/README.md index 9d6462a..95ffe7e 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ necessary used for model training. - `preprocessing`: code for filtering code datasets based on: - line length and percentage of alphanumeric characters (basic filter) - number of stars, comments to code ratio, tokenizer fertility - - Additionnal filters used for StarCoder Training: + - Additional filters used for StarCoder Training: - basic-filter with parameters that depend on the file's extension. - filter to remove XML files - filter for HTML based on displayed-text VS code ratio diff --git a/pii/README.md b/pii/README.md index 63d1a20..fd5df5c 100644 --- a/pii/README.md +++ b/pii/README.md @@ -4,12 +4,12 @@ We provide code to detect Names, Emails, IP addresses, Passwords API/SSH keys in ## NER approach For the **NER** model based approach (e.g [StarPII](https://huggingface.co/bigcode/starpii)), please go to the `ner` folder. -We provide the code used for training a PII NER model to detect : Names, Emails, Keys, Passwords & IP addresses (more details in our paper: [StarCoder: May The Source Be With You](https://drive.google.com/file/d/1cN-b9GnWtHzQRoE7M7gAEyivY0kl4BYs/view)). You will also find the code (and `slurm` scripts) used for running PII Inference on [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), we were able to detect PII in 800GB of text in 800 GPU-hours on A100 80GB. To replace secrets we used teh following tokens: +We provide the code used for training a PII NER model to detect : Names, Emails, Keys, Passwords & IP addresses (more details in our paper: [StarCoder: May The Source Be With You](https://drive.google.com/file/d/1cN-b9GnWtHzQRoE7M7gAEyivY0kl4BYs/view)). You will also find the code (and `slurm` scripts) used for running PII Inference on [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), we were able to detect PII in 800GB of text in 800 GPU-hours on A100 80GB. To replace secrets we used the following tokens: `, , , ` To mask IP addresses, we randomly selected an IP address from 5~synthetic, private, non-internet-facing IP addresses of the same type. ## Regex approach -Below we explain the regex based approach to dectect Emails, IP addresses adn keys only: +Below we explain the regex based approach to detect Emails, IP addresses and keys only: We use regexes for emails and IP addresses (they are adapted from [BigScience PII pipeline](https://github.com/bigscience-workshop/data-preparation/tree/main/preprocessing/training/02_pii)). And we use [detect-secrets](https://github.com/Yelp/detect-secrets) for finding secrets keys. We additionally implement some filters on top to reduce the number of false positives. There is also some evaluation code to test the pipeline on a PII benchmark we annotated. ## Usage of the regex approach diff --git a/pii/main.py b/pii/main.py index c253d67..661a164 100644 --- a/pii/main.py +++ b/pii/main.py @@ -182,7 +182,7 @@ def main(): 'max_forks_repo_path', 'max_forks_repo_name', 'max_forks_repo_head_hexsha', \ 'max_forks_repo_licenses', 'max_forks_count', 'max_forks_repo_forks_event_min_datetime', 'max_forks_repo_forks_event_max_datetime'] ds = ds.remove_columns(columns) - logger.info(f"New dataset fomat: {ds}") + logger.info(f"New dataset format: {ds}") # add id column to dataset logger.info(f" ===== Adding an index column =====") ds = ds.add_column("index", list(range(len(ds)))) diff --git a/pii/ner/pii_inference/notebooks/Pipeline with sliding-window.ipynb b/pii/ner/pii_inference/notebooks/Pipeline with sliding-window.ipynb index 0ca86ed..6a9dce9 100644 --- a/pii/ner/pii_inference/notebooks/Pipeline with sliding-window.ipynb +++ b/pii/ner/pii_inference/notebooks/Pipeline with sliding-window.ipynb @@ -296,8 +296,8 @@ "id": "33a72a0a", "metadata": {}, "source": [ - "### 5 seconds out 12 (41% of time) consumed on transfering from cpu to cuda \n", - "### 7 seconds out 12 (58% of time) consumed on transfering from cuda to cpu \n", + "### 5 seconds out 12 (41% of time) consumed on transferring from cpu to cuda \n", + "### 7 seconds out 12 (58% of time) consumed on transferring from cuda to cpu \n", "### so, most of the time consumption is just a bus bottleneck =/" ] } diff --git a/pii/ner/pii_inference/notebooks/Train DeBERTa-v3-base on pseudo-labeled data.ipynb b/pii/ner/pii_inference/notebooks/Train DeBERTa-v3-base on pseudo-labeled data.ipynb index 4508cfc..e824a33 100644 --- a/pii/ner/pii_inference/notebooks/Train DeBERTa-v3-base on pseudo-labeled data.ipynb +++ b/pii/ner/pii_inference/notebooks/Train DeBERTa-v3-base on pseudo-labeled data.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### The fisrt stage of training \n", + "### The first stage of training \n", "The initial training on pseudo-labeled data " ] }, diff --git a/pii/ner/pii_redaction/manual_sharding.py b/pii/ner/pii_redaction/manual_sharding.py index 8e9f894..e2ec08c 100644 --- a/pii/ner/pii_redaction/manual_sharding.py +++ b/pii/ner/pii_redaction/manual_sharding.py @@ -27,7 +27,7 @@ def save_manual_shards( # this will create a folder OUT_PATH that is a clone of REMOTE_DATASET_REPO # you can save the shards inside it and do git add/commit/push to push data to the hub out_path = remote_dataset_repo if local_dir is None else local_dir - # if out path doesnt already exist + # if out path doesn't already exist if not os.path.exists(out_path): repo = Repository( local_dir=out_path, diff --git a/pii/ner/pii_redaction/utils.py b/pii/ner/pii_redaction/utils.py index bd4ed42..ca5bf20 100644 --- a/pii/ner/pii_redaction/utils.py +++ b/pii/ner/pii_redaction/utils.py @@ -106,7 +106,7 @@ def redact_pii_text(text, secrets, replacements, add_references=False): secrets (list): list with the secrets to redact replacements (dict): dictionary of replacements for each PII type add_references (bool): whether to add references to the redacted text (delimiters to PII) - for vizualization + for visualization Returns: text (str): new text with redacted secrets """ diff --git a/pii/ner/pii_train_ner/README.md b/pii/ner/pii_train_ner/README.md index 1a97624..66ef172 100644 --- a/pii/ner/pii_train_ner/README.md +++ b/pii/ner/pii_train_ner/README.md @@ -1,6 +1,6 @@ # Fine-tuning StarEncoder on an NER task for PII detection -To run the training on an annotated PII dataset (`bigcode/pii-full-ds` in our case, you might need to adpat the code to fit your dataset), use the following command: +To run the training on an annotated PII dataset (`bigcode/pii-full-ds` in our case, you might need to adapt the code to fit your dataset), use the following command: ```bash python -m torch.distributed.launch \ --nproc_per_node number_of_gpus train.py \ diff --git a/pii/ner/pii_train_ner/train.py b/pii/ner/pii_train_ner/train.py index 4e764cc..0ba7830 100644 --- a/pii/ner/pii_train_ner/train.py +++ b/pii/ner/pii_train_ner/train.py @@ -220,7 +220,7 @@ def main(args): valid_stats = get_stats(valid_data) test_stats = get_stats(test_data) print("Train low-resource stats") - # print stats for keys with less than 100 in teh value + # print stats for keys with less than 100 in the value pprint({k: v for k, v in train_stats.items() if v < 300}) print("Valid low-resource stats") pprint({k: v for k, v in valid_stats.items() if v < 100}) diff --git a/pii/notebooks/analysis_detect_secrets_tool.ipynb b/pii/notebooks/analysis_detect_secrets_tool.ipynb index 82e9558..d6ffe90 100644 --- a/pii/notebooks/analysis_detect_secrets_tool.ipynb +++ b/pii/notebooks/analysis_detect_secrets_tool.ipynb @@ -1103,7 +1103,7 @@ "id": "f8a18b9a", "metadata": {}, "source": [ - "Conclusion: Gibberish detector can filter the false postives of this detector" + "Conclusion: Gibberish detector can filter the false positives of this detector" ] }, { diff --git a/pii/notebooks/data_prefiltering_lightag.ipynb b/pii/notebooks/data_prefiltering_lightag.ipynb index 42e6532..45b8865 100644 --- a/pii/notebooks/data_prefiltering_lightag.ipynb +++ b/pii/notebooks/data_prefiltering_lightag.ipynb @@ -1012,7 +1012,7 @@ " for j in range(len(labels[\"examples\"][i][\"annotations\"])):\n", " tags.append(labels[\"examples\"][i][\"annotations\"][j][\"tag\"])\n", "Counter(tags)\n", - "# plot distibution of tags\n", + "# plot distribution of tags\n", "import matplotlib.pyplot as plt\n", "plt.bar(Counter(tags).keys(), Counter(tags).values())\n", "# add title to the plot saying tags from 180 annotated files\n", diff --git a/pii/notebooks/early_qualitative_analysis.ipynb b/pii/notebooks/early_qualitative_analysis.ipynb index 0e9df5d..da436fe 100644 --- a/pii/notebooks/early_qualitative_analysis.ipynb +++ b/pii/notebooks/early_qualitative_analysis.ipynb @@ -23,7 +23,7 @@ " * Some hashes are not labelled as gibberish by the gibberish detector(=> not filtered), not sure if they are really secrets, for an example see `./experiments/file_with_hashes.txt` (some other hashes -from that file- are filtered though)\n", " * There are still some false positives like name/path (labeled as gibberish) in this format \"e2e_mask_rcnn_X-152-32x8d-FPN-IN5k_1.44x\" and \"//deno.land/std@0.142.0/testing/asserts.ts\"\n", " * If there is an \"=\" or \"id=\" in front of the key it is detected\n", - " * Some instances like \"f47dbc9c:\" and \"dc22a3aa:\" are detected, tehy seems like ids of patch releases, their context is saved in `./experiments/short_keys_patch_releases.txt`\n", + " * Some instances like \"f47dbc9c:\" and \"dc22a3aa:\" are detected, they seems like ids of patch releases, their context is saved in `./experiments/short_keys_patch_releases.txt`\n", " * You can check all detected keys by looking for 'KEY' tags in `./experiments/list_detected_pii.txt` \n", "* TODO: get precision numbers and try adding more filters (from detect-secrets fore example)\n", "2. For email detection:\n", @@ -374,15 +374,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "nb dectected by detect-secrets: 2\n", - "nb dectected by regexes: 21\n", + "nb detected by detect-secrets: 2\n", + "nb detected by regexes: 21\n", "number true API keys: 30\n" ] } ], "source": [ - "print(f\"nb dectected by detect-secrets: {detect_secrets_nb}\")\n", - "print(f\"nb dectected by regexes: {regexes_nb}\")\n", + "print(f\"nb detected by detect-secrets: {detect_secrets_nb}\")\n", + "print(f\"nb detected by regexes: {regexes_nb}\")\n", "print(f\"number true API keys: {len(api_keys_clean)}\")" ] }, @@ -417,7 +417,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "detect-secrets has a very low recall: 2 out of 30, let's anlyze the regex detections" + "detect-secrets has a very low recall: 2 out of 30, let's analyze the regex detections" ] }, { @@ -532,15 +532,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "nb dectected by detect-secrets: 0\n", - "nb dectected by regexes: 7\n", + "nb detected by detect-secrets: 0\n", + "nb detected by regexes: 7\n", "number true ssh keys: 7\n" ] } ], "source": [ - "print(f\"nb dectected by detect-secrets: {detect_secrets_nb}\")\n", - "print(f\"nb dectected by regexes: {regexes_nb}\")\n", + "print(f\"nb detected by detect-secrets: {detect_secrets_nb}\")\n", + "print(f\"nb detected by regexes: {regexes_nb}\")\n", "print(f\"number true ssh keys: {len(ssh_keys_clean)}\")" ] }, @@ -782,7 +782,7 @@ "source": [ "### Email detection\n", "\n", - "* our current regex detects many false positives taht are derivatives of: dusk-network/helpers@4.6.12\n", + "* our current regex detects many false positives that are derivatives of: dusk-network/helpers@4.6.12\n", "* bigscience updated regex: can't detect emails well when they are in this format: and also labels dusk-network/helpers@4.6.12 as emails, see https://regex101.com/r/LNwpG1/1" ] }, @@ -895,7 +895,7 @@ "metadata": {}, "outputs": [], "source": [ - "# fiw an issue with this annotation\n", + "# fiX an issue with this annotation\n", "ds_emails[142] = ds_emails[142][:-1]\n", "ds_emails[108] = ds_emails[108].strip()" ] @@ -953,15 +953,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "nb emails dectected by old regex: 170\n", - "nb emails dectected by new BS regex: 169\n", + "nb emails detected by old regex: 170\n", + "nb emails detected by new BS regex: 169\n", "number true EMAILS: 170\n" ] } ], "source": [ - "print(f\"nb emails dectected by old regex: {old_regex_nb}\")\n", - "print(f\"nb emails dectected by new BS regex: {new_regex_nb}\")\n", + "print(f\"nb emails detected by old regex: {old_regex_nb}\")\n", + "print(f\"nb emails detected by new BS regex: {new_regex_nb}\")\n", "print(f\"number true EMAILS: {len(ds_emails)}\")" ] }, @@ -1065,8 +1065,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "nb emails dectected by old regex: 7\n", - "nb emails dectected by new BS regex: 2\n", + "nb emails detected by old regex: 7\n", + "nb emails detected by new BS regex: 2\n", "number true EMAILS: 7\n" ] } @@ -1088,8 +1088,8 @@ " new_regex_nb += 1\n", " new_regex_results.append(output_2)\n", "\n", - "print(f\"nb emails dectected by old regex: {old_regex_nb}\")\n", - "print(f\"nb emails dectected by new BS regex: {new_regex_nb}\")\n", + "print(f\"nb emails detected by old regex: {old_regex_nb}\")\n", + "print(f\"nb emails detected by new BS regex: {new_regex_nb}\")\n", "print(f\"number true EMAILS: {len(ds_emails[:7])}\")" ] }, @@ -1147,7 +1147,7 @@ "source": [ "def compare_intervals(ref_intervals, pred_intervals):\n", " \"\"\"Compare two lists of intervals and return the number of true positives, false positives and false negatives\n", - " authur : @copilot\n", + " author : @copilot\n", " \"\"\"\n", " ref_intervals = sorted(ref_intervals, key=lambda x: x[0])\n", " pred_intervals = sorted(pred_intervals, key=lambda x: x[0])\n", diff --git a/pii/notebooks/initial_analysis_regexes.ipynb b/pii/notebooks/initial_analysis_regexes.ipynb index 020c7af..f449c53 100644 --- a/pii/notebooks/initial_analysis_regexes.ipynb +++ b/pii/notebooks/initial_analysis_regexes.ipynb @@ -991,9 +991,9 @@ "metadata": {}, "source": [ "In this section we test Presidio for the detection of the entities [\"EMAIL_ADDRESS\",\"PERSON\", \"CREDIT_CARD\", \"IP_ADDRESS\", \"IBAN_CODE\"]\n", - "* We first apply it on the code files then on their dosctrings/comments only\n", + "* We first apply it on the code files then on their docstrings/comments only\n", "\n", - "* Even with a high threshold on prediction score (0.8) the **predictiosn are bad**. The underhood NER must have trouble infering context from technical text\n", + "* Even with a high threshold on prediction score (0.8) the **predictions are bad**. The underhood NER must have trouble inferring context from technical text\n", "which often include python keywords and code" ] }, diff --git a/pii/notebooks/postprocess_annotations_lightag.ipynb b/pii/notebooks/postprocess_annotations_lightag.ipynb index 6bccd43..cabc246 100644 --- a/pii/notebooks/postprocess_annotations_lightag.ipynb +++ b/pii/notebooks/postprocess_annotations_lightag.ipynb @@ -91,7 +91,7 @@ " 'archived': False,\n", " 'priority': 1,\n", " 'active': True,\n", - " 'guidelines': '## Task Overview\\n\\nWelcome to our annotation task. In this task we\\'ll present you with one code file at a time and ask you to tag specific entities. We\\'ll be using this data to evaluate PII detection tools on source code from different programming languages. \\n\\n1. Please highlight the entire span for each tags where applicable. For example: For tag `NAME`, if the text presented has John Doe, please highlight John Doe as one span, instead of highlighting John and Doe separately.\\n2. If you think a word that should be highlighted, but unsure about which tag to go to, use `AMBIGUOUS` instead.\\n3. **Do not overlap** tags. Tag the one most applicable to the entire span. For example, if a person\\'s name is part of `EMAIL`, do not tag `NAME`.\\n\\n## Tags Guidelines\\n\\nFor each file, please highlight you find any corresponding to the following tags: \\n\\n### API_KEY\\n\\nIncluding API Keys or API Tokens, Bearer Tokens, OAuth Tokens, see [here](https://www.freecodecamp.org/news/best-practices-for-building-api-keys-97c26eabfea9/#:~:text=the%20right%20way.-,API%20Key%20Generation,-Since%20the%20API) for example. Please highlight only the actual key.\\n\\n### AMBIGUOUS\\n\\nIf unsure whether to highlight a text, or which tag to highlight a text for, tag as `AMBIGUOUS` to allow reviewers to take a look later.\\n\\n### EMAIL\\n\\nEmail address, including generic emails such as support@organization.com.\\n\\n### IP_ADDRESS\\n\\nIP addresses could come in two different formats, see [IPv4 and IPv6 address formats](https://www.ibm.com/docs/en/ts3500-tape-library?topic=functionality-ipv4-ipv6-address-formats). Please tag both formats. If unsure a span is an IP address or not, tag `AMBIGUOUS` instead.\\n\\n### NAME\\n\\nA person\\'s name, including just first name or last name only. Do not include title used. For example, if \"Ms. Doe\" is used, please highlight \"Doe\" only.\\nFor now we are not tagging usernames.\\n\\n### PASSWORD\\n\\nAny authentication credentials not applicable to SSH keys or API keys. If unsure, tag `AMBIGUOUS`.\\n\\n### SSH_KEY\\nSecure Shell Key, the output usually looks like `ssh-rsa public_key account`, see example [here](https://git-scm.com/book/en/v2/Git-on-the-Server-Generating-Your-SSH-Public-Key). Please highlight the entire public key span. \\n\\n### USERNAME\\n\\nAny username that is used, including for credentials or handles (such as GitHub handles). If username is an email, tag `EMAIL` instead.\\n\\n## Additional note: Class CONTAINS_NON_ENGLISH\\n\\nIf you find a file with comments in a natural language you don\\'t undertsand and can\\'t decide if it includes PII, label it with the class `CONTAINS_NON_ENGLISH`.',\n", + " 'guidelines': '## Task Overview\\n\\nWelcome to our annotation task. In this task we\\'ll present you with one code file at a time and ask you to tag specific entities. We\\'ll be using this data to evaluate PII detection tools on source code from different programming languages. \\n\\n1. Please highlight the entire span for each tags where applicable. For example: For tag `NAME`, if the text presented has John Doe, please highlight John Doe as one span, instead of highlighting John and Doe separately.\\n2. If you think a word that should be highlighted, but unsure about which tag to go to, use `AMBIGUOUS` instead.\\n3. **Do not overlap** tags. Tag the one most applicable to the entire span. For example, if a person\\'s name is part of `EMAIL`, do not tag `NAME`.\\n\\n## Tags Guidelines\\n\\nFor each file, please highlight you find any corresponding to the following tags: \\n\\n### API_KEY\\n\\nIncluding API Keys or API Tokens, Bearer Tokens, OAuth Tokens, see [here](https://www.freecodecamp.org/news/best-practices-for-building-api-keys-97c26eabfea9/#:~:text=the%20right%20way.-,API%20Key%20Generation,-Since%20the%20API) for example. Please highlight only the actual key.\\n\\n### AMBIGUOUS\\n\\nIf unsure whether to highlight a text, or which tag to highlight a text for, tag as `AMBIGUOUS` to allow reviewers to take a look later.\\n\\n### EMAIL\\n\\nEmail address, including generic emails such as support@organization.com.\\n\\n### IP_ADDRESS\\n\\nIP addresses could come in two different formats, see [IPv4 and IPv6 address formats](https://www.ibm.com/docs/en/ts3500-tape-library?topic=functionality-ipv4-ipv6-address-formats). Please tag both formats. If unsure a span is an IP address or not, tag `AMBIGUOUS` instead.\\n\\n### NAME\\n\\nA person\\'s name, including just first name or last name only. Do not include title used. For example, if \"Ms. Doe\" is used, please highlight \"Doe\" only.\\nFor now we are not tagging usernames.\\n\\n### PASSWORD\\n\\nAny authentication credentials not applicable to SSH keys or API keys. If unsure, tag `AMBIGUOUS`.\\n\\n### SSH_KEY\\nSecure Shell Key, the output usually looks like `ssh-rsa public_key account`, see example [here](https://git-scm.com/book/en/v2/Git-on-the-Server-Generating-Your-SSH-Public-Key). Please highlight the entire public key span. \\n\\n### USERNAME\\n\\nAny username that is used, including for credentials or handles (such as GitHub handles). If username is an email, tag `EMAIL` instead.\\n\\n## Additional note: Class CONTAINS_NON_ENGLISH\\n\\nIf you find a file with comments in a natural language you don\\'t understand and can\\'t decide if it includes PII, label it with the class `CONTAINS_NON_ENGLISH`.',\n", " 'schema_id': '33a5ef29-c22f-4e64-8fee-6019dde1c76d',\n", " 'dataset_id': '5ed27353-49a7-42e9-bc84-81e15f3f4162',\n", " 'project_id': '87c47b2c-d503-4967-b314-c04d1e7f8be7',\n", @@ -788,7 +788,7 @@ "metadata": {}, "outputs": [], "source": [ - "# remove file at index 11 many ambigous keys\n", + "# remove file at index 11 many ambiguous keys\n", "# remove file 51 many incorrect names" ] }, @@ -3155,7 +3155,7 @@ " counters_sanity_check.append(1)\n", "\n", " def test_rules_priority_forwarded(self, setup, direction, ptfadapter, counters_sanity_check, ip_version):\n", - " \"\"\"Verify that we respect rule priorites in the forwarding case.\"\"\"\n", + " \"\"\"Verify that we respect rule priorities in the forwarding case.\"\"\"\n", " src_ip = \"20.0.0.7\" if ip_version == \"ipv4\" else \"60c0:a800::7\"\n", " pkt = self.tcp_packet(setup, direction, ptfadapter, ip_version, src_ip=src_ip)\n", "\n", @@ -3163,7 +3163,7 @@ " counters_sanity_check.append(20)\n", "\n", " def test_rules_priority_dropped(self, setup, direction, ptfadapter, counters_sanity_check, ip_version):\n", - " \"\"\"Verify that we respect rule priorites in the drop case.\"\"\"\n", + " \"\"\"Verify that we respect rule priorities in the drop case.\"\"\"\n", " src_ip = \"20.0.0.3\" if ip_version == \"ipv4\" else \"60c0:a800::4\"\n", " pkt = self.tcp_packet(setup, direction, ptfadapter, ip_version, src_ip=src_ip)\n", "\n", diff --git a/pii/notebooks/quantitative_evaluation_regexes.ipynb b/pii/notebooks/quantitative_evaluation_regexes.ipynb index 91bb97f..420e51b 100644 --- a/pii/notebooks/quantitative_evaluation_regexes.ipynb +++ b/pii/notebooks/quantitative_evaluation_regexes.ipynb @@ -159,7 +159,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The following data is used to build a vizualization space" + "The following data is used to build a visualization space" ] }, { @@ -181,7 +181,7 @@ " # sort elements inside FN by start index\n", " details[\"FN\"] = sorted(details[\"FN\"], key=lambda x: x[\"start\"])\n", " print(f\"not skipped {i}\")\n", - " # add content to each detection while hightlighting detected words\n", + " # add content to each detection while highlighting detected words\n", " subparts = []\n", " advance = 0\n", " for detection in details[\"FN\"]:\n", @@ -208,7 +208,7 @@ " # sort elements inside FP by start index\n", " details[\"FP\"] = sorted(details[\"FP\"], key=lambda x: x[\"start\"])\n", " print(f\"not skipped {i}\")\n", - " # add content to each detection while hightlighting detected words\n", + " # add content to each detection while highlighting detected words\n", " subparts = []\n", " advance = 0\n", " for detection in details[\"FP\"]:\n", diff --git a/pii/notebooks/statistics_pii_dataset.ipynb b/pii/notebooks/statistics_pii_dataset.ipynb index f55fdf5..df9f526 100644 --- a/pii/notebooks/statistics_pii_dataset.ipynb +++ b/pii/notebooks/statistics_pii_dataset.ipynb @@ -249,7 +249,7 @@ } ], "source": [ - "# plot distibution of tags\n", + "# plot distribution of tags\n", "import matplotlib.pyplot as plt\n", "# use ggplot style defaults and set the default figure size\n", "plt.style.use('ggplot')\n", diff --git a/pii/pii_redaction.py b/pii/pii_redaction.py index 059edb0..ecbbd74 100644 --- a/pii/pii_redaction.py +++ b/pii/pii_redaction.py @@ -16,7 +16,7 @@ ], } -# providergs = ["google", "cloudfare", "alternate-dns", "quad9","open-dns", "comodo", "adguard"] +# providergs = ["google", "cloudflare", "alternate-dns", "quad9","open-dns", "comodo", "adguard"] POPULAR_DNS_SERVERS = [ "8.8.8.8", "8.8.4.4", diff --git a/preprocessing/README.md b/preprocessing/README.md index dce1f39..ddd9f37 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -9,7 +9,7 @@ Three filters for the preprocessing of The Stack are available: * We compute the comment to code ratio of a file by counting the number of characters in comments over the total number of characters in the file. * **fertility**: filter based on the character to token ratio after calling the tokenizer on the code file. Different thresholds for Python (2.5), Java (2.9) and JavaScript (2.6) for data after near-dedup + basic filtering & PII redaction. -* Additionnal filters used for StarCoder Training [StarCodeData](https://huggingface.co/datasets/bigcode/starcoderdata): +* Additional filters used for StarCoder Training [StarCodeData](https://huggingface.co/datasets/bigcode/starcoderdata): - basic-filter with parameters that depend on the file's extension. - filter to remove XML files - filter for HTML based on displayed-text VS code ratio diff --git a/preprocessing/arguments.py b/preprocessing/arguments.py index ea0419d..d02b11e 100644 --- a/preprocessing/arguments.py +++ b/preprocessing/arguments.py @@ -79,7 +79,7 @@ class FilteringArguments: ) out_path: Optional[str] = field( default=None, - metadata={"help": "Local path to save the ouptut dataset."}, + metadata={"help": "Local path to save the output dataset."}, ) log_file: Optional[str] = field( default="filtering.log", @@ -107,7 +107,7 @@ class ContentWithMetaArguments: ) split: Optional[str] = field( default="train", - metadata={"help": "Datasset split to process."}, + metadata={"help": "Dataset split to process."}, ) add_repo_name_prob: float = field( default=.2, @@ -143,7 +143,7 @@ class ContentWithMetaArguments: ) out_path: Optional[str] = field( default=None, - metadata={"help": "Local path to save the ouptut dataset."}, + metadata={"help": "Local path to save the output dataset."}, ) log_file: Optional[str] = field( default="filtering.log", diff --git a/preprocessing/jupyter-structured/jupyter-generate-triplets.py b/preprocessing/jupyter-structured/jupyter-generate-triplets.py index 79de712..41d994f 100644 --- a/preprocessing/jupyter-structured/jupyter-generate-triplets.py +++ b/preprocessing/jupyter-structured/jupyter-generate-triplets.py @@ -27,7 +27,7 @@ def parse_data(ds): types = types[1:] #else: # drop first the two cells of markdown followed by code - # the first markown cell of a notebook is often a long description of the whole notebook + # the first markdown cell of a notebook is often a long description of the whole notebook # cells = notebooks["cells"][2:] # types = notebooks["types"][2:] if len(types)>0: diff --git a/preprocessing/jupyter-structured/jupyter-segment-notebooks.py b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py index 2e9f4e2..a7dfb6b 100644 --- a/preprocessing/jupyter-structured/jupyter-segment-notebooks.py +++ b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py @@ -59,7 +59,7 @@ def segment(batch): dataset = load_dataset("bigcode/the-stack",data_dir="data/jupyter-notebook", split="train",use_auth_token=True) # segment notebooks dataset = dataset.map(segment) - # filter out erronous cells via placeholders + # filter out erroneous cells via placeholders dataset = dataset.filter(lambda entry: entry['cell_types']!=['empty']) # push to hub dataset.push_to_hub("bigcode/jupyter-parsed") \ No newline at end of file diff --git a/preprocessing/statistics.py b/preprocessing/statistics.py index f55f7e0..d0f016f 100644 --- a/preprocessing/statistics.py +++ b/preprocessing/statistics.py @@ -83,7 +83,7 @@ def get_unrecognized_ext(log_file: str): m = re.search(pattern, line) if m is not None: res.append(m.group(2)) - # Count each occurence + # Count each occurrence res = dict(Counter(res)) return res diff --git a/preprocessing/utils/manual_sharding.py b/preprocessing/utils/manual_sharding.py index 2d1f875..48e453a 100644 --- a/preprocessing/utils/manual_sharding.py +++ b/preprocessing/utils/manual_sharding.py @@ -26,7 +26,7 @@ def save_manual_shards(ds, user="loubnabnl", remote_dataset_repo="bigcode-pii-pj # you can save the shards inside it and do git add/commit/push to push data to the hub out_path = remote_dataset_repo if out_path is None else out_path - # if out path doesnt already exist + # if out path doesn't already exist if not os.path.exists(out_path): repo = Repository( local_dir=out_path, diff --git a/preprocessing/utils/utils_issues.py b/preprocessing/utils/utils_issues.py index 1000045..ce94f64 100644 --- a/preprocessing/utils/utils_issues.py +++ b/preprocessing/utils/utils_issues.py @@ -125,7 +125,7 @@ def strip_automated_email_text(example): def truncate_long_comments(example, max_lines=80): - """Truncates long comments in the middle (we keep teh last 20 lnes)""" + """Truncates long comments in the middle (we keep the last 20 lnes)""" for event in example["events"]: lines = event["text"].split("\n") nb_lines = len(lines)