diff --git a/transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py b/transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py index 35dca2b7e..a6d15d6ad 100644 --- a/transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py +++ b/transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py @@ -24,12 +24,9 @@ import pandas as pd import struct from collections import defaultdict -import dpk_rep_removal.utils import transformers from transformers import GPT2Tokenizer -run_in_OCP = True - #### Save the tokenizer in a local path to speed up the process #### Get tokenizer from the local path to speed up the process @@ -81,9 +78,6 @@ def load_pq_docs_once_avoidIO(pq_df, content_col, save_dir, dataset_name, tokeni global args_tokenize, encoded_docs, loaded_size args_tokenize = tokenize - pre_sep = b"\xff\xff" - post_sep = b"" - if not os.path.exists(save_dir): os.mkdir(save_dir) @@ -207,9 +201,3 @@ def extract_dup_per_doc_avoidIO_further(repeated_pairs): int(min(int(remove[ptr][1] - byte_start), byte_end - byte_start)) - 6)) ################## added -6 to exclude sep ptr += 1 - # print ('############# Number of duplicate made from two subsequent documents: ', count_between_docs) - # print ('############# Number of duplicate made from two subsequent documents: ', duplicate_between_docs) - - # df_dict = pd.DataFrame(remove_ex) - # print(remove_ex) - # return remove_ex \ No newline at end of file diff --git a/transforms/universal/rep_removal/dpk_rep_removal/runtime.py b/transforms/universal/rep_removal/dpk_rep_removal/runtime.py index 00f0fd193..da7a71497 100644 --- a/transforms/universal/rep_removal/dpk_rep_removal/runtime.py +++ b/transforms/universal/rep_removal/dpk_rep_removal/runtime.py @@ -55,16 +55,16 @@ def add_input_params(self, parser: ArgumentParser) -> None: ) parser.add_argument( "--rep_removal_length_thresh", - type=str, + type=int, required=False, - default="50", + default=50, help="Length threshold for processing", ) parser.add_argument( "--rep_removal_frequency_threshold", - type=str, + type=int, required=False, - default="1", + default=1, help="Frequency threshold for processing.", ) parser.add_argument( @@ -83,14 +83,14 @@ def add_input_params(self, parser: ArgumentParser) -> None: ) parser.add_argument( "--rep_removal_num_threads", - type=str, + type=int, required=False, - default=str(cpu_count(logical=False)), + default=cpu_count(logical=False), help="Value for number of threads to use for processing", ) parser.add_argument( "--rep_removal_num_cpus", - type=str, + type=int, required=False, default=cpu_count(logical=False), help="Value for number of cpus allocated for processing", diff --git a/transforms/universal/rep_removal/dpk_rep_removal/transform.py b/transforms/universal/rep_removal/dpk_rep_removal/transform.py index f67737764..a1118814f 100644 --- a/transforms/universal/rep_removal/dpk_rep_removal/transform.py +++ b/transforms/universal/rep_removal/dpk_rep_removal/transform.py @@ -13,7 +13,6 @@ import os import subprocess import tempfile -import datetime import pyarrow as pa import pandas as pd from dpk_rep_removal.dedup_pq_level import load_pq_docs_once_avoidIO, extract_dup_per_doc_avoidIO_further, save_deduped_pq_once @@ -32,12 +31,12 @@ def __init__(self, config: dict[str, Any]): self.contents_column_name = config.get("rep_removal_contents_column_name", "contents") self.dedup_level = config.get("rep_removal_dedup_level_name", "parquet") - self.length_thresh = config.get("rep_removal_length_thresh", str(50)) - self.frequency_threshold = config.get("rep_removal_frequency_threshold", str(1)) + self.length_thresh = str(config.get("rep_removal_length_thresh", 5)) + self.frequency_threshold = str(config.get("rep_removal_frequency_threshold", 1)) self.retain_first_copy = str(config.get("rep_removal_retain_first_copy", True)) self.tokenize = str(config.get("rep_removal_tokenize", True)) - self.num_threads = config.get("rep_removal_num_threads", str(cpu_count(logical=False))) - self.num_cpus = config.get("rep_removal_num_cpus", cpu_count(logical=False)) + self.num_threads = str(config.get("rep_removal_num_threads", cpu_count(logical=False))) + self.num_cpus = str(config.get("rep_removal_num_cpus", cpu_count(logical=False))) if self.retain_first_copy.lower() == 'false': self.retain_first_copy = False diff --git a/transforms/universal/rep_removal/rep_removal.ipynb b/transforms/universal/rep_removal/rep_removal.ipynb index 1c529765b..afaacff50 100644 --- a/transforms/universal/rep_removal/rep_removal.ipynb +++ b/transforms/universal/rep_removal/rep_removal.ipynb @@ -38,16 +38,6 @@ "set the $PATH to include `/Users/USERNAME/.cargo/bin/`" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "88527732-fcaf-4fac-9120-43c67dba76d3", - "metadata": {}, - "outputs": [], - "source": [ - "import os" - ] - }, { "cell_type": "code", "execution_count": null, @@ -66,6 +56,7 @@ "outputs": [], "source": [ "# set $PATH env to append the rust path\n", + "import os\n", "os.environ['PATH'] = os.environ['PATH'] + ':/OUTPUT/OF/WHEREIS/CARGO/UP/TO/BIN/'" ] }, @@ -105,7 +96,6 @@ "RepRemoval(input_folder= \"test-data/input\",\n", " output_folder= \"test-data/output\",\n", " rep_removal_contents_column_name='text', \n", - " rep_removal_num_threads='1',\n", " ).transform()" ] },