Skip to content

Commit

Permalink
updated per PR comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Shalisha Witherspoon committed Feb 6, 2025
1 parent b68a6a1 commit 46097d5
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 35 deletions.
12 changes: 0 additions & 12 deletions transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,9 @@
import pandas as pd
import struct
from collections import defaultdict
import dpk_rep_removal.utils
import transformers
from transformers import GPT2Tokenizer

run_in_OCP = True

#### Save the tokenizer in a local path to speed up the process
#### Get tokenizer from the local path to speed up the process

Expand Down Expand Up @@ -81,9 +78,6 @@ def load_pq_docs_once_avoidIO(pq_df, content_col, save_dir, dataset_name, tokeni
global args_tokenize, encoded_docs, loaded_size
args_tokenize = tokenize

pre_sep = b"\xff\xff"
post_sep = b""

if not os.path.exists(save_dir):
os.mkdir(save_dir)

Expand Down Expand Up @@ -207,9 +201,3 @@ def extract_dup_per_doc_avoidIO_further(repeated_pairs):
int(min(int(remove[ptr][1] - byte_start),
byte_end - byte_start)) - 6)) ################## added -6 to exclude sep
ptr += 1
# print ('############# Number of duplicate made from two subsequent documents: ', count_between_docs)
# print ('############# Number of duplicate made from two subsequent documents: ', duplicate_between_docs)

# df_dict = pd.DataFrame(remove_ex)
# print(remove_ex)
# return remove_ex
14 changes: 7 additions & 7 deletions transforms/universal/rep_removal/dpk_rep_removal/runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,16 @@ def add_input_params(self, parser: ArgumentParser) -> None:
)
parser.add_argument(
"--rep_removal_length_thresh",
type=str,
type=int,
required=False,
default="50",
default=50,
help="Length threshold for processing",
)
parser.add_argument(
"--rep_removal_frequency_threshold",
type=str,
type=int,
required=False,
default="1",
default=1,
help="Frequency threshold for processing.",
)
parser.add_argument(
Expand All @@ -83,14 +83,14 @@ def add_input_params(self, parser: ArgumentParser) -> None:
)
parser.add_argument(
"--rep_removal_num_threads",
type=str,
type=int,
required=False,
default=str(cpu_count(logical=False)),
default=cpu_count(logical=False),
help="Value for number of threads to use for processing",
)
parser.add_argument(
"--rep_removal_num_cpus",
type=str,
type=int,
required=False,
default=cpu_count(logical=False),
help="Value for number of cpus allocated for processing",
Expand Down
9 changes: 4 additions & 5 deletions transforms/universal/rep_removal/dpk_rep_removal/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import os
import subprocess
import tempfile
import datetime
import pyarrow as pa
import pandas as pd
from dpk_rep_removal.dedup_pq_level import load_pq_docs_once_avoidIO, extract_dup_per_doc_avoidIO_further, save_deduped_pq_once
Expand All @@ -32,12 +31,12 @@ def __init__(self, config: dict[str, Any]):

self.contents_column_name = config.get("rep_removal_contents_column_name", "contents")
self.dedup_level = config.get("rep_removal_dedup_level_name", "parquet")
self.length_thresh = config.get("rep_removal_length_thresh", str(50))
self.frequency_threshold = config.get("rep_removal_frequency_threshold", str(1))
self.length_thresh = str(config.get("rep_removal_length_thresh", 5))
self.frequency_threshold = str(config.get("rep_removal_frequency_threshold", 1))
self.retain_first_copy = str(config.get("rep_removal_retain_first_copy", True))
self.tokenize = str(config.get("rep_removal_tokenize", True))
self.num_threads = config.get("rep_removal_num_threads", str(cpu_count(logical=False)))
self.num_cpus = config.get("rep_removal_num_cpus", cpu_count(logical=False))
self.num_threads = str(config.get("rep_removal_num_threads", cpu_count(logical=False)))
self.num_cpus = str(config.get("rep_removal_num_cpus", cpu_count(logical=False)))

if self.retain_first_copy.lower() == 'false':
self.retain_first_copy = False
Expand Down
12 changes: 1 addition & 11 deletions transforms/universal/rep_removal/rep_removal.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,16 +38,6 @@
"set the $PATH to include `/Users/USERNAME/.cargo/bin/`"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "88527732-fcaf-4fac-9120-43c67dba76d3",
"metadata": {},
"outputs": [],
"source": [
"import os"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -66,6 +56,7 @@
"outputs": [],
"source": [
"# set $PATH env to append the rust path\n",
"import os\n",
"os.environ['PATH'] = os.environ['PATH'] + ':/OUTPUT/OF/WHEREIS/CARGO/UP/TO/BIN/'"
]
},
Expand Down Expand Up @@ -105,7 +96,6 @@
"RepRemoval(input_folder= \"test-data/input\",\n",
" output_folder= \"test-data/output\",\n",
" rep_removal_contents_column_name='text', \n",
" rep_removal_num_threads='1',\n",
" ).transform()"
]
},
Expand Down

0 comments on commit 46097d5

Please sign in to comment.