diff --git a/transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py b/transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py
index 35dca2b7e..a6d15d6ad 100644
--- a/transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py
+++ b/transforms/universal/rep_removal/dpk_rep_removal/dedup_pq_level.py
@@ -24,12 +24,9 @@
 import pandas as pd
 import struct
 from collections import defaultdict
-import dpk_rep_removal.utils
 import transformers
 from transformers import GPT2Tokenizer
 
-run_in_OCP = True
-
 #### Save the tokenizer in a local path to speed up the process
 #### Get tokenizer from the local path to speed up the process
 
@@ -81,9 +78,6 @@ def load_pq_docs_once_avoidIO(pq_df, content_col, save_dir, dataset_name, tokeni
     global args_tokenize, encoded_docs, loaded_size
     args_tokenize = tokenize
 
-    pre_sep = b"\xff\xff"
-    post_sep = b""
-
     if not os.path.exists(save_dir):
         os.mkdir(save_dir)
 
@@ -207,9 +201,3 @@ def extract_dup_per_doc_avoidIO_further(repeated_pairs):
                                  int(min(int(remove[ptr][1] - byte_start),
                                          byte_end - byte_start)) - 6))  ################## added -6 to exclude sep
             ptr += 1
-    # print ('############# Number of duplicate made from two subsequent documents: ', count_between_docs)
-    # print ('############# Number of duplicate made from two subsequent documents: ', duplicate_between_docs)
-
-    # df_dict = pd.DataFrame(remove_ex)
-    # print(remove_ex)
-    # return remove_ex
\ No newline at end of file
diff --git a/transforms/universal/rep_removal/dpk_rep_removal/runtime.py b/transforms/universal/rep_removal/dpk_rep_removal/runtime.py
index 00f0fd193..da7a71497 100644
--- a/transforms/universal/rep_removal/dpk_rep_removal/runtime.py
+++ b/transforms/universal/rep_removal/dpk_rep_removal/runtime.py
@@ -55,16 +55,16 @@ def add_input_params(self, parser: ArgumentParser) -> None:
         )
         parser.add_argument(
             "--rep_removal_length_thresh",
-            type=str,
+            type=int,
             required=False,
-            default="50",
+            default=50,
             help="Length threshold for processing",
         )
         parser.add_argument(
             "--rep_removal_frequency_threshold",
-            type=str,
+            type=int,
             required=False,
-            default="1",
+            default=1,
             help="Frequency threshold for processing.",
         )
         parser.add_argument(
@@ -83,14 +83,14 @@ def add_input_params(self, parser: ArgumentParser) -> None:
         )
         parser.add_argument(
             "--rep_removal_num_threads",
-            type=str,
+            type=int,
             required=False,
-            default=str(cpu_count(logical=False)),
+            default=cpu_count(logical=False),
             help="Value for number of threads to use for processing",
         )
         parser.add_argument(
             "--rep_removal_num_cpus",
-            type=str,
+            type=int,
             required=False,
             default=cpu_count(logical=False),
             help="Value for number of cpus allocated for processing",
diff --git a/transforms/universal/rep_removal/dpk_rep_removal/transform.py b/transforms/universal/rep_removal/dpk_rep_removal/transform.py
index f67737764..a1118814f 100644
--- a/transforms/universal/rep_removal/dpk_rep_removal/transform.py
+++ b/transforms/universal/rep_removal/dpk_rep_removal/transform.py
@@ -13,7 +13,6 @@
 import os
 import subprocess
 import tempfile
-import datetime
 import pyarrow as pa
 import pandas as pd
 from dpk_rep_removal.dedup_pq_level import load_pq_docs_once_avoidIO, extract_dup_per_doc_avoidIO_further, save_deduped_pq_once
@@ -32,12 +31,12 @@ def __init__(self, config: dict[str, Any]):
 
         self.contents_column_name = config.get("rep_removal_contents_column_name", "contents")
         self.dedup_level = config.get("rep_removal_dedup_level_name", "parquet")
-        self.length_thresh = config.get("rep_removal_length_thresh", str(50))
-        self.frequency_threshold = config.get("rep_removal_frequency_threshold", str(1))
+        self.length_thresh = str(config.get("rep_removal_length_thresh", 5))
+        self.frequency_threshold = str(config.get("rep_removal_frequency_threshold", 1))
         self.retain_first_copy = str(config.get("rep_removal_retain_first_copy", True))
         self.tokenize = str(config.get("rep_removal_tokenize", True))
-        self.num_threads = config.get("rep_removal_num_threads", str(cpu_count(logical=False)))
-        self.num_cpus = config.get("rep_removal_num_cpus", cpu_count(logical=False))
+        self.num_threads = str(config.get("rep_removal_num_threads", cpu_count(logical=False)))
+        self.num_cpus = str(config.get("rep_removal_num_cpus", cpu_count(logical=False)))
 
         if self.retain_first_copy.lower() == 'false':
             self.retain_first_copy = False
diff --git a/transforms/universal/rep_removal/rep_removal.ipynb b/transforms/universal/rep_removal/rep_removal.ipynb
index 1c529765b..afaacff50 100644
--- a/transforms/universal/rep_removal/rep_removal.ipynb
+++ b/transforms/universal/rep_removal/rep_removal.ipynb
@@ -38,16 +38,6 @@
     "set the $PATH to include `/Users/USERNAME/.cargo/bin/`"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "88527732-fcaf-4fac-9120-43c67dba76d3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -66,6 +56,7 @@
    "outputs": [],
    "source": [
     "# set $PATH env to append the rust path\n",
+    "import os\n",
     "os.environ['PATH'] = os.environ['PATH'] + ':/OUTPUT/OF/WHEREIS/CARGO/UP/TO/BIN/'"
    ]
   },
@@ -105,7 +96,6 @@
     "RepRemoval(input_folder= \"test-data/input\",\n",
     "            output_folder= \"test-data/output\",\n",
     "            rep_removal_contents_column_name='text', \n",
-    "            rep_removal_num_threads='1',\n",
     "            ).transform()"
    ]
   },