From fa67db0b21111d8df7956edd2e87f264b938c13c Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Thu, 11 May 2023 17:50:21 +1000
Subject: [PATCH 1/5] Add Jupyter script conversion

Add the script for converting Jupyter notebooks to Jupyter scripts
---
 preprocessing/jupyter_script_conversion.py | 86 ++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 preprocessing/jupyter_script_conversion.py

diff --git a/preprocessing/jupyter_script_conversion.py b/preprocessing/jupyter_script_conversion.py
new file mode 100644
index 0000000..950fd4c
--- /dev/null
+++ b/preprocessing/jupyter_script_conversion.py
@@ -0,0 +1,86 @@
+import json
+import nbformat
+import jupytext
+from glob import glob
+from tqdm import tqdm
+from guesslang import Guess
+from datasets import load_dataset
+
+# Function to load dataset and instantiate Guess
+def load_data():
+    guess = Guess()
+    dataset = load_dataset("bigcode/the-stack", data_dir="data/jupyter-notebook", split="train")
+    return guess, dataset
+
+# Function to convert a notebook to a script and handle exceptions
+def convert_notebook_to_script(nb, file_path):
+    try:
+        jupytext.write(nb, file_path)
+    except:
+        return False
+    return True
+
+# Function to convert dataset to script
+def convert_to_script(dataset, success_dir, unknown_dir):
+    for i in tqdm(range(len(dataset))):
+        try:
+            # Convert JSON content to nbformat notebook
+            notebook = nbformat.from_dict(json.loads(dataset[i]["content"]))
+            # Join cell sources
+            for cell in notebook.cells:
+                cell.source = "".join(cell.source)
+            # Construct file path
+            file_path = f"{success_dir}/{i}{notebook.metadata.language_info.file_extension}"
+            # Try writing the notebook as a script
+            if not convert_notebook_to_script(notebook, file_path):
+                if "py" in str(notebook.metadata.kernelspec):
+                    file_path = f"{success_dir}/{i}.py"
+                else:
+                    file_path = f"{unknown_dir}/{i}.ipynb"
+                convert_notebook_to_script(notebook, file_path)
+        except:
+            # Log unreadable files
+            with open("unreadable_files_index.txt","a+") as f:
+                f.write(f"{i}\n")
+
+# Function to convert unknown files
+def convert_unknown_files(guess, success_dir, unknown_dir, lang2ext):
+    for file in tqdm(glob(f"{unknown_dir}/*")):
+        try:
+            # Extract index from filename
+            i = file.split("/")[-1].replace(".ipynb","")
+            # Read notebook
+            notebook = jupytext.read(file)
+            # Set notebook metadata filter
+            notebook.metadata["jupytext"] = {"notebook_metadata_filter": "-all"}
+            # Extract code from notebook cells
+            data = "\n".join([cell.source for cell in notebook.cells if cell.cell_type == 'code'])
+            if not data.isspace():
+                # Guess language of the code
+                for p in guess.probabilities(data):
+                    if p[1] >= 0.5:
+                        try:
+                            # Get file extension for the guessed language
+                            ext = lang2ext[[lang for lang in lang2ext if lang in p[0].lower()][0]]
+                            # Construct file path
+                            file_path = f"{success_dir}/{i}{ext}"
+                            # Write notebook as a script
+                            convert_notebook_to_script(notebook, file_path)
+                            break
+                        except IndexError:
+                            pass
+        except:
+            pass
+
+if __name__ == "__main__":
+    guess, dataset = load_data()
+    lang2ext = {
+        'qsharp': '.py', 'python': '.py', 'coconut': '.coco', 'R': '.r', 'julia': '.jl',
+        'c++': '.cpp', 'scheme': '.scm', 'clojure': '.clj', 'bash': '.sh', 'powershell': '.ps1',
+        'q': '.q', 'matlab': '.m', 'Wolfram Language': '.wolfram', 'idl': '.pro', 'javascript': '.js',
+        'typescript': '.ts', 'scala': '.scala', 'rust': '.rs', 'robotframework': '.resource', 'csharp': '.cs',
+        'fsharp': '.fs', 'sos': '.sos', 'java': '.java', 'groovy': '.groovy', 'sage': '.sage', 'ocaml': '.ml',
+        'haskell': '.hs', 'tcl': '.tcl', 'maxima': '.mac', 'gnuplot': '.gp'
+    }
+    convert_to_script(dataset, "script", "unknown")
+    convert_unknown_files(guess, lang2ext)

From 863b6ee546531f0369d62346b2cda1da988a8720 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 12 May 2023 19:19:40 +1000
Subject: [PATCH 2/5] Update jupyter_script_conversion.py

---
 preprocessing/jupyter_script_conversion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/preprocessing/jupyter_script_conversion.py b/preprocessing/jupyter_script_conversion.py
index 950fd4c..f8a2e53 100644
--- a/preprocessing/jupyter_script_conversion.py
+++ b/preprocessing/jupyter_script_conversion.py
@@ -83,4 +83,4 @@ def convert_unknown_files(guess, success_dir, unknown_dir, lang2ext):
         'haskell': '.hs', 'tcl': '.tcl', 'maxima': '.mac', 'gnuplot': '.gp'
     }
     convert_to_script(dataset, "script", "unknown")
-    convert_unknown_files(guess, lang2ext)
+    convert_unknown_files(guess, "script", "unknown", lang2ext)

From d167c2d7b892b27dba257e3ad0cf19ff029dbe6a Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 12 May 2023 19:20:55 +1000
Subject: [PATCH 3/5] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index c5ae082..8b7e0fa 100644
--- a/README.md
+++ b/README.md
@@ -20,4 +20,5 @@ necessary used for model training.
     - code to generate full-content with meta (repo-name, filename, num stars) for training
   - Filters for GitHub Issues
   - Filters for Git Commits
+  - Script to convert Jupyter notebooks to scripts
 - `decontamination`: script to remove files that match test-samples from code generation benchmarks.

From 645dcd78b0fb1ae83c8327791b7d571b2085df31 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 12 May 2023 19:23:29 +1000
Subject: [PATCH 4/5] Update README.md

---
 preprocessing/README.md | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/preprocessing/README.md b/preprocessing/README.md
index dffd547..a768928 100644
--- a/preprocessing/README.md
+++ b/preprocessing/README.md
@@ -26,10 +26,17 @@ A log file `filtering.log` is saved in the working directory with details about
 
 # Filtering GitHub Issues
 
-We release the filtered GitHub issues dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and providethe code used to curate the dataset:
+We release the filtered GitHub issues dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide the code used to curate the dataset:
 ```bash
 python filtering_issues.py --dataset_name bigcode/subset-github-issues --subset data/ --hub_username loubnabnl --remote_repo test_filter_github_issues
 ```
 
 # Filtering Git Commits
-We release the filtered Git Commits dataset as part of  [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide a notebook with the cleaning steps at `filtering_git_commits.ipynb`.
+We release the filtered Git Commits dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide a notebook with the cleaning steps at `filtering_git_commits.ipynb`.
+
+# Converting Jupyter Notebooks to Scripts
+We release the Jupyter scripts dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide the code used to curate the dataset:
+```bash
+python jupyter_script_conversion.py
+```
+

From 224c3bde5421201c91726286e4c1615dc6051f9f Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Fri, 12 May 2023 19:28:16 +1000
Subject: [PATCH 5/5] Update requirements.txt

---
 preprocessing/requirements.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/preprocessing/requirements.txt b/preprocessing/requirements.txt
index 18d18dc..899163a 100644
--- a/preprocessing/requirements.txt
+++ b/preprocessing/requirements.txt
@@ -2,3 +2,5 @@ datasets==2.7.1
 huggingface_hub==0.10.1
 transformers==4.23.1
 pygments==2.11.2
+guesslang==2.2.1
+nbformat==14.4.4