From fa67db0b21111d8df7956edd2e87f264b938c13c Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 11 May 2023 17:50:21 +1000 Subject: [PATCH 1/5] Add Jupyter script conversion Add the script for converting Jupyter notebooks to Jupyter scripts --- preprocessing/jupyter_script_conversion.py | 86 ++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 preprocessing/jupyter_script_conversion.py diff --git a/preprocessing/jupyter_script_conversion.py b/preprocessing/jupyter_script_conversion.py new file mode 100644 index 0000000..950fd4c --- /dev/null +++ b/preprocessing/jupyter_script_conversion.py @@ -0,0 +1,86 @@ +import json +import nbformat +import jupytext +from glob import glob +from tqdm import tqdm +from guesslang import Guess +from datasets import load_dataset + +# Function to load dataset and instantiate Guess +def load_data(): + guess = Guess() + dataset = load_dataset("bigcode/the-stack", data_dir="data/jupyter-notebook", split="train") + return guess, dataset + +# Function to convert a notebook to a script and handle exceptions +def convert_notebook_to_script(nb, file_path): + try: + jupytext.write(nb, file_path) + except: + return False + return True + +# Function to convert dataset to script +def convert_to_script(dataset, success_dir, unknown_dir): + for i in tqdm(range(len(dataset))): + try: + # Convert JSON content to nbformat notebook + notebook = nbformat.from_dict(json.loads(dataset[i]["content"])) + # Join cell sources + for cell in notebook.cells: + cell.source = "".join(cell.source) + # Construct file path + file_path = f"{success_dir}/{i}{notebook.metadata.language_info.file_extension}" + # Try writing the notebook as a script + if not convert_notebook_to_script(notebook, file_path): + if "py" in str(notebook.metadata.kernelspec): + file_path = f"{success_dir}/{i}.py" + else: + file_path = f"{unknown_dir}/{i}.ipynb" + convert_notebook_to_script(notebook, file_path) + except: + # Log unreadable files + with open("unreadable_files_index.txt","a+") as f: + f.write(f"{i}\n") + +# Function to convert unknown files +def convert_unknown_files(guess, success_dir, unknown_dir, lang2ext): + for file in tqdm(glob(f"{unknown_dir}/*")): + try: + # Extract index from filename + i = file.split("/")[-1].replace(".ipynb","") + # Read notebook + notebook = jupytext.read(file) + # Set notebook metadata filter + notebook.metadata["jupytext"] = {"notebook_metadata_filter": "-all"} + # Extract code from notebook cells + data = "\n".join([cell.source for cell in notebook.cells if cell.cell_type == 'code']) + if not data.isspace(): + # Guess language of the code + for p in guess.probabilities(data): + if p[1] >= 0.5: + try: + # Get file extension for the guessed language + ext = lang2ext[[lang for lang in lang2ext if lang in p[0].lower()][0]] + # Construct file path + file_path = f"{success_dir}/{i}{ext}" + # Write notebook as a script + convert_notebook_to_script(notebook, file_path) + break + except IndexError: + pass + except: + pass + +if __name__ == "__main__": + guess, dataset = load_data() + lang2ext = { + 'qsharp': '.py', 'python': '.py', 'coconut': '.coco', 'R': '.r', 'julia': '.jl', + 'c++': '.cpp', 'scheme': '.scm', 'clojure': '.clj', 'bash': '.sh', 'powershell': '.ps1', + 'q': '.q', 'matlab': '.m', 'Wolfram Language': '.wolfram', 'idl': '.pro', 'javascript': '.js', + 'typescript': '.ts', 'scala': '.scala', 'rust': '.rs', 'robotframework': '.resource', 'csharp': '.cs', + 'fsharp': '.fs', 'sos': '.sos', 'java': '.java', 'groovy': '.groovy', 'sage': '.sage', 'ocaml': '.ml', + 'haskell': '.hs', 'tcl': '.tcl', 'maxima': '.mac', 'gnuplot': '.gp' + } + convert_to_script(dataset, "script", "unknown") + convert_unknown_files(guess, lang2ext) From 863b6ee546531f0369d62346b2cda1da988a8720 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 12 May 2023 19:19:40 +1000 Subject: [PATCH 2/5] Update jupyter_script_conversion.py --- preprocessing/jupyter_script_conversion.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preprocessing/jupyter_script_conversion.py b/preprocessing/jupyter_script_conversion.py index 950fd4c..f8a2e53 100644 --- a/preprocessing/jupyter_script_conversion.py +++ b/preprocessing/jupyter_script_conversion.py @@ -83,4 +83,4 @@ def convert_unknown_files(guess, success_dir, unknown_dir, lang2ext): 'haskell': '.hs', 'tcl': '.tcl', 'maxima': '.mac', 'gnuplot': '.gp' } convert_to_script(dataset, "script", "unknown") - convert_unknown_files(guess, lang2ext) + convert_unknown_files(guess, "script", "unknown", lang2ext) From d167c2d7b892b27dba257e3ad0cf19ff029dbe6a Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 12 May 2023 19:20:55 +1000 Subject: [PATCH 3/5] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index c5ae082..8b7e0fa 100644 --- a/README.md +++ b/README.md @@ -20,4 +20,5 @@ necessary used for model training. - code to generate full-content with meta (repo-name, filename, num stars) for training - Filters for GitHub Issues - Filters for Git Commits + - Script to convert Jupyter notebooks to scripts - `decontamination`: script to remove files that match test-samples from code generation benchmarks. From 645dcd78b0fb1ae83c8327791b7d571b2085df31 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 12 May 2023 19:23:29 +1000 Subject: [PATCH 4/5] Update README.md --- preprocessing/README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/preprocessing/README.md b/preprocessing/README.md index dffd547..a768928 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -26,10 +26,17 @@ A log file `filtering.log` is saved in the working directory with details about # Filtering GitHub Issues -We release the filtered GitHub issues dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and providethe code used to curate the dataset: +We release the filtered GitHub issues dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide the code used to curate the dataset: ```bash python filtering_issues.py --dataset_name bigcode/subset-github-issues --subset data/ --hub_username loubnabnl --remote_repo test_filter_github_issues ``` # Filtering Git Commits -We release the filtered Git Commits dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide a notebook with the cleaning steps at `filtering_git_commits.ipynb`. +We release the filtered Git Commits dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide a notebook with the cleaning steps at `filtering_git_commits.ipynb`. + +# Converting Jupyter Notebooks to Scripts +We release the Jupyter scripts dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide the code used to curate the dataset: +```bash +python jupyter_script_conversion.py +``` + From 224c3bde5421201c91726286e4c1615dc6051f9f Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Fri, 12 May 2023 19:28:16 +1000 Subject: [PATCH 5/5] Update requirements.txt --- preprocessing/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/preprocessing/requirements.txt b/preprocessing/requirements.txt index 18d18dc..899163a 100644 --- a/preprocessing/requirements.txt +++ b/preprocessing/requirements.txt @@ -2,3 +2,5 @@ datasets==2.7.1 huggingface_hub==0.10.1 transformers==4.23.1 pygments==2.11.2 +guesslang==2.2.1 +nbformat==14.4.4