diff --git a/README.md b/README.md index c5ae082..8b7e0fa 100644 --- a/README.md +++ b/README.md @@ -20,4 +20,5 @@ necessary used for model training. - code to generate full-content with meta (repo-name, filename, num stars) for training - Filters for GitHub Issues - Filters for Git Commits + - Script to convert Jupyter notebooks to scripts - `decontamination`: script to remove files that match test-samples from code generation benchmarks. diff --git a/preprocessing/README.md b/preprocessing/README.md index dffd547..a768928 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -26,10 +26,17 @@ A log file `filtering.log` is saved in the working directory with details about # Filtering GitHub Issues -We release the filtered GitHub issues dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and providethe code used to curate the dataset: +We release the filtered GitHub issues dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide the code used to curate the dataset: ```bash python filtering_issues.py --dataset_name bigcode/subset-github-issues --subset data/ --hub_username loubnabnl --remote_repo test_filter_github_issues ``` # Filtering Git Commits -We release the filtered Git Commits dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide a notebook with the cleaning steps at `filtering_git_commits.ipynb`. +We release the filtered Git Commits dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide a notebook with the cleaning steps at `filtering_git_commits.ipynb`. + +# Converting Jupyter Notebooks to Scripts +We release the Jupyter scripts dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide the code used to curate the dataset: +```bash +python jupyter_script_conversion.py +``` + diff --git a/preprocessing/jupyter_script_conversion.py b/preprocessing/jupyter_script_conversion.py new file mode 100644 index 0000000..f8a2e53 --- /dev/null +++ b/preprocessing/jupyter_script_conversion.py @@ -0,0 +1,86 @@ +import json +import nbformat +import jupytext +from glob import glob +from tqdm import tqdm +from guesslang import Guess +from datasets import load_dataset + +# Function to load dataset and instantiate Guess +def load_data(): + guess = Guess() + dataset = load_dataset("bigcode/the-stack", data_dir="data/jupyter-notebook", split="train") + return guess, dataset + +# Function to convert a notebook to a script and handle exceptions +def convert_notebook_to_script(nb, file_path): + try: + jupytext.write(nb, file_path) + except: + return False + return True + +# Function to convert dataset to script +def convert_to_script(dataset, success_dir, unknown_dir): + for i in tqdm(range(len(dataset))): + try: + # Convert JSON content to nbformat notebook + notebook = nbformat.from_dict(json.loads(dataset[i]["content"])) + # Join cell sources + for cell in notebook.cells: + cell.source = "".join(cell.source) + # Construct file path + file_path = f"{success_dir}/{i}{notebook.metadata.language_info.file_extension}" + # Try writing the notebook as a script + if not convert_notebook_to_script(notebook, file_path): + if "py" in str(notebook.metadata.kernelspec): + file_path = f"{success_dir}/{i}.py" + else: + file_path = f"{unknown_dir}/{i}.ipynb" + convert_notebook_to_script(notebook, file_path) + except: + # Log unreadable files + with open("unreadable_files_index.txt","a+") as f: + f.write(f"{i}\n") + +# Function to convert unknown files +def convert_unknown_files(guess, success_dir, unknown_dir, lang2ext): + for file in tqdm(glob(f"{unknown_dir}/*")): + try: + # Extract index from filename + i = file.split("/")[-1].replace(".ipynb","") + # Read notebook + notebook = jupytext.read(file) + # Set notebook metadata filter + notebook.metadata["jupytext"] = {"notebook_metadata_filter": "-all"} + # Extract code from notebook cells + data = "\n".join([cell.source for cell in notebook.cells if cell.cell_type == 'code']) + if not data.isspace(): + # Guess language of the code + for p in guess.probabilities(data): + if p[1] >= 0.5: + try: + # Get file extension for the guessed language + ext = lang2ext[[lang for lang in lang2ext if lang in p[0].lower()][0]] + # Construct file path + file_path = f"{success_dir}/{i}{ext}" + # Write notebook as a script + convert_notebook_to_script(notebook, file_path) + break + except IndexError: + pass + except: + pass + +if __name__ == "__main__": + guess, dataset = load_data() + lang2ext = { + 'qsharp': '.py', 'python': '.py', 'coconut': '.coco', 'R': '.r', 'julia': '.jl', + 'c++': '.cpp', 'scheme': '.scm', 'clojure': '.clj', 'bash': '.sh', 'powershell': '.ps1', + 'q': '.q', 'matlab': '.m', 'Wolfram Language': '.wolfram', 'idl': '.pro', 'javascript': '.js', + 'typescript': '.ts', 'scala': '.scala', 'rust': '.rs', 'robotframework': '.resource', 'csharp': '.cs', + 'fsharp': '.fs', 'sos': '.sos', 'java': '.java', 'groovy': '.groovy', 'sage': '.sage', 'ocaml': '.ml', + 'haskell': '.hs', 'tcl': '.tcl', 'maxima': '.mac', 'gnuplot': '.gp' + } + convert_to_script(dataset, "script", "unknown") + convert_unknown_files(guess, "script", "unknown", lang2ext) diff --git a/preprocessing/requirements.txt b/preprocessing/requirements.txt index 18d18dc..899163a 100644 --- a/preprocessing/requirements.txt +++ b/preprocessing/requirements.txt @@ -2,3 +2,5 @@ datasets==2.7.1 huggingface_hub==0.10.1 transformers==4.23.1 pygments==2.11.2 +guesslang==2.2.1 +nbformat==14.4.4