Skip to content

Commit

Permalink
Merge pull request #50 from terryyz/main
Browse files Browse the repository at this point in the history
Add Jupyter script conversion
  • Loading branch information
loubnabnl authored May 12, 2023
2 parents 966a101 + 224c3bd commit e806727
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 2 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,5 @@ necessary used for model training.
- code to generate full-content with meta (repo-name, filename, num stars) for training
- Filters for GitHub Issues
- Filters for Git Commits
- Script to convert Jupyter notebooks to scripts
- `decontamination`: script to remove files that match test-samples from code generation benchmarks.
11 changes: 9 additions & 2 deletions preprocessing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,17 @@ A log file `filtering.log` is saved in the working directory with details about

# Filtering GitHub Issues

We release the filtered GitHub issues dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and providethe code used to curate the dataset:
We release the filtered GitHub issues dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide the code used to curate the dataset:
```bash
python filtering_issues.py --dataset_name bigcode/subset-github-issues --subset data/ --hub_username loubnabnl --remote_repo test_filter_github_issues
```

# Filtering Git Commits
We release the filtered Git Commits dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide a notebook with the cleaning steps at `filtering_git_commits.ipynb`.
We release the filtered Git Commits dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide a notebook with the cleaning steps at `filtering_git_commits.ipynb`.

# Converting Jupyter Notebooks to Scripts
We release the Jupyter scripts dataset as part of [StarCoderData](https://huggingface.co/datasets/bigcode/starcoderdata), and provide the code used to curate the dataset:
```bash
python jupyter_script_conversion.py
```

86 changes: 86 additions & 0 deletions preprocessing/jupyter_script_conversion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import json
import nbformat
import jupytext
from glob import glob
from tqdm import tqdm
from guesslang import Guess
from datasets import load_dataset

# Function to load dataset and instantiate Guess
def load_data():
guess = Guess()
dataset = load_dataset("bigcode/the-stack", data_dir="data/jupyter-notebook", split="train")
return guess, dataset

# Function to convert a notebook to a script and handle exceptions
def convert_notebook_to_script(nb, file_path):
try:
jupytext.write(nb, file_path)
except:
return False
return True

# Function to convert dataset to script
def convert_to_script(dataset, success_dir, unknown_dir):
for i in tqdm(range(len(dataset))):
try:
# Convert JSON content to nbformat notebook
notebook = nbformat.from_dict(json.loads(dataset[i]["content"]))
# Join cell sources
for cell in notebook.cells:
cell.source = "".join(cell.source)
# Construct file path
file_path = f"{success_dir}/{i}{notebook.metadata.language_info.file_extension}"
# Try writing the notebook as a script
if not convert_notebook_to_script(notebook, file_path):
if "py" in str(notebook.metadata.kernelspec):
file_path = f"{success_dir}/{i}.py"
else:
file_path = f"{unknown_dir}/{i}.ipynb"
convert_notebook_to_script(notebook, file_path)
except:
# Log unreadable files
with open("unreadable_files_index.txt","a+") as f:
f.write(f"{i}\n")

# Function to convert unknown files
def convert_unknown_files(guess, success_dir, unknown_dir, lang2ext):
for file in tqdm(glob(f"{unknown_dir}/*")):
try:
# Extract index from filename
i = file.split("/")[-1].replace(".ipynb","")
# Read notebook
notebook = jupytext.read(file)
# Set notebook metadata filter
notebook.metadata["jupytext"] = {"notebook_metadata_filter": "-all"}
# Extract code from notebook cells
data = "\n".join([cell.source for cell in notebook.cells if cell.cell_type == 'code'])
if not data.isspace():
# Guess language of the code
for p in guess.probabilities(data):
if p[1] >= 0.5:
try:
# Get file extension for the guessed language
ext = lang2ext[[lang for lang in lang2ext if lang in p[0].lower()][0]]
# Construct file path
file_path = f"{success_dir}/{i}{ext}"
# Write notebook as a script
convert_notebook_to_script(notebook, file_path)
break
except IndexError:
pass
except:
pass

if __name__ == "__main__":
guess, dataset = load_data()
lang2ext = {
'qsharp': '.py', 'python': '.py', 'coconut': '.coco', 'R': '.r', 'julia': '.jl',
'c++': '.cpp', 'scheme': '.scm', 'clojure': '.clj', 'bash': '.sh', 'powershell': '.ps1',
'q': '.q', 'matlab': '.m', 'Wolfram Language': '.wolfram', 'idl': '.pro', 'javascript': '.js',
'typescript': '.ts', 'scala': '.scala', 'rust': '.rs', 'robotframework': '.resource', 'csharp': '.cs',
'fsharp': '.fs', 'sos': '.sos', 'java': '.java', 'groovy': '.groovy', 'sage': '.sage', 'ocaml': '.ml',
'haskell': '.hs', 'tcl': '.tcl', 'maxima': '.mac', 'gnuplot': '.gp'
}
convert_to_script(dataset, "script", "unknown")
convert_unknown_files(guess, "script", "unknown", lang2ext)
2 changes: 2 additions & 0 deletions preprocessing/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,5 @@ datasets==2.7.1
huggingface_hub==0.10.1
transformers==4.23.1
pygments==2.11.2
guesslang==2.2.1
nbformat==14.4.4

0 comments on commit e806727

Please sign in to comment.