From af9975300ad59cdb18a7d5437bc99e3aafcae097 Mon Sep 17 00:00:00 2001 From: lingjzhu <> Date: Fri, 12 May 2023 10:16:30 -0700 Subject: [PATCH] update correct scripts and readme.md --- README.md | 1 + preprocessing/README.md | 19 + preprocessing/jupyter-structured/README.md | 17 - .../jupyter-generate-triplets.py | 597 ++---------------- 4 files changed, 88 insertions(+), 546 deletions(-) delete mode 100644 preprocessing/jupyter-structured/README.md diff --git a/README.md b/README.md index 8b7e0fa..ecdbd1d 100644 --- a/README.md +++ b/README.md @@ -21,4 +21,5 @@ necessary used for model training. - Filters for GitHub Issues - Filters for Git Commits - Script to convert Jupyter notebooks to scripts + - Scripts to convert Jupyter notebooks to structured markdown-code-output triplets - `decontamination`: script to remove files that match test-samples from code generation benchmarks. diff --git a/preprocessing/README.md b/preprocessing/README.md index a768928..dce1f39 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -40,3 +40,22 @@ We release the Jupyter scripts dataset as part of [StarCoderData](https://huggin python jupyter_script_conversion.py ``` +# Creating Jupyter-structured dataset + +## Step 1 +Parse Jupyter notebooks from `the Stack`. +``` +python jupyter-structured/jupyter-segment-notebooks.py +``` + +## Step 2 +Generate markdown-code-output triplets. +``` +python jupyter-structured/jupyter-generate-triplets.py +``` + +## Step 3 +Create notebook-level structured dataset using `jupyter-structured/jupyter-structured.ipynb`. + + + diff --git a/preprocessing/jupyter-structured/README.md b/preprocessing/jupyter-structured/README.md deleted file mode 100644 index 9297e53..0000000 --- a/preprocessing/jupyter-structured/README.md +++ /dev/null @@ -1,17 +0,0 @@ -## Creating Jupyter-structured dataset - -### Step 1 -Parse Jupyter notebooks from `the Stack`. -``` -python jupyter-segment-notebooks.py -``` - -### Step 2 -Generate markdown-code-output triplets. -``` -python jupyter-generate-triplets.py -``` - -### Step 3 -Create notebook-level structured dataset using `jupyter-structured.ipynb`. - diff --git a/preprocessing/jupyter-structured/jupyter-generate-triplets.py b/preprocessing/jupyter-structured/jupyter-generate-triplets.py index 4e7accd..79de712 100644 --- a/preprocessing/jupyter-structured/jupyter-generate-triplets.py +++ b/preprocessing/jupyter-structured/jupyter-generate-triplets.py @@ -1,529 +1,68 @@ - - - - - - - - - - - - - - - - - - - - - - - - generate_triplets.py · bigcode/jupyter-code-text-pairs at main - - -
-
Hugging Face's logo - - -
- -
- - -
-
-
- - - -
- -
- - - - -
-
jupyter-code-text-pairs - / - generate_triplets.py
- - -
-
lingjzhu's picture - -
Upload generate_triplets.py
- a75cfd0 - -
- - -
-
- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
import json
import itertools
from datasets import load_dataset, load_from_disk, Dataset
import re
from tqdm import tqdm
import sys
-
def clean_markdown(text):
text = re.sub(r'<.*?>','',text)
text = re.sub(r'\n+','',text)
text = text.replace('#','')
return text
-
-
def parse_data(ds):
"""Parse data into markdown-code pairs"""
for notebook in tqdm(ds):
types = notebook["cell_types"]
cells = notebook["cells"]
if len(types)>0:
if types[0] == "code":
# drop first cell of code to have the notebook start with markdown
cells = cells[1:]
types = types[1:]
#else:
# drop first the two cells of markdown followed by code
# the first markown cell of a notebook is often a long description of the whole notebook
# cells = notebooks["cells"][2:]
# types = notebooks["types"][2:]
if len(types)>0:
if types[-1] == 'markdown':
cells = cells[:-1]
types = types[:-1]
-
if len(cells) % 2 == 0:
inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0]
inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0]
-
-
for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets):
markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block])
code = '\n'.join([snippet[0] for snippet in code_snippet])
output = [snippet[1] for snippet in code_snippet][-1]
-
line = {'markdown':markdown_block,
'code':code,
'output':output,
'license':notebook['max_issues_repo_licenses'][0],
'path':notebook['max_stars_repo_path'],
'repo_name':notebook['max_stars_repo_name'],
}
yield line
-
if __name__ == "__main__":
file = sys.argv[1]
dataset = load_dataset("bigcode/jupyter-parsed")
with open(file,'w') as out:
for line in parse_data(data):
out.write(json.dumps(line)+'\n')
-
dataset = load_dataset('json',ata_files=file)
dataset.push_to_hub("bigcode/jupyter-code-text-pairs")
-
-
- - - - - - - - - - - - - +import json +import itertools +from datasets import load_dataset, load_from_disk, Dataset +import re +from tqdm import tqdm +import sys + +def clean_markdown(text): + text = re.sub(r'<.*?>','',text) + text = re.sub(r'\n+','',text) + text = text.replace('#','') + return text + + +def parse_data(ds): + """Parse data into markdown-code pairs""" + + for notebook in tqdm(ds): + + types = notebook["cell_types"] + cells = notebook["cells"] + + if len(types)>0: + if types[0] == "code": + # drop first cell of code to have the notebook start with markdown + cells = cells[1:] + types = types[1:] + #else: + # drop first the two cells of markdown followed by code + # the first markown cell of a notebook is often a long description of the whole notebook + # cells = notebooks["cells"][2:] + # types = notebooks["types"][2:] + if len(types)>0: + if types[-1] == 'markdown': + cells = cells[:-1] + types = types[:-1] + + if len(cells) % 2 == 0: + inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0] + inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0] + + + for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets): + markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block]) + code = '\n'.join([snippet[0] for snippet in code_snippet]) + output = [snippet[1] for snippet in code_snippet][-1] + + line = {'markdown':markdown_block, + 'code':code, + 'output':output, + 'license':notebook['max_issues_repo_licenses'][0], + 'path':notebook['max_stars_repo_path'], + 'repo_name':notebook['max_stars_repo_name'], + } + yield line + + +if __name__ == "__main__": + file = sys.argv[1] + + dataset = load_dataset("bigcode/jupyter-parsed") + with open(file,'w') as out: + for line in parse_data(data): + out.write(json.dumps(line)+'\n') + + dataset = load_dataset('json',ata_files=file) + + dataset.push_to_hub("bigcode/jupyter-code-text-pairs")