From 588df437170242b03a1cacc3ab60e47072c3e86c Mon Sep 17 00:00:00 2001 From: lingjzhu <> Date: Thu, 11 May 2023 13:40:36 -0700 Subject: [PATCH 1/3] add jupyter-structured scripts --- preprocessing/jupyter-structured/README.md | 17 + .../jupyter-generate-triplets.py | 529 ++++++++++++++++++ .../jupyter-segment-notebooks.py | 65 +++ .../jupyter-structured.ipynb | 237 ++++++++ 4 files changed, 848 insertions(+) create mode 100644 preprocessing/jupyter-structured/README.md create mode 100644 preprocessing/jupyter-structured/jupyter-generate-triplets.py create mode 100644 preprocessing/jupyter-structured/jupyter-segment-notebooks.py create mode 100644 preprocessing/jupyter-structured/jupyter-structured.ipynb diff --git a/preprocessing/jupyter-structured/README.md b/preprocessing/jupyter-structured/README.md new file mode 100644 index 0000000..9297e53 --- /dev/null +++ b/preprocessing/jupyter-structured/README.md @@ -0,0 +1,17 @@ +## Creating Jupyter-structured dataset + +### Step 1 +Parse Jupyter notebooks from `the Stack`. +``` +python jupyter-segment-notebooks.py +``` + +### Step 2 +Generate markdown-code-output triplets. +``` +python jupyter-generate-triplets.py +``` + +### Step 3 +Create notebook-level structured dataset using `jupyter-structured.ipynb`. + diff --git a/preprocessing/jupyter-structured/jupyter-generate-triplets.py b/preprocessing/jupyter-structured/jupyter-generate-triplets.py new file mode 100644 index 0000000..4e7accd --- /dev/null +++ b/preprocessing/jupyter-structured/jupyter-generate-triplets.py @@ -0,0 +1,529 @@ + + + + + + + + + + + + + + + + + + + + + + + + generate_triplets.py · bigcode/jupyter-code-text-pairs at main + + +
+
Hugging Face's logo + + +
+ +
+ + +
+
+
+ + + +
+ +
+ + + + +
+
jupyter-code-text-pairs + / + generate_triplets.py
+ + +
+
lingjzhu's picture + +
Upload generate_triplets.py
+ a75cfd0 + +
+ + +
+
+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
import json
import itertools
from datasets import load_dataset, load_from_disk, Dataset
import re
from tqdm import tqdm
import sys
+
def clean_markdown(text):
text = re.sub(r'<.*?>','',text)
text = re.sub(r'\n+','',text)
text = text.replace('#','')
return text
+
+
def parse_data(ds):
"""Parse data into markdown-code pairs"""
for notebook in tqdm(ds):
types = notebook["cell_types"]
cells = notebook["cells"]
if len(types)>0:
if types[0] == "code":
# drop first cell of code to have the notebook start with markdown
cells = cells[1:]
types = types[1:]
#else:
# drop first the two cells of markdown followed by code
# the first markown cell of a notebook is often a long description of the whole notebook
# cells = notebooks["cells"][2:]
# types = notebooks["types"][2:]
if len(types)>0:
if types[-1] == 'markdown':
cells = cells[:-1]
types = types[:-1]
+
if len(cells) % 2 == 0:
inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0]
inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0]
+
+
for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets):
markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block])
code = '\n'.join([snippet[0] for snippet in code_snippet])
output = [snippet[1] for snippet in code_snippet][-1]
+
line = {'markdown':markdown_block,
'code':code,
'output':output,
'license':notebook['max_issues_repo_licenses'][0],
'path':notebook['max_stars_repo_path'],
'repo_name':notebook['max_stars_repo_name'],
}
yield line
+
if __name__ == "__main__":
file = sys.argv[1]
dataset = load_dataset("bigcode/jupyter-parsed")
with open(file,'w') as out:
for line in parse_data(data):
out.write(json.dumps(line)+'\n')
+
dataset = load_dataset('json',ata_files=file)
dataset.push_to_hub("bigcode/jupyter-code-text-pairs")
+
+
+ + + + + + + + + + + + + diff --git a/preprocessing/jupyter-structured/jupyter-segment-notebooks.py b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py new file mode 100644 index 0000000..2e9f4e2 --- /dev/null +++ b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py @@ -0,0 +1,65 @@ +import json +import itertools +from datasets import load_dataset + + +def segment_blocks(content): + + cells = [] + cell_types = [] + for cell in content['cells']: + if len(cell['source']) > 0: + output = '_____no_output_____' + if 'outputs' in cell.keys(): + if len(cell['outputs'])>0: + if 'text' in cell['outputs'][0].keys(): + output = cell['outputs'][0]['text'] + cells.append([''.join(cell['source']),''.join(output)]) + cell_types.append(cell['cell_type']) + return cells, cell_types + + +def segment(batch): + try: + content = json.loads(batch['content']) + if 'py' in json.dumps(content['metadata']): + cells, types = segment_blocks(content) + + cell_type_groups = [list(g) for k,g in itertools.groupby(types)] + cell_types = [k for k,g in itertools.groupby(types)] + cell_groups = [] + + group_start = 0 + for g in cell_type_groups: + cell_groups.append(cells[group_start:group_start+len(g)]) + group_start += len(g) + + batch['cells'] = cell_groups + batch['cell_types'] = cell_types + batch['cell_type_groups'] = cell_type_groups + + else: + batch['cells'] = [[['empty']]] + batch['cell_types'] = ['empty'] + batch['cell_type_groups'] = [['empty']] + + except: + + batch['cells'] = [[['empty']]] + batch['cell_types'] = ['empty'] + batch['cell_type_groups'] = [['empty']] + + del batch['content'] + return batch + + +if __name__ == "__main__": + + # load dataset + dataset = load_dataset("bigcode/the-stack",data_dir="data/jupyter-notebook", split="train",use_auth_token=True) + # segment notebooks + dataset = dataset.map(segment) + # filter out erronous cells via placeholders + dataset = dataset.filter(lambda entry: entry['cell_types']!=['empty']) + # push to hub + dataset.push_to_hub("bigcode/jupyter-parsed") \ No newline at end of file diff --git a/preprocessing/jupyter-structured/jupyter-structured.ipynb b/preprocessing/jupyter-structured/jupyter-structured.ipynb new file mode 100644 index 0000000..ce51d4b --- /dev/null +++ b/preprocessing/jupyter-structured/jupyter-structured.ipynb @@ -0,0 +1,237 @@ + + + + + + + + + + + + + + + + + + + + + + + + jupyter-pairs.ipynb · bigcode/jupyter-structured-clean at main + + +
+
Hugging Face's logo + + +
+ +
+ + +
+
+
+ + + +
+ +
+ + + + +
+
jupyter-structured-clean + / + jupyter-pairs.ipynb
+ + +
+
lvwerra's picture +
lvwerra + +
HF staff +
+
+
Upload jupyter-pairs.ipynb
+ bcb1c3a + +
+ + +
Open in Colab +
+
+ + + + + + + + + + + + + From af9975300ad59cdb18a7d5437bc99e3aafcae097 Mon Sep 17 00:00:00 2001 From: lingjzhu <> Date: Fri, 12 May 2023 10:16:30 -0700 Subject: [PATCH 2/3] update correct scripts and readme.md --- README.md | 1 + preprocessing/README.md | 19 + preprocessing/jupyter-structured/README.md | 17 - .../jupyter-generate-triplets.py | 597 ++---------------- 4 files changed, 88 insertions(+), 546 deletions(-) delete mode 100644 preprocessing/jupyter-structured/README.md diff --git a/README.md b/README.md index 8b7e0fa..ecdbd1d 100644 --- a/README.md +++ b/README.md @@ -21,4 +21,5 @@ necessary used for model training. - Filters for GitHub Issues - Filters for Git Commits - Script to convert Jupyter notebooks to scripts + - Scripts to convert Jupyter notebooks to structured markdown-code-output triplets - `decontamination`: script to remove files that match test-samples from code generation benchmarks. diff --git a/preprocessing/README.md b/preprocessing/README.md index a768928..dce1f39 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -40,3 +40,22 @@ We release the Jupyter scripts dataset as part of [StarCoderData](https://huggin python jupyter_script_conversion.py ``` +# Creating Jupyter-structured dataset + +## Step 1 +Parse Jupyter notebooks from `the Stack`. +``` +python jupyter-structured/jupyter-segment-notebooks.py +``` + +## Step 2 +Generate markdown-code-output triplets. +``` +python jupyter-structured/jupyter-generate-triplets.py +``` + +## Step 3 +Create notebook-level structured dataset using `jupyter-structured/jupyter-structured.ipynb`. + + + diff --git a/preprocessing/jupyter-structured/README.md b/preprocessing/jupyter-structured/README.md deleted file mode 100644 index 9297e53..0000000 --- a/preprocessing/jupyter-structured/README.md +++ /dev/null @@ -1,17 +0,0 @@ -## Creating Jupyter-structured dataset - -### Step 1 -Parse Jupyter notebooks from `the Stack`. -``` -python jupyter-segment-notebooks.py -``` - -### Step 2 -Generate markdown-code-output triplets. -``` -python jupyter-generate-triplets.py -``` - -### Step 3 -Create notebook-level structured dataset using `jupyter-structured.ipynb`. - diff --git a/preprocessing/jupyter-structured/jupyter-generate-triplets.py b/preprocessing/jupyter-structured/jupyter-generate-triplets.py index 4e7accd..79de712 100644 --- a/preprocessing/jupyter-structured/jupyter-generate-triplets.py +++ b/preprocessing/jupyter-structured/jupyter-generate-triplets.py @@ -1,529 +1,68 @@ - - - - - - - - - - - - - - - - - - - - - - - - generate_triplets.py · bigcode/jupyter-code-text-pairs at main - - -
-
Hugging Face's logo - - -
- -
- - -
-
-
- - - -
- -
- - - - -
-
jupyter-code-text-pairs - / - generate_triplets.py
- - -
-
lingjzhu's picture - -
Upload generate_triplets.py
- a75cfd0 - -
- - -
-
- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
import json
import itertools
from datasets import load_dataset, load_from_disk, Dataset
import re
from tqdm import tqdm
import sys
-
def clean_markdown(text):
text = re.sub(r'<.*?>','',text)
text = re.sub(r'\n+','',text)
text = text.replace('#','')
return text
-
-
def parse_data(ds):
"""Parse data into markdown-code pairs"""
for notebook in tqdm(ds):
types = notebook["cell_types"]
cells = notebook["cells"]
if len(types)>0:
if types[0] == "code":
# drop first cell of code to have the notebook start with markdown
cells = cells[1:]
types = types[1:]
#else:
# drop first the two cells of markdown followed by code
# the first markown cell of a notebook is often a long description of the whole notebook
# cells = notebooks["cells"][2:]
# types = notebooks["types"][2:]
if len(types)>0:
if types[-1] == 'markdown':
cells = cells[:-1]
types = types[:-1]
-
if len(cells) % 2 == 0:
inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0]
inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0]
-
-
for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets):
markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block])
code = '\n'.join([snippet[0] for snippet in code_snippet])
output = [snippet[1] for snippet in code_snippet][-1]
-
line = {'markdown':markdown_block,
'code':code,
'output':output,
'license':notebook['max_issues_repo_licenses'][0],
'path':notebook['max_stars_repo_path'],
'repo_name':notebook['max_stars_repo_name'],
}
yield line
-
if __name__ == "__main__":
file = sys.argv[1]
dataset = load_dataset("bigcode/jupyter-parsed")
with open(file,'w') as out:
for line in parse_data(data):
out.write(json.dumps(line)+'\n')
-
dataset = load_dataset('json',ata_files=file)
dataset.push_to_hub("bigcode/jupyter-code-text-pairs")
-
-
- - - - - - - - - - - - - +import json +import itertools +from datasets import load_dataset, load_from_disk, Dataset +import re +from tqdm import tqdm +import sys + +def clean_markdown(text): + text = re.sub(r'<.*?>','',text) + text = re.sub(r'\n+','',text) + text = text.replace('#','') + return text + + +def parse_data(ds): + """Parse data into markdown-code pairs""" + + for notebook in tqdm(ds): + + types = notebook["cell_types"] + cells = notebook["cells"] + + if len(types)>0: + if types[0] == "code": + # drop first cell of code to have the notebook start with markdown + cells = cells[1:] + types = types[1:] + #else: + # drop first the two cells of markdown followed by code + # the first markown cell of a notebook is often a long description of the whole notebook + # cells = notebooks["cells"][2:] + # types = notebooks["types"][2:] + if len(types)>0: + if types[-1] == 'markdown': + cells = cells[:-1] + types = types[:-1] + + if len(cells) % 2 == 0: + inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0] + inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0] + + + for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets): + markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block]) + code = '\n'.join([snippet[0] for snippet in code_snippet]) + output = [snippet[1] for snippet in code_snippet][-1] + + line = {'markdown':markdown_block, + 'code':code, + 'output':output, + 'license':notebook['max_issues_repo_licenses'][0], + 'path':notebook['max_stars_repo_path'], + 'repo_name':notebook['max_stars_repo_name'], + } + yield line + + +if __name__ == "__main__": + file = sys.argv[1] + + dataset = load_dataset("bigcode/jupyter-parsed") + with open(file,'w') as out: + for line in parse_data(data): + out.write(json.dumps(line)+'\n') + + dataset = load_dataset('json',ata_files=file) + + dataset.push_to_hub("bigcode/jupyter-code-text-pairs") From 73793e476855547c9f153c37078125a35896e1b4 Mon Sep 17 00:00:00 2001 From: lingjzhu <> Date: Fri, 12 May 2023 10:20:44 -0700 Subject: [PATCH 3/3] update correct scripts and readme.md --- .../jupyter-structured.ipynb | 1273 ++++++++++++++--- 1 file changed, 1036 insertions(+), 237 deletions(-) diff --git a/preprocessing/jupyter-structured/jupyter-structured.ipynb b/preprocessing/jupyter-structured/jupyter-structured.ipynb index ce51d4b..2bda98d 100644 --- a/preprocessing/jupyter-structured/jupyter-structured.ipynb +++ b/preprocessing/jupyter-structured/jupyter-structured.ipynb @@ -1,237 +1,1036 @@ - - - - - - - - - - - - - - - - - - - - - - - - jupyter-pairs.ipynb · bigcode/jupyter-structured-clean at main - - -
-
Hugging Face's logo - - -
- -
- - -
-
-
- - - -
- -
- - - - -
-
jupyter-structured-clean - / - jupyter-pairs.ipynb
- - -
-
lvwerra's picture -
lvwerra - -
HF staff -
-
-
Upload jupyter-pairs.ipynb
- bcb1c3a - -
- - -
Open in Colab -
-
- - - - - - - - - - - - - +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset, Dataset\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e5ec42c4eaf0494ea715e21510dc0b99", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading readme: 0%| | 0.00/558 [00:001000:\n", + " output_str = output[:1000] + \"[...]\"\n", + " elif output == \"_____no_output_____\":\n", + " output_str = \"\"\n", + " else:\n", + " output_str = output\n", + " \n", + " content = f\"{markdown.strip()}{code.strip()}{output_str.strip()}\"\n", + " return content\n", + "\n", + "current_repo_path = ds[0][\"repo_name\"] + \"/\" + ds[0][\"path\"]\n", + "current_content = \"\"\n", + "current_chain_length = 0\n", + "\n", + "data_dict = {\"content\": [], \"license\": [ds[0][\"license\"]], \"path\": [ds[0][\"path\"]], \"repo_name\": [ds[0][\"path\"]], \"chain_length\": []}\n", + "\n", + "for i in tqdm(range(int(len(ds)/1_000)+1)):\n", + " end_index = min((i+1)*1000, len(ds))\n", + " element = ds[i*1000:end_index]\n", + " for j in range(min(1_000, end_index-i*1000)):\n", + " repo_path = element[\"repo_name\"][j] + \"/\" + element[\"path\"][j]\n", + " \n", + " if repo_path != current_repo_path:\n", + " data_dict[\"content\"].append(current_content)\n", + " data_dict[\"chain_length\"].append(current_chain_length)\n", + " data_dict[\"license\"].append(element[\"license\"][j])\n", + " data_dict[\"path\"].append(element[\"path\"][j])\n", + " data_dict[\"repo_name\"].append(element[\"repo_name\"][j])\n", + "\n", + " current_repo_path = repo_path\n", + " current_content = \"\"\n", + " current_chain_length = 0\n", + " \n", + " current_content += build_content(element[\"markdown\"][j], element[\"code\"][j], element[\"output\"][j])\n", + " current_chain_length +=1\n", + "\n", + "data_dict[\"content\"].append(current_content)\n", + "data_dict[\"chain_length\"].append(current_chain_length)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1045605" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data_dict[\"content\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11.187439308" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum([len(content) for content in data_dict[\"content\"]])/1000/1000/1000" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.900101854906968\n" + ] + } + ], + "source": [ + "print(np.mean(data_dict[\"chain_length\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "ds_clean = Dataset.from_dict(data_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['content', 'license', 'path', 'repo_name', 'chain_length'],\n", + " num_rows: 1045605\n", + "})" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_clean" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5bb9a7b00fcf445a9ba840fa68f8f816", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Pushing dataset shards to the dataset hub: 0%| | 0/23 [00:00