diff --git a/README.md b/README.md index 8b7e0fa..ecdbd1d 100644 --- a/README.md +++ b/README.md @@ -21,4 +21,5 @@ necessary used for model training. - Filters for GitHub Issues - Filters for Git Commits - Script to convert Jupyter notebooks to scripts + - Scripts to convert Jupyter notebooks to structured markdown-code-output triplets - `decontamination`: script to remove files that match test-samples from code generation benchmarks. diff --git a/preprocessing/README.md b/preprocessing/README.md index a768928..dce1f39 100644 --- a/preprocessing/README.md +++ b/preprocessing/README.md @@ -40,3 +40,22 @@ We release the Jupyter scripts dataset as part of [StarCoderData](https://huggin python jupyter_script_conversion.py ``` +# Creating Jupyter-structured dataset + +## Step 1 +Parse Jupyter notebooks from `the Stack`. +``` +python jupyter-structured/jupyter-segment-notebooks.py +``` + +## Step 2 +Generate markdown-code-output triplets. +``` +python jupyter-structured/jupyter-generate-triplets.py +``` + +## Step 3 +Create notebook-level structured dataset using `jupyter-structured/jupyter-structured.ipynb`. + + + diff --git a/preprocessing/jupyter-structured/jupyter-generate-triplets.py b/preprocessing/jupyter-structured/jupyter-generate-triplets.py new file mode 100644 index 0000000..79de712 --- /dev/null +++ b/preprocessing/jupyter-structured/jupyter-generate-triplets.py @@ -0,0 +1,68 @@ +import json +import itertools +from datasets import load_dataset, load_from_disk, Dataset +import re +from tqdm import tqdm +import sys + +def clean_markdown(text): + text = re.sub(r'<.*?>','',text) + text = re.sub(r'\n+','',text) + text = text.replace('#','') + return text + + +def parse_data(ds): + """Parse data into markdown-code pairs""" + + for notebook in tqdm(ds): + + types = notebook["cell_types"] + cells = notebook["cells"] + + if len(types)>0: + if types[0] == "code": + # drop first cell of code to have the notebook start with markdown + cells = cells[1:] + types = types[1:] + #else: + # drop first the two cells of markdown followed by code + # the first markown cell of a notebook is often a long description of the whole notebook + # cells = notebooks["cells"][2:] + # types = notebooks["types"][2:] + if len(types)>0: + if types[-1] == 'markdown': + cells = cells[:-1] + types = types[:-1] + + if len(cells) % 2 == 0: + inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0] + inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0] + + + for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets): + markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block]) + code = '\n'.join([snippet[0] for snippet in code_snippet]) + output = [snippet[1] for snippet in code_snippet][-1] + + line = {'markdown':markdown_block, + 'code':code, + 'output':output, + 'license':notebook['max_issues_repo_licenses'][0], + 'path':notebook['max_stars_repo_path'], + 'repo_name':notebook['max_stars_repo_name'], + } + yield line + + +if __name__ == "__main__": + file = sys.argv[1] + + dataset = load_dataset("bigcode/jupyter-parsed") + with open(file,'w') as out: + for line in parse_data(data): + out.write(json.dumps(line)+'\n') + + dataset = load_dataset('json',ata_files=file) + + dataset.push_to_hub("bigcode/jupyter-code-text-pairs") diff --git a/preprocessing/jupyter-structured/jupyter-segment-notebooks.py b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py new file mode 100644 index 0000000..2e9f4e2 --- /dev/null +++ b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py @@ -0,0 +1,65 @@ +import json +import itertools +from datasets import load_dataset + + +def segment_blocks(content): + + cells = [] + cell_types = [] + for cell in content['cells']: + if len(cell['source']) > 0: + output = '_____no_output_____' + if 'outputs' in cell.keys(): + if len(cell['outputs'])>0: + if 'text' in cell['outputs'][0].keys(): + output = cell['outputs'][0]['text'] + cells.append([''.join(cell['source']),''.join(output)]) + cell_types.append(cell['cell_type']) + return cells, cell_types + + +def segment(batch): + try: + content = json.loads(batch['content']) + if 'py' in json.dumps(content['metadata']): + cells, types = segment_blocks(content) + + cell_type_groups = [list(g) for k,g in itertools.groupby(types)] + cell_types = [k for k,g in itertools.groupby(types)] + cell_groups = [] + + group_start = 0 + for g in cell_type_groups: + cell_groups.append(cells[group_start:group_start+len(g)]) + group_start += len(g) + + batch['cells'] = cell_groups + batch['cell_types'] = cell_types + batch['cell_type_groups'] = cell_type_groups + + else: + batch['cells'] = [[['empty']]] + batch['cell_types'] = ['empty'] + batch['cell_type_groups'] = [['empty']] + + except: + + batch['cells'] = [[['empty']]] + batch['cell_types'] = ['empty'] + batch['cell_type_groups'] = [['empty']] + + del batch['content'] + return batch + + +if __name__ == "__main__": + + # load dataset + dataset = load_dataset("bigcode/the-stack",data_dir="data/jupyter-notebook", split="train",use_auth_token=True) + # segment notebooks + dataset = dataset.map(segment) + # filter out erronous cells via placeholders + dataset = dataset.filter(lambda entry: entry['cell_types']!=['empty']) + # push to hub + dataset.push_to_hub("bigcode/jupyter-parsed") \ No newline at end of file diff --git a/preprocessing/jupyter-structured/jupyter-structured.ipynb b/preprocessing/jupyter-structured/jupyter-structured.ipynb new file mode 100644 index 0000000..2bda98d --- /dev/null +++ b/preprocessing/jupyter-structured/jupyter-structured.ipynb @@ -0,0 +1,1036 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset, Dataset\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e5ec42c4eaf0494ea715e21510dc0b99", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading readme: 0%| | 0.00/558 [00:001000:\n", + " output_str = output[:1000] + \"[...]\"\n", + " elif output == \"_____no_output_____\":\n", + " output_str = \"\"\n", + " else:\n", + " output_str = output\n", + " \n", + " content = f\"{markdown.strip()}{code.strip()}{output_str.strip()}\"\n", + " return content\n", + "\n", + "current_repo_path = ds[0][\"repo_name\"] + \"/\" + ds[0][\"path\"]\n", + "current_content = \"\"\n", + "current_chain_length = 0\n", + "\n", + "data_dict = {\"content\": [], \"license\": [ds[0][\"license\"]], \"path\": [ds[0][\"path\"]], \"repo_name\": [ds[0][\"path\"]], \"chain_length\": []}\n", + "\n", + "for i in tqdm(range(int(len(ds)/1_000)+1)):\n", + " end_index = min((i+1)*1000, len(ds))\n", + " element = ds[i*1000:end_index]\n", + " for j in range(min(1_000, end_index-i*1000)):\n", + " repo_path = element[\"repo_name\"][j] + \"/\" + element[\"path\"][j]\n", + " \n", + " if repo_path != current_repo_path:\n", + " data_dict[\"content\"].append(current_content)\n", + " data_dict[\"chain_length\"].append(current_chain_length)\n", + " data_dict[\"license\"].append(element[\"license\"][j])\n", + " data_dict[\"path\"].append(element[\"path\"][j])\n", + " data_dict[\"repo_name\"].append(element[\"repo_name\"][j])\n", + "\n", + " current_repo_path = repo_path\n", + " current_content = \"\"\n", + " current_chain_length = 0\n", + " \n", + " current_content += build_content(element[\"markdown\"][j], element[\"code\"][j], element[\"output\"][j])\n", + " current_chain_length +=1\n", + "\n", + "data_dict[\"content\"].append(current_content)\n", + "data_dict[\"chain_length\"].append(current_chain_length)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1045605" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data_dict[\"content\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11.187439308" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum([len(content) for content in data_dict[\"content\"]])/1000/1000/1000" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.900101854906968\n" + ] + } + ], + "source": [ + "print(np.mean(data_dict[\"chain_length\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "ds_clean = Dataset.from_dict(data_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['content', 'license', 'path', 'repo_name', 'chain_length'],\n", + " num_rows: 1045605\n", + "})" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_clean" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5bb9a7b00fcf445a9ba840fa68f8f816", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Pushing dataset shards to the dataset hub: 0%| | 0/23 [00:00