Merge pull request #51 from lingjzhu/main

add jupyter-structured scripts
bigcode-project · May 12, 2023 · 6c269c7 · 6c269c7
2 parents e806727 + 73793e4
commit 6c269c7
Show file tree

Hide file tree

Showing 5 changed files with 1,189 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -21,4 +21,5 @@ necessary used for model training.
   - Filters for GitHub Issues
   - Filters for Git Commits
   - Script to convert Jupyter notebooks to scripts
+  - Scripts to convert Jupyter notebooks to structured markdown-code-output triplets
 - `decontamination`: script to remove files that match test-samples from code generation benchmarks.
diff --git a/preprocessing/README.md b/preprocessing/README.md
@@ -40,3 +40,22 @@ We release the Jupyter scripts dataset as part of [StarCoderData](https://huggin
 python jupyter_script_conversion.py
 ```
 
+# Creating Jupyter-structured dataset
+
+## Step 1
+Parse Jupyter notebooks from `the Stack`.  
+```
+python jupyter-structured/jupyter-segment-notebooks.py
+```
+
+## Step 2
+Generate markdown-code-output triplets.
+```
+python jupyter-structured/jupyter-generate-triplets.py
+```
+
+## Step 3
+Create notebook-level structured dataset using `jupyter-structured/jupyter-structured.ipynb`.
+
+
+
diff --git a/preprocessing/jupyter-structured/jupyter-generate-triplets.py b/preprocessing/jupyter-structured/jupyter-generate-triplets.py
@@ -0,0 +1,68 @@
+import json
+import itertools
+from datasets import load_dataset, load_from_disk, Dataset
+import re
+from tqdm import tqdm
+import sys
+
+def clean_markdown(text):
+    text = re.sub(r'<.*?>','',text)
+    text = re.sub(r'\n+','',text)
+    text = text.replace('#','')
+    return text
+
+
+def parse_data(ds):
+    """Parse data into markdown-code pairs"""
+
+    for notebook in tqdm(ds):
+
+        types = notebook["cell_types"]
+        cells = notebook["cells"]
+
+        if len(types)>0:
+            if types[0] == "code":
+                # drop first cell of code to have the notebook start with markdown
+                cells = cells[1:]
+                types = types[1:]
+            #else:
+                # drop first the two cells of markdown followed by code
+                # the first markown cell of a notebook is often a long description of the whole notebook
+            #    cells = notebooks["cells"][2:]
+            #    types = notebooks["types"][2:]
+            if len(types)>0:
+                if types[-1] == 'markdown':
+                    cells = cells[:-1]
+                    types = types[:-1]
+
+                if len(cells) % 2 == 0:
+                    inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0]
+                    inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0]
+
+
+                    for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets):
+                        markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block])
+                        code = '\n'.join([snippet[0] for snippet in code_snippet])
+                        output = [snippet[1] for snippet in code_snippet][-1]
+
+                        line = {'markdown':markdown_block,
+                                'code':code,
+                                'output':output,
+                                'license':notebook['max_issues_repo_licenses'][0],
+                                'path':notebook['max_stars_repo_path'],
+                                'repo_name':notebook['max_stars_repo_name'],
+                                }
+                        yield line
+
+
+if __name__ == "__main__":
+    file = sys.argv[1]
+
+    dataset = load_dataset("bigcode/jupyter-parsed")
+    with open(file,'w') as out:
+        for line in parse_data(data):
+            out.write(json.dumps(line)+'\n')
+
+    dataset = load_dataset('json',ata_files=file)
+
+    dataset.push_to_hub("bigcode/jupyter-code-text-pairs")   
diff --git a/preprocessing/jupyter-structured/jupyter-segment-notebooks.py b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py
@@ -0,0 +1,65 @@
+import json
+import itertools
+from datasets import load_dataset
+
+
+def segment_blocks(content):
+
+    cells = []
+    cell_types = []
+    for cell in content['cells']:
+        if len(cell['source']) > 0:
+            output = '_____no_output_____'
+            if 'outputs' in cell.keys():
+                if len(cell['outputs'])>0:
+                    if 'text' in cell['outputs'][0].keys():
+                        output = cell['outputs'][0]['text']
+            cells.append([''.join(cell['source']),''.join(output)])
+            cell_types.append(cell['cell_type'])
+    return cells, cell_types
+
+
+def segment(batch):
+    try:
+        content = json.loads(batch['content'])
+        if 'py' in json.dumps(content['metadata']):
+            cells, types = segment_blocks(content)
+
+            cell_type_groups = [list(g) for k,g in itertools.groupby(types)]
+            cell_types = [k for k,g in itertools.groupby(types)]
+            cell_groups = []
+
+            group_start = 0
+            for g in cell_type_groups:
+                cell_groups.append(cells[group_start:group_start+len(g)])
+                group_start += len(g)
+
+            batch['cells'] = cell_groups
+            batch['cell_types'] = cell_types
+            batch['cell_type_groups'] = cell_type_groups
+
+        else:
+            batch['cells'] = [[['empty']]]
+            batch['cell_types'] = ['empty']
+            batch['cell_type_groups'] = [['empty']]
+
+    except:
+
+        batch['cells'] = [[['empty']]]
+        batch['cell_types'] = ['empty']
+        batch['cell_type_groups'] = [['empty']]
+
+    del batch['content']
+    return batch
+
+
+if __name__ == "__main__":
+
+    # load dataset
+    dataset = load_dataset("bigcode/the-stack",data_dir="data/jupyter-notebook", split="train",use_auth_token=True)
+    # segment notebooks
+    dataset = dataset.map(segment) 
+    # filter out erronous cells via placeholders
+    dataset = dataset.filter(lambda entry: entry['cell_types']!=['empty'])
+    # push to hub
+    dataset.push_to_hub("bigcode/jupyter-parsed")