+
+ |
+ import json |
+
+
+ |
+ import itertools |
+
+
+ |
+ from datasets import load_dataset, load_from_disk, Dataset |
+
+
+ |
+ import re |
+
+
+ |
+ from tqdm import tqdm |
+
+
+ |
+ import sys |
+
+
+ |
+
+ |
+
+
+ |
+ def clean_markdown(text): |
+
+
+ |
+ text = re.sub(r'<.*?>','',text) |
+
+
+ |
+ text = re.sub(r'\n+','',text) |
+
+
+ |
+ text = text.replace('#','') |
+
+
+ |
+ return text |
+
+
+ |
+
+ |
+
+
+ |
+
+ |
+
+
+ |
+ def parse_data(ds): |
+
+
+ |
+ """Parse data into markdown-code pairs""" |
+
+
+ |
+ |
+
+
+ |
+ for notebook in tqdm(ds): |
+
+
+ |
+ |
+
+
+ |
+ types = notebook["cell_types"] |
+
+
+ |
+ cells = notebook["cells"] |
+
+
+ |
+ |
+
+
+ |
+ if len(types)>0: |
+
+
+ |
+ if types[0] == "code": |
+
+
+ |
+ |
+
+
+ |
+ cells = cells[1:] |
+
+
+ |
+ types = types[1:] |
+
+
+ |
+ |
+
+
+ |
+ |
+
+
+ |
+ |
+
+
+ |
+ |
+
+
+ |
+ |
+
+
+ |
+ if len(types)>0: |
+
+
+ |
+ if types[-1] == 'markdown': |
+
+
+ |
+ cells = cells[:-1] |
+
+
+ |
+ types = types[:-1] |
+
+
+ |
+
+ |
+
+
+ |
+ if len(cells) % 2 == 0: |
+
+
+ |
+ inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0] |
+
+
+ |
+ inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0] |
+
+
+ |
+
+ |
+
+
+ |
+
+ |
+
+
+ |
+ for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets): |
+
+
+ |
+ markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block]) |
+
+
+ |
+ code = '\n'.join([snippet[0] for snippet in code_snippet]) |
+
+
+ |
+ output = [snippet[1] for snippet in code_snippet][-1] |
+
+
+ |
+
+ |
+
+
+ |
+ line = {'markdown':markdown_block, |
+
+
+ |
+ 'code':code, |
+
+
+ |
+ 'output':output, |
+
+
+ |
+ 'license':notebook['max_issues_repo_licenses'][0], |
+
+
+ |
+ 'path':notebook['max_stars_repo_path'], |
+
+
+ |
+ 'repo_name':notebook['max_stars_repo_name'], |
+
+
+ |
+ } |
+
+
+ |
+ yield line |
+
+
+ |
+
+ |
+
+
+ |
+ |
+
+
+ |
+ if __name__ == "__main__": |
+
+
+ |
+ file = sys.argv[1] |
+
+
+ |
+ |
+
+
+ |
+ dataset = load_dataset("bigcode/jupyter-parsed") |
+
+
+ |
+ with open(file,'w') as out: |
+
+
+ |
+ for line in parse_data(data): |
+
+
+ |
+ out.write(json.dumps(line)+'\n') |
+
+
+ |
+
+ |
+
+
+ |
+ dataset = load_dataset('json',ata_files=file) |
+
+
+ |
+ |
+
+
+ |
+ dataset.push_to_hub("bigcode/jupyter-code-text-pairs") |
+