Skip to content

Commit

Permalink
Merge pull request #51 from lingjzhu/main
Browse files Browse the repository at this point in the history
add jupyter-structured scripts
  • Loading branch information
loubnabnl authored May 12, 2023
2 parents e806727 + 73793e4 commit 6c269c7
Show file tree
Hide file tree
Showing 5 changed files with 1,189 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,5 @@ necessary used for model training.
- Filters for GitHub Issues
- Filters for Git Commits
- Script to convert Jupyter notebooks to scripts
- Scripts to convert Jupyter notebooks to structured markdown-code-output triplets
- `decontamination`: script to remove files that match test-samples from code generation benchmarks.
19 changes: 19 additions & 0 deletions preprocessing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,22 @@ We release the Jupyter scripts dataset as part of [StarCoderData](https://huggin
python jupyter_script_conversion.py
```

# Creating Jupyter-structured dataset

## Step 1
Parse Jupyter notebooks from `the Stack`.
```
python jupyter-structured/jupyter-segment-notebooks.py
```

## Step 2
Generate markdown-code-output triplets.
```
python jupyter-structured/jupyter-generate-triplets.py
```

## Step 3
Create notebook-level structured dataset using `jupyter-structured/jupyter-structured.ipynb`.



68 changes: 68 additions & 0 deletions preprocessing/jupyter-structured/jupyter-generate-triplets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import json
import itertools
from datasets import load_dataset, load_from_disk, Dataset
import re
from tqdm import tqdm
import sys

def clean_markdown(text):
text = re.sub(r'<.*?>','',text)
text = re.sub(r'\n+','',text)
text = text.replace('#','')
return text


def parse_data(ds):
"""Parse data into markdown-code pairs"""

for notebook in tqdm(ds):

types = notebook["cell_types"]
cells = notebook["cells"]

if len(types)>0:
if types[0] == "code":
# drop first cell of code to have the notebook start with markdown
cells = cells[1:]
types = types[1:]
#else:
# drop first the two cells of markdown followed by code
# the first markown cell of a notebook is often a long description of the whole notebook
# cells = notebooks["cells"][2:]
# types = notebooks["types"][2:]
if len(types)>0:
if types[-1] == 'markdown':
cells = cells[:-1]
types = types[:-1]

if len(cells) % 2 == 0:
inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0]
inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0]


for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets):
markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block])
code = '\n'.join([snippet[0] for snippet in code_snippet])
output = [snippet[1] for snippet in code_snippet][-1]

line = {'markdown':markdown_block,
'code':code,
'output':output,
'license':notebook['max_issues_repo_licenses'][0],
'path':notebook['max_stars_repo_path'],
'repo_name':notebook['max_stars_repo_name'],
}
yield line


if __name__ == "__main__":
file = sys.argv[1]

dataset = load_dataset("bigcode/jupyter-parsed")
with open(file,'w') as out:
for line in parse_data(data):
out.write(json.dumps(line)+'\n')

dataset = load_dataset('json',ata_files=file)

dataset.push_to_hub("bigcode/jupyter-code-text-pairs")
65 changes: 65 additions & 0 deletions preprocessing/jupyter-structured/jupyter-segment-notebooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import json
import itertools
from datasets import load_dataset


def segment_blocks(content):

cells = []
cell_types = []
for cell in content['cells']:
if len(cell['source']) > 0:
output = '_____no_output_____'
if 'outputs' in cell.keys():
if len(cell['outputs'])>0:
if 'text' in cell['outputs'][0].keys():
output = cell['outputs'][0]['text']
cells.append([''.join(cell['source']),''.join(output)])
cell_types.append(cell['cell_type'])
return cells, cell_types


def segment(batch):
try:
content = json.loads(batch['content'])
if 'py' in json.dumps(content['metadata']):
cells, types = segment_blocks(content)

cell_type_groups = [list(g) for k,g in itertools.groupby(types)]
cell_types = [k for k,g in itertools.groupby(types)]
cell_groups = []

group_start = 0
for g in cell_type_groups:
cell_groups.append(cells[group_start:group_start+len(g)])
group_start += len(g)

batch['cells'] = cell_groups
batch['cell_types'] = cell_types
batch['cell_type_groups'] = cell_type_groups

else:
batch['cells'] = [[['empty']]]
batch['cell_types'] = ['empty']
batch['cell_type_groups'] = [['empty']]

except:

batch['cells'] = [[['empty']]]
batch['cell_types'] = ['empty']
batch['cell_type_groups'] = [['empty']]

del batch['content']
return batch


if __name__ == "__main__":

# load dataset
dataset = load_dataset("bigcode/the-stack",data_dir="data/jupyter-notebook", split="train",use_auth_token=True)
# segment notebooks
dataset = dataset.map(segment)
# filter out erronous cells via placeholders
dataset = dataset.filter(lambda entry: entry['cell_types']!=['empty'])
# push to hub
dataset.push_to_hub("bigcode/jupyter-parsed")
Loading

0 comments on commit 6c269c7

Please sign in to comment.