diff --git a/README.md b/README.md
index 8b7e0fa..ecdbd1d 100644
--- a/README.md
+++ b/README.md
@@ -21,4 +21,5 @@ necessary used for model training.
   - Filters for GitHub Issues
   - Filters for Git Commits
   - Script to convert Jupyter notebooks to scripts
+  - Scripts to convert Jupyter notebooks to structured markdown-code-output triplets
 - `decontamination`: script to remove files that match test-samples from code generation benchmarks.
diff --git a/preprocessing/README.md b/preprocessing/README.md
index a768928..dce1f39 100644
--- a/preprocessing/README.md
+++ b/preprocessing/README.md
@@ -40,3 +40,22 @@ We release the Jupyter scripts dataset as part of [StarCoderData](https://huggin
 python jupyter_script_conversion.py
 ```
 
+# Creating Jupyter-structured dataset
+
+## Step 1
+Parse Jupyter notebooks from `the Stack`.  
+```
+python jupyter-structured/jupyter-segment-notebooks.py
+```
+
+## Step 2
+Generate markdown-code-output triplets.
+```
+python jupyter-structured/jupyter-generate-triplets.py
+```
+
+## Step 3
+Create notebook-level structured dataset using `jupyter-structured/jupyter-structured.ipynb`.
+
+
+
diff --git a/preprocessing/jupyter-structured/jupyter-generate-triplets.py b/preprocessing/jupyter-structured/jupyter-generate-triplets.py
new file mode 100644
index 0000000..79de712
--- /dev/null
+++ b/preprocessing/jupyter-structured/jupyter-generate-triplets.py
@@ -0,0 +1,68 @@
+import json
+import itertools
+from datasets import load_dataset, load_from_disk, Dataset
+import re
+from tqdm import tqdm
+import sys
+
+def clean_markdown(text):
+    text = re.sub(r'<.*?>','',text)
+    text = re.sub(r'\n+','',text)
+    text = text.replace('#','')
+    return text
+
+
+def parse_data(ds):
+    """Parse data into markdown-code pairs"""
+   
+    for notebook in tqdm(ds):
+        
+        types = notebook["cell_types"]
+        cells = notebook["cells"]
+        
+        if len(types)>0:
+            if types[0] == "code":
+                # drop first cell of code to have the notebook start with markdown
+                cells = cells[1:]
+                types = types[1:]
+            #else:
+                # drop first the two cells of markdown followed by code
+                # the first markown cell of a notebook is often a long description of the whole notebook
+            #    cells = notebooks["cells"][2:]
+            #    types = notebooks["types"][2:]
+            if len(types)>0:
+                if types[-1] == 'markdown':
+                    cells = cells[:-1]
+                    types = types[:-1]
+
+                if len(cells) % 2 == 0:
+                    inner_markdowns = [cells[j] for j in range(len(cells)) if j % 2 == 0]
+                    inner_code_snippets = [cells[j+1] for j in range(len(cells) - 1) if j % 2 == 0]
+
+
+                    for markdown_block, code_snippet in zip(inner_markdowns,inner_code_snippets):
+                        markdown_block = ' '.join([clean_markdown(block[0]) for block in markdown_block])
+                        code = '\n'.join([snippet[0] for snippet in code_snippet])
+                        output = [snippet[1] for snippet in code_snippet][-1]
+
+                        line = {'markdown':markdown_block,
+                                'code':code,
+                                'output':output,
+                                'license':notebook['max_issues_repo_licenses'][0],
+                                'path':notebook['max_stars_repo_path'],
+                                'repo_name':notebook['max_stars_repo_name'],
+                                }
+                        yield line
+
+                        
+if __name__ == "__main__":
+    file = sys.argv[1]
+    
+    dataset = load_dataset("bigcode/jupyter-parsed")
+    with open(file,'w') as out:
+        for line in parse_data(data):
+            out.write(json.dumps(line)+'\n')
+
+    dataset = load_dataset('json',ata_files=file)
+    
+    dataset.push_to_hub("bigcode/jupyter-code-text-pairs")   
diff --git a/preprocessing/jupyter-structured/jupyter-segment-notebooks.py b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py
new file mode 100644
index 0000000..2e9f4e2
--- /dev/null
+++ b/preprocessing/jupyter-structured/jupyter-segment-notebooks.py
@@ -0,0 +1,65 @@
+import json
+import itertools
+from datasets import load_dataset
+
+
+def segment_blocks(content):
+    
+    cells = []
+    cell_types = []
+    for cell in content['cells']:
+        if len(cell['source']) > 0:
+            output = '_____no_output_____'
+            if 'outputs' in cell.keys():
+                if len(cell['outputs'])>0:
+                    if 'text' in cell['outputs'][0].keys():
+                        output = cell['outputs'][0]['text']
+            cells.append([''.join(cell['source']),''.join(output)])
+            cell_types.append(cell['cell_type'])
+    return cells, cell_types
+
+
+def segment(batch):
+    try:
+        content = json.loads(batch['content'])
+        if 'py' in json.dumps(content['metadata']):
+            cells, types = segment_blocks(content)
+
+            cell_type_groups = [list(g) for k,g in itertools.groupby(types)]
+            cell_types = [k for k,g in itertools.groupby(types)]
+            cell_groups = []
+
+            group_start = 0
+            for g in cell_type_groups:
+                cell_groups.append(cells[group_start:group_start+len(g)])
+                group_start += len(g)
+
+            batch['cells'] = cell_groups
+            batch['cell_types'] = cell_types
+            batch['cell_type_groups'] = cell_type_groups
+            
+        else:
+            batch['cells'] = [[['empty']]]
+            batch['cell_types'] = ['empty']
+            batch['cell_type_groups'] = [['empty']]
+        
+    except:
+        
+        batch['cells'] = [[['empty']]]
+        batch['cell_types'] = ['empty']
+        batch['cell_type_groups'] = [['empty']]
+        
+    del batch['content']
+    return batch
+
+
+if __name__ == "__main__":
+    
+    # load dataset
+    dataset = load_dataset("bigcode/the-stack",data_dir="data/jupyter-notebook", split="train",use_auth_token=True)
+    # segment notebooks
+    dataset = dataset.map(segment) 
+    # filter out erronous cells via placeholders
+    dataset = dataset.filter(lambda entry: entry['cell_types']!=['empty'])
+    # push to hub
+    dataset.push_to_hub("bigcode/jupyter-parsed")
\ No newline at end of file
diff --git a/preprocessing/jupyter-structured/jupyter-structured.ipynb b/preprocessing/jupyter-structured/jupyter-structured.ipynb
new file mode 100644
index 0000000..2bda98d
--- /dev/null
+++ b/preprocessing/jupyter-structured/jupyter-structured.ipynb
@@ -0,0 +1,1036 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset, Dataset\n",
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "from tqdm import tqdm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e5ec42c4eaf0494ea715e21510dc0b99",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading readme:   0%|          | 0.00/558 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading and preparing dataset None/None to /home/leandro/.cache/huggingface/datasets/bigcode___parquet/bigcode--jupyter-code-text-pairs-7dea51d66788f48c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b38ffb86fbdd407a9fcbe4ca74420387",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "af62aa35f5cd4be9bbb29efb23dedee3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/220M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eaf52ecda16644dcafdb8c16cbf63724",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fa61e49ca1e04425b7eb53d340ccae4b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7241a4d9d106416e9296ceb161d2b14e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/228M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "89dc99b05d444fb5b377f5fd49ac9657",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7916da891b684dd080afbd91313db527",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0582e50c581849bc829451cf4b3cf07c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "782a43bafa7d413caae9e717c35a85c9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/221M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3e80355300804665b66510f816f04243",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/230M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a157b09218414865b73d5eecced1e559",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9766ad64bc0243b3ba447156eaed7ed5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/216M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4146e72fdacd4157b15878871170556a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d59391dcc44f4e979a945eed865d8715",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "87fe99f1b5aa42a4b0b628f89b24b49f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9388dbf7e85244899490eb0c403e0137",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/218M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "97dbf711bfdb4635a1e4befbfe7b6c77",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/214M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "710f9adf42864662bf7240d235bf41d2",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/224M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7f7a3b31419d420c9cbb0117359a6281",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/221M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6b9c1fb1a7e04433b9168cdda109bc2a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/216M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f3e81063603b4d44814d236cd737a364",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e78f4150fc254d5c8ea9386626b23e97",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/222M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52e58ed08b9a4524931531f79241b901",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "879d0758663a4fc8aacddc921fe88347",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/217M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "90a2c8b3925d45369f0dd0c2c9956a21",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/226M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8caf0ac449a34afda28605a679cb18da",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/229M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "23415b13de1944c28af1846f934a3fe1",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/217M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "64660ccfcd32458898e4c03cfd29f9b6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/214M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6c18fcd0d76a4361b84868058a346e0f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/216M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3a93a0e9fbb84547b621cfeca1d41cb3",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eb17935962ce43eda641df3da6c4b077",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split:   0%|          | 0/9305991 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset parquet downloaded and prepared to /home/leandro/.cache/huggingface/datasets/bigcode___parquet/bigcode--jupyter-code-text-pairs-7dea51d66788f48c/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.\n"
+     ]
+    }
+   ],
+   "source": [
+    "ds = load_dataset(\"bigcode/jupyter-code-text-pairs\", use_auth_token=True, split=\"train\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['markdown', 'code', 'output', 'license', 'path', 'repo_name'],\n",
+       "    num_rows: 9305991\n",
+       "})"
+      ]
+     },
+     "execution_count": 71,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 98,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 9306/9306 [01:16<00:00, 122.06it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "def build_content(markdown, code, output):\n",
+    "    if len(output)>1000:\n",
+    "        output_str = output[:1000] + \"[...]\"\n",
+    "    elif output == \"_____no_output_____\":\n",
+    "        output_str = \"<empty_output>\"\n",
+    "    else:\n",
+    "        output_str = output\n",
+    "    \n",
+    "    content = f\"<jupyter_text>{markdown.strip()}<jupyter_code>{code.strip()}<jupyter_output>{output_str.strip()}\"\n",
+    "    return content\n",
+    "\n",
+    "current_repo_path = ds[0][\"repo_name\"] + \"/\" + ds[0][\"path\"]\n",
+    "current_content = \"<jupyter_start>\"\n",
+    "current_chain_length = 0\n",
+    "\n",
+    "data_dict = {\"content\": [], \"license\": [ds[0][\"license\"]], \"path\": [ds[0][\"path\"]], \"repo_name\": [ds[0][\"path\"]], \"chain_length\": []}\n",
+    "\n",
+    "for i in tqdm(range(int(len(ds)/1_000)+1)):\n",
+    "    end_index = min((i+1)*1000, len(ds))\n",
+    "    element = ds[i*1000:end_index]\n",
+    "    for j in range(min(1_000, end_index-i*1000)):\n",
+    "        repo_path = element[\"repo_name\"][j] + \"/\" + element[\"path\"][j]\n",
+    "    \n",
+    "        if repo_path != current_repo_path:\n",
+    "            data_dict[\"content\"].append(current_content)\n",
+    "            data_dict[\"chain_length\"].append(current_chain_length)\n",
+    "            data_dict[\"license\"].append(element[\"license\"][j])\n",
+    "            data_dict[\"path\"].append(element[\"path\"][j])\n",
+    "            data_dict[\"repo_name\"].append(element[\"repo_name\"][j])\n",
+    "\n",
+    "            current_repo_path = repo_path\n",
+    "            current_content = \"<jupyter_start>\"\n",
+    "            current_chain_length = 0\n",
+    "        \n",
+    "        current_content += build_content(element[\"markdown\"][j], element[\"code\"][j], element[\"output\"][j])\n",
+    "        current_chain_length +=1\n",
+    "\n",
+    "data_dict[\"content\"].append(current_content)\n",
+    "data_dict[\"chain_length\"].append(current_chain_length)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1045605"
+      ]
+     },
+     "execution_count": 99,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(data_dict[\"content\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11.187439308"
+      ]
+     },
+     "execution_count": 104,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum([len(content) for content in data_dict[\"content\"]])/1000/1000/1000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "8.900101854906968\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(np.mean(data_dict[\"chain_length\"]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 102,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_clean = Dataset.from_dict(data_dict)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['content', 'license', 'path', 'repo_name', 'chain_length'],\n",
+       "    num_rows: 1045605\n",
+       "})"
+      ]
+     },
+     "execution_count": 103,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "ds_clean"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5bb9a7b00fcf445a9ba840fa68f8f816",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Pushing dataset shards to the dataset hub:   0%|          | 0/23 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "52f0ea6edfb9435187260074e66d4006",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eec940e5c1254b2e8a9ff31b0133df91",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "db399935dd9d483b8f8bc3c2148f9b0d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b6750e4a9b3b4bd2b6332f49fdfc5c08",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e70679fa775b4b469f5d1d437d389f1b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3a09c128a979416da45d32ab585aea9b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "36fa852faebe4df18904ed6e3540d68f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d98b04938d2d472c967dcd1095b9eeff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fa9287f42a7e464b9067f6d9bb0c951a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fc7527efb1344b97823c94366e37d975",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fd489cd034584bea8ccfde01ac946410",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e4c69f3e5a43439c8b19fcf04764ea93",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d972df45e0e440d99743065fd37473c7",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3789bc3c5aaa402d826f51afc452eaac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5a96869f1aed47a787bfc585a3f1c68d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f5711466065d49b3bc6d9324b71f5991",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "16b1b97e4da24556a135759024784291",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b4aaaface3e04a9fa31edfb015cfe2cf",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "97edbb8defb141999eb8b85eb267c0db",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "d7b78cfc505547dfb9127e520c255b7c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "49cea59cd0374249bc56cbd0434772cd",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7bbe0add21144ebdb59f096a42e08a45",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "98df8df071f8490da00db489fcf048ff",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Creating parquet from Arrow format:   0%|          | 0/46 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "ds_clean.push_to_hub(\"jupyter-structured-clean\", private=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "d4d1e4263499bec80672ea0156c357c1ee493ec2b1c70f0acce89fc37c4a6abe"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}