diff --git a/preprocessing/jupyter-structured/jupyter-structured.ipynb b/preprocessing/jupyter-structured/jupyter-structured.ipynb index ce51d4b..2bda98d 100644 --- a/preprocessing/jupyter-structured/jupyter-structured.ipynb +++ b/preprocessing/jupyter-structured/jupyter-structured.ipynb @@ -1,237 +1,1036 @@ - - - - - - - - - - - - - - - - - - - - - - - - jupyter-pairs.ipynb · bigcode/jupyter-structured-clean at main - - -
-
Hugging Face's logo - - -
- -
- - -
-
-
- - - -
- -
- - - - -
-
jupyter-structured-clean - / - jupyter-pairs.ipynb
- - -
-
lvwerra's picture -
lvwerra - -
HF staff -
-
-
Upload jupyter-pairs.ipynb
- bcb1c3a - -
- - -
Open in Colab -
-
- - - - - - - - - - - - - +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import load_dataset, Dataset\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e5ec42c4eaf0494ea715e21510dc0b99", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading readme: 0%| | 0.00/558 [00:001000:\n", + " output_str = output[:1000] + \"[...]\"\n", + " elif output == \"_____no_output_____\":\n", + " output_str = \"\"\n", + " else:\n", + " output_str = output\n", + " \n", + " content = f\"{markdown.strip()}{code.strip()}{output_str.strip()}\"\n", + " return content\n", + "\n", + "current_repo_path = ds[0][\"repo_name\"] + \"/\" + ds[0][\"path\"]\n", + "current_content = \"\"\n", + "current_chain_length = 0\n", + "\n", + "data_dict = {\"content\": [], \"license\": [ds[0][\"license\"]], \"path\": [ds[0][\"path\"]], \"repo_name\": [ds[0][\"path\"]], \"chain_length\": []}\n", + "\n", + "for i in tqdm(range(int(len(ds)/1_000)+1)):\n", + " end_index = min((i+1)*1000, len(ds))\n", + " element = ds[i*1000:end_index]\n", + " for j in range(min(1_000, end_index-i*1000)):\n", + " repo_path = element[\"repo_name\"][j] + \"/\" + element[\"path\"][j]\n", + " \n", + " if repo_path != current_repo_path:\n", + " data_dict[\"content\"].append(current_content)\n", + " data_dict[\"chain_length\"].append(current_chain_length)\n", + " data_dict[\"license\"].append(element[\"license\"][j])\n", + " data_dict[\"path\"].append(element[\"path\"][j])\n", + " data_dict[\"repo_name\"].append(element[\"repo_name\"][j])\n", + "\n", + " current_repo_path = repo_path\n", + " current_content = \"\"\n", + " current_chain_length = 0\n", + " \n", + " current_content += build_content(element[\"markdown\"][j], element[\"code\"][j], element[\"output\"][j])\n", + " current_chain_length +=1\n", + "\n", + "data_dict[\"content\"].append(current_content)\n", + "data_dict[\"chain_length\"].append(current_chain_length)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1045605" + ] + }, + "execution_count": 99, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(data_dict[\"content\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "11.187439308" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum([len(content) for content in data_dict[\"content\"]])/1000/1000/1000" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "8.900101854906968\n" + ] + } + ], + "source": [ + "print(np.mean(data_dict[\"chain_length\"]))" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "ds_clean = Dataset.from_dict(data_dict)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Dataset({\n", + " features: ['content', 'license', 'path', 'repo_name', 'chain_length'],\n", + " num_rows: 1045605\n", + "})" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds_clean" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5bb9a7b00fcf445a9ba840fa68f8f816", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Pushing dataset shards to the dataset hub: 0%| | 0/23 [00:00