diff --git a/ColumnTransformer Meets NLP.ipynb b/ColumnTransformer Meets NLP.ipynb new file mode 100644 index 0000000..3ae1f2c --- /dev/null +++ b/ColumnTransformer Meets NLP.ipynb @@ -0,0 +1,573 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "from scipy import stats\n", + "from scipy.stats import norm, skew\n", + "from sklearn import preprocessing\n", + "from sklearn.linear_model import Ridge\n", + "from sklearn.metrics import mean_squared_error\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "from sklearn.preprocessing import OneHotEncoder\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.pipeline import Pipeline\n", + "pd.set_option('display.float_format', lambda x: '%.3f' % x)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('train.tsv', sep = '\\t')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
train_idnameitem_condition_idcategory_namebrand_namepriceshippingitem_description
00MLB Cincinnati Reds T Shirt Size XL3Men/Tops/T-shirtsNaN10.0001No description yet
11Razer BlackWidow Chroma Keyboard3Electronics/Computers & Tablets/Components & P...Razer52.0000This keyboard is in great condition and works ...
22AVA-VIV Blouse1Women/Tops & Blouses/BlouseTarget10.0001Adorable top with a hint of lace and a key hol...
33Leather Horse Statues1Home/Home Décor/Home Décor AccentsNaN35.0001New with tags. Leather horses. Retail for [rm]...
4424K GOLD plated rose1Women/Jewelry/NecklacesNaN44.0000Complete with certificate of authenticity
\n", + "
" + ], + "text/plain": [ + " train_id name item_condition_id \\\n", + "0 0 MLB Cincinnati Reds T Shirt Size XL 3 \n", + "1 1 Razer BlackWidow Chroma Keyboard 3 \n", + "2 2 AVA-VIV Blouse 1 \n", + "3 3 Leather Horse Statues 1 \n", + "4 4 24K GOLD plated rose 1 \n", + "\n", + " category_name brand_name price \\\n", + "0 Men/Tops/T-shirts NaN 10.000 \n", + "1 Electronics/Computers & Tablets/Components & P... Razer 52.000 \n", + "2 Women/Tops & Blouses/Blouse Target 10.000 \n", + "3 Home/Home Décor/Home Décor Accents NaN 35.000 \n", + "4 Women/Jewelry/Necklaces NaN 44.000 \n", + "\n", + " shipping item_description \n", + "0 1 No description yet \n", + "1 0 This keyboard is in great condition and works ... \n", + "2 1 Adorable top with a hint of lace and a key hol... \n", + "3 1 New with tags. Leather horses. Retail for [rm]... \n", + "4 0 Complete with certificate of authenticity " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 1482535 entries, 0 to 1482534\n", + "Data columns (total 8 columns):\n", + "train_id 1482535 non-null int64\n", + "name 1482535 non-null object\n", + "item_condition_id 1482535 non-null int64\n", + "category_name 1476208 non-null object\n", + "brand_name 849853 non-null object\n", + "price 1482535 non-null float64\n", + "shipping 1482535 non-null int64\n", + "item_description 1482531 non-null object\n", + "dtypes: float64(1), int64(3), object(4)\n", + "memory usage: 90.5+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 1482535.000\n", + "mean 26.738\n", + "std 38.586\n", + "min 0.000\n", + "25% 10.000\n", + "50% 17.000\n", + "75% 29.000\n", + "max 2009.000\n", + "Name: price, dtype: float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.price.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " mu = 26.75 and sigma = 38.59\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY4AAAEWCAYAAABxMXBSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvOIA7rQAAIABJREFUeJzt3Xm4XWV59/HvLydhSAIkOScimQEBGxUjngJiXxWhiIgiVnGIgjjklThgra1YvKrFxnJpHasgwSJQjgoqIO2LQsCpziQUkVEikEBAMjMHMtzvH+vZZOfk7CnZa689/D7XlWvt9ay11773SbLu8wzreRQRmJmZ1WtU0QGYmVlnceIwM7OGOHGYmVlDnDjMzKwhThxmZtYQJw4zM2uIE4dZGUmfknTJDr73nZJ+UeX4DyWdMtK5kh6TtN+OfG6DMf5U0nvy/hzrbk4c1vEk3SvpyXTzfUjShZLGFx3XcBHx6oi4qMKx8RFxN0CK/1929HOa8fOQNEtSSBq9o3FY93LisG7x2ogYDxwCDAKfGH6CMr3yb77mz8NsR/XKfyLrERGxAvgh8Hx4pmlmgaRfAk8A+0maIukqSWslLZX03mGX2U3SpZIelXSjpBeWDkg6Q9Kf0rHbJJ047L2S9FVJD0u6Q9JRZQcqNhOl3+6fI2keMBf4h1Rj+C9Jfy/p+8PO/4qkLzf68xh2jVGSPiFpmaSVki6WtFc6/PO0XZ/ieEmtz7Le4cRhXUXSdOA44H/Lit8BzAP2AJYB3wHuB6YAbwQ+I+mVZeefAHwXmAR8C7hS0ph07E/A/wH2Av4ZuETSPmXvPSydMwB8Erhc0qR644+IhcAQ8NnUfPVa4BLgWEkT0nccDbwFuLjW9Sr8PEremf4cCewHjAe+mo69LG0npDh+Xe93sO7nxGHd4kpJ64FfAD8DPlN27MKIuDUiNgHPBl4KfCwiNkTETcA3gJPLzl8SEd+LiI3AF4DdgMMBIuK7EfFARGyJiEuBu4BDy967EvhSRGxMx+8EXrMzXywiHiSrAbwpFR0LrI6IJVXeVu3nUTIX+EJE3B0RjwEfB97ifg2rxf9ArFu8PiKuq3DsvrLXU4C1EfFoWdkysn6A7c6PiC2SSrUTJJ0MfASYlU4ZT1a7KFkR284cuqz03p10EXAacD7wduA/a5xf7edRMoUsvpJlZPeEvXc0SOsNrnFYLyi/kT8ATJK0R1nZDGBF2f700ovUmT4NeEDSTLIb9weA/oiYANwCqOy9UyWV789In7mj8ZZcCRws6fnA8WTNWTvrAWBm2f4MYBPwUIUYzAAnDusxEXEf8CvgXyXtJulg4N1k/QglL5b0htRk82HgKeA3wDiyG+oqAEmnsn2n87OAD0kaI+lNwF8AVzcY5kNkfQ7lcW8AvkfW5/K7iFje4DVH8m3gbyXtm4brfga4NDXprQK2DI/DDJw4rDe9layp6QHgCuCTw5p1fgC8GVhH1rH+htRncRvweeDXZDf3FwC/HHbt3wIHAKuBBcAbI2JNg/H9BzBb0npJV5aVX5Q+s1YzVb0uSNf6OXAPsAH4IEBEPEEW/y9THIc36TOtC8gLOZl1BkkzgDuAZ0fEI0XHY73LNQ6zDpD6Wj4CfMdJw4rmUVVmbU7SOLKmsWVkQ3HNCuWmKjMza4ibqszMrCFd2VQ1MDAQs2bNKjoMM7OOsmTJktURMbnWeV2ZOGbNmsXixYuLDsPMrKNIWlb7LDdVmZlZg5w4zMysIU4cZmbWECcOMzNriBOHmZk1xInDzKwLDA3BrFkwalS2HWrGxPsV5JY4JE2X9JO0LvOtkk5P5ZMkLZJ0V9pOTOVK6ygvlXSzpEPKrnVKOv8uSafkFbOZWRGGhmBgAKSR//T1ZdvRo7PtwED2p5Qk5s+HefNg2TKIyLbz5uWXPHKbciStw7xPRNyYFs1ZAryebI3jtRFxtqQzgIkR8TFJx5FN6Xwc2brNX46Iw9J6zYvJVmiLdJ0XR8S6Sp89ODgYfo7DzDrB0BCceips3Ljj15CyhDHczJlw772NXEdLImKw1nm51Tgi4sGIuDG9fhS4HZgKnEC2rgBp+/r0+gTg4sj8BpiQks+rgEURsTYli0V4ojcz6xJnnrlzSQNGThoAy5ux3NcIWtLHIWkW8CKyRW72jogH06E/s3V946lsuzb0/amsUvnwz5gnabGkxatWrWpq/GZmecnr5g4wY0Y+1809caQlKb8PfHj4OgKRtZM1pa0sIhZGxGBEDE6eXHOqFTOzttCsm/s2K90DY8fCggXNufZwuSYOSWPIksZQRFyeih9KTVClfpCVqXwFML3s7dNSWaVyM7OOt2ABjBmzc9cYOxbe976sT0PKtgsXwty5zYlxuDxHVYls7eTbI+ILZYeuAkojo04hW9+5VH5yGl11OPBwatK6BjhG0sQ0AuuYVGZm1vHmzoVvfhP6+yufMyrdqfv6sm1/f/anPEmcc07WEb5lS7bNK2lAvrPjvhR4B/AHSTelsn8EzgYuk/RushXNTkrHriYbUbUUeAI4FSAi1kr6NHBDOu+siFibY9xmZi01d26+N/pm68oVAD0c18yscYUPxzUzs9rmz9/6YN/o0dl+u+vKhZzMzDrB/Plw7rlb9zdv3rp/zjnFxFQP1zjMzAqycGFj5e3CicPMrCCbNzdW3i6cOMzMClIaXltvebtw4jAzK8hBB41cPm9ea+NolBOHmVkB5s+H227bvnz27PbuGAcnDjOzQlTqAL/zztbGsSOcOMzMCtCpHePgxGFm1nJTt1sYYqt27xgHPwBoZtZSEyfC+vWVj7d7xzi4xmFm1jJHH109aUD7d4yDE4eZWctcf331453QTAVOHGZmbaMTmqnAicPMrCWGhqofnzKlM5qpwInDzKwl3ve+yscmTIAVHbQgthOHmVnOhobgsccqH1+3rnWxNIMTh5lZzs48s+gImsuJw8wsZ8uXFx1BczlxmJnlbNy4ysf6+1sXR7M4cZiZ5axa/8aXv9y6OJrFicPMrEBz5xYdQeOcOMzMrCFOHGZmOavUx1Gt76OdOXGYmeXsvPNg1LC77ahRWXkncuIwM8vZ3Llw8cUwcyZI2fbiizuzfwOcOMzMcjc0lD0EuHw5zJgBCxZ0btIAL+RkZparoSF417vg6aez/WXLsn3o3OThGoeZWY5OP31r0ih5+umsvFM5cZiZ5WjNmsbKO4ETh5lZTubPLzqCfDhxmJnlZOHCoiPIhxOHmVlONm8uOoJ8OHGYmRWgE2fFLXHiMDMrQCfOilvixGFmlpO+vpHLpc59hgOcOMzMclOpjyOitXE0mxOHmVkOqg3FrVQT6RS5JQ5JF0haKemWsrJPSVoh6ab057iyYx+XtFTSnZJeVVZ+bCpbKumMvOI1M2umc8+tfKzTR1vlWeO4EDh2hPIvRsSc9OdqAEmzgbcAz0vvOUdSn6Q+4GvAq4HZwFvTuWZmbWvixOrHZ85sTRx5yW2Sw4j4uaRZdZ5+AvCdiHgKuEfSUuDQdGxpRNwNIOk76dzbmhyumVnTrF9f/fiCBa2JIy9F9HF8QNLNqSmrlJenAveVnXN/KqtUvh1J8yQtlrR41apVecRtZtYUnTyiClqfOM4F9gfmAA8Cn2/WhSNiYUQMRsTg5MmTm3VZM7OmOuqooiPYeS1djyMiHiq9lnQ+8N9pdwUwvezUaamMKuVmZm1naKj68euua00ceWppjUPSPmW7JwKlEVdXAW+RtKukfYEDgN8BNwAHSNpX0i5kHehXtTJmM7NGnHlm5WOd3ilekluNQ9K3gVcAA5LuBz4JvELSHCCAe4H/CxARt0q6jKzTexPw/ojYnK7zAeAaoA+4ICJuzStmM7OdtXx55WOd3ileouj0RxhHMDg4GIsXLy46DDPrQbNmZcvDDtffD6tXtzychkhaEhGDtc7zk+NmZk00btzI5Sed1No48uTEYWbWJPPnw20VnjLrpkWdnDjMzJrkvPMqH+v0aUbKOXGYmTXJli1FR9AaThxmZtYQJw4zsxaYMqXoCJrHicPMrAmOPrr68RVdNOeFE4eZWRNcf33REbROQ4lD0kRJB+cVjJmZtb+aiUPSTyXtKWkScCNwvqQv5B+amZm1o3pqHHtFxCPAG4CLI+IwoEZrnpmZlXTDVOrl6kkco9OstiexdRp0MzNLanWMd8NU6uXqSRxnkc1O+6eIuEHSfsBd+YZlZtY5eqljHOqYVj0ivgt8t2z/buBv8gzKzMzaVz2d4wdKul7SLWn/YEmfyD80M7POt8suRUfQfPU0VZ0PfBzYCBARN5OtxGdm1vPmz69+/IILWhNHK9WTOMZGxO+GlW3KIxgzs05Ta7r0uXNbE0cr1ZM4Vkvan2y5VyS9EXgw16jMzDpEN02XXq961hx/P7AQeK6kFcA9wNtzjcrMzNpWPaOq7gaOljQOGBURj+YflplZ55swoegI8lEzcUj6p2H7AETEWTnFZGbWEdLtsKJ161oTR6vV01T1eNnr3YDjgdvzCcfMzNpdPU1Vny/fl/RvZE+Sm5lZD9qR9TjGAtOaHYiZWTfZffeiI8hPPX0cfyANxQX6gMlk81eZmVkFTzxRdAT5qaeP4/iy15uAhyLCDwCamfWoiokjLdwEMHz47Z6SiIi1+YVlZtb+Ro2CLVtGLu9m1WocS8iaqEYacBbAfrlEZGbWIUZKGtXKu0XFxBER+7YyEDOzTjI0VHQExamnjwNJE4EDyJ7jACAifp5XUGZm7e7tVSZeqvVgYKerZ1TVe4DTyYbg3gQcDvwaeGW+oZmZtadaS8VGVD/e6erpwjkd+EtgWUQcCbwIWJ9rVGZmbazWUrEzZ7YmjqLUkzg2RMQGAEm7RsQdwEH5hmVm1rkWLCg6gnzV08dxv6QJwJXAIknrgGX5hmVm1rm6cfGmcvXMVXVievkpST8B9gJ+lGtUZmZtqtZSsd081UhJtQcArwa+BVwZEY8BRMTPWhWYmVk7qrVUbDdPNVJSrY/jPOA1wD2SLpN0oqRdWhSXmVlb6sWlYoermDgi4gcR8VZgJvB94GRguaRvSvrrVgVoZmbtpeaoqoh4IiIuTX0dxwBzqKOPQ9IFklZKuqWsbJKkRZLuStuJqVySviJpqaSbJR1S9p5T0vl3STplh76lmVkT1OrfGDOmNXEUrWbikLS3pA9K+iXZyKprgENqvA3gQuDYYWVnANdHxAHA9Wkf4NVkT6YfAMwDzk2fPQn4JHAYcCjwyVKyMTNrtXPPrX786adbE0fRKiYOSe+V9GPgRrIb+t9HxH4RcUZE/L7WhdOUJMNn0D0BuCi9vgh4fVn5xZH5DTBB0j7Aq4BFEbE2ItYBi9g+GZmZWQtVG477EuBfyWoIzZrrce+IeDC9/jOwd3o9Fbiv7Lz7U1ml8u1ImkdWW2HGjBlNCtfMzIar1jn+rohY1MSkMfz6wdaVBZtxvYURMRgRg5MnT27WZc3MgN6eDXe4Vi838lBqgiJtV6byFcD0svOmpbJK5WZmLVVtNlzo/okNy7U6cVwFlEZGnQL8oKz85DS66nDg4dSkdQ1wjKSJqVP8mFRmZmYFqWfp2BHVWjpW0reBVwADku4nGx11NnCZpHeTzXd1Ujr9auA4YCnwBHBq6TMkfRq4IZ13lpesNbN2M25c0RG0Vr1Lx84A1qXXE4DlQNUVAtPDgyM5aoRzA3h/hetcAFxQ7bPMzPJU6/mN885rTRztolrn+L4RsR9wHfDaiBiIiH7geODaVgVoZla0Ws9vdPtsuMPV08dxeERcXdqJiB8CR+QXkplZ++j2ZWB3RD3rcTwg6RPAJWl/LvBAfiGZmVk7q6fG8VZgMnAFcHl6Xan/wsyspxy1Xa9t96tnIae1wOmSxkXE4y2IycysY1x3XdERtF49kxweIek24Pa0/0JJ5+QemZlZwdy/MbJ6mqq+SDbZ4BqANMHhy/IMysysE/TS0+Ll6npyPCLuG1bkNbDMzHpUPaOq7pN0BBCSxgCnk5qtzMy6VV9f0RG0r3pqHO8je6p7KtkEg3Oo8JS3mVm32FJjXvDTTmtNHO2oao1DUh/wjojosecizayX1TOF+jk9PESoao0jIjYDb2tRLGZmbaHWFOq9rp4+jl9I+ipwKfDMcxwRcWNuUZmZtbFefOivXD2JY07anlVWFsArmx+OmVmxas2EC7350F+5ep4cP7IVgZiZtYNaM+FafU+O7y3pPyT9MO3PTgsxmZl1lXo6xXv1ob9y9QzHvZBsudYpaf+PwIfzCsjMrCjuFK9PPYljICIuA7YARMQm/OS4mVnPqidxPC6pn6xDHEmHAw/nGpWZWRtyM1WmnlFVHwGuAvaX9Euy9TjemGtUZmYtVk//hmXqGVV1o6SXAwcBAu6MiI25R2Zm1kK1+jdc29iqYuKQ9IYKhw6URERcnlNMZmbWxqrVOF6bts8CjgB+nPaPBH5FtoysmVnH84JNjamYOCLiVABJ1wKzI+LBtL8P2RBdMzPrQfWMqppeShrJQ8CMnOIxM2upqVNrn+P+jW3VM6rqeknXAN9O+28GenymFjPrFg88UHQEnaeeUVUfkHQiW9cZXxgRV+QblplZe5gwoegI2k89CzldlyY6dLIws56zbl3REbSfehZy2iJprxbFY2bWMvX0b9j26unjeAz4g6RFbLuQ04dyi8rMrAVq9W+4U3xk9SSOy/EzG2bWZSZOLDqCzlVP4rgUeE56vTQiNuQYj5lZS6xfX3QEnatiH4ek0ZI+C9wPXARcDNwn6bOSxrQqQDMzay/VOsc/B0wC9o2IF0fEIcD+wATg31oRnJlZHupppnL/RmXVEsfxwHsj4tFSQUQ8ApwGHJd3YGZmeXEz1c6pljgiYvucm4boOhebWddybaO6aonjNkknDy+U9HbgjvxCMjPLzy67FB1B56s2qur9wOWS3gUsSWWDwO7AiXkHZmaWh41ehm6nVZtWfQVwmKRXAs9LxVdHxPU7+6GS7gUeBTYDmyJiUNIksqG/s4B7gZMiYp0kAV8m61d5AnhnRNy4szGYWe+pp7bhZqra6pnk8MdsXcSpmY6MiNVl+2cA10fE2ZLOSPsfA14NHJD+HAacm7ZmZg1xbaM56lmPo1VOIHtehLR9fVn5xZH5DTAhLSZlZmYFKCpxBHCtpCWS5qWyvcsWjPozsHd6PRW4r+y996eybUiaJ2mxpMWrVq3KK24z61D1LA/rZqr61DPlSB7+KiJWSHoWsEjSNqO0IiIkNfRXGBELgYUAg4OD/us3M8tJITWO1PFORKwkW+fjUOChUhNU2q5Mp68Appe9fVoqMzOri2sbzdXyxCFpnKQ9Sq+BY4BbgKuAU9JppwA/SK+vAk5W5nDg4WFroJuZVVRP0rDGFNFUtTdwRTbKltHAtyLiR5JuAC6T9G5gGXBSOv9qsqG4S8mG457a+pDNrJuddlrREXQWjTCrSMcbHByMxYsXFx2GmbUBN1PVT9KSiBisdV47Dcc1M2sqN1Plw4nDzLpSvUnDtY3GOXGYWdepdyJDJ40d48RhZl3HU4vky4nDzLqK+zXy58RhZl2jkaThZqod58RhZl2hnnXES5w0do4Th5l1hXrXEXfS2HlOHGbW8dyv0VpOHGbW0dyv0XpOHGbWsZw0iuHEYWYdyUmjOE4cZtZx3KdRLCcOM+tqrm00X1FLx5qZNazRmsaYMfnE0etc4zCzjrAjSePpp/OJpde5xmFmba/RpOHmqXy5xmFmbc1Jo/04cZhZ2/Loqfbkpiozazs7mjBc22gN1zjMrK04abQ/1zjMrC04YXQO1zjMrHBOGp3FNQ4zK8zOdH47aRTHicPMWm5nR0s5aRTLTVVm1lJOGjnasgUefzz3j3GNw8xy14znMXouYWzeDGvXwpo1sHp1fdu1a+ElL4Ff/CLX0Jw4zCw3zXqAr+OTxsaN2U293gSwenW2iHqlL77rrjAwAP392fbgg7fuH3hg7l/HicPMmqqZT3u3ZcJ4+untb/K1EsHDD1e+3u67Zzf90o1/1qytCaHSduzYQh+rd+Iws52Wxz2sJUljw4bGagFr1sCjj1a+3vjx297kn/Oc6kmgvz9LAh3GicPMdkhev/DuUMKIgCeeaCwBrF6dvaeSPffceoOfPBme+9zqtYD+/qwJqQc4cZhZXfJuGXkmYUTAY4813hy0YUPli0+YsLU5aMqUrE+gWgKYNAl22SXfL9zBnDjMbDvNSxLBnjzCAKvpZ03F7ZtesRoOLksClVZgkrKbeukmP2MGHHJI7SQw2re6ZvJP06yHNZIgxBYmsL5qAhi+7WcNY9g08gX7+rKb+sAAbOqH/feHQw+t3hw0YUL2PiuUE4dZl6qWFEaxmYms46A6E8AAq5nEWvrYMuL1NjKa1Qywhn5WM8AdPHeb/fLtb+5KiWCvvWCUn0HuRE4cZm2uVq2gj01MYu12N/uPVUkCE1nHKEbuhX6KXba52d/C80dMAuWvH2UPoHKgbTms1naYE4dZE+1s38AYnn6miad0o59XozYwkfUVr/cku21zk1/OjBFrAeXbxxlHtSRQLyeL7uXEYU3n5T4zu7Khof6AAVazJ5WfEXiMcdvc4O9mv6oJYA39PElrnxFwsugNHZM4JB0LfBnoA74REWc3+zOGhuDMM2H58mywxoIFWXmpbNKkbH/Nmqx/bvPm7Cbp/yzdb3eeaDgJjKfyZHOPsMc2N/k7Oahqc9Aa+nmK3Vr4jevjf/u9qSMSh6Q+4GvAXwP3AzdIuioibmvWZwwNwbx5W58HWrYMTj01SwylkYFr1mw9f/PmbOv/OJ0mGMfjDSWAftYwlicrXnEdE5650f+ZZ3MLz69aC1hDPxvpvGcE/G/dSjoicQCHAksj4m4ASd8BTgCaljjOPHP7h0g3bmzW1S0f2TMCjdYEdmXkZwS2INYx8Zkb/H1M5ybmVG0OWsskNnfMf6P6OEFYLZ3yL34qcF/Z/v3AYeUnSJoHzAOYMWNGwx+wfPlORGc7TWxhLx6u++Zfel3pGYHNjGItk565yd/NftzAX243Gqh8u46JbKG7nxFwUrBm6JTEUVNELAQWAgwODjb832PGjKx5ynbeKDYzgfUN1QImsZbRbB7xepvo2+Ym/0cO5FccUbU5aD0TiC5ep8wJwIrUKYljBTC9bH9aKmuaBQu27eMAGDNm2z6OXlR6RqCR5qBJrK36jED5Df5WnldzeOgj7EkzhocWxTd56zadkjhuAA6QtC9ZwngL8LZmfsDcudm2m0dVjWbjds8IVGsGyh4Uq/yMwAZ23abp5ybmjPhw2Br6WXxP9rTwruPHM0ViSgu/t5k1l6JD7nqSjgO+RDYc94KIWFDp3MHBwVi8eHHLYivEU081PoX0I49Uvt7YsbWnjB6+LXgxGTNrLklLImKw1nmdUuMgIq4Gri46jlw8+WTji8k89ljl6+2xx7Y3+AMPrL2YzO67t+77mllH65jE0RFKi8nUc/Mvf/1k5WcE2GuvrTf5Zz0LZs+uXhOYNKlnFpMxs2I4cVQSkS0R2Whz0FNPVb7mxIlbb/LTpsELX1g7CYwZ07rvbGZWByeOcitXwlFHbU0ElZ4AlLY28QwMZIvLDw5Wbw6aONGLyZhZV/CdrNz48dni8ocfvvWmX2kxGa8jYGY9yomj3NixcMUVRUdhZtbW/GuzmZk1xInDzMwa4sRhZmYNceIwM7OGOHGYmVlDnDjMzKwhThxmZtYQJw4zM2tIx0yr3ghJq4BOWs9vAFhddBAF8PfuLf7e7W9mREyudVJXJo5OI2lxPXPgdxt/797i79093FRlZmYNceIwM7OGOHG0h4VFB1AQf+/e4u/dJdzHYWZmDXGNw8zMGuLEYWZmDXHiaDOS/k5SSBooOpZWkPQ5SXdIulnSFZImFB1TXiQdK+lOSUslnVF0PK0gabqkn0i6TdKtkk4vOqZWktQn6X8l/XfRsTSTE0cbkTQdOAZYXnQsLbQIeH5EHAz8Efh4wfHkQlIf8DXg1cBs4K2SZhcbVUtsAv4uImYDhwPv75HvXXI6cHvRQTSbE0d7+SLwD0DPjFiIiGsjYlPa/Q0wrch4cnQosDQi7o6Ip4HvACcUHFPuIuLBiLgxvX6U7CY6tdioWkPSNOA1wDeKjqXZnDjahKQTgBUR8fuiYynQu4AfFh1ETqYC95Xt30+P3EBLJM0CXgT8tthIWuZLZL8Ibik6kGYbXXQAvUTSdcCzRzh0JvCPZM1UXafa946IH6RzziRr1hhqZWzWGpLGA98HPhwRjxQdT94kHQ+sjIglkl5RdDzN5sTRQhFx9Ejlkl4A7Av8XhJkzTU3Sjo0Iv7cwhBzUel7l0h6J3A8cFR074NFK4DpZfvTUlnXkzSGLGkMRcTlRcfTIi8FXifpOGA3YE9Jl0TE2wuOqyn8AGAbknQvMBgRnTKj5g6TdCzwBeDlEbGq6HjyImk0Wef/UWQJ4wbgbRFxa6GB5UzZb0IXAWsj4sNFx1OEVOP4aEQcX3QszeI+DivaV4E9gEWSbpL09aIDykMaAPAB4BqyDuLLuj1pJC8F3gG8Mv393pR+C7cO5hqHmZk1xDUOMzNriBOHmZk1xInDzMwa4sRhZmYNceIwM7OGOHFYW5PUXzaM88+SVqTX6yXd1uJY5pQPJZX0uh2d5VbSvUXNgCzpnZKmlO1/ozTxYJFxWedw4rC2FhFrImJORMwBvg58Mb2eQw5zAKUH9SqZAzyTOCLiqog4u9kxtMA7gWcSR0S8JyJamoStszlxWCfrk3R+WufhWkm7A0jaX9KPJC2R9D+SnpvKZ0n6cVr743pJM1L5hZK+Lum3wGcljZN0gaTfpbUUTpC0C3AW8OZU43lz+s39q+kae6f1RH6f/hyRyq9McdwqaV6tLyTpVEl/TJ99ftn1L5T0xrLzHkvb8em73CjpD2myzNJ3vX34zyddYxAYSt9jd0k/lTQ4QixvT3HcJOm8tLZEX4rllvR5f7sTf3/WoZw4rJMdAHwtIp4HrAf+JpUvBD4YES8GPgqck8r/Hbgorf0xBHyl7FrTgCMi4iNkk07+OCIOBY4EPgeMAf4JuDTVgC4dFstXgJ9FxAuBQ4DSU+HvSnEMAh+S1F/py0jaB/hnsqf4BlXZAAACeElEQVSt/4ps3Y5aNgAnRsQhKdbPp2k+Rvz5RMT3gMXA3PQ9nqwQy18AbwZemmp4m4G5ZLWuqRHx/Ih4AfDNOmK0LuNJDq2T3RMRN6XXS4BZaRbWI4Dvbr1/smvavgR4Q3r9n8Bny6713YjYnF4fQzZB3UfT/m7AjBqxvBI4GSBd5+FU/iFJJ6bX08lu5msqXOMw4KelObskXQocWONzBXxG0svImu6mAnunY9v9fGpcq9xRwIuBG9LPcXdgJfBfwH6S/h34f8C1DVzTuoQTh3Wyp8pebya7uY0C1qffkhvxeNlrkf12fmf5CZIOa+SCaXK7o4GXRMQTkn5KloR2xCZSC4GkUcAuqXwuMBl4cURsTBNklj5jpJ9P3eGT1c62W5FR0guBVwHvA04iW0fFeoibqqyrpLUe7pH0JshmZ003OoBfAW9Jr+cC/1PhMtcAHyw1+Uh6USp/lGxCxpFcD5yWzu+TtBewF7AuJY3nki2dWs1vgZenkWRjgDeVHbuXrAYA8DqypjPSZ6xMSeNIYGaNz6j1Pcq/zxslPSt9p0mSZqYRV6Mi4vvAJ8ia5azHOHFYN5oLvFvS78n6GkpLtH4QOFXSzWQztp5e4f2fJrsx3yzp1rQP8BNgdqlzfNh7TgeOlPQHsmah2cCPgNGSbgfOJlsat6KIeBD4FPBr4Jdsu1b1+WRJ5fdkTW6lGtIQMJg+92TgjmqfkVwIfL3UOV4hltvIEsO16ee1CNiHrCnsp5JuAi6hS9eIt+o8O65Zm1K2wNVgRHyg6FjMyrnGYWZmDXGNw8zMGuIah5mZNcSJw8zMGuLEYWZmDXHiMDOzhjhxmJlZQ/4/0fprih1Lu38AAAAASUVORK5CYII=\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "df = df.loc[df['price'] > 0]\n", + "\n", + "sns.distplot(df['price'], fit = norm);\n", + "(mu, sigma) = norm.fit(df['price'])\n", + "print( '\\n mu = {:.2f} and sigma = {:.2f}\\n'.format(mu, sigma))\n", + "\n", + "plt.legend(['Normal dist. ($\\mu=$ {:.2f} and $\\sigma=$ {:.2f} )'.format(mu, sigma)],\n", + " loc='best')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Price distribution')\n", + "\n", + "fig = plt.figure()\n", + "res = stats.probplot(df['price'], plot=plt)\n", + "plt.show();" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " mu = 2.98 and sigma = 0.75\n", + "\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "sns.distplot(np.log1p(df['price']), fit = norm);\n", + "(mu, sigma) = norm.fit(np.log1p(df['price']))\n", + "print( '\\n mu = {:.2f} and sigma = {:.2f}\\n'.format(mu, sigma))\n", + "\n", + "plt.legend(['Normal dist. ($\\mu=$ {:.2f} and $\\sigma=$ {:.2f} )'.format(mu, sigma)],\n", + " loc='best')\n", + "plt.ylabel('Frequency')\n", + "plt.title('Log (Price+1) distribution')\n", + "\n", + "fig = plt.figure()\n", + "res = stats.probplot(np.log1p(df['price']), plot=plt)\n", + "plt.show();" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df[\"price\"] = np.log1p(df[\"price\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "train_id 0\n", + "name 0\n", + "item_condition_id 0\n", + "category_name 6314\n", + "brand_name 632336\n", + "price 0\n", + "shipping 0\n", + "item_description 4\n", + "dtype: int64" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will replace all NaN values in these 3 columns with \"missing\". There are 82,427 descriptions are \"No description yet\". So we will replace them by \"missing\" too." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_BRANDS = 2500\n", + "NAME_MIN_DF = 10\n", + "MAX_FEAT_DESCP = 50000\n", + "\n", + "df[\"category_name\"] = df[\"category_name\"].fillna(\"Other\").astype(\"category\")\n", + "df[\"brand_name\"] = df[\"brand_name\"].fillna(\"unknown\")\n", + "\n", + "pop_brands = df[\"brand_name\"].value_counts().index[:NUM_BRANDS]\n", + "df.loc[~df[\"brand_name\"].isin(pop_brands), \"brand_name\"] = \"Other\"\n", + "\n", + "df[\"item_description\"] = df[\"item_description\"].fillna(\"None\")\n", + "df[\"item_condition_id\"] = df[\"item_condition_id\"].astype(\"category\")\n", + "df[\"brand_name\"] = df[\"brand_name\"].astype(\"category\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "train_id int64\n", + "name object\n", + "item_condition_id category\n", + "category_name category\n", + "brand_name category\n", + "price float64\n", + "shipping int64\n", + "item_description object\n", + "dtype: object" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "target = df.price.values\n", + "features = df[['name', 'item_condition_id', 'category_name', 'brand_name', 'shipping', 'item_description']].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "name object\n", + "item_condition_id category\n", + "category_name category\n", + "brand_name category\n", + "shipping int64\n", + "item_description object\n", + "dtype: object" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "preprocess = ColumnTransformer(\n", + " [('item_condition_category', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['item_condition_id']),\n", + " ('brand_name_category', OneHotEncoder(dtype='int', handle_unknown='ignore'), ['brand_name']),\n", + " ('category_name_countvec', CountVectorizer(), 'category_name'),\n", + " ('name_countvec', CountVectorizer(min_df=NAME_MIN_DF), 'name'),\n", + " ('description_tfidf', TfidfVectorizer(max_features = MAX_FEAT_DESCP, stop_words = 'english', ngram_range=(1,3)), 'item_description')],\n", + " remainder='passthrough')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "model = make_pipeline(\n", + " preprocess,\n", + " Ridge(solver = \"lsqr\", fit_intercept=False))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('columntransformer', ColumnTransformer(n_jobs=None, remainder='passthrough', sparse_threshold=0.3,\n", + " transformer_weights=None,\n", + " transformers=[('item_condition_category', OneHotEncoder(categorical_features=None, categories=None, dtype='int',\n", + " handle_unknown='ignore', n_va...t_intercept=False, max_iter=None,\n", + " normalize=False, random_state=None, solver='lsqr', tol=0.001))])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train RMSE: 0.4534\n", + "Test RMSE: 0.4674\n" + ] + } + ], + "source": [ + "y_train_pred = model.predict(X_train)\n", + "y_pred = model.predict(X_test)\n", + "\n", + "train_rmse = np.sqrt(mean_squared_error(y_train_pred, y_train))\n", + "test_rmse = np.sqrt(mean_squared_error(y_pred, y_test))\n", + "print('Train RMSE: %.4f' % train_rmse)\n", + "print('Test RMSE: %.4f' % test_rmse)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}