Update notebook

susanli2016 · Oct 22, 2018 · 965a222 · 965a222
1 parent 89f084d
commit 965a222
Showing 1 changed file with 647 additions and 0 deletions.
diff --git a/BOW_TFIDF_Xgboost_update.ipynb b/BOW_TFIDF_Xgboost_update.ipynb
@@ -0,0 +1,647 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
+    "from sklearn import linear_model\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import scipy\n",
+    "from sklearn.metrics import log_loss\n",
+    "import xgboost as xgb\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "from sklearn.metrics import roc_auc_score\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>qid1</th>\n",
+       "      <th>qid2</th>\n",
+       "      <th>question1</th>\n",
+       "      <th>question2</th>\n",
+       "      <th>is_duplicate</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>2</td>\n",
+       "      <td>What is the step by step guide to invest in sh...</td>\n",
+       "      <td>What is the step by step guide to invest in sh...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>4</td>\n",
+       "      <td>What is the story of Kohinoor (Koh-i-Noor) Dia...</td>\n",
+       "      <td>What would happen if the Indian government sto...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>5</td>\n",
+       "      <td>6</td>\n",
+       "      <td>How can I increase the speed of my internet co...</td>\n",
+       "      <td>How can Internet speed be increased by hacking...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3</td>\n",
+       "      <td>7</td>\n",
+       "      <td>8</td>\n",
+       "      <td>Why am I mentally very lonely? How can I solve...</td>\n",
+       "      <td>Find the remainder when [math]23^{24}[/math] i...</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4</td>\n",
+       "      <td>9</td>\n",
+       "      <td>10</td>\n",
+       "      <td>Which one dissolve in water quikly sugar, salt...</td>\n",
+       "      <td>Which fish would survive in salt water?</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   id  qid1  qid2                                          question1  \\\n",
+       "0   0     1     2  What is the step by step guide to invest in sh...   \n",
+       "1   1     3     4  What is the story of Kohinoor (Koh-i-Noor) Dia...   \n",
+       "2   2     5     6  How can I increase the speed of my internet co...   \n",
+       "3   3     7     8  Why am I mentally very lonely? How can I solve...   \n",
+       "4   4     9    10  Which one dissolve in water quikly sugar, salt...   \n",
+       "\n",
+       "                                           question2  is_duplicate  \n",
+       "0  What is the step by step guide to invest in sh...             0  \n",
+       "1  What would happen if the Indian government sto...             0  \n",
+       "2  How can Internet speed be increased by hacking...             0  \n",
+       "3  Find the remainder when [math]23^{24}[/math] i...             0  \n",
+       "4            Which fish would survive in salt water?             0  "
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.read_csv('quora_train.csv')\n",
+    "df = df.dropna(how=\"any\").reset_index(drop=True)\n",
+    "\n",
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<matplotlib.axes._subplots.AxesSubplot at 0x1ea602c84e0>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAY0AAAEHCAYAAABSjBpvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAEjNJREFUeJzt3X+s3XV9x/Hny1acDhWUSliLK9EuE11EbYDofjBZoLBlxQ0y2CIdNqszkGiim2iygD9IMIuasSkLhEoxTmSoo3HV2iHOGRF6kQpU1N4gSi2BahFxRB343h/nc/VwOb3303srp3ifj+Sb8z3vz4/v5yRtX/n+OKepKiRJ6vGUcS9AkvTkYWhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSeq2eNwL2N8OO+ywWr58+biXIUlPKrfccsv3qmrJbP1+5UJj+fLlTExMjHsZkvSkkuTbPf28PCVJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqduv3Jf7niyWn/+f417Cr5S7L/7jcS9BWhBmPdNIcmSSG5LcmWR7kje2+oVJvptkW9tOHRrztiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f788JKkfdNzeeoR4M1V9SLgeODcJEe3tvdX1TFt2wTQ2s4EXgysAj6YZFGSRcAHgFOAo4GzhuZ5T5trBfAAsLbV1wIPVNULgfe3fpKkMZk1NKrq3qr6Stt/CLgTWDrDkNXA1VX1k6r6FjAJHNu2yaq6q6p+ClwNrE4S4NXAtW38BuC0obk2tP1rgRNbf0nSGOzTjfB2eehlwE2tdF6S25KsT3Joqy0F7hkatrPV9lZ/LvCDqnpkWv0xc7X2B1t/SdIYdIdGkoOBjwNvqqofApcCLwCOAe4F3jvVdcTwmkN9prmmr21dkokkE7t3757xc0iS5q4rNJI8lUFgfKSqPgFQVfdV1aNV9TPgcgaXn2BwpnDk0PBlwK4Z6t8DDkmyeFr9MXO19mcDe6avr6ouq6qVVbVyyZJZfw5ekjRHPU9PBbgCuLOq3jdUP2Ko22uAO9r+RuDM9uTTUcAK4GZgK7CiPSl1EIOb5RurqoAbgNPb+DXAdUNzrWn7pwOfa/0lSWPQ8z2NVwGvBW5Psq3V3s7g6adjGFwuuht4PUBVbU9yDfA1Bk9enVtVjwIkOQ/YDCwC1lfV9jbfW4Grk7wbuJVBSNFeP5xkksEZxpnz+KySpHmaNTSq6ouMvrewaYYxFwEXjahvGjWuqu7iF5e3hus/Bs6YbY2SpCeGPyMiSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKnbrKGR5MgkNyS5M8n2JG9s9eck2ZJkR3s9tNWT5JIkk0luS/LyobnWtP47kqwZqr8iye1tzCVJMtMxJEnj0XOm8Qjw5qp6EXA8cG6So4HzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVuVavv7RiSpDGYNTSq6t6q+krbfwi4E1gKrAY2tG4bgNPa/mrgqhr4MnBIkiOAk4EtVbWnqh4AtgCrWtuzqurGqirgqmlzjTqGJGkM9umeRpLlwMuAm4DDq+peGAQL8LzWbSlwz9Cwna02U33niDozHGP6utYlmUgysXv37n35SJKkfdAdGkkOBj4OvKmqfjhT1xG1mkO9W1VdVlUrq2rlkiVL9mWoJGkfdIVGkqcyCIyPVNUnWvm+dmmJ9np/q+8EjhwavgzYNUt92Yj6TMeQJI1Bz9NTAa4A7qyq9w01bQSmnoBaA1w3VD+7PUV1PPBgu7S0GTgpyaHtBvhJwObW9lCS49uxzp4216hjSJLGYHFHn1cBrwVuT7Kt1d4OXAxck2Qt8B3gjNa2CTgVmAQeBs4BqKo9Sd4FbG393llVe9r+G4ArgacDn24bMxxDkjQGs4ZGVX2R0fcdAE4c0b+Ac/cy13pg/Yj6BPCSEfXvjzqGJGk8/Ea4JKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSuhkakqRuhoYkqZuhIUnqZmhIkroZGpKkboaGJKmboSFJ6mZoSJK6GRqSpG6GhiSpm6EhSepmaEiSus0aGknWJ7k/yR1DtQuTfDfJtradOtT2tiSTSb6R5OSh+qpWm0xy/lD9qCQ3JdmR5GNJDmr1p7X3k619+f760JKkuek507gSWDWi/v6qOqZtmwCSHA2cCby4jflgkkVJFgEfAE4BjgbOan0B3tPmWgE8AKxt9bXAA1X1QuD9rZ8kaYxmDY2q+gKwp3O+1cDVVfWTqvoWMAkc27bJqrqrqn4KXA2sThLg1cC1bfwG4LShuTa0/WuBE1t/SdKYzOeexnlJbmuXrw5ttaXAPUN9drba3urPBX5QVY9Mqz9mrtb+YOsvSRqTxXMcdynwLqDa63uB1wGjzgSK0eFUM/RnlrbHSLIOWAfw/Oc/f6Z1S5rNhc8e9wp+tVz44LhXsF/N6Uyjqu6rqker6mfA5QwuP8HgTOHIoa7LgF0z1L8HHJJk8bT6Y+Zq7c9mL5fJquqyqlpZVSuXLFkyl48kSeowp9BIcsTQ29cAU09WbQTObE8+HQWsAG4GtgIr2pNSBzG4Wb6xqgq4ATi9jV8DXDc015q2fzrwudZfkjQms16eSvJR4ATgsCQ7gQuAE5Icw+By0d3A6wGqanuSa4CvAY8A51bVo22e84DNwCJgfVVtb4d4K3B1kncDtwJXtPoVwIeTTDI4wzhz3p9WkjQvs4ZGVZ01onzFiNpU/4uAi0bUNwGbRtTv4heXt4brPwbOmG19kqQnjt8IlyR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUbdbQSLI+yf1J7hiqPSfJliQ72uuhrZ4klySZTHJbkpcPjVnT+u9Ismao/ookt7cxlyTJTMeQJI1Pz5nGlcCqabXzgeuragVwfXsPcAqwom3rgEthEADABcBxwLHABUMhcGnrOzVu1SzHkCSNyayhUVVfAPZMK68GNrT9DcBpQ/WrauDLwCFJjgBOBrZU1Z6qegDYAqxqbc+qqhurqoCrps016hiSpDGZ6z2Nw6vqXoD2+rxWXwrcM9RvZ6vNVN85oj7TMR4nybokE0kmdu/ePcePJEmazf6+EZ4RtZpDfZ9U1WVVtbKqVi5ZsmRfh0uSOs01NO5rl5Zor/e3+k7gyKF+y4Bds9SXjajPdAxJ0pjMNTQ2AlNPQK0Brhuqn92eojoeeLBdWtoMnJTk0HYD/CRgc2t7KMnx7amps6fNNeoYkqQxWTxbhyQfBU4ADkuyk8FTUBcD1yRZC3wHOKN13wScCkwCDwPnAFTVniTvAra2fu+sqqmb629g8ITW04FPt40ZjiFJGpNZQ6OqztpL04kj+hZw7l7mWQ+sH1GfAF4yov79UceQJI2P3wiXJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktRtXqGR5O4ktyfZlmSi1Z6TZEuSHe310FZPkkuSTCa5LcnLh+ZZ0/rvSLJmqP6KNv9kG5v5rFeSND/740zjD6vqmKpa2d6fD1xfVSuA69t7gFOAFW1bB1wKg5ABLgCOA44FLpgKmtZn3dC4VfthvZKkOfplXJ5aDWxo+xuA04bqV9XAl4FDkhwBnAxsqao9VfUAsAVY1dqeVVU3VlUBVw3NJUkag/mGRgGfTXJLknWtdnhV3QvQXp/X6kuBe4bG7my1meo7R9QfJ8m6JBNJJnbv3j3PjyRJ2pvF8xz/qqraleR5wJYkX5+h76j7ETWH+uOLVZcBlwGsXLlyZB9J0vzN60yjqna11/uBTzK4J3Ffu7REe72/dd8JHDk0fBmwa5b6shF1SdKYzDk0kvx6kmdO7QMnAXcAG4GpJ6DWANe1/Y3A2e0pquOBB9vlq83ASUkObTfATwI2t7aHkhzfnpo6e2guSdIYzOfy1OHAJ9tTsIuBf6uqzyTZClyTZC3wHeCM1n8TcCowCTwMnANQVXuSvAvY2vq9s6r2tP03AFcCTwc+3TZJ0pjMOTSq6i7gpSPq3wdOHFEv4Ny9zLUeWD+iPgG8ZK5rlCTtX34jXJLUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0MDUlSN0NDktTN0JAkdTM0JEndDA1JUjdDQ5LUzdCQJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1MzQkSd0O+NBIsirJN5JMJjl/3OuRpIXsgA6NJIuADwCnAEcDZyU5eryrkqSF64AODeBYYLKq7qqqnwJXA6vHvCZJWrAO9NBYCtwz9H5nq0mSxmDxuBcwi4yo1eM6JeuAde3tj5J845e6qoXlMOB7417EbPKeca9AY/Ck+LPJO0b9M3ZA+s2eTgd6aOwEjhx6vwzYNb1TVV0GXPZELWohSTJRVSvHvQ5pOv9sjseBfnlqK7AiyVFJDgLOBDaOeU2StGAd0GcaVfVIkvOAzcAiYH1VbR/zsiRpwTqgQwOgqjYBm8a9jgXMy346UPlncwxS9bj7ypIkjXSg39OQJB1ADA1JUrcD/p6GnjhJfpvBN+6XMvg+zC5gY1XdOdaFSTpgeKYhAJK8lcHPtAS4mcHjzgE+6g9FSprijXABkOSbwIur6v+m1Q8CtlfVivGsTJpZknOq6kPjXsdC4ZmGpvwM+I0R9SNam3Sgese4F7CQeE9DU94EXJ9kB7/4kcjnAy8EzhvbqiQgyW17awIOfyLXstB5eUo/l+QpDH6OfimDv4w7ga1V9ehYF6YFL8l9wMnAA9ObgC9V1aizZP0SeKahn6uqnwFfHvc6pBE+BRxcVdumNyT5/BO/nIXLMw1JUjdvhEuSuhkakqRuhoYWrCRfmuf4v07yL/MYf3eSw+azliSnJTl6rmuQ9pWhoQWrql457jVMmcdaTgMMDT1hDA0tWEl+1F6PSPKFJNuS3JHk92YYc06Sbyb5b+BVQ/Urk5w+Yu4T2tyfTPK1JP/aHm0euZa2//dJbk/y1SQXt9rfJNnaah9P8owkrwT+FPjHtvYXtO0zSW5J8j/t98Sk/cZHbiX4S2BzVV2UZBHwjFGdkhzB4NvHrwAeBG4Abu2Y/1gGZwPfBj4D/Blw7V6OcQqDs4fjqurhJM9pTZ+oqstbn3cDa6vqn5NsBD5VVde2tuuBv62qHUmOAz4IvLpjjVIXQ0Ma/Djj+iRPBf5j1HcBmuOAz1fVboAkHwN+q2P+m6vqrjbmo8DvspfQAP4I+FBVPQxQVXta/SUtLA4BDmbwXyA/RpKDgVcC/55kqvy0jvVJ3bw8pQWvqr4A/D7wXeDDSc6eqfte6o/Q/j5l8C/2QTOMmenLUdlL+5XAeVX1OwzOdn5tRJ+nAD+oqmOGthfNcCxpnxkaWvCS/CZwf7v8cwXw8r10vQk4Iclz21nJGUNtdzO4bAWD/5PkqUNtxyY5qt3L+AvgizMs57PA65I8o61t6vLUM4F723H/aqj/Q62Nqvoh8K0kZ7SxSfLSGY4l7TNDQ4ITgG1JbgX+HPinUZ2q6l7gQuBG4L+Arww1Xw78QZKbGVzG+t+hthuBi4E7gG8Bn9zbQqrqM8BGYCLJNuAtrekfGITWFuDrQ0OuBv4uya1JXsAgUNYm+SqwnUGASfuNPyMi/RIlOQF4S1X9ybjXIu0PnmlIkrp5piGNkOQmHv/k0Wur6vZxrEc6UBgakqRuXp6SJHUzNCRJ3QwNSVI3Q0OS1M3QkCR1+38qOCjM258g7gAAAABJRU5ErkJggg==\n",
+      "text/plain": [
+       "<matplotlib.figure.Figure at 0x1ea4641b470>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "df.groupby(\"is_duplicate\")['id'].count().plot.bar()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "What is the step by step guide to invest in share market in india?\n",
+      "What is the step by step guide to invest in share market?\n",
+      "\n",
+      "What is the story of Kohinoor (Koh-i-Noor) Diamond?\n",
+      "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?\n",
+      "\n",
+      "How can I increase the speed of my internet connection while using a VPN?\n",
+      "How can Internet speed be increased by hacking through DNS?\n",
+      "\n",
+      "Why am I mentally very lonely? How can I solve it?\n",
+      "Find the remainder when [math]23^{24}[/math] is divided by 24,23?\n",
+      "\n",
+      "Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?\n",
+      "Which fish would survive in salt water?\n",
+      "\n",
+      "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?\n",
+      "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?\n",
+      "\n",
+      "Should I buy tiago?\n",
+      "What keeps childern active and far from phone and video games?\n",
+      "\n",
+      "How can I be a good geologist?\n",
+      "What should I do to be a great geologist?\n",
+      "\n",
+      "When do you use シ instead of し?\n",
+      "When do you use \"&\" instead of \"and\"?\n",
+      "\n",
+      "Motorola (company): Can I hack my Charter Motorolla DCX3400?\n",
+      "How do I hack Motorola DCX3400 for free internet?\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = 0 \n",
+    "for i in range(a,a+10):\n",
+    "    print(df.question1[i])\n",
+    "    print(df.question2[i])\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "SPECIAL_TOKENS = {\n",
+    "    'quoted': 'quoted_item',\n",
+    "    'non-ascii': 'non_ascii_word',\n",
+    "    'undefined': 'something'\n",
+    "}\n",
+    "\n",
+    "def clean(text, stem_words=True):\n",
+    "    import re\n",
+    "    from string import punctuation\n",
+    "    from nltk.stem import SnowballStemmer\n",
+    "    from nltk.corpus import stopwords\n",
+    "    \n",
+    "    def pad_str(s):\n",
+    "        return ' '+s+' '\n",
+    "    \n",
+    "    if pd.isnull(text):\n",
+    "        return ''\n",
+    "\n",
+    "#    stops = set(stopwords.words(\"english\"))\n",
+    "    # Clean the text, with the option to stem words.\n",
+    "    \n",
+    "    # Empty question\n",
+    "    \n",
+    "    if type(text) != str or text=='':\n",
+    "        return ''\n",
+    "\n",
+    "    # Clean the text\n",
+    "    text = re.sub(\"\\'s\", \" \", text) # we have cases like \"Sam is\" or \"Sam's\" (i.e. his) these two cases aren't separable, I choose to compromise are kill \"'s\" directly\n",
+    "    text = re.sub(\" whats \", \" what is \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\"\\'ve\", \" have \", text)\n",
+    "    text = re.sub(\"can't\", \"can not\", text)\n",
+    "    text = re.sub(\"n't\", \" not \", text)\n",
+    "    text = re.sub(\"i'm\", \"i am\", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\"\\'re\", \" are \", text)\n",
+    "    text = re.sub(\"\\'d\", \" would \", text)\n",
+    "    text = re.sub(\"\\'ll\", \" will \", text)\n",
+    "    text = re.sub(\"e\\.g\\.\", \" eg \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\"b\\.g\\.\", \" bg \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\"(\\d+)(kK)\", \" \\g<1>000 \", text)\n",
+    "    text = re.sub(\"e-mail\", \" email \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\"(the[\\s]+|The[\\s]+)?U\\.S\\.A\\.\", \" America \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\"(the[\\s]+|The[\\s]+)?United State(s)?\", \" America \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\"\\(s\\)\", \" \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\"[c-fC-F]\\:\\/\", \" disk \", text)\n",
+    "    \n",
+    "    # remove comma between numbers, i.e. 15,000 -> 15000\n",
+    "    \n",
+    "    text = re.sub('(?<=[0-9])\\,(?=[0-9])', \"\", text)\n",
+    "    \n",
+    "#     # all numbers should separate from words, this is too aggressive\n",
+    "    \n",
+    "#     def pad_number(pattern):\n",
+    "#         matched_string = pattern.group(0)\n",
+    "#         return pad_str(matched_string)\n",
+    "#     text = re.sub('[0-9]+', pad_number, text)\n",
+    "    \n",
+    "    # add padding to punctuations and special chars, we still need them later\n",
+    "    \n",
+    "    text = re.sub('\\$', \" dollar \", text)\n",
+    "    text = re.sub('\\%', \" percent \", text)\n",
+    "    text = re.sub('\\&', \" and \", text)\n",
+    "    \n",
+    "#    def pad_pattern(pattern):\n",
+    "#        matched_string = pattern.group(0)\n",
+    "#       return pad_str(matched_string)\n",
+    "#    text = re.sub('[\\!\\?\\@\\^\\+\\*\\/\\,\\~\\|\\`\\=\\:\\;\\.\\#\\\\\\]', pad_pattern, text) \n",
+    "        \n",
+    "    text = re.sub('[^\\x00-\\x7F]+', pad_str(SPECIAL_TOKENS['non-ascii']), text) # replace non-ascii word with special word\n",
+    "    \n",
+    "    # indian dollar\n",
+    "    \n",
+    "    text = re.sub(\"(?<=[0-9])rs \", \" rs \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(\" rs(?=[0-9])\", \" rs \", text, flags=re.IGNORECASE)\n",
+    "    \n",
+    "    # clean text rules get from : https://www.kaggle.com/currie32/the-importance-of-cleaning-text\n",
+    "    text = re.sub(r\" (the[\\s]+|The[\\s]+)?US(A)? \", \" America \", text)\n",
+    "    text = re.sub(r\" UK \", \" England \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" india \", \" India \", text)\n",
+    "    text = re.sub(r\" switzerland \", \" Switzerland \", text)\n",
+    "    text = re.sub(r\" china \", \" China \", text)\n",
+    "    text = re.sub(r\" chinese \", \" Chinese \", text) \n",
+    "    text = re.sub(r\" imrovement \", \" improvement \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" intially \", \" initially \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" quora \", \" Quora \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" dms \", \" direct messages \", text, flags=re.IGNORECASE)  \n",
+    "    text = re.sub(r\" demonitization \", \" demonetization \", text, flags=re.IGNORECASE) \n",
+    "    text = re.sub(r\" actived \", \" active \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" kms \", \" kilometers \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" cs \", \" computer science \", text, flags=re.IGNORECASE) \n",
+    "    text = re.sub(r\" upvote\", \" up vote\", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" iPhone \", \" phone \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" \\0rs \", \" rs \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" calender \", \" calendar \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" ios \", \" operating system \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" gps \", \" GPS \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" gst \", \" GST \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" programing \", \" programming \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" bestfriend \", \" best friend \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" dna \", \" DNA \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" III \", \" 3 \", text)\n",
+    "    text = re.sub(r\" banglore \", \" Banglore \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" J K \", \" JK \", text, flags=re.IGNORECASE)\n",
+    "    text = re.sub(r\" J\\.K\\. \", \" JK \", text, flags=re.IGNORECASE)\n",
+    "    \n",
+    "    # replace the float numbers with a random number, it will be parsed as number afterward, and also been replaced with word \"number\"\n",
+    "    \n",
+    "    text = re.sub('[0-9]+\\.[0-9]+', \" 87 \", text)\n",
+    "  \n",
+    "    \n",
+    "    # Remove punctuation from text\n",
+    "    text = ''.join([c for c in text if c not in punctuation]).lower()\n",
+    "       # Return a list of words\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['question1'] = df['question1'].apply(clean)\n",
+    "df['question2'] = df['question2'].apply(clean)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "what is the step by step guide to invest in share market in india\n",
+      "what is the step by step guide to invest in share market\n",
+      "\n",
+      "what is the story of kohinoor kohinoor diamond\n",
+      "what would happen if the indian government stole the kohinoor kohinoor diamond back\n",
+      "\n",
+      "how can i increase the speed of my internet connection while using a vpn\n",
+      "how can internet speed be increased by hacking through dns\n",
+      "\n",
+      "why am i mentally very lonely how can i solve it\n",
+      "find the remainder when math2324math is divided by 2423\n",
+      "\n",
+      "which one dissolve in water quikly sugar salt methane and carbon di oxide\n",
+      "which fish would survive in salt water\n",
+      "\n",
+      "astrology i am a capricorn sun cap moon and cap risingwhat does that say about me\n",
+      "i am a triple capricorn sun moon and ascendant in capricorn what does this say about me\n",
+      "\n",
+      "should i buy tiago\n",
+      "what keeps childern active and far from phone and video games\n",
+      "\n",
+      "how can i be a good geologist\n",
+      "what should i do to be a great geologist\n",
+      "\n",
+      "when do you use  nonasciiword  instead of  nonasciiword \n",
+      "when do you use  and  instead of and\n",
+      "\n",
+      "motorola company can i hack my charter motorolla dcx3400\n",
+      "how do i hack motorola dcx3400 for free internet\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = 0 \n",
+    "for i in range(a,a+10):\n",
+    "    print(df.question1[i])\n",
+    "    print(df.question2[i])\n",
+    "    print()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### BOW + Xgboost Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "count_vect = CountVectorizer(analyzer='word', token_pattern=r'\\w{1,}')\n",
+    "count_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
+    "trainq1_trans = count_vect.transform(df['question1'].values)\n",
+    "trainq2_trans = count_vect.transform(df['question2'].values)\n",
+    "labels = df['is_duplicate'].values\n",
+    "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
+    "y = labels\n",
+    "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n",
+    "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
+    "xgb_prediction = xgb_model.predict(X_valid)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "training score: 0.6177597850983563\n",
+      "validation score: 0.6154032008574583\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.70      0.95      0.80     84267\n",
+      "           1       0.77      0.30      0.43     49148\n",
+      "\n",
+      "   micro avg       0.71      0.71      0.71    133415\n",
+      "   macro avg       0.73      0.62      0.62    133415\n",
+      "weighted avg       0.72      0.71      0.66    133415\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
+    "\n",
+    "print('training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
+    "print('validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
+    "print(classification_report(y_valid, xgb_prediction))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Word level TF-IDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', max_features=5000)\n",
+    "tfidf_vect.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
+    "trainq1_trans = tfidf_vect.transform(df['question1'].values)\n",
+    "trainq2_trans = tfidf_vect.transform(df['question2'].values)\n",
+    "labels = df['is_duplicate'].values\n",
+    "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
+    "y = labels\n",
+    "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "word level tf-idf training score: 0.8493408114951853\n",
+      "word level tf-idf validation score: 0.7576508867065961\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.79      0.90      0.84     84267\n",
+      "           1       0.77      0.60      0.67     49148\n",
+      "\n",
+      "   micro avg       0.79      0.79      0.79    133415\n",
+      "   macro avg       0.78      0.75      0.76    133415\n",
+      "weighted avg       0.79      0.79      0.78    133415\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
+    "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
+    "xgb_prediction = xgb_model.predict(X_valid)\n",
+    "print('word level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
+    "print('word level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
+    "print(classification_report(y_valid, xgb_prediction))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### N-gram Level TF-IDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n",
+    "tfidf_vect_ngram.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
+    "trainq1_trans = tfidf_vect_ngram.transform(df['question1'].values)\n",
+    "trainq2_trans = tfidf_vect_ngram.transform(df['question2'].values)\n",
+    "labels = df['is_duplicate'].values\n",
+    "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
+    "y = labels\n",
+    "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "n-gram level tf-idf training score: 0.7193864239031045\n",
+      "n-gram level tf-idf validation score: 0.67470696099733\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.73      0.92      0.81     84267\n",
+      "           1       0.75      0.42      0.54     49148\n",
+      "\n",
+      "   micro avg       0.73      0.73      0.73    133415\n",
+      "   macro avg       0.74      0.67      0.67    133415\n",
+      "weighted avg       0.74      0.73      0.71    133415\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
+    "xgb_prediction = xgb_model.predict(X_valid)\n",
+    "print('n-gram level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
+    "print('n-gram level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
+    "print(classification_report(y_valid, xgb_prediction))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Character Level TF-IDF "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "character level tf-idf training score: 0.9844717869102682\n",
+      "character level tf-idf validation score: 0.8008380950798113\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "           0       0.83      0.91      0.87     84267\n",
+      "           1       0.81      0.67      0.74     49148\n",
+      "\n",
+      "   micro avg       0.82      0.82      0.82    133415\n",
+      "   macro avg       0.82      0.79      0.80    133415\n",
+      "weighted avg       0.82      0.82      0.82    133415\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import f1_score, classification_report, accuracy_score\n",
+    "tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\\w{1,}', ngram_range=(2,3), max_features=5000)\n",
+    "tfidf_vect_ngram_chars.fit(pd.concat((df['question1'],df['question2'])).unique())\n",
+    "trainq1_trans = tfidf_vect_ngram_chars.transform(df['question1'].values)\n",
+    "trainq2_trans = tfidf_vect_ngram_chars.transform(df['question2'].values)\n",
+    "labels = df['is_duplicate'].values\n",
+    "X = scipy.sparse.hstack((trainq1_trans,trainq2_trans))\n",
+    "y = labels\n",
+    "X_train,X_valid,y_train,y_valid = train_test_split(X,y, test_size = 0.33, random_state = 42)\n",
+    "xgb_model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train) \n",
+    "xgb_prediction = xgb_model.predict(X_valid)\n",
+    "print('character level tf-idf training score:', f1_score(y_train, xgb_model.predict(X_train), average='macro'))\n",
+    "print('character level tf-idf validation score:', f1_score(y_valid, xgb_model.predict(X_valid), average='macro'))\n",
+    "print(classification_report(y_valid, xgb_prediction))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}