From c92afcb56efc74689767acb1ad184c43fd52c2e8 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Mon, 29 Oct 2018 04:50:38 -0400 Subject: [PATCH] Add notebook --- Word2vec_xgboost.ipynb | 1262 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1262 insertions(+) create mode 100644 Word2vec_xgboost.ipynb diff --git a/Word2vec_xgboost.ipynb b/Word2vec_xgboost.ipynb new file mode 100644 index 0000000..2afe164 --- /dev/null +++ b/Word2vec_xgboost.ipynb @@ -0,0 +1,1262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Word2vec & Xgboost" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import gensim\n", + "from fuzzywuzzy import fuzz\n", + "from nltk.corpus import stopwords\n", + "from tqdm import tqdm_notebook\n", + "from nltk import word_tokenize\n", + "from scipy.stats import skew, kurtosis\n", + "from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis\n", + "from nltk import word_tokenize\n", + "stop_words = stopwords.words('english')\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import confusion_matrix \n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('quora_train.csv')\n", + "df = df.dropna(how=\"any\").reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "What is the step by step guide to invest in share market in india?\n", + "What is the step by step guide to invest in share market?\n", + "\n", + "What is the story of Kohinoor (Koh-i-Noor) Diamond?\n", + "What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back?\n", + "\n", + "How can I increase the speed of my internet connection while using a VPN?\n", + "How can Internet speed be increased by hacking through DNS?\n", + "\n", + "Why am I mentally very lonely? How can I solve it?\n", + "Find the remainder when [math]23^{24}[/math] is divided by 24,23?\n", + "\n", + "Which one dissolve in water quikly sugar, salt, methane and carbon di oxide?\n", + "Which fish would survive in salt water?\n", + "\n", + "Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me?\n", + "I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me?\n", + "\n", + "Should I buy tiago?\n", + "What keeps childern active and far from phone and video games?\n", + "\n", + "How can I be a good geologist?\n", + "What should I do to be a great geologist?\n", + "\n", + "When do you use シ instead of し?\n", + "When do you use \"&\" instead of \"and\"?\n", + "\n", + "Motorola (company): Can I hack my Charter Motorolla DCX3400?\n", + "How do I hack Motorola DCX3400 for free internet?\n", + "\n" + ] + } + ], + "source": [ + "a = 0 \n", + "for i in range(a,a+10):\n", + " print(df.question1[i])\n", + " print(df.question2[i])\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computing Word Mover's Distance (WMD)\n", + "\n", + "The WMD measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to \"travel\" to reach the embedded words of another document." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "question1 = 'What would a Trump presidency mean for current international master’s students on an F1 visa?'\n", + "question2 = 'How will a Trump presidency affect the students presently in US or planning to study in US?'\n", + "\n", + "question1 = question1.lower().split()\n", + "question2 = question2.lower().split()\n", + "\n", + "question1 = [w for w in question1 if w not in stop_words]\n", + "question2 = [w for w in question2 if w not in stop_words]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be using word2vec pre-trained Google News corpus. We load these into a Gensim Word2Vec model class." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "import gensim\n", + "\n", + "from gensim.models import Word2Vec\n", + " \n", + "model = gensim.models.KeyedVectors.load_word2vec_format('./word2Vec_models/GoogleNews-vectors-negative300.bin.gz', binary=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "let's compute WMD of these two sentence using the wmdistance method. These two sentences are expressing the same meaning, and they are duplicate." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "distance = 1.8293\n" + ] + } + ], + "source": [ + "distance = model.wmdistance(question1, question2)\n", + "print('distance = %.4f' % distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This question pair is labled as duplicate, but the distance between these two sentences is pretty large. This brings us to normalized WMD." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normalizing word2vec vectors\n", + "\n", + "When using the wmdistance method, it is beneficial to normalize the word2vec vectors first, so they all have equal length. To do this, simply call model.init_sims(replace=True) and Gensim will take care of that for you.\n", + "\n", + "Usually, one measures the distance between two word2vec vectors using the cosine distance (see cosine similarity), which measures the angle between vectors. WMD, on the other hand, uses the Euclidean distance. The Euclidean distance between two vectors might be large because their lengths differ, but the cosine distance is small because the angle between them is small; we can mitigate some of this by normalizing the vectors." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "normalized distance = 0.7589\n" + ] + } + ], + "source": [ + "model.init_sims(replace=True)\n", + "distance = model.wmdistance(question1, question2)\n", + "print('normalized distance = %.4f' % distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After normalization, the distance became much smaller." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To put it in perspective, let's try one more pair. This time, these two questions are not duplicate." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "distance = 1.2637\n" + ] + } + ], + "source": [ + "question3 = 'Why am I mentally very lonely? How can I solve it?'\n", + "question4 = 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'\n", + "\n", + "question3 = question3.lower().split()\n", + "question4 = question4.lower().split()\n", + "\n", + "question3 = [w for w in question3 if w not in stop_words]\n", + "question4 = [w for w in question4 if w not in stop_words]\n", + "\n", + "distance = model.wmdistance(question3, question4)\n", + "print('distance = %.4f' % distance)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "normalized distance = 1.2637\n" + ] + } + ], + "source": [ + "model.init_sims(replace=True)\n", + "distance = model.wmdistance(question3, question4)\n", + "print('normalized distance = %.4f' % distance)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After normalization, the distance remains the same. WMD thinks the 2nd pair is not as similar as the 1st pair. It worked!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### soft cosine" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim import corpora\n", + "documents = [question1, question2, question3, question4]\n", + "dictionary = corpora.Dictionary(documents)\n", + "corpus = [dictionary.doc2bow(document) for document in documents]\n", + "\n", + "# Convert the sentences into bag-of-words vectors.\n", + "question1 = dictionary.doc2bow(question1)\n", + "question2 = dictionary.doc2bow(question2)\n", + "question3 = dictionary.doc2bow(question3)\n", + "question4 = dictionary.doc2bow(question4)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import gensim.downloader as api\n", + "\n", + "w2v_model = api.load(\"glove-wiki-gigaword-50\")\n", + "similarity_matrix = w2v_model.similarity_matrix(dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "similarity = 0.7611\n" + ] + } + ], + "source": [ + "from gensim.matutils import softcossim\n", + "\n", + "similarity = softcossim(question1, question2, similarity_matrix)\n", + "print('similarity = %.4f' % similarity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The similarity for the 1st pair is relative large, this means soft cosine thinks these two sentence are very similar." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "similarity = 0.2030\n" + ] + } + ], + "source": [ + "similarity = softcossim(question3, question4, similarity_matrix)\n", + "print('similarity = %.4f' % similarity)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On the other hand, the similarity for the 2nd pair is very small, this means soft cosine thinks this pair are not similar." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FuzzyWuzzy\n", + "\n", + "We have covered some basics on Fuzzy String Matching in Python, let's have a quick peak on whether FuzzyWuzzy can help with our question dedupe problem." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "53" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from fuzzywuzzy import fuzz\n", + "\n", + "question1 = 'What would a Trump presidency mean for current international master’s students on an F1 visa?'\n", + "question2 = 'How will a Trump presidency affect the students presently in US or planning to study in US?'\n", + "fuzz.ratio(question1, question2)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fuzz.partial_token_set_ratio(question1, question2)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "question3 = 'Why am I mentally very lonely? How can I solve it?'\n", + "question4 = 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?'" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "28" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fuzz.ratio(question3, question4)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "37" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fuzz.partial_token_set_ratio(question3, question4)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "FuzzyWuzzy does not think these two sentence have the similar meaning. That's good. \n", + "\n", + "The other features will be the length of word, the length of character, the length of common word between question1 and question2, the length difference between question1 and question2." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def wmd(q1, q2):\n", + " q1 = str(q1).lower().split()\n", + " q2 = str(q2).lower().split()\n", + " stop_words = stopwords.words('english')\n", + " q1 = [w for w in q1 if w not in stop_words]\n", + " q2 = [w for w in q2 if w not in stop_words]\n", + " return model.wmdistance(q1, q2)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "def norm_wmd(q1, q2):\n", + " q1 = str(q1).lower().split()\n", + " q2 = str(q2).lower().split()\n", + " stop_words = stopwords.words('english')\n", + " q1 = [w for w in q1 if w not in stop_words]\n", + " q2 = [w for w in q2 if w not in stop_words]\n", + " return norm_model.wmdistance(q1, q2)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def sent2vec(s):\n", + " words = str(s).lower()\n", + " words = word_tokenize(words)\n", + " words = [w for w in words if not w in stop_words]\n", + " words = [w for w in words if w.isalpha()]\n", + " M = []\n", + " for w in words:\n", + " try:\n", + " M.append(model[w])\n", + " except:\n", + " continue\n", + " M = np.array(M)\n", + " v = M.sum(axis=0)\n", + " return v / np.sqrt((v ** 2).sum())" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df['len_q1'] = df.question1.apply(lambda x: len(str(x)))\n", + "df['len_q2'] = df.question2.apply(lambda x: len(str(x)))\n", + "df['diff_len'] = df.len_q1 - df.len_q2\n", + "df['len_char_q1'] = df.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))\n", + "df['len_char_q2'] = df.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', '')))))\n", + "df['len_word_q1'] = df.question1.apply(lambda x: len(str(x).split()))\n", + "df['len_word_q2'] = df.question2.apply(lambda x: len(str(x).split()))\n", + "df['common_words'] = df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1)\n", + "df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x['question1']), str(x['question2'])), axis=1)\n", + "df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)\n", + "df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)\n", + "df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)\n", + "df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1)\n", + "df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
question1question2is_duplicatelen_q1len_q2diff_lenlen_char_q1len_char_q2len_word_q1len_word_q2common_wordsfuzz_ratiofuzz_partial_ratiofuzz_partial_token_set_ratiofuzz_partial_token_sort_ratiofuzz_token_set_ratiofuzz_token_sort_ratio
0What is the step by step guide to invest in sh...What is the step by step guide to invest in sh...066579202014121093981008910093
1What is the story of Kohinoor (Koh-i-Noor) Dia...What would happen if the Indian government sto...05188-37212981346573100758663
\n", + "
" + ], + "text/plain": [ + " question1 \\\n", + "0 What is the step by step guide to invest in sh... \n", + "1 What is the story of Kohinoor (Koh-i-Noor) Dia... \n", + "\n", + " question2 is_duplicate len_q1 \\\n", + "0 What is the step by step guide to invest in sh... 0 66 \n", + "1 What would happen if the Indian government sto... 0 51 \n", + "\n", + " len_q2 diff_len len_char_q1 len_char_q2 len_word_q1 len_word_q2 \\\n", + "0 57 9 20 20 14 12 \n", + "1 88 -37 21 29 8 13 \n", + "\n", + " common_words fuzz_ratio fuzz_partial_ratio fuzz_partial_token_set_ratio \\\n", + "0 10 93 98 100 \n", + "1 4 65 73 100 \n", + "\n", + " fuzz_partial_token_sort_ratio fuzz_token_set_ratio fuzz_token_sort_ratio \n", + "0 89 100 93 \n", + "1 75 86 63 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word2vec Modeling" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "model = gensim.models.KeyedVectors.load_word2vec_format('./word2Vec_models/GoogleNews-vectors-negative300.bin.gz', binary=True)\n", + "df['wmd'] = df.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
question1question2is_duplicatelen_q1len_q2diff_lenlen_char_q1len_char_q2len_word_q1len_word_q2common_wordsfuzz_ratiofuzz_partial_ratiofuzz_partial_token_set_ratiofuzz_partial_token_sort_ratiofuzz_token_set_ratiofuzz_token_sort_ratiowmd
0What is the step by step guide to invest in sh...What is the step by step guide to invest in sh...0665792020141210939810089100930.564615
1What is the story of Kohinoor (Koh-i-Noor) Dia...What would happen if the Indian government sto...05188-372129813465731007586633.772346
\n", + "
" + ], + "text/plain": [ + " question1 \\\n", + "0 What is the step by step guide to invest in sh... \n", + "1 What is the story of Kohinoor (Koh-i-Noor) Dia... \n", + "\n", + " question2 is_duplicate len_q1 \\\n", + "0 What is the step by step guide to invest in sh... 0 66 \n", + "1 What would happen if the Indian government sto... 0 51 \n", + "\n", + " len_q2 diff_len len_char_q1 len_char_q2 len_word_q1 len_word_q2 \\\n", + "0 57 9 20 20 14 12 \n", + "1 88 -37 21 29 8 13 \n", + "\n", + " common_words fuzz_ratio fuzz_partial_ratio fuzz_partial_token_set_ratio \\\n", + "0 10 93 98 100 \n", + "1 4 65 73 100 \n", + "\n", + " fuzz_partial_token_sort_ratio fuzz_token_set_ratio fuzz_token_sort_ratio \\\n", + "0 89 100 93 \n", + "1 75 86 63 \n", + "\n", + " wmd \n", + "0 0.564615 \n", + "1 3.772346 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normalized Word2vec Modeling" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "norm_model = gensim.models.KeyedVectors.load_word2vec_format('./word2Vec_models/GoogleNews-vectors-negative300.bin.gz', binary=True)\n", + "norm_model.init_sims(replace=True)\n", + "df['norm_wmd'] = df.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
question1question2is_duplicatelen_q1len_q2diff_lenlen_char_q1len_char_q2len_word_q1len_word_q2common_wordsfuzz_ratiofuzz_partial_ratiofuzz_partial_token_set_ratiofuzz_partial_token_sort_ratiofuzz_token_set_ratiofuzz_token_sort_ratiowmdnorm_wmd
0What is the step by step guide to invest in sh...What is the step by step guide to invest in sh...0665792020141210939810089100930.5646150.217555
1What is the story of Kohinoor (Koh-i-Noor) Dia...What would happen if the Indian government sto...05188-372129813465731007586633.7723461.368796
\n", + "
" + ], + "text/plain": [ + " question1 \\\n", + "0 What is the step by step guide to invest in sh... \n", + "1 What is the story of Kohinoor (Koh-i-Noor) Dia... \n", + "\n", + " question2 is_duplicate len_q1 \\\n", + "0 What is the step by step guide to invest in sh... 0 66 \n", + "1 What would happen if the Indian government sto... 0 51 \n", + "\n", + " len_q2 diff_len len_char_q1 len_char_q2 len_word_q1 len_word_q2 \\\n", + "0 57 9 20 20 14 12 \n", + "1 88 -37 21 29 8 13 \n", + "\n", + " common_words fuzz_ratio fuzz_partial_ratio fuzz_partial_token_set_ratio \\\n", + "0 10 93 98 100 \n", + "1 4 65 73 100 \n", + "\n", + " fuzz_partial_token_sort_ratio fuzz_token_set_ratio fuzz_token_sort_ratio \\\n", + "0 89 100 93 \n", + "1 75 86 63 \n", + "\n", + " wmd norm_wmd \n", + "0 0.564615 0.217555 \n", + "1 3.772346 1.368796 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e1267221391142a996d49c3ab2a4b15a", + "version_major": 2, + "version_minor": 0 + }, + "text/html": [ + "

Failed to display Jupyter Widget of type HBox.

\n", + "

\n", + " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", + " that the widgets JavaScript is still loading. If this message persists, it\n", + " likely means that the widgets JavaScript library is either not installed or\n", + " not enabled. See the Jupyter\n", + " Widgets Documentation for setup instructions.\n", + "

\n", + "

\n", + " If you're reading this message in another frontend (for example, a static\n", + " rendering on GitHub or NBViewer),\n", + " it may mean that your frontend doesn't currently support widgets.\n", + "

\n" + ], + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=404287), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in double_scalars\n", + " \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bf1d0b01886541909cee3c550bb1ea3e", + "version_major": 2, + "version_minor": 0 + }, + "text/html": [ + "

Failed to display Jupyter Widget of type HBox.

\n", + "

\n", + " If you're reading this message in the Jupyter Notebook or JupyterLab Notebook, it may mean\n", + " that the widgets JavaScript is still loading. If this message persists, it\n", + " likely means that the widgets JavaScript library is either not installed or\n", + " not enabled. See the Jupyter\n", + " Widgets Documentation for setup instructions.\n", + "

\n", + "

\n", + " If you're reading this message in another frontend (for example, a static\n", + " rendering on GitHub or NBViewer),\n", + " it may mean that your frontend doesn't currently support widgets.\n", + "

\n" + ], + "text/plain": [ + "HBox(children=(IntProgress(value=0, max=404287), HTML(value='')))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "question1_vectors = np.zeros((df.shape[0], 300))\n", + "\n", + "for i, q in enumerate(tqdm_notebook(df.question1.values)):\n", + " question1_vectors[i, :] = sent2vec(q)\n", + " \n", + "question2_vectors = np.zeros((df.shape[0], 300))\n", + "for i, q in enumerate(tqdm_notebook(df.question2.values)):\n", + " question2_vectors[i, :] = sent2vec(q)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:698: RuntimeWarning: invalid value encountered in double_scalars\n", + " dist = 1.0 - uv / np.sqrt(uu * vv)\n", + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:853: RuntimeWarning: invalid value encountered in double_scalars\n", + " dist = np.double(unequal_nonzero.sum()) / np.double(nonzero.sum())\n", + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\scipy\\spatial\\distance.py:1138: RuntimeWarning: invalid value encountered in double_scalars\n", + " return l1_diff.sum() / l1_sum.sum()\n" + ] + } + ], + "source": [ + "df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", + "df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", + "df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", + "df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", + "df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", + "df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", + "df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))]\n", + "df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)]\n", + "df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)]\n", + "df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)]\n", + "df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 255024\n", + "1 149263\n", + "Name: is_duplicate, dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['is_duplicate'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "question1 0\n", + "question2 0\n", + "is_duplicate 0\n", + "len_q1 0\n", + "len_q2 0\n", + "diff_len 0\n", + "len_char_q1 0\n", + "len_char_q2 0\n", + "len_word_q1 0\n", + "len_word_q2 0\n", + "common_words 0\n", + "fuzz_ratio 0\n", + "fuzz_partial_ratio 0\n", + "fuzz_partial_token_set_ratio 0\n", + "fuzz_partial_token_sort_ratio 0\n", + "fuzz_token_set_ratio 0\n", + "fuzz_token_sort_ratio 0\n", + "wmd 0\n", + "norm_wmd 0\n", + "cosine_distance 1775\n", + "cityblock_distance 0\n", + "jaccard_distance 522\n", + "canberra_distance 0\n", + "euclidean_distance 0\n", + "minkowski_distance 0\n", + "braycurtis_distance 522\n", + "skew_q1vec 0\n", + "skew_q2vec 0\n", + "kur_q1vec 0\n", + "kur_q2vec 0\n", + "dtype: int64" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "df.drop(['question1', 'question2'], axis=1, inplace=True)\n", + "df = df[pd.notnull(df['cosine_distance'])]\n", + "df = df[pd.notnull(df['jaccard_distance'])]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import confusion_matrix \n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "X = df.loc[:, df.columns != 'is_duplicate']\n", + "y = df.loc[:, df.columns == 'is_duplicate']\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[60757 15121]\n", + " [12054 32822]]\n", + "Accuracy 0.7749556950494394\n", + " precision recall f1-score support\n", + "\n", + " 0 0.83 0.80 0.82 75878\n", + " 1 0.68 0.73 0.71 44876\n", + "\n", + " micro avg 0.77 0.77 0.77 120754\n", + " macro avg 0.76 0.77 0.76 120754\n", + "weighted avg 0.78 0.77 0.78 120754\n", + "\n" + ] + } + ], + "source": [ + "import xgboost as xgb\n", + "\n", + "model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) \n", + "prediction = model.predict(X_test)\n", + "cm = confusion_matrix(y_test, prediction) \n", + "print(cm) \n", + "print('Accuracy', accuracy_score(y_test, prediction))\n", + "print(classification_report(y_test, prediction))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}