From 7a37de25d55d5a8fc639405077f8d64e1bc8921f Mon Sep 17 00:00:00 2001 From: Susan Li Date: Mon, 17 Sep 2018 20:01:08 -0400 Subject: [PATCH] Add notebook --- Doc2Vec Consumer Complaint.ipynb | 1065 ++++++++++++++++++++++++++++ Doc2Vec Consumer Complaint_3.ipynb | 804 +++++++++++++++++++++ 2 files changed, 1869 insertions(+) create mode 100644 Doc2Vec Consumer Complaint.ipynb create mode 100644 Doc2Vec Consumer Complaint_3.ipynb diff --git a/Doc2Vec Consumer Complaint.ipynb b/Doc2Vec Consumer Complaint.ipynb new file mode 100644 index 0000000..816fd21 --- /dev/null +++ b/Doc2Vec Consumer Complaint.ipynb @@ -0,0 +1,1065 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Doc2Vec\n", + "\n", + "Doc2vec is an adaptation of Word2Vec that allows us to learn document similarity. Doc2vec model by itself is an unsupervised method." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "tqdm.pandas(desc=\"progress-bar\")\n", + "from gensim.models import Doc2Vec\n", + "from sklearn import utils\n", + "from sklearn.model_selection import train_test_split\n", + "import gensim\n", + "from sklearn.linear_model import LogisticRegression\n", + "from gensim.models.doc2vec import TaggedDocument\n", + "import re\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Consumer complaint narrativeProduct
1When my loan was switched over to Navient i wa...Student loan
2I tried to sign up for a spending monitoring p...Credit card or prepaid card
7My mortgage is with BB & T Bank, recently I ha...Mortgage
14The entire lending experience with Citizens Ba...Mortgage
15My credit score has gone down XXXX points in t...Credit reporting
17I few months back I contacted XXXX in regards...Credit reporting, credit repair services, or o...
28I '' m a victim of fraud and I have a file wit...Credit reporting, credit repair services, or o...
30My mortgage is owned by XXXX, we have painfull...Mortgage
32I have been disputing a Bankruptcy on my credi...Credit reporting, credit repair services, or o...
54Today I received a phone call from a number li...Debt collection
\n", + "
" + ], + "text/plain": [ + " Consumer complaint narrative \\\n", + "1 When my loan was switched over to Navient i wa... \n", + "2 I tried to sign up for a spending monitoring p... \n", + "7 My mortgage is with BB & T Bank, recently I ha... \n", + "14 The entire lending experience with Citizens Ba... \n", + "15 My credit score has gone down XXXX points in t... \n", + "17 I few months back I contacted XXXX in regards... \n", + "28 I '' m a victim of fraud and I have a file wit... \n", + "30 My mortgage is owned by XXXX, we have painfull... \n", + "32 I have been disputing a Bankruptcy on my credi... \n", + "54 Today I received a phone call from a number li... \n", + "\n", + " Product \n", + "1 Student loan \n", + "2 Credit card or prepaid card \n", + "7 Mortgage \n", + "14 Mortgage \n", + "15 Credit reporting \n", + "17 Credit reporting, credit repair services, or o... \n", + "28 Credit reporting, credit repair services, or o... \n", + "30 Mortgage \n", + "32 Credit reporting, credit repair services, or o... \n", + "54 Debt collection " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('Consumer_Complaints.csv')\n", + "df = df[['Consumer complaint narrative','Product']]\n", + "df = df[pd.notnull(df['Consumer complaint narrative'])]\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(318718, 2)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Consumer complaint narrative 0\n", + "Product 0\n", + "dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cnt_pro = df['Product'].value_counts()\n", + "\n", + "plt.figure(figsize=(12,4))\n", + "sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8)\n", + "plt.ylabel('Number of Occurrences', fontsize=12)\n", + "plt.xlabel('Product', fontsize=12)\n", + "plt.xticks(rotation=90)\n", + "plt.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The classes are imbalanced. However, a naive classifier that predicts everything to be Debt collection will only achieve over 20% accuracy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Text preprocessing\n", + "\n", + "Below we define a function to convert text to lower-case and strip punctuation/symbols from words and so on." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "def cleanText(text):\n", + " text = BeautifulSoup(text, \"lxml\").text\n", + " text = re.sub(r'\\|\\|\\|', r' ', text) \n", + " text = re.sub(r'http\\S+', r'', text)\n", + " text = text.lower()\n", + " text = text.replace('x', '')\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df['Consumer complaint narrative'] = df['Consumer complaint narrative'].apply(cleanText)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'when my loan was switched over to navient i was never told that i had a deliquint balance because with i did not. when going to purchase a vehicle i discovered my credit score had been dropped from the into the . i have been faithful at paying my student loan. i was told that navient was the company i had delinquency with. i contacted navient to resolve this issue you and kept being told to just contact the credit bureaus and epalin the situation and maybe they could help me. i was so angry that i just hurried and paid the balance off and then after tried to dispute the delinquency with the credit bureaus. i have had so much trouble bringing my credit score back up.'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Consumer complaint narrative'][1]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(318718, 2)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Re-arrange the index of the table. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "df.index = range(318718)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Consumer complaint narrativeProduct
0when my loan was switched over to navient i wa...Student loan
1i tried to sign up for a spending monitoring p...Credit card or prepaid card
2my mortgage is with bb & t bank, recently i ha...Mortgage
3the entire lending eperience with citizens ban...Mortgage
4my credit score has gone down points in the l...Credit reporting
5i few months back i contacted in regards to ...Credit reporting, credit repair services, or o...
6i '' m a victim of fraud and i have a file wit...Credit reporting, credit repair services, or o...
7my mortgage is owned by , we have painfully de...Mortgage
8i have been disputing a bankruptcy on my credi...Credit reporting, credit repair services, or o...
9today i received a phone call from a number li...Debt collection
\n", + "
" + ], + "text/plain": [ + " Consumer complaint narrative \\\n", + "0 when my loan was switched over to navient i wa... \n", + "1 i tried to sign up for a spending monitoring p... \n", + "2 my mortgage is with bb & t bank, recently i ha... \n", + "3 the entire lending eperience with citizens ban... \n", + "4 my credit score has gone down points in the l... \n", + "5 i few months back i contacted in regards to ... \n", + "6 i '' m a victim of fraud and i have a file wit... \n", + "7 my mortgage is owned by , we have painfully de... \n", + "8 i have been disputing a bankruptcy on my credi... \n", + "9 today i received a phone call from a number li... \n", + "\n", + " Product \n", + "0 Student loan \n", + "1 Credit card or prepaid card \n", + "2 Mortgage \n", + "3 Mortgage \n", + "4 Credit reporting \n", + "5 Credit reporting, credit repair services, or o... \n", + "6 Credit reporting, credit repair services, or o... \n", + "7 Mortgage \n", + "8 Credit reporting, credit repair services, or o... \n", + "9 Debt collection " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "df.rename(columns = {'Consumer complaint narrative':'narrative'}, inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
narrativeProduct
0when my loan was switched over to navient i wa...Student loan
1i tried to sign up for a spending monitoring p...Credit card or prepaid card
2my mortgage is with bb & t bank, recently i ha...Mortgage
3the entire lending eperience with citizens ban...Mortgage
4my credit score has gone down points in the l...Credit reporting
\n", + "
" + ], + "text/plain": [ + " narrative \\\n", + "0 when my loan was switched over to navient i wa... \n", + "1 i tried to sign up for a spending monitoring p... \n", + "2 my mortgage is with bb & t bank, recently i ha... \n", + "3 the entire lending eperience with citizens ban... \n", + "4 my credit score has gone down points in the l... \n", + "\n", + " Product \n", + "0 Student loan \n", + "1 Credit card or prepaid card \n", + "2 Mortgage \n", + "3 Mortgage \n", + "4 Credit reporting " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below, we define a function to label each complaint narrative. And the TaggedDocument is an object-type to encapsulate a text-example function that helps to associate a tag/number with each document of the training corpus. In our case, the tag is simply the zero based line number." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.models import doc2vec\n", + "\n", + "def label_sentences(corpus, label_type):\n", + " \"\"\"\n", + " Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.\n", + " We do this by using the TaggedDocument method. The format will be \"TRAIN_i\" or \"TEST_i\" where \"i\" is\n", + " a dummy index of the complaint narrative.\n", + " \"\"\"\n", + " labeled = []\n", + " for i, v in enumerate(corpus):\n", + " label = label_type + '_' + str(i)\n", + " labeled.append(doc2vec.TaggedDocument(v.split(), [label]))\n", + " return labeled" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df.narrative, df.Product, random_state=0, test_size=0.3)\n", + "X_train = label_sentences(X_train, 'Train')\n", + "X_test = label_sentences(X_test, 'Test')\n", + "all_data = X_train + X_test" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "318718" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the training corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[TaggedDocument(words=['i', 'am', 'having', 'trouble', 'repaying', 'my', 'student', 'loans', 'for', 'my', 'account', '.', 'due', 'to', 'economic', 'hardship', 'i', 'have', 'asked', 'for', 'an', 'income-driven', 'repayment', 'plan', 'and', 'received', 'an', 'offer', 'of', '{$180.00}', 'on', '2016.', 'however,', 'fed', 'loan', 'servicing', 'on', ',', '2016', 'sent', 'me', 'an', 'invoice', 'in', 'the', 'amount', 'of', '{$1300.00}', 'which', 'includes', 'an', 'amount', 'past', 'due', 'of', '{$610.00}', 'and', 'a', 'current', 'payment', 'of', '{$720.00}.', 'please', 'note', 'that', 'for', 'me', 'it', 'is', 'a', 'stretch', 'for', 'me', 'to', 'pay', 'the', 'income-driven', 'repayment', 'plan', 'and', 'received', 'an', 'offer', 'of', '{$180.00}', 'but', 'i', 'am', 'willing', 'to', 'do', 'the', 'sacrifice.', 'however,', 'it', 'would', 'be', 'a', 'financial', 'burden', 'to', 'pay', 'the', 'amount', 'asked', 'for', 'on', 'the', 'invoice', 'sent', 'by', 'fed', 'loan', 'servicing', 'on', ',', '2016', 'in', 'the', 'amount', 'of', '{$1300.00}.', 'furthermore,', 'reneging', 'on', 'the', 'income-driven', 'repayment', 'plan', 'offer', 'of', '{$180.00}', 'and', 'asking', 'me', 'now', 'to', 'pay', 'the', 'amount', 'of', '{$1300.00}', 'which', 'includes', 'an', 'amount', 'past', 'due', 'of', '{$610.00}', 'and', 'a', 'current', 'payment', 'of', '{$720.00}', 'would', 'be', 'detrimental', 'to', 'me.', 'by', 'reneging', 'fedloan', 'services', 'does', 'not', 'take', 'into', 'account', 'the', 'fact', 'that', 'i', 'have', 'no', 'discretionary', 'income', 'to', 'pay', 'my', 'student', 'loans', 'according', 'to', '.', 'i', 'have', 'asked', 'for', 'a', 'minimum', 'before', 'but', 'the', 'provider', 'has', 'sent', 'collections', 'notices', 'as', 'well', 'as', 'threats', 'of', 'garnishment.', 'likewise,', 'i', 'have', 'asked', 'for', 'as', 'i', 'am', 'a', 'but', 'i', 'as', 'ill', 'advised', 'by', 'fed', 'loan', 'servicing', 'that', 'i', 'do', 'not', 'qualify', 'even', 'though', 'i', 'work', 'in', 'an', '.'], tags=['Train_0']),\n", + " TaggedDocument(words=['so', 'first', 'off', 'let', 'me', 'start', 'off', 'by', 'saying.', 'i', 'have', 'a', 'so', 'if', 'this', 'is', 'hard', 'to', 'understand', 'thats', 'why.', 'so', 'my', 'story', 'now.', 'i', 'have', 'a', 'credit', 'card', 'with', 'capital', 'one.', 'i', 'am', 'trying', 'to', 'fi', 'my', 'charging', 'privileges', 'and', 'restricted', 'status', 'on', 'my', 'account.', 'i', 'have', 'sent', 'them', 'certified', 'letters,', 'i', 'emailed', 'them,', 'and', 'i', 'have', 'called', 'them.', 'but', 'i', 'feel', 'like', 'i', 'am', 'getting', 'treated', 'with', 'neglection.', 'they', 'told', 'me', 'i', 'was', 'suppose', 'to', 'pay', 'off', 'my', 'bill', 'by', 'a', 'certain', 'date.', 'they', 'also', 'said', 'i', 'have', 'recovered', 'a', 'award', 'letter', 'to', 'get', 'my', 'privileges', 'back.', 'but', 'sense', 'i', 'did', \"n't\", 'pay', 'the', 'bill', 'by', 'the', 'certain', 'date', 'they', 'revoked', 'my', 'privileges.', 'but', 'i', 'am', 'disputing', 'this', 'item', 'for', 'you', 'to', 'review', 'because', 'i', 'did', 'pay', 'this', 'item', 'by', 'a', 'certain', 'date.', 'but', 'the', 'only', 'problem', 'is', 'there', 'posting', 'period', 'took', 'longer', 'then', 'usual.', 'so', 'they', 'revoked', 'my', 'privileges.', 'i', 'am', 'willing', 'to', 'work', 'with', 'this', 'creditor', 'i', 'tried.', 'but', 'they', 'did', \"n't\", 'work', 'with', 'me.', 'so', 'if', 'they', 'do', \"n't\", 'work', 'with', 'me.', 'i', 'will', 'be', 'releasing', 'a', 'public', 'press', 'release', 'online', 'about', 'this', 'company', 'for', 'neglect', 'ion', 'and', 'unable', 'to', 'work', 'with', 'someone', 'that', 'has', 'a', 'that', 'just', 'wants', 'to', 'rebuild', 'there', 'credit.', 'so', 'if', 'you', 'can', 'work', 'with', 'this', 'creditor', 'and', 'get', 'them', 'to', 'work', 'with', 'me', 'that', 'would', 'be', 'progress.', 'i', 'feel', 'like', 'i', 'have', 'been', 'treated', 'with', 'neglection', 'here.', 'thanks'], tags=['Train_1'])]" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_data[:2]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training the model\n", + "\n", + "We'll instantiate a Doc2Vec model-Distributed Bag of Words (DBOW). In the Word2Vec architecture, the two algorithm names are “continuous bag of words” (cbow) and “skip-gram” (sg); in the Doc2Vec architecture, the corresponding algorithms are “distributed bag of words” (dbow) and “distributed memory” (dm)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DBOW\n", + "\n", + "DBOW is the Doc2Vec model analogous to Skip-gram model in Word2Vec. The paragraph vectors are obtained by training a neural network on the task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph.\n", + "\n", + "Training a Doc2Vec model is rather straight forward in Gensim, we initialize the model and train for 30 epochs:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "dm =0 means ‘distributed bag of words’ (DBOW), set min_count=2 means ignoring all words with total frequency lower than this, size=100 is dimensionality of the generated feature vectors, alpha=0.025 is the initial alpha rate, learning rate will linearly drop to min_alpha as training progresses. And then we build a vocabulary." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 318718/318718 [00:00<00:00, 2537947.09it/s]\n" + ] + } + ], + "source": [ + "model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)\n", + "model_dbow.build_vocab([x for x in tqdm(all_data)])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 318718/318718 [00:00<00:00, 2722501.94it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2600616.66it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2375667.86it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3399382.02it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3261864.60it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3449833.63it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2406732.26it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2549515.45it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2722773.66it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2970746.29it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3399338.80it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2088753.55it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3213755.54it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2864054.24it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2957273.91it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2913681.93it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2167860.25it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2060997.66it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2678169.32it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3192213.80it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1018820.26it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3399390.67it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3399312.87it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2913364.43it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2522983.31it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2671185.61it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3399330.16it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3399330.16it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2913713.68it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2913828.00it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 33min 18s\n" + ] + } + ], + "source": [ + "%%time\n", + "for epoch in range(30):\n", + " model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)\n", + " model_dbow.alpha -= 0.002\n", + " model_dbow.min_alpha = model_dbow.alpha" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Define a function to get the vectors." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def get_vectors(model, corpus_size, vectors_size, vectors_type):\n", + " \"\"\"\n", + " Get vectors from trained doc2vec model\n", + " :param doc2vec_model: Trained Doc2Vec model\n", + " :param corpus_size: Size of the data\n", + " :param vectors_size: Size of the embedding vectors\n", + " :param vectors_type: Training or Testing vectors\n", + " :return: list of vectors\n", + " \"\"\"\n", + " vectors = np.zeros((corpus_size, vectors_size))\n", + " for i in range(0, corpus_size):\n", + " prefix = vectors_type + '_' + str(i)\n", + " vectors[i] = model.docvecs[prefix]\n", + " return vectors" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')\n", + "test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='multinomial',\n", + " n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',\n", + " tol=0.0001, verbose=0, warm_start=False)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logreg = LogisticRegression(multi_class='multinomial', solver = 'lbfgs')\n", + "logreg.fit(train_vectors_dbow, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6882425535475234" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logreg.score(test_vectors_dbow, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "model_dbow.save('d2v_model_dbow.doc2vec')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distributed Memory\n", + "\n", + "Distributed Memory (DM) acts as a memory that remembers what is missing from the current context — or as the topic of the paragraph. While the word vectors represent the concept of a word, the document vector intends to represent the concept of a document.\n", + "\n", + "We again instantiate a Doc2Vec model with a vector size with 100 words and iterating over the training corpus 30 times." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 318718/318718 [00:00<00:00, 1621798.90it/s]\n" + ] + } + ], + "source": [ + "model_dm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)\n", + "model_dm.build_vocab([x for x in tqdm(all_data)])" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 318718/318718 [00:00<00:00, 2764131.19it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1066654.90it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1880859.47it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1103720.83it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1070251.82it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 905616.33it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2172341.30it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 875676.71it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1535611.15it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 987147.58it/s] \n", + "100%|██████████| 318718/318718 [00:00<00:00, 1190525.12it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1836549.06it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1963555.23it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1123252.95it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1901808.46it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2648902.00it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1399458.12it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1711163.37it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3116360.38it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 3591405.62it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1831448.89it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2176986.26it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1168642.98it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2563414.55it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1858905.35it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1181675.70it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1164360.69it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 769659.58it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 1897995.78it/s]\n", + "100%|██████████| 318718/318718 [00:00<00:00, 2408128.30it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 56min 23s\n" + ] + } + ], + "source": [ + "%%time\n", + "for epoch in range(30):\n", + " model_dm.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)\n", + " model_dm.alpha -= 0.002\n", + " model_dm.min_alpha = model_dm.alpha" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "train_vectors_dm = get_vectors(model_dm, len(X_train), 300, 'Train')\n", + "test_vectors_dm = get_vectors(model_dm, len(X_test), 300, 'Test')" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='multinomial',\n", + " n_jobs=1, penalty='l2', random_state=None, solver='lbfgs',\n", + " tol=0.0001, verbose=0, warm_start=False)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logreg.fit(train_vectors_dm, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6646690930388219" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logreg.score(test_vectors_dm, y_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "model_dm.save('d2v_model_dm.doc2vec')" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# model_dbow = Doc2Vec.load('d2v_model_dbow.doc2vec')\n", + "# model_dm = Doc2Vec.load('d2v_model_dm.doc2vec')\n", + "model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)\n", + "model_dm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "def get_concat_vectors(model1,model2, corpus_size, vectors_size, vectors_type):\n", + " vectors = np.zeros((corpus_size, vectors_size))\n", + " for i in range(0, corpus_size):\n", + " prefix = vectors_type + '_' + str(i)\n", + " vectors[i] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])\n", + " return vectors" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "train_vecs_dbow_dm = get_concat_vectors(model_dbow,model_dm, len(X_train), 600, 'Train')\n", + "test_vecs_dbow_dm = get_concat_vectors(model_dbow,model_dm, len(X_test), 600, 'Test')" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 19min 18s\n" + ] + } + ], + "source": [ + "%%time\n", + "logreg = LogisticRegression()\n", + "logreg.fit(train_vecs_dbow_dm, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7055199966532798" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logreg.score(test_vecs_dbow_dm, y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Doc2Vec Consumer Complaint_3.ipynb b/Doc2Vec Consumer Complaint_3.ipynb new file mode 100644 index 0000000..5c67ae9 --- /dev/null +++ b/Doc2Vec Consumer Complaint_3.ipynb @@ -0,0 +1,804 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from tqdm import tqdm\n", + "tqdm.pandas(desc=\"progress-bar\")\n", + "from gensim.models import Doc2Vec\n", + "from sklearn import utils\n", + "from sklearn.model_selection import train_test_split\n", + "import gensim\n", + "from sklearn.linear_model import LogisticRegression\n", + "from gensim.models.doc2vec import TaggedDocument\n", + "import re\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
narrativeProduct
1When my loan was switched over to Navient i wa...Student loan
2I tried to sign up for a spending monitoring p...Credit card or prepaid card
7My mortgage is with BB & T Bank, recently I ha...Mortgage
14The entire lending experience with Citizens Ba...Mortgage
15My credit score has gone down XXXX points in t...Credit reporting
17I few months back I contacted XXXX in regards...Credit reporting, credit repair services, or o...
28I '' m a victim of fraud and I have a file wit...Credit reporting, credit repair services, or o...
30My mortgage is owned by XXXX, we have painfull...Mortgage
32I have been disputing a Bankruptcy on my credi...Credit reporting, credit repair services, or o...
54Today I received a phone call from a number li...Debt collection
\n", + "
" + ], + "text/plain": [ + " narrative \\\n", + "1 When my loan was switched over to Navient i wa... \n", + "2 I tried to sign up for a spending monitoring p... \n", + "7 My mortgage is with BB & T Bank, recently I ha... \n", + "14 The entire lending experience with Citizens Ba... \n", + "15 My credit score has gone down XXXX points in t... \n", + "17 I few months back I contacted XXXX in regards... \n", + "28 I '' m a victim of fraud and I have a file wit... \n", + "30 My mortgage is owned by XXXX, we have painfull... \n", + "32 I have been disputing a Bankruptcy on my credi... \n", + "54 Today I received a phone call from a number li... \n", + "\n", + " Product \n", + "1 Student loan \n", + "2 Credit card or prepaid card \n", + "7 Mortgage \n", + "14 Mortgage \n", + "15 Credit reporting \n", + "17 Credit reporting, credit repair services, or o... \n", + "28 Credit reporting, credit repair services, or o... \n", + "30 Mortgage \n", + "32 Credit reporting, credit repair services, or o... \n", + "54 Debt collection " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('Consumer_Complaints.csv')\n", + "df = df[['Consumer complaint narrative','Product']]\n", + "df = df[pd.notnull(df['Consumer complaint narrative'])]\n", + "df.rename(columns = {'Consumer complaint narrative':'narrative'}, inplace = True)\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(318718, 2)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "df.index = range(318718)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "63420212" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['narrative'].apply(lambda x: len(x.split(' '))).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have over 63 million words, it is not a small data set." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "cnt_pro = df['Product'].value_counts()\n", + "\n", + "plt.figure(figsize=(12,4))\n", + "sns.barplot(cnt_pro.index, cnt_pro.values, alpha=0.8)\n", + "plt.ylabel('Number of Occurrences', fontsize=12)\n", + "plt.xlabel('Product', fontsize=12)\n", + "plt.xticks(rotation=90)\n", + "plt.show();" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The classes are imbalanced. However, a naive classifier that predicts everything to be Debt collection will only achieve over 20% accuracy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's have a look a few examples of complaint narrative and its associated product." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def print_complaint(index):\n", + " example = df[df.index == index][['narrative', 'Product']].values[0]\n", + " if len(example) > 0:\n", + " print(example[0])\n", + " print('Product:', example[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "I APPARENTLY HAVE AN OUTSTANDING DEBT WITH XXXX XXXX. THEY SENT THE ACCOUNT TO XXXX XXXX XXXX. THIS COLLECTION COMPANY CALLS MY WORK AND CELL SEVERAL TIMES A DAY. UP TO 6 TIMES A DAY. I TRY TO TELL THE COMPANY TO STOP CALLING ME AND THEY HANG UP LAUGHING. I HAVE NOT RECEIVED ANY PAPERWORK FROM THIS COMPANY. AND XXXX XXXX REFUSE TO SPEAK WITH ME. I HAVE MAILED IN A CEASE AND DESIST, AND I HAVE CALLED THEM TO GET THEIR FAX NUMBER TO SEND IN THE PAPERWORK-THEY KEEP GIVING ME DIFFERENT ONES ( FAX # XXXX ) I DON'T OWE ANY MONEY TO XXXX, I CLOSED THE ACCOUNT WITH A XXXX BALANCE AND TURNED IN ALL OF MY EQUIPMENT THAT I WAS RENTING. AND SO DID MY WIFE.\n", + "Product: Debt collection\n" + ] + } + ], + "source": [ + "print_complaint(12)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "In late XXXX of 2017 I requested a balance transfer from my Comenity Bank Visa card to pay off a high balance on another credit card I have. On XXXX XXXX, 2017 the charge posted to my Comenity Bank Visa card account however, as of today ( XXXX XXXX, 2017 ) they have never sent the money to pay off the other credit card. I have tried to resolve the problem with several phone calls to Comenity Bank but no one has been able to give a reasonable explanation of why they are charging my credit card account without actually sending the money to pay off the other credit card. I have also sent them a written dispute of the charge but have received no response yet.\n", + "Product: Credit card or prepaid card\n" + ] + } + ], + "source": [ + "print_complaint(20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Text preprocessing\n", + "\n", + "Below we define a function to convert text to lower-case and strip punctuation/symbols from words and so on." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "def cleanText(text):\n", + " text = BeautifulSoup(text, \"lxml\").text\n", + " text = re.sub(r'\\|\\|\\|', r' ', text) \n", + " text = re.sub(r'http\\S+', r'', text)\n", + " text = text.lower()\n", + " text = text.replace('x', '')\n", + " return text\n", + "df['narrative'] = df['narrative'].apply(cleanText)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'in late of 2017 i requested a balance transfer from my comenity bank visa card to pay off a high balance on another credit card i have. on , 2017 the charge posted to my comenity bank visa card account however, as of today ( , 2017 ) they have never sent the money to pay off the other credit card. i have tried to resolve the problem with several phone calls to comenity bank but no one has been able to give a reasonable eplanation of why they are charging my credit card account without actually sending the money to pay off the other credit card. i have also sent them a written dispute of the charge but have received no response yet.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['narrative'][20]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Train/test split of 70/30." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = train_test_split(df, test_size=0.3, random_state=42)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "import nltk\n", + "from nltk.corpus import stopwords\n", + "def tokenize_text(text):\n", + " tokens = []\n", + " for sent in nltk.sent_tokenize(text):\n", + " for word in nltk.word_tokenize(sent):\n", + " if len(word) < 2:\n", + " continue\n", + " tokens.append(word.lower())\n", + " return tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "train_tagged = train.apply(\n", + " lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)\n", + "test_tagged = test.apply(\n", + " lambda r: TaggedDocument(words=tokenize_text(r['narrative']), tags=[r.Product]), axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is what a training entry looks like - an example complaint narrative tagged by 'Credit reporting'." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "TaggedDocument(words=['had', 'bankruptcy', 'years', 'ago', 'and', 'it', 'is', 'still', 'showing', 'up', 'on', 'equifa', 'which', 'is', 'preventing', 'me', 'from', 'buying', 'home', 'at', 'good', 'rate', 'they', 'need', 'to', 'take', 'it', 'off', 'like', 'did', 'so', 'my', 'score', 'will', 'be'], tags=['Credit reporting'])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_tagged.values[30]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training the model\n", + "\n", + "We'll instantiate a Doc2Vec model-Distributed Bag of Words (DBOW). In the Word2Vec architecture, the two algorithm names are “continuous bag of words” (cbow) and “skip-gram” (sg); in the Doc2Vec architecture, the corresponding algorithms are “distributed bag of words” (dbow) and “distributed memory” (dm)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DBOW\n", + "\n", + "DBOW is the Doc2Vec model analogous to Skip-gram model in Word2Vec. The paragraph vectors are obtained by training a neural network on the task of predicting a probability distribution of words in a paragraph given a randomly-sampled word from the paragraph.\n", + "\n", + "Training a Doc2Vec model is rather straight forward in Gensim, we initialize the model and train for 30 epochs:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We set the minimum word count to 2 in order to discard words with very few occurrences. " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "import multiprocessing\n", + "\n", + "cores = multiprocessing.cpu_count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Build a vocabulary" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 223102/223102 [00:00<00:00, 2855261.21it/s]\n" + ] + } + ], + "source": [ + "model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)\n", + "model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 223102/223102 [00:00<00:00, 2379615.48it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2849340.50it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2100357.58it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1596907.77it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2576198.16it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1961591.50it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1799270.90it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1216525.65it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1879469.14it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2159058.65it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1851940.21it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1868324.12it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2222950.33it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2035760.37it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1791036.78it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2039549.64it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 3569332.45it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2855374.47it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2855418.04it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2379567.07it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2039580.76it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2322399.87it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2517534.79it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1888184.11it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 919457.47it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2060277.88it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1885517.07it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1539867.75it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2726717.42it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2033057.94it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 18min 24s\n" + ] + } + ], + "source": [ + "%%time\n", + "for epoch in range(30):\n", + " model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)\n", + " model_dbow.alpha -= 0.002\n", + " model_dbow.min_alpha = model_dbow.alpha" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Buliding the final vector feature for the classifier." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "def vec_for_learning(model, tagged_docs):\n", + " sents = tagged_docs.values\n", + " targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])\n", + " return targets, regressors" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "y_train, X_train = vec_for_learning(model_dbow, train_tagged)\n", + "y_test, X_test = vec_for_learning(model_dbow, test_tagged)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "logreg = LogisticRegression(n_jobs=1, C=1e5)\n", + "logreg.fit(X_train, y_train)\n", + "y_pred = logreg.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing accuracy 0.6683609437751004\n", + "Testing F1 score: 0.651646431211616\n" + ] + } + ], + "source": [ + "from sklearn.metrics import accuracy_score, f1_score\n", + "\n", + "print('Testing accuracy %s' % accuracy_score(y_test, y_pred))\n", + "print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distributed Memory with Averaging\n", + "\n", + "Distributed Memory (DM) acts as a memory that remembers what is missing from the current context — or as the topic of the paragraph. While the word vectors represent the concept of a word, the document vector intends to represent the concept of a document.\n", + "We again instantiate a Doc2Vec model with a vector size with 300 words and iterating over the training corpus 30 times." + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 223102/223102 [00:00<00:00, 1886113.74it/s]\n" + ] + } + ], + "source": [ + "model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)\n", + "model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 223102/223102 [00:00<00:00, 2855531.31it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1578083.26it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1745168.51it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1854244.79it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1893698.00it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1885653.85it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1869836.11it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1869832.37it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2445159.40it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2022787.35it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1165813.40it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2379542.86it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2379585.22it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1784628.78it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1784628.78it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1940508.05it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2039602.98it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1104039.77it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2039647.44it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2039562.97it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2379573.12it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2855418.04it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2379506.56it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2039554.08it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1784632.19it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1751177.79it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1892361.48it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1959722.32it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 1649827.85it/s]\n", + "100%|██████████| 223102/223102 [00:00<00:00, 2778023.03it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 33min 28s\n" + ] + } + ], + "source": [ + "%%time\n", + "for epoch in range(30):\n", + " model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)\n", + " model_dmm.alpha -= 0.002\n", + " model_dmm.min_alpha = model_dmm.alpha" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train Logistic Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing accuracy 0.47498326639892907\n", + "Testing F1 score: 0.4445833078167434\n" + ] + } + ], + "source": [ + "y_train, X_train = vec_for_learning(model_dmm, train_tagged)\n", + "y_test, X_test = vec_for_learning(model_dmm, test_tagged)\n", + "\n", + "logreg.fit(X_train, y_train)\n", + "y_pred = logreg.predict(X_test)\n", + "\n", + "print('Testing accuracy %s' % accuracy_score(y_test, y_pred))\n", + "print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "model_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)\n", + "model_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "from gensim.test.test_doc2vec import ConcatenatedDoc2Vec\n", + "new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "def get_vectors(model, tagged_docs):\n", + " sents = tagged_docs.values\n", + " targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])\n", + " return targets, regressors" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "y_train, X_train = get_vectors(new_model, train_tagged)\n", + "y_test, X_test = get_vectors(new_model, test_tagged)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing accuracy 0.6778572623828648\n", + "Testing F1 score: 0.664561533967402\n" + ] + } + ], + "source": [ + "logreg.fit(X_train, y_train)\n", + "y_pred = logreg.predict(X_test)\n", + "\n", + "print('Testing accuracy %s' % accuracy_score(y_test, y_pred))\n", + "print('Testing F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}