From a9e16d114886609e4233e0e32249d051d3104f17 Mon Sep 17 00:00:00 2001 From: Susan Li Date: Tue, 25 Sep 2018 01:28:02 -0400 Subject: [PATCH] Add notebook --- Text Classification model selection.ipynb | 1344 +++++++++++++++++++++ 1 file changed, 1344 insertions(+) create mode 100644 Text Classification model selection.ipynb diff --git a/Text Classification model selection.ipynb b/Text Classification model selection.ipynb new file mode 100644 index 0000000..0b7c470 --- /dev/null +++ b/Text Classification model selection.ipynb @@ -0,0 +1,1344 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Model selection is the task of selecting a statistical model from a set of candidate models, given data. In the simplest cases, a pre-existing set of data is considered. Given candidate models of similar predictive or explanatory power, the simplest model is most likely to be the best choice." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The data is available in Google BigQuery that can be downloaded from here. The data is also publicly available at this Cloud Storage URL: https://storage.googleapis.com/tensorflow-workshop-examples/stack-overflow-data.csv." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import pandas as pd\n", + "import numpy as np\n", + "from numpy import random\n", + "import gensim\n", + "import nltk\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", + "from sklearn.metrics import accuracy_score, confusion_matrix\n", + "import matplotlib.pyplot as plt\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "from bs4 import BeautifulSoup\n", + "\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
posttags
0what is causing this behavior in our c# datet...c#
1have dynamic html load as if it was in an ifra...asp.net
2how to convert a float value in to min:sec i ...objective-c
3.net framework 4 redistributable just wonderi....net
4trying to calculate and print the mean and its...python
5how to give alias name for my website i have ...asp.net
6window.open() returns null in angularjs it wo...angularjs
7identifying server timeout quickly in iphone ...iphone
8unknown method key error in rails 2.3.8 unit ...ruby-on-rails
9from the include how to show and hide the con...angularjs
\n", + "
" + ], + "text/plain": [ + " post tags\n", + "0 what is causing this behavior in our c# datet... c#\n", + "1 have dynamic html load as if it was in an ifra... asp.net\n", + "2 how to convert a float value in to min:sec i ... objective-c\n", + "3 .net framework 4 redistributable just wonderi... .net\n", + "4 trying to calculate and print the mean and its... python\n", + "5 how to give alias name for my website i have ... asp.net\n", + "6 window.open() returns null in angularjs it wo... angularjs\n", + "7 identifying server timeout quickly in iphone ... iphone\n", + "8 unknown method key error in rails 2.3.8 unit ... ruby-on-rails\n", + "9 from the include how to show and hide the con... angularjs" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('stack-overflow-data.csv')\n", + "df = df[pd.notnull(df['tags'])]\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10276752" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['post'].apply(lambda x: len(x.split(' '))).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have over 10 million words in the data." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "my_tags = ['java','html','asp.net','c#','ruby-on-rails','jquery','mysql','php','ios','javascript','python','c','css','android','iphone','sql','objective-c','c++','angularjs','.net']\n", + "plt.figure(figsize=(10,4))\n", + "df.tags.value_counts().plot(kind='bar');" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The classes are very well balanced." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def print_plot(index):\n", + " example = df[df.index == index][['post', 'tags']].values[0]\n", + " if len(example) > 0:\n", + " print(example[0])\n", + " print('Tag:', example[1])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Have a look a few post and tag pairs." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "when we need interface c#
possible duplicate:
why would i want to use interfaces why i need interface
i want to know where and when to use it for example
interface idemo {  // function prototype  public void show(); }  // first class using the interface class myclass1 : idemo {  public void show()  {   // function body comes here   response.write( i m in myclass );  }  }  // second class using the interface class myclass2 : idemo {  public void show()   {   // function body comes here   response.write( i m in myclass2 );   response.write( so  what  );  } 
these two classes has the same function name with different body. this can be even achieved without interface. then why we need an interface where and when to use it\n", + "Tag: c#\n" + ] + } + ], + "source": [ + "print_plot(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "how to chain expressions inside ngclass when using the {...}[] form how can i add another expression to an ng-class directive that uses this form:
ng-class= {true: loading   false: loading-done }[data.loader===null]  
i d like to add something like this to the list:
{highlight:isspecial} 
is it possible without expanding the first expression thanks.\n", + "Tag: angularjs\n" + ] + } + ], + "source": [ + "print_plot(30)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The text need to be cleaned up." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "REPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\\]\\|@,;]')\n", + "BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')\n", + "STOPWORDS = set(stopwords.words('english'))\n", + "\n", + "def clean_text(text):\n", + " \"\"\"\n", + " text: a string\n", + " \n", + " return: modified initial string\n", + " \"\"\"\n", + " text = BeautifulSoup(text, \"lxml\").text # HTML decoding\n", + " text = text.lower() # lowercase text\n", + " text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text\n", + " text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text\n", + " text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df['post'] = df['post'].apply(clean_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "need interface c# possible duplicate would want use interfaces need interface want know use example interface idemo function prototype public void show first class using interface class myclass1 idemo public void show function body comes responsewrite myclass second class using interface class myclass2 idemo public void show function body comes responsewrite myclass2 responsewrite two classes function name different body even achieved without interface need interface use\n", + "Tag: c#\n" + ] + } + ], + "source": [ + "print_plot(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Way better!" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "3421180" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['post'].apply(lambda x: len(x.split(' '))).sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we have over 3 million words to work with." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "X = df.post\n", + "y = df.tags\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next steps includes feature engineering. We will convert our text documents to a matrix of token counts (CountVectorizer), then transform a count matrix to a normalized tf-idf representation (tf-idf transformer). After that, we train several classifiers. \n", + "\n", + "### Naive Bayes classifier for multinomial models" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", + " strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.feature_extraction.text import TfidfTransformer\n", + "\n", + "nb = Pipeline([('vect', CountVectorizer()),\n", + " ('tfidf', TfidfTransformer()),\n", + " ('clf', MultinomialNB()),\n", + " ])\n", + "nb.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy 0.7395\n", + " precision recall f1-score support\n", + "\n", + " java 0.63 0.65 0.64 613\n", + " html 0.94 0.86 0.90 620\n", + " asp.net 0.87 0.92 0.90 587\n", + " c# 0.70 0.77 0.73 586\n", + "ruby-on-rails 0.73 0.87 0.79 599\n", + " jquery 0.72 0.51 0.60 589\n", + " mysql 0.77 0.74 0.75 594\n", + " php 0.69 0.89 0.78 610\n", + " ios 0.63 0.59 0.61 617\n", + " javascript 0.57 0.65 0.61 587\n", + " python 0.70 0.50 0.59 611\n", + " c 0.79 0.79 0.79 594\n", + " css 0.84 0.59 0.69 619\n", + " android 0.66 0.84 0.74 574\n", + " iphone 0.64 0.83 0.72 584\n", + " sql 0.66 0.64 0.65 578\n", + " objective-c 0.79 0.77 0.78 591\n", + " c++ 0.89 0.83 0.86 608\n", + " angularjs 0.94 0.89 0.91 638\n", + " .net 0.74 0.66 0.70 601\n", + "\n", + " avg / total 0.75 0.74 0.74 12000\n", + "\n", + "Wall time: 880 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "from sklearn.metrics import classification_report\n", + "y_pred = nb.predict(X_test)\n", + "\n", + "print('accuracy %s' % accuracy_score(y_pred, y_test))\n", + "print(classification_report(y_test, y_pred,target_names=my_tags))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Linear support vector machine" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", + " strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,\n", + " tol=None, verbose=0, warm_start=False))])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import SGDClassifier\n", + "\n", + "sgd = Pipeline([('vect', CountVectorizer()),\n", + " ('tfidf', TfidfTransformer()),\n", + " ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),\n", + " ])\n", + "sgd.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy 0.7891666666666667\n", + " precision recall f1-score support\n", + "\n", + " java 0.74 0.68 0.71 613\n", + " html 0.85 0.93 0.89 620\n", + " asp.net 0.87 0.95 0.91 587\n", + " c# 0.81 0.80 0.80 586\n", + "ruby-on-rails 0.74 0.88 0.80 599\n", + " jquery 0.77 0.41 0.53 589\n", + " mysql 0.82 0.68 0.74 594\n", + " php 0.70 0.95 0.81 610\n", + " ios 0.82 0.56 0.66 617\n", + " javascript 0.72 0.59 0.65 587\n", + " python 0.71 0.65 0.68 611\n", + " c 0.81 0.87 0.84 594\n", + " css 0.77 0.79 0.78 619\n", + " android 0.83 0.86 0.85 574\n", + " iphone 0.81 0.80 0.81 584\n", + " sql 0.71 0.68 0.69 578\n", + " objective-c 0.81 0.90 0.85 591\n", + " c++ 0.84 0.96 0.89 608\n", + " angularjs 0.87 0.95 0.91 638\n", + " .net 0.77 0.89 0.83 601\n", + "\n", + " avg / total 0.79 0.79 0.78 12000\n", + "\n", + "Wall time: 940 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "y_pred = sgd.predict(X_test)\n", + "\n", + "print('accuracy %s' % accuracy_score(y_pred, y_test))\n", + "print(classification_report(y_test, y_pred,target_names=my_tags))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Logistic regression" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Pipeline(memory=None,\n", + " steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',\n", + " dtype=, encoding='utf-8', input='content',\n", + " lowercase=True, max_df=1.0, max_features=None, min_df=1,\n", + " ngram_range=(1, 1), preprocessor=None, stop_words=None,\n", + " strip...ty='l2', random_state=None,\n", + " solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "logreg = Pipeline([('vect', CountVectorizer()),\n", + " ('tfidf', TfidfTransformer()),\n", + " ('clf', LogisticRegression(n_jobs=1, C=1e5)),\n", + " ])\n", + "logreg.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy 0.783\n", + " precision recall f1-score support\n", + "\n", + " java 0.70 0.62 0.66 613\n", + " html 0.91 0.91 0.91 620\n", + " asp.net 0.97 0.94 0.95 587\n", + " c# 0.78 0.77 0.78 586\n", + "ruby-on-rails 0.77 0.81 0.79 599\n", + " jquery 0.59 0.58 0.58 589\n", + " mysql 0.77 0.76 0.76 594\n", + " php 0.82 0.86 0.84 610\n", + " ios 0.70 0.72 0.71 617\n", + " javascript 0.61 0.59 0.60 587\n", + " python 0.64 0.63 0.64 611\n", + " c 0.83 0.83 0.83 594\n", + " css 0.78 0.78 0.78 619\n", + " android 0.85 0.85 0.85 574\n", + " iphone 0.80 0.83 0.81 584\n", + " sql 0.65 0.65 0.65 578\n", + " objective-c 0.82 0.84 0.83 591\n", + " c++ 0.91 0.91 0.91 608\n", + " angularjs 0.96 0.94 0.95 638\n", + " .net 0.78 0.83 0.80 601\n", + "\n", + " avg / total 0.78 0.78 0.78 12000\n", + "\n", + "Wall time: 883 ms\n" + ] + } + ], + "source": [ + "%%time\n", + "\n", + "y_pred = logreg.predict(X_test)\n", + "\n", + "print('accuracy %s' % accuracy_score(y_pred, y_test))\n", + "print(classification_report(y_test, y_pred,target_names=my_tags))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Word2vec embedding and Logistic Regression" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 2min 11s\n" + ] + } + ], + "source": [ + "%%time\n", + "from gensim.models import Word2Vec\n", + "\n", + "wv = gensim.models.KeyedVectors.load_word2vec_format(\"GoogleNews-vectors-negative300.bin.gz\", binary=True)\n", + "wv.init_sims(replace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Memorial_Hospital',\n", + " 'Seniors',\n", + " 'memorandum',\n", + " 'elephant',\n", + " 'Trump',\n", + " 'Census',\n", + " 'pilgrims',\n", + " 'De',\n", + " 'Dogs',\n", + " '###-####_ext',\n", + " 'chaotic',\n", + " 'forgive',\n", + " 'scholar',\n", + " 'Lottery',\n", + " 'decreasing',\n", + " 'Supervisor',\n", + " 'fundamentally',\n", + " 'Fitness',\n", + " 'abundance',\n", + " 'Hold']" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from itertools import islice\n", + "list(islice(wv.vocab, 13030, 13050))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The common way is to average the two word vectors. BOW based approaches which includes averaging." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "def word_averaging(wv, words):\n", + " all_words, mean = set(), []\n", + " \n", + " for word in words:\n", + " if isinstance(word, np.ndarray):\n", + " mean.append(word)\n", + " elif word in wv.vocab:\n", + " mean.append(wv.syn0norm[wv.vocab[word].index])\n", + " all_words.add(wv.vocab[word].index)\n", + "\n", + " if not mean:\n", + " logging.warning(\"cannot compute similarity with no input %s\", words)\n", + " # FIXME: remove these examples in pre-processing\n", + " return np.zeros(wv.vector_size,)\n", + "\n", + " mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)\n", + " return mean\n", + "\n", + "def word_averaging_list(wv, text_list):\n", + " return np.vstack([word_averaging(wv, post) for post in text_list ])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "def w2v_tokenize_text(text):\n", + " tokens = []\n", + " for sent in nltk.sent_tokenize(text, language='english'):\n", + " for word in nltk.word_tokenize(sent, language='english'):\n", + " if len(word) < 2:\n", + " continue\n", + " tokens.append(word)\n", + " return tokens" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = train_test_split(df, test_size=0.3, random_state = 42)\n", + "\n", + "test_tokenized = test.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values\n", + "train_tokenized = train.apply(lambda r: w2v_tokenize_text(r['post']), axis=1).values" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\SusanLi\\AppData\\Local\\Continuum\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `syn0norm` (Attribute will be removed in 4.0.0, use self.wv.vectors_norm instead).\n", + " \n", + "WARNING:root:cannot compute similarity with no input []\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 15.8 s\n" + ] + } + ], + "source": [ + "X_train_word_average = word_averaging_list(wv,train_tokenized)\n", + "X_test_word_average = word_averaging_list(wv,test_tokenized)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 1min 7s\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "logreg = LogisticRegression(n_jobs=1, C=1e5)\n", + "logreg = logreg.fit(X_train_word_average, train['tags'])\n", + "y_pred = logreg.predict(X_test_word_average)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy 0.6379166666666667\n", + " precision recall f1-score support\n", + "\n", + " java 0.63 0.59 0.61 613\n", + " html 0.73 0.76 0.75 620\n", + " asp.net 0.65 0.67 0.66 587\n", + " c# 0.53 0.52 0.52 586\n", + "ruby-on-rails 0.70 0.77 0.73 599\n", + " jquery 0.44 0.39 0.41 589\n", + " mysql 0.65 0.61 0.63 594\n", + " php 0.73 0.80 0.76 610\n", + " ios 0.60 0.61 0.61 617\n", + " javascript 0.56 0.52 0.54 587\n", + " python 0.55 0.50 0.52 611\n", + " c 0.61 0.61 0.61 594\n", + " css 0.65 0.65 0.65 619\n", + " android 0.60 0.57 0.59 574\n", + " iphone 0.70 0.71 0.71 584\n", + " sql 0.42 0.42 0.42 578\n", + " objective-c 0.68 0.71 0.70 591\n", + " c++ 0.76 0.78 0.77 608\n", + " angularjs 0.82 0.83 0.82 638\n", + " .net 0.66 0.71 0.68 601\n", + "\n", + " avg / total 0.63 0.64 0.64 12000\n", + "\n" + ] + } + ], + "source": [ + "print('accuracy %s' % accuracy_score(y_pred, test.tags))\n", + "print(classification_report(test.tags, y_pred,target_names=my_tags))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Doc2vec and Logistic Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Doc2vec, taking the linear combination of every term in the document creates a random walk with bias process in the word2vec space." + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "tqdm.pandas(desc=\"progress-bar\")\n", + "from gensim.models import Doc2Vec\n", + "from sklearn import utils\n", + "import gensim\n", + "from gensim.models.doc2vec import TaggedDocument\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "def label_sentences(corpus, label_type):\n", + " \"\"\"\n", + " Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.\n", + " We do this by using the TaggedDocument method. The format will be \"TRAIN_i\" or \"TEST_i\" where \"i\" is\n", + " a dummy index of the post.\n", + " \"\"\"\n", + " labeled = []\n", + " for i, v in enumerate(corpus):\n", + " label = label_type + '_' + str(i)\n", + " labeled.append(doc2vec.TaggedDocument(v.split(), [label]))\n", + " return labeled" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(df.post, df.tags, random_state=0, test_size=0.3)\n", + "X_train = label_sentences(X_train, 'Train')\n", + "X_test = label_sentences(X_test, 'Test')\n", + "all_data = X_train + X_test" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[TaggedDocument(words=['fulltext', 'search', 'php', 'pdo', 'returning', 'result', 'searched', 'lot', 'matter', 'find', 'wrong', 'setup', 'trying', 'fulltext', 'search', 'using', 'pdo', 'php', 'get', 'results', 'error', 'messages', 'table', 'contains', 'customer', 'details', 'id', 'int', '11', 'auto_increment', 'name', 'varchar', '150', 'lastname', 'varchar', '150', 'company', 'varchar', '250', 'adress', 'varchar', '150', 'postcode', 'int', '5', 'city', 'varchar', '150', 'email', 'varchar', '250', 'phone', 'varchar', '20', 'orgnr', 'varchar', '15', 'timestamp', 'timestamp', 'current_timestamp', 'run', 'sqlquery', 'alter', 'table', 'system_customer', 'add', 'fulltext', 'name', 'lastname', 'except', 'columns', 'id', 'postcode', 'timestamp', 'signs', 'trouble', 'far', 'idea', 'problem', 'lies', 'db', 'configuration', 'php', 'code', 'goes', 'php', 'sth', 'dbhprepare', 'select', 'name', 'lastname', 'company', 'adress', 'city', 'phone', 'email', 'orgnr', 'db_pre', 'customer', 'match', 'name', 'lastname', 'company', 'adress', 'city', 'phone', 'email', 'orgnr', 'search', 'boolean', 'mode', 'bind', 'placeholders', 'sthbindparam', 'search', 'data', 'sthexecute', 'rows', 'sthfetchall', 'testing', 'print_r', 'dbherrorinfo', 'empty', 'rows', 'echo', 'else', 'echo', 'foreach', 'rows', 'row', 'echo', 'tr', 'datahref', 'new_orderphp', 'cid', 'row', 'id', 'echo', 'td', 'row', 'name', 'td', 'echo', 'td', 'row', 'lastname', 'td', 'echo', 'td', 'row', 'company', 'td', 'echo', 'td', 'row', 'phone', 'td', 'echo', 'td', 'row', 'email', 'td', 'echo', 'td', 'date', 'ymd', 'strtotime', 'row', 'timestamp', 'td', 'echo', 'tr', 'echo', 'tried', 'change', 'parameter', 'searchquery', 'string', 'like', 'testcompany', 'somename', 'boolean', 'mode', 'also', 'read', 'word', 'found', '50', 'rows', 'counts', 'common', 'word', 'pretty', 'sure', 'case', 'uses', 'specific', 'words', 'table', 'uses', 'myisam', 'engine', 'get', 'results', 'error', 'messages', 'please', 'help', 'point', 'wrong', 'thank'], tags=['Train_0']),\n", + " TaggedDocument(words=['select', 'everything', '1', 'table', 'x', 'rows', 'another', 'im', 'making', 'join', 'query', 'like', 'select', 'clothes', 'c', 'join', 'style', 'cstyleid', 'ssylelid', 'clothesid', '19', 'dont', 'want', 'select', 'everything', 'style', 'want', 'select', 'everything', 'clothes', '20', 'rows', 'select', '1', 'row', '10', 'style', 'easyest', 'way', 'without', 'select', 'every', 'row', 'clothes', '20', 'things', 'select', 'like', 'select', 'cid', 'cdescription', 'cname', 'csize', 'cbrand', 'sname', 'clothes', 'c', 'join', 'style', 'cstyleid', 'stsylelid', 'clothesid', '19', 'would', 'fastest', 'way', 'possibillity'], tags=['Train_1'])]" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_data[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 40000/40000 [00:00<00:00, 2559297.07it/s]\n" + ] + } + ], + "source": [ + "model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, min_count=1, alpha=0.065, min_alpha=0.065)\n", + "model_dbow.build_vocab([x for x in tqdm(all_data)])" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 40000/40000 [00:00<00:00, 2560195.33it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 2346627.88it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1163541.14it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1393769.03it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 2526879.43it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1688749.13it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1769393.90it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1734509.44it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1266463.05it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 5424078.11it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1078213.39it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 2491641.07it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1662262.56it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 917093.46it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1036219.09it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 2279574.99it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1734527.37it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1375643.95it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1036219.09it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1022926.13it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1092985.36it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 2417884.36it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1477596.00it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 2156315.92it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1266405.69it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 831221.87it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 6649709.08it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 2559297.07it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 876772.44it/s]\n", + "100%|██████████| 40000/40000 [00:00<00:00, 1806079.68it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Wall time: 3min 36s\n" + ] + } + ], + "source": [ + "for epoch in range(30):\n", + " model_dbow.train(utils.shuffle([x for x in tqdm(all_data)]), total_examples=len(all_data), epochs=1)\n", + " model_dbow.alpha -= 0.002\n", + " model_dbow.min_alpha = model_dbow.alpha" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [], + "source": [ + "def get_vectors(model, corpus_size, vectors_size, vectors_type):\n", + " \"\"\"\n", + " Get vectors from trained doc2vec model\n", + " :param doc2vec_model: Trained Doc2Vec model\n", + " :param corpus_size: Size of the data\n", + " :param vectors_size: Size of the embedding vectors\n", + " :param vectors_type: Training or Testing vectors\n", + " :return: list of vectors\n", + " \"\"\"\n", + " vectors = np.zeros((corpus_size, vectors_size))\n", + " for i in range(0, corpus_size):\n", + " prefix = vectors_type + '_' + str(i)\n", + " vectors[i] = model.docvecs[prefix]\n", + " return vectors" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [], + "source": [ + "train_vectors_dbow = get_vectors(model_dbow, len(X_train), 300, 'Train')\n", + "test_vectors_dbow = get_vectors(model_dbow, len(X_test), 300, 'Test')" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=100000.0, class_weight=None, dual=False,\n", + " fit_intercept=True, intercept_scaling=1, max_iter=100,\n", + " multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,\n", + " solver='liblinear', tol=0.0001, verbose=0, warm_start=False)" + ] + }, + "execution_count": 114, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "logreg = LogisticRegression(n_jobs=1, C=1e5)\n", + "logreg.fit(train_vectors_dbow, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [], + "source": [ + "logreg = logreg.fit(train_vectors_dbow, y_train)\n", + "y_pred = logreg.predict(test_vectors_dbow)" + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "accuracy 0.8045\n", + " precision recall f1-score support\n", + "\n", + " java 0.73 0.68 0.70 589\n", + " html 0.89 0.91 0.90 661\n", + " asp.net 0.93 0.94 0.94 606\n", + " c# 0.80 0.80 0.80 613\n", + "ruby-on-rails 0.83 0.90 0.86 601\n", + " jquery 0.72 0.71 0.72 585\n", + " mysql 0.87 0.81 0.84 621\n", + " php 0.81 0.84 0.82 587\n", + " ios 0.68 0.67 0.67 560\n", + " javascript 0.69 0.63 0.66 611\n", + " python 0.63 0.65 0.64 593\n", + " c 0.81 0.83 0.82 581\n", + " css 0.81 0.77 0.79 608\n", + " android 0.84 0.85 0.84 593\n", + " iphone 0.84 0.82 0.83 592\n", + " sql 0.68 0.65 0.66 597\n", + " objective-c 0.84 0.86 0.85 604\n", + " c++ 0.90 0.95 0.92 610\n", + " angularjs 0.93 0.96 0.95 595\n", + " .net 0.81 0.84 0.82 593\n", + "\n", + " avg / total 0.80 0.80 0.80 12000\n", + "\n" + ] + } + ], + "source": [ + "print('accuracy %s' % accuracy_score(y_pred, y_test))\n", + "print(classification_report(y_test, y_pred,target_names=my_tags))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### BOW with keras" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "import itertools\n", + "import os\n", + "\n", + "%matplotlib inline\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "\n", + "from sklearn.preprocessing import LabelBinarizer, LabelEncoder\n", + "from sklearn.metrics import confusion_matrix\n", + "\n", + "from tensorflow import keras\n", + "from keras.models import Sequential\n", + "from keras.layers import Dense, Activation, Dropout\n", + "from keras.preprocessing import text, sequence\n", + "from keras import utils" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train size: 28000\n", + "Test size: 12000\n" + ] + } + ], + "source": [ + "train_size = int(len(df) * .7)\n", + "print (\"Train size: %d\" % train_size)\n", + "print (\"Test size: %d\" % (len(df) - train_size))" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "train_posts = df['post'][:train_size]\n", + "train_tags = df['tags'][:train_size]\n", + "\n", + "test_posts = df['post'][train_size:]\n", + "test_tags = df['tags'][train_size:]" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [], + "source": [ + "max_words = 1000\n", + "tokenize = text.Tokenizer(num_words=max_words, char_level=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [], + "source": [ + "tokenize.fit_on_texts(train_posts) # only fit on train\n", + "x_train = tokenize.texts_to_matrix(train_posts)\n", + "x_test = tokenize.texts_to_matrix(test_posts)" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [], + "source": [ + "encoder = LabelEncoder()\n", + "encoder.fit(train_tags)\n", + "y_train = encoder.transform(train_tags)\n", + "y_test = encoder.transform(test_tags)" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [], + "source": [ + "num_classes = np.max(y_train) + 1\n", + "y_train = utils.to_categorical(y_train, num_classes)\n", + "y_test = utils.to_categorical(y_test, num_classes)" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "x_train shape: (28000, 1000)\n", + "x_test shape: (12000, 1000)\n", + "y_train shape: (28000, 20)\n", + "y_test shape: (12000, 20)\n" + ] + } + ], + "source": [ + "print('x_train shape:', x_train.shape)\n", + "print('x_test shape:', x_test.shape)\n", + "print('y_train shape:', y_train.shape)\n", + "print('y_test shape:', y_test.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [], + "source": [ + "batch_size = 32\n", + "epochs = 2" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [], + "source": [ + "# Build the model\n", + "model = Sequential()\n", + "model.add(Dense(512, input_shape=(max_words,)))\n", + "model.add(Activation('relu'))\n", + "model.add(Dropout(0.5))\n", + "model.add(Dense(num_classes))\n", + "model.add(Activation('softmax'))\n", + "\n", + "model.compile(loss='categorical_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train on 25200 samples, validate on 2800 samples\n", + "Epoch 1/2\n", + "25200/25200 [==============================] - 11s 442us/step - loss: 1.0261 - acc: 0.7180 - val_loss: 0.6658 - val_acc: 0.7975\n", + "Epoch 2/2\n", + "25200/25200 [==============================] - 11s 434us/step - loss: 0.5675 - acc: 0.8190 - val_loss: 0.6625 - val_acc: 0.7868\n" + ] + } + ], + "source": [ + "history = model.fit(x_train, y_train,\n", + " batch_size=batch_size,\n", + " epochs=epochs,\n", + " verbose=1,\n", + " validation_split=0.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "12000/12000 [==============================] - 1s 76us/step\n", + "Test accuracy: 0.7955833333333333\n" + ] + } + ], + "source": [ + "score = model.evaluate(x_test, y_test,\n", + " batch_size=batch_size, verbose=1)\n", + "print('Test accuracy:', score[1])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}