diff --git a/Amazon Reviews.ipynb b/Amazon Reviews.ipynb new file mode 100644 index 0000000..53e18b4 --- /dev/null +++ b/Amazon Reviews.ipynb @@ -0,0 +1,775 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Product NameBrand NamePriceRatingReviewsReview Votes
0\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.995I feel so LUCKY to have found this used (phone...1.0
1\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.994nice phone, nice up grade from my pantach revu...0.0
2\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.995Very pleased0.0
3\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.994It works good but it goes slow sometimes but i...0.0
4\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.994Great phone to replace my lost phone. The only...0.0
\n", + "
" + ], + "text/plain": [ + " Product Name Brand Name Price \\\n", + "0 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "1 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "2 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "3 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "4 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "\n", + " Rating Reviews Review Votes \n", + "0 5 I feel so LUCKY to have found this used (phone... 1.0 \n", + "1 4 nice phone, nice up grade from my pantach revu... 0.0 \n", + "2 5 Very pleased 0.0 \n", + "3 4 It works good but it goes slow sometimes but i... 0.0 \n", + "4 4 Great phone to replace my lost phone. The only... 0.0 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "df = pd.read_csv('Amazon_Unlocked_Mobile.csv')\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Product NameBrand NamePriceRatingReviewsReview VotesPositively Rated
0\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.995I feel so LUCKY to have found this used (phone...1.01
1\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.994nice phone, nice up grade from my pantach revu...0.01
2\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.995Very pleased0.01
3\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.994It works good but it goes slow sometimes but i...0.01
4\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.994Great phone to replace my lost phone. The only...0.01
5\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.991I already had a phone with problems... I know ...1.00
6\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.992The charging port was loose. I got that solder...0.00
7\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.992Phone looks good but wouldn't stay charged, ha...0.00
8\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.995I originally was using the Samsung S2 Galaxy f...0.01
9\"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7...Samsung199.993It's battery life is great. It's very responsi...0.00
\n", + "
" + ], + "text/plain": [ + " Product Name Brand Name Price \\\n", + "0 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "1 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "2 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "3 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "4 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "5 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "6 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "7 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "8 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "9 \"CLEAR CLEAN ESN\" Sprint EPIC 4G Galaxy SPH-D7... Samsung 199.99 \n", + "\n", + " Rating Reviews Review Votes \\\n", + "0 5 I feel so LUCKY to have found this used (phone... 1.0 \n", + "1 4 nice phone, nice up grade from my pantach revu... 0.0 \n", + "2 5 Very pleased 0.0 \n", + "3 4 It works good but it goes slow sometimes but i... 0.0 \n", + "4 4 Great phone to replace my lost phone. The only... 0.0 \n", + "5 1 I already had a phone with problems... I know ... 1.0 \n", + "6 2 The charging port was loose. I got that solder... 0.0 \n", + "7 2 Phone looks good but wouldn't stay charged, ha... 0.0 \n", + "8 5 I originally was using the Samsung S2 Galaxy f... 0.0 \n", + "9 3 It's battery life is great. It's very responsi... 0.0 \n", + "\n", + " Positively Rated \n", + "0 1 \n", + "1 1 \n", + "2 1 \n", + "3 1 \n", + "4 1 \n", + "5 0 \n", + "6 0 \n", + "7 0 \n", + "8 1 \n", + "9 0 " + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dropna(inplace=True)\n", + "df[df['Rating'] != 3]\n", + "df['Positively Rated'] = np.where(df['Rating'] > 3, 1, 0)\n", + "df.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6899487041440472" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['Positively Rated'].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positively Rated'], random_state = 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "X_train first entry: \n", + "\n", + " I feel so LUCKY to have found this used (phone to us & not used hard at all), phone on line from someone who upgraded and sold this one. My Son liked his old one that finally fell apart after 2.5+ years and didn't want an upgrade!! Thank you Seller, we really appreciate it & your honesty re: said used phone.I recommend this seller very highly & would but from them again!!\n", + "\n", + "\n", + "X_train shape: (250751,)\n" + ] + } + ], + "source": [ + "print('X_train first entry: \\n\\n', X_train[0])\n", + "print('\\n\\nX_train shape: ', X_train.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# CountVectorizer\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "vect = CountVectorizer().fit(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['00',\n", + " '858',\n", + " 'approval',\n", + " 'booth',\n", + " 'cmon',\n", + " 'dealsthanks',\n", + " 'eclair',\n", + " 'ff',\n", + " 'gsmpros',\n", + " 'insertion',\n", + " 'linkhttps',\n", + " 'movment',\n", + " 'outmoded',\n", + " 'preset',\n", + " 'reinstallation',\n", + " 'separatingly',\n", + " 'stillnumbers',\n", + " 'todos',\n", + " 'verycool']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vect.get_feature_names()[::3000]" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "56947" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(vect.get_feature_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<250751x56947 sparse matrix of type ''\n", + "\twith 6848632 stored elements in Compressed Sparse Row format>" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# transform the documents in the training data to a document-term matrix\n", + "X_train_vectorized = vect.transform(X_train)\n", + "X_train_vectorized" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", + " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", + " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", + " verbose=0, warm_start=False)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "model = LogisticRegression()\n", + "model.fit(X_train_vectorized, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC: 0.896300276575\n" + ] + } + ], + "source": [ + "from sklearn.metrics import roc_auc_score\n", + "\n", + "predictions = model.predict(vect.transform(X_test))\n", + "\n", + "print('AUC: ', roc_auc_score(y_test, predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Smallest Coefs: \n", + "['raymond' 'stylist' 'mony' 'false' 'worst' 'unsatisfied' 'disconnects'\n", + " 'pos' 'horribly' 'lies']\n", + "\n", + "Largest Coefs: \n", + "['excelent' 'excelente' '4eeeks' 'perfecto' 'exelente' 'loving' 'awsome'\n", + " 'excellent' 'becuse' 'sweet']\n", + "\n" + ] + } + ], + "source": [ + "# get the feature names as numpy array\n", + "feature_names = np.array(vect.get_feature_names())\n", + "\n", + "# Sort the coefficients from the model\n", + "sorted_coef_index = model.coef_[0].argsort()\n", + "\n", + "# Find the 10 smallest and 10 largest coefficients\n", + "# The 10 largest coefficients are being indexed using [:-11:-1] \n", + "# so the list returned is in order of largest to smallest\n", + "print('Smallest Coefs: \\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n", + "print('Largest Coefs: \\n{}\\n'.format(feature_names[sorted_coef_index[:-11:-1]]))" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "18951" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Tfidf\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "\n", + "# Fit the TfidfVectorizer to the training data specifiying a minimum document frequency of 5\n", + "vect = TfidfVectorizer(min_df = 5).fit(X_train)\n", + "len(vect.get_feature_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC: 0.900231756945\n" + ] + } + ], + "source": [ + "X_train_vectorized = vect.transform(X_train)\n", + "\n", + "model = LogisticRegression()\n", + "model.fit(X_train_vectorized, y_train)\n", + "predictions = model.predict(vect.transform(X_test))\n", + "print('AUC: ', roc_auc_score(y_test, predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Smallest Tfidf: \n", + "['brawns' 'messiah' 'excites' 'reading___' 'seizing' 'srgb' 'liquidating'\n", + " '16nm' '1b' '700nits']\n", + "\n", + "Largest Tfidf: \n", + "['excllent' 'ecxelente' 'purchase' 'eh' 'gud' 'looser' 'gucci' 'soo'\n", + " 'unusable' 'lost']\n", + "\n" + ] + } + ], + "source": [ + "feature_names = np.array(vect.get_feature_names())\n", + "\n", + "sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()\n", + "\n", + "print('Smallest Tfidf: \\n{}\\n'.format(feature_names[sorted_tfidf_index[:10]]))\n", + "print('Largest Tfidf: \\n{}\\n'.format(feature_names[sorted_tfidf_index[:-11:-1]]))" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Smallest coef: \n", + "['not' 'worst' 'disappointed' 'waste' 'poor' 'terrible' 'return' 'stopped'\n", + " 'slow' 'returning']\n", + "\n", + "Largest coef: \n", + "['love' 'great' 'amazing' 'excellent' 'perfect' 'loves' 'best' 'awesome'\n", + " 'perfectly' 'easy']\n", + "\n" + ] + } + ], + "source": [ + "sorted_coef_index = model.coef_[0].argsort()\n", + "\n", + "print('Smallest coef: \\n{}\\n'.format(feature_names[sorted_coef_index[:10]]))\n", + "print('Largest coef: \\n{}\\n'.format(feature_names[sorted_coef_index[:-11:-1]]))" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 0]\n" + ] + } + ], + "source": [ + "# These reviews are treated the same by our current model\n", + "\n", + "print(model.predict(vect.transform(['Not an issue, phone is working', \n", + " 'an issue, phone is not working'])))" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "217383" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# n-grams\n", + "# Fit the CountVectorizer to the training data specifiying a minimum \n", + "# document frequency of 5 and extracting 1-grams and 2-grams\n", + "vect = CountVectorizer(min_df = 5, ngram_range = (1,2)).fit(X_train)\n", + "X_train_vectorized = vect.transform(X_train)\n", + "len(vect.get_feature_names())" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AUC: 0.948236892649\n" + ] + } + ], + "source": [ + "model = LogisticRegression()\n", + "model.fit(X_train_vectorized, y_train)\n", + "\n", + "predictions = model.predict(vect.transform(X_test))\n", + "print('AUC: ', roc_auc_score(y_test, predictions))" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Smallest Coef: \n", + "['junk' 'no good' 'worst' 'good love' 'horrible' 'nope' 'terrible'\n", + " 'needed good' 'three stars' 'not happy']\n", + "\n", + "Largest Coef: \n", + "['excelent' 'excelente' 'excellent' 'perfect' 'no issues' 'no problems'\n", + " 'perfecto' 'exelente' 'awesome' 'awsome']\n", + "\n" + ] + } + ], + "source": [ + "feature_names = np.array(vect.get_feature_names())\n", + "sorted_coef_index = model.coef_[0].argsort()\n", + "\n", + "print('Smallest Coef: \\n{}\\n'.format(feature_names[sorted_coef_index][:10]))\n", + "print('Largest Coef: \\n{}\\n'.format(feature_names[sorted_coef_index][:-11:-1]))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[0 0]\n" + ] + } + ], + "source": [ + "print(model.predict(vect.transform(['not an issue, phone is working',\n", + " 'an issue, phone is not working'])))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}