diff --git a/ObidroidLearning.ipynb b/ObidroidLearning.ipynb new file mode 100644 index 0000000..570ff2b --- /dev/null +++ b/ObidroidLearning.ipynb @@ -0,0 +1,1421 @@ +{ + "metadata": { + "name": "" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Obidroid Learning Notebook" + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Loading App Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I shall be using the previously exported `exports/appFeatures.csv` data for this notebook" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import pandas as pd\n", + "import numpy as np\n", + "from pandas import Series, DataFrame\n", + "\n", + "## Load the appFeatures file\n", + "appData = pd.read_csv('exports/appFeatures.csv')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "\"\"\"\n", + "Lets create a new dataframe for appFeatures and appLabels\n", + "\"\"\"\n", + "\n", + "## for App Labels\n", + "appLabels = appData['appLabel']\n", + "\n", + "\n", + "## for App Features\n", + "appCols = set(appData.columns)\n", + "appCols.remove('appName') # remove app Names column\n", + "appCols.remove('Unnamed: 7') # removing a weird unnamed column\n", + "appCols.remove('appLabel') # removing the label column\n", + "appCols.remove('price') # removing price since most of the apps are free\n", + "appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0\n", + "\n", + "appFeatures = appData[list(appCols)]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, I want to explicitly set types to all my columns as a better practice" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Explicitly casting column types in appFeatures dataframe\n", + "\n", + "# -- boolean\n", + "appFeatures['hasPrivacy'].astype(bool)\n", + "appFeatures['hasDeveloperEmail'].astype(bool)\n", + "appFeatures['hasDeveloperWebsite'].astype(bool)\n", + "\n", + "# -- integer\n", + "appFeatures['adjectiveCount'].astype(int)\n", + "appFeatures['countCapital'].astype(int)\n", + "appFeatures['installs'].astype(int)\n", + "appFeatures['revSent'].astype(int)\n", + "appFeatures['revLength'].astype(int)\n", + "\n", + "# -- float\n", + "appFeatures['avgRating'].astype(float)\n", + "\n", + "\n", + "appFeatures" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
adjectiveCounthasPrivacyrevLengthcountCapitalhasDeveloperWebsiteinstallshasDeveloperEmailavgRatingrevSent
0 4 True 601 1 True 30000000 True 4.051 -3
1 13 True 1139 11 True 30000000 True 4.351 2
2 23 True 2223 20 True 3000000 False 4.555 -4
3 10 False 804 5 True 30000000 True 4.623 8
4 22 True 1867 16 True 7500000 False 4.046-11
5 18 False 1162 6 True 30000000 True 4.595 1
6 18 True 1522 60 True 30000000 True 4.526 -4
7 13 False 1895 19 True 30000000 False 4.039 -5
8 11 True 1195 10 True 3000000 True 4.400 -2
9 19 True 1488 11 True 300000 True 3.935 -4
10 18 False 1864 35 True 3000000 True 4.075 -5
11 19 False 2049 14 True 750000 False 3.983 -2
12 8 False 417 2 True 30000000 True 4.238 1
13 16 False 1276 11 True 3000000 True 3.915 -3
14 13 False 1210 12 True 750000 True 4.050 -3
15 20 True 2038 24 True 750000 True 3.795 -7
16 12 False 1044 10 True 7500000 True 3.997 1
17 15 True 1245 13 True 3000000 True 3.212 -5
18 2 False 225 2 True 3000000 True 2.611 -1
19 15 False 1120 10 True 30000000 True 4.547 -3
20 22 True 1406 11 True 3000000 True 2.671 4
21 13 True 1063 10 True 750000 False 4.045 -3
22 7 False 855 10 True 750000 True 3.555 -9
23 17 True 2147 38 True 3000000 True 4.590 -5
24 12 True 1189 13 True 7500000 True 4.258 -9
25 19 False 1804 8 True 750000 True 4.428-10
26 26 False 1514 5 True 3000000 True 4.401 5
27 12 True 1272 12 True 3000000 True 4.275 -6
28 17 True 1413 14 True 750000 True 4.149 -8
29 7 True 610 2 True 300000 True 4.396 -3
30 14 True 1145 5 True 3000000 True 4.113 2
31 14 False 1413 14 True 30000 True 4.240 2
32 2 True 573 14 True 300000 True 4.241 -4
33 12 True 1387 16 True 3000000 True 3.989 -6
34 19 True 1336 7 True 75000 True 4.310 -7
35 7 True 817 5 True 3000000 True 4.451 5
36 29 True 2205 16 True 300000 True 3.916 -9
37 7 False 541 3 True 300000 True 4.761 1
38 6 False 310 1 True 300000 True 4.158 0
39 6 False 880 4 True 300000 True 2.972 -4
40 5 False 583 4 True 300000 True 3.903 -6
41 19 False 1888 20 True 3000000 True 3.433 -9
42 13 False 1122 4 True 3000000 True 4.412 -3
43 15 True 1613 21 True 3000000 False 4.461 15
44 5 False 1014 11 False 300000 True 3.564 -2
45 8 False 1115 6 False 3000000 True 4.131 5
46 6 True 437 8 True 30000 True 3.550 -1
47 2 True 307 0 True 3000000 True 4.435 1
48 13 True 991 4 True 300000 True 4.233 -3
49 7 False 578 6 True 3000000 True 3.975 1
50 10 True 962 7 True 300000 True 3.926 4
51 13 False 1267 6 True 3000000 True 4.590 2
52 9 False 1300 8 True 300000 True 3.601 -9
53 16 False 1051 12 True 300000 True 3.701 3
54 16 True 1822 10 True 300000 True 2.931-13
55 7 False 535 0 True 3000000 True 4.564 0
56 11 False 1075 8 True 7500000 True 4.179 -7
57 6 True 691 4 True 3000000 True 4.466 7
58 7 True 991 5 True 750000 False 4.340 -3
59 7 False 805 14 True 300000 True 4.539 3
...........................
\n", + "

323 rows \u00d7 9 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 3, + "text": [ + " adjectiveCount hasPrivacy revLength countCapital hasDeveloperWebsite \\\n", + "0 4 True 601 1 True \n", + "1 13 True 1139 11 True \n", + "2 23 True 2223 20 True \n", + "3 10 False 804 5 True \n", + "4 22 True 1867 16 True \n", + "5 18 False 1162 6 True \n", + "6 18 True 1522 60 True \n", + "7 13 False 1895 19 True \n", + "8 11 True 1195 10 True \n", + "9 19 True 1488 11 True \n", + "10 18 False 1864 35 True \n", + "11 19 False 2049 14 True \n", + "12 8 False 417 2 True \n", + "13 16 False 1276 11 True \n", + "14 13 False 1210 12 True \n", + "15 20 True 2038 24 True \n", + "16 12 False 1044 10 True \n", + "17 15 True 1245 13 True \n", + "18 2 False 225 2 True \n", + "19 15 False 1120 10 True \n", + "20 22 True 1406 11 True \n", + "21 13 True 1063 10 True \n", + "22 7 False 855 10 True \n", + "23 17 True 2147 38 True \n", + "24 12 True 1189 13 True \n", + "25 19 False 1804 8 True \n", + "26 26 False 1514 5 True \n", + "27 12 True 1272 12 True \n", + "28 17 True 1413 14 True \n", + "29 7 True 610 2 True \n", + "30 14 True 1145 5 True \n", + "31 14 False 1413 14 True \n", + "32 2 True 573 14 True \n", + "33 12 True 1387 16 True \n", + "34 19 True 1336 7 True \n", + "35 7 True 817 5 True \n", + "36 29 True 2205 16 True \n", + "37 7 False 541 3 True \n", + "38 6 False 310 1 True \n", + "39 6 False 880 4 True \n", + "40 5 False 583 4 True \n", + "41 19 False 1888 20 True \n", + "42 13 False 1122 4 True \n", + "43 15 True 1613 21 True \n", + "44 5 False 1014 11 False \n", + "45 8 False 1115 6 False \n", + "46 6 True 437 8 True \n", + "47 2 True 307 0 True \n", + "48 13 True 991 4 True \n", + "49 7 False 578 6 True \n", + "50 10 True 962 7 True \n", + "51 13 False 1267 6 True \n", + "52 9 False 1300 8 True \n", + "53 16 False 1051 12 True \n", + "54 16 True 1822 10 True \n", + "55 7 False 535 0 True \n", + "56 11 False 1075 8 True \n", + "57 6 True 691 4 True \n", + "58 7 True 991 5 True \n", + "59 7 False 805 14 True \n", + " ... ... ... ... ... \n", + "\n", + " installs hasDeveloperEmail avgRating revSent \n", + "0 30000000 True 4.051 -3 \n", + "1 30000000 True 4.351 2 \n", + "2 3000000 False 4.555 -4 \n", + "3 30000000 True 4.623 8 \n", + "4 7500000 False 4.046 -11 \n", + "5 30000000 True 4.595 1 \n", + "6 30000000 True 4.526 -4 \n", + "7 30000000 False 4.039 -5 \n", + "8 3000000 True 4.400 -2 \n", + "9 300000 True 3.935 -4 \n", + "10 3000000 True 4.075 -5 \n", + "11 750000 False 3.983 -2 \n", + "12 30000000 True 4.238 1 \n", + "13 3000000 True 3.915 -3 \n", + "14 750000 True 4.050 -3 \n", + "15 750000 True 3.795 -7 \n", + "16 7500000 True 3.997 1 \n", + "17 3000000 True 3.212 -5 \n", + "18 3000000 True 2.611 -1 \n", + "19 30000000 True 4.547 -3 \n", + "20 3000000 True 2.671 4 \n", + "21 750000 False 4.045 -3 \n", + "22 750000 True 3.555 -9 \n", + "23 3000000 True 4.590 -5 \n", + "24 7500000 True 4.258 -9 \n", + "25 750000 True 4.428 -10 \n", + "26 3000000 True 4.401 5 \n", + "27 3000000 True 4.275 -6 \n", + "28 750000 True 4.149 -8 \n", + "29 300000 True 4.396 -3 \n", + "30 3000000 True 4.113 2 \n", + "31 30000 True 4.240 2 \n", + "32 300000 True 4.241 -4 \n", + "33 3000000 True 3.989 -6 \n", + "34 75000 True 4.310 -7 \n", + "35 3000000 True 4.451 5 \n", + "36 300000 True 3.916 -9 \n", + "37 300000 True 4.761 1 \n", + "38 300000 True 4.158 0 \n", + "39 300000 True 2.972 -4 \n", + "40 300000 True 3.903 -6 \n", + "41 3000000 True 3.433 -9 \n", + "42 3000000 True 4.412 -3 \n", + "43 3000000 False 4.461 15 \n", + "44 300000 True 3.564 -2 \n", + "45 3000000 True 4.131 5 \n", + "46 30000 True 3.550 -1 \n", + "47 3000000 True 4.435 1 \n", + "48 300000 True 4.233 -3 \n", + "49 3000000 True 3.975 1 \n", + "50 300000 True 3.926 4 \n", + "51 3000000 True 4.590 2 \n", + "52 300000 True 3.601 -9 \n", + "53 300000 True 3.701 3 \n", + "54 300000 True 2.931 -13 \n", + "55 3000000 True 4.564 0 \n", + "56 7500000 True 4.179 -7 \n", + "57 3000000 True 4.466 7 \n", + "58 750000 False 4.340 -3 \n", + "59 300000 True 4.539 3 \n", + " ... ... ... ... \n", + "\n", + "[323 rows x 9 columns]" + ] + } + ], + "prompt_number": 3 + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Understanding the Data" + ] + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Histograms" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['adjectiveCount'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 4, + "text": [ + "" + ] + } + ], + "prompt_number": 4 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['avgRating'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 6, + "text": [ + "" + ] + } + ], + "prompt_number": 6 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['countCapital'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 7, + "text": [ + "" + ] + } + ], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['hasDeveloperEmail'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 8, + "text": [ + "" + ] + } + ], + "prompt_number": 8 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['hasDeveloperWebsite'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 9, + "text": [ + "" + ] + } + ], + "prompt_number": 9 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['hasPrivacy'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 10, + "text": [ + "" + ] + } + ], + "prompt_number": 10 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['installs'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 11, + "text": [ + "" + ] + } + ], + "prompt_number": 11 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['revSent'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 12, + "text": [ + "" + ] + } + ], + "prompt_number": 12 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures['revLength'].hist()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 13, + "text": [ + "" + ] + } + ], + "prompt_number": 13 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "appFeatures.describe()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
adjectiveCounthasPrivacyrevLengthcountCapitalhasDeveloperWebsiteinstallshasDeveloperEmailavgRatingrevSent
count 323.000000 323 323.000000 323.000000 323 3.230000e+02 323 323.000000 323.000000
mean 12.653251 0.5263158 1114.975232 11.789474 0.9071207 2.634931e+07 0.9287926 4.141879 -2.281734
std 6.520670 0.5000817 498.856574 10.834108 0.2907135 8.654108e+07 0.25757 0.497807 4.932892
min 0.000000 False 17.000000 0.000000 False 3.000000e+02 False 1.000000 -17.000000
25% 8.000000 0 748.500000 6.000000 1 3.000000e+05 1 4.003500 -5.000000
50% 13.000000 1 1123.000000 10.000000 1 3.000000e+06 1 4.242000 -2.000000
75% 17.000000 1 1458.500000 15.000000 1 7.500000e+06 1 4.431000 1.000000
max 41.000000 True 2454.000000 109.000000 True 7.500000e+08 True 4.845000 15.000000
\n", + "

8 rows \u00d7 9 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 14, + "text": [ + " adjectiveCount hasPrivacy revLength countCapital \\\n", + "count 323.000000 323 323.000000 323.000000 \n", + "mean 12.653251 0.5263158 1114.975232 11.789474 \n", + "std 6.520670 0.5000817 498.856574 10.834108 \n", + "min 0.000000 False 17.000000 0.000000 \n", + "25% 8.000000 0 748.500000 6.000000 \n", + "50% 13.000000 1 1123.000000 10.000000 \n", + "75% 17.000000 1 1458.500000 15.000000 \n", + "max 41.000000 True 2454.000000 109.000000 \n", + "\n", + " hasDeveloperWebsite installs hasDeveloperEmail avgRating \\\n", + "count 323 3.230000e+02 323 323.000000 \n", + "mean 0.9071207 2.634931e+07 0.9287926 4.141879 \n", + "std 0.2907135 8.654108e+07 0.25757 0.497807 \n", + "min False 3.000000e+02 False 1.000000 \n", + "25% 1 3.000000e+05 1 4.003500 \n", + "50% 1 3.000000e+06 1 4.242000 \n", + "75% 1 7.500000e+06 1 4.431000 \n", + "max True 7.500000e+08 True 4.845000 \n", + "\n", + " revSent \n", + "count 323.000000 \n", + "mean -2.281734 \n", + "std 4.932892 \n", + "min -17.000000 \n", + "25% -5.000000 \n", + "50% -2.000000 \n", + "75% 1.000000 \n", + "max 15.000000 \n", + "\n", + "[8 rows x 9 columns]" + ] + } + ], + "prompt_number": 14 + }, + { + "cell_type": "heading", + "level": 4, + "metadata": {}, + "source": [ + "Variable Relations" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import seaborn as sns\n", + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt\n", + "\n", + "\n", + "\n", + "sns.set(palette=\"Purples_r\")\n", + "np.random.seed(9221999)\n", + "mpl.rc(\"figure\", figsize=(5, 5))\n", + "\n" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 15 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x = appFeatures['revSent'].astype(float)\n", + "y = appFeatures['avgRating'].astype(float)\n", + "\n", + "sns.regplot(x,y)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 16 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file diff --git a/clustering_v2.py b/clustering_v2.py new file mode 100644 index 0000000..ae0ca18 --- /dev/null +++ b/clustering_v2.py @@ -0,0 +1,92 @@ +#! /usr/bin/env python +# -*- coding: UTF-8 -*- +""" +Clustering Version 2 +===================== +After Feature Extraction, that returns a data of the format +[(filename, linenum, vote, sentence, feat1, feat2, ...)] + +Improving the initial clustering mechanism (via R scripts) to +SciKit based clustering and producing plots + +For clustering there shall be no resampling required. +We are looking for clues via unsupervised learning approach. + +Attempting the following clustering operations +- [ ] k-means +- [ ] mini k-means + +====================== +""" + +from __future__ import division +from optparse import OptionParser +from pprint import pprint + +import pandas as pd + + + +def getUserInput(): + """ + The following flags are supported + + -f or --file + provide the path to the app features file extracted + + + -c or --cluster + choose a clustering engine + + - km (for kmeans) + - mkm (for minikmeans) + """ + + + optionparser = OptionParser() + usage = "usage: $python clustering_v2.py arg1 arg2" + + optionparser.add_option('-c', '--cl', dest="cluster", default="km") + optionparser.add_option('-f', '--file', dest="file") + + (option, args) = optionparser.parse_args() + + if not option.file: + print "App Features file not provided" + print usage + print getUserInput.__doc__ + + return + else: + + return {'cluster': option.cluster, 'file': option.file} + + + +def loadData(filepath): + """ + load the features from the appfeatures file, where features have + been stored after extraction + """ + featDframe = pd.read_csv(filepath) + + print featDframe.columns + + return featDframe + + + + +def main(): + print __doc__ + + userInput = getUserInput() + + ## a pandas dataframe object + appData = loadData(userInput['file']) + + + +if __name__ == "__main__": + main() +