diff --git a/ObidroidLearning.ipynb b/ObidroidLearning.ipynb new file mode 100644 index 0000000..570ff2b --- /dev/null +++ b/ObidroidLearning.ipynb @@ -0,0 +1,1421 @@ +{ + "metadata": { + "name": "" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "heading", + "level": 1, + "metadata": {}, + "source": [ + "Obidroid Learning Notebook" + ] + }, + { + "cell_type": "heading", + "level": 2, + "metadata": {}, + "source": [ + "Loading App Data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I shall be using the previously exported `exports/appFeatures.csv` data for this notebook" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import pandas as pd\n", + "import numpy as np\n", + "from pandas import Series, DataFrame\n", + "\n", + "## Load the appFeatures file\n", + "appData = pd.read_csv('exports/appFeatures.csv')" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "\"\"\"\n", + "Lets create a new dataframe for appFeatures and appLabels\n", + "\"\"\"\n", + "\n", + "## for App Labels\n", + "appLabels = appData['appLabel']\n", + "\n", + "\n", + "## for App Features\n", + "appCols = set(appData.columns)\n", + "appCols.remove('appName') # remove app Names column\n", + "appCols.remove('Unnamed: 7') # removing a weird unnamed column\n", + "appCols.remove('appLabel') # removing the label column\n", + "appCols.remove('price') # removing price since most of the apps are free\n", + "appCols.remove('exclamationCount') # remove exclamation count from features, as all values seemed to be 0\n", + "\n", + "appFeatures = appData[list(appCols)]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 2 + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, I want to explicitly set types to all my columns as a better practice" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "# Explicitly casting column types in appFeatures dataframe\n", + "\n", + "# -- boolean\n", + "appFeatures['hasPrivacy'].astype(bool)\n", + "appFeatures['hasDeveloperEmail'].astype(bool)\n", + "appFeatures['hasDeveloperWebsite'].astype(bool)\n", + "\n", + "# -- integer\n", + "appFeatures['adjectiveCount'].astype(int)\n", + "appFeatures['countCapital'].astype(int)\n", + "appFeatures['installs'].astype(int)\n", + "appFeatures['revSent'].astype(int)\n", + "appFeatures['revLength'].astype(int)\n", + "\n", + "# -- float\n", + "appFeatures['avgRating'].astype(float)\n", + "\n", + "\n", + "appFeatures" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + " | adjectiveCount | \n", + "hasPrivacy | \n", + "revLength | \n", + "countCapital | \n", + "hasDeveloperWebsite | \n", + "installs | \n", + "hasDeveloperEmail | \n", + "avgRating | \n", + "revSent | \n", + "
---|---|---|---|---|---|---|---|---|---|
0 | \n", + "4 | \n", + "True | \n", + "601 | \n", + "1 | \n", + "True | \n", + "30000000 | \n", + "True | \n", + "4.051 | \n", + "-3 | \n", + "
1 | \n", + "13 | \n", + "True | \n", + "1139 | \n", + "11 | \n", + "True | \n", + "30000000 | \n", + "True | \n", + "4.351 | \n", + "2 | \n", + "
2 | \n", + "23 | \n", + "True | \n", + "2223 | \n", + "20 | \n", + "True | \n", + "3000000 | \n", + "False | \n", + "4.555 | \n", + "-4 | \n", + "
3 | \n", + "10 | \n", + "False | \n", + "804 | \n", + "5 | \n", + "True | \n", + "30000000 | \n", + "True | \n", + "4.623 | \n", + "8 | \n", + "
4 | \n", + "22 | \n", + "True | \n", + "1867 | \n", + "16 | \n", + "True | \n", + "7500000 | \n", + "False | \n", + "4.046 | \n", + "-11 | \n", + "
5 | \n", + "18 | \n", + "False | \n", + "1162 | \n", + "6 | \n", + "True | \n", + "30000000 | \n", + "True | \n", + "4.595 | \n", + "1 | \n", + "
6 | \n", + "18 | \n", + "True | \n", + "1522 | \n", + "60 | \n", + "True | \n", + "30000000 | \n", + "True | \n", + "4.526 | \n", + "-4 | \n", + "
7 | \n", + "13 | \n", + "False | \n", + "1895 | \n", + "19 | \n", + "True | \n", + "30000000 | \n", + "False | \n", + "4.039 | \n", + "-5 | \n", + "
8 | \n", + "11 | \n", + "True | \n", + "1195 | \n", + "10 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.400 | \n", + "-2 | \n", + "
9 | \n", + "19 | \n", + "True | \n", + "1488 | \n", + "11 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "3.935 | \n", + "-4 | \n", + "
10 | \n", + "18 | \n", + "False | \n", + "1864 | \n", + "35 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.075 | \n", + "-5 | \n", + "
11 | \n", + "19 | \n", + "False | \n", + "2049 | \n", + "14 | \n", + "True | \n", + "750000 | \n", + "False | \n", + "3.983 | \n", + "-2 | \n", + "
12 | \n", + "8 | \n", + "False | \n", + "417 | \n", + "2 | \n", + "True | \n", + "30000000 | \n", + "True | \n", + "4.238 | \n", + "1 | \n", + "
13 | \n", + "16 | \n", + "False | \n", + "1276 | \n", + "11 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "3.915 | \n", + "-3 | \n", + "
14 | \n", + "13 | \n", + "False | \n", + "1210 | \n", + "12 | \n", + "True | \n", + "750000 | \n", + "True | \n", + "4.050 | \n", + "-3 | \n", + "
15 | \n", + "20 | \n", + "True | \n", + "2038 | \n", + "24 | \n", + "True | \n", + "750000 | \n", + "True | \n", + "3.795 | \n", + "-7 | \n", + "
16 | \n", + "12 | \n", + "False | \n", + "1044 | \n", + "10 | \n", + "True | \n", + "7500000 | \n", + "True | \n", + "3.997 | \n", + "1 | \n", + "
17 | \n", + "15 | \n", + "True | \n", + "1245 | \n", + "13 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "3.212 | \n", + "-5 | \n", + "
18 | \n", + "2 | \n", + "False | \n", + "225 | \n", + "2 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "2.611 | \n", + "-1 | \n", + "
19 | \n", + "15 | \n", + "False | \n", + "1120 | \n", + "10 | \n", + "True | \n", + "30000000 | \n", + "True | \n", + "4.547 | \n", + "-3 | \n", + "
20 | \n", + "22 | \n", + "True | \n", + "1406 | \n", + "11 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "2.671 | \n", + "4 | \n", + "
21 | \n", + "13 | \n", + "True | \n", + "1063 | \n", + "10 | \n", + "True | \n", + "750000 | \n", + "False | \n", + "4.045 | \n", + "-3 | \n", + "
22 | \n", + "7 | \n", + "False | \n", + "855 | \n", + "10 | \n", + "True | \n", + "750000 | \n", + "True | \n", + "3.555 | \n", + "-9 | \n", + "
23 | \n", + "17 | \n", + "True | \n", + "2147 | \n", + "38 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.590 | \n", + "-5 | \n", + "
24 | \n", + "12 | \n", + "True | \n", + "1189 | \n", + "13 | \n", + "True | \n", + "7500000 | \n", + "True | \n", + "4.258 | \n", + "-9 | \n", + "
25 | \n", + "19 | \n", + "False | \n", + "1804 | \n", + "8 | \n", + "True | \n", + "750000 | \n", + "True | \n", + "4.428 | \n", + "-10 | \n", + "
26 | \n", + "26 | \n", + "False | \n", + "1514 | \n", + "5 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.401 | \n", + "5 | \n", + "
27 | \n", + "12 | \n", + "True | \n", + "1272 | \n", + "12 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.275 | \n", + "-6 | \n", + "
28 | \n", + "17 | \n", + "True | \n", + "1413 | \n", + "14 | \n", + "True | \n", + "750000 | \n", + "True | \n", + "4.149 | \n", + "-8 | \n", + "
29 | \n", + "7 | \n", + "True | \n", + "610 | \n", + "2 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "4.396 | \n", + "-3 | \n", + "
30 | \n", + "14 | \n", + "True | \n", + "1145 | \n", + "5 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.113 | \n", + "2 | \n", + "
31 | \n", + "14 | \n", + "False | \n", + "1413 | \n", + "14 | \n", + "True | \n", + "30000 | \n", + "True | \n", + "4.240 | \n", + "2 | \n", + "
32 | \n", + "2 | \n", + "True | \n", + "573 | \n", + "14 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "4.241 | \n", + "-4 | \n", + "
33 | \n", + "12 | \n", + "True | \n", + "1387 | \n", + "16 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "3.989 | \n", + "-6 | \n", + "
34 | \n", + "19 | \n", + "True | \n", + "1336 | \n", + "7 | \n", + "True | \n", + "75000 | \n", + "True | \n", + "4.310 | \n", + "-7 | \n", + "
35 | \n", + "7 | \n", + "True | \n", + "817 | \n", + "5 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.451 | \n", + "5 | \n", + "
36 | \n", + "29 | \n", + "True | \n", + "2205 | \n", + "16 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "3.916 | \n", + "-9 | \n", + "
37 | \n", + "7 | \n", + "False | \n", + "541 | \n", + "3 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "4.761 | \n", + "1 | \n", + "
38 | \n", + "6 | \n", + "False | \n", + "310 | \n", + "1 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "4.158 | \n", + "0 | \n", + "
39 | \n", + "6 | \n", + "False | \n", + "880 | \n", + "4 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "2.972 | \n", + "-4 | \n", + "
40 | \n", + "5 | \n", + "False | \n", + "583 | \n", + "4 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "3.903 | \n", + "-6 | \n", + "
41 | \n", + "19 | \n", + "False | \n", + "1888 | \n", + "20 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "3.433 | \n", + "-9 | \n", + "
42 | \n", + "13 | \n", + "False | \n", + "1122 | \n", + "4 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.412 | \n", + "-3 | \n", + "
43 | \n", + "15 | \n", + "True | \n", + "1613 | \n", + "21 | \n", + "True | \n", + "3000000 | \n", + "False | \n", + "4.461 | \n", + "15 | \n", + "
44 | \n", + "5 | \n", + "False | \n", + "1014 | \n", + "11 | \n", + "False | \n", + "300000 | \n", + "True | \n", + "3.564 | \n", + "-2 | \n", + "
45 | \n", + "8 | \n", + "False | \n", + "1115 | \n", + "6 | \n", + "False | \n", + "3000000 | \n", + "True | \n", + "4.131 | \n", + "5 | \n", + "
46 | \n", + "6 | \n", + "True | \n", + "437 | \n", + "8 | \n", + "True | \n", + "30000 | \n", + "True | \n", + "3.550 | \n", + "-1 | \n", + "
47 | \n", + "2 | \n", + "True | \n", + "307 | \n", + "0 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.435 | \n", + "1 | \n", + "
48 | \n", + "13 | \n", + "True | \n", + "991 | \n", + "4 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "4.233 | \n", + "-3 | \n", + "
49 | \n", + "7 | \n", + "False | \n", + "578 | \n", + "6 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "3.975 | \n", + "1 | \n", + "
50 | \n", + "10 | \n", + "True | \n", + "962 | \n", + "7 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "3.926 | \n", + "4 | \n", + "
51 | \n", + "13 | \n", + "False | \n", + "1267 | \n", + "6 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.590 | \n", + "2 | \n", + "
52 | \n", + "9 | \n", + "False | \n", + "1300 | \n", + "8 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "3.601 | \n", + "-9 | \n", + "
53 | \n", + "16 | \n", + "False | \n", + "1051 | \n", + "12 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "3.701 | \n", + "3 | \n", + "
54 | \n", + "16 | \n", + "True | \n", + "1822 | \n", + "10 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "2.931 | \n", + "-13 | \n", + "
55 | \n", + "7 | \n", + "False | \n", + "535 | \n", + "0 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.564 | \n", + "0 | \n", + "
56 | \n", + "11 | \n", + "False | \n", + "1075 | \n", + "8 | \n", + "True | \n", + "7500000 | \n", + "True | \n", + "4.179 | \n", + "-7 | \n", + "
57 | \n", + "6 | \n", + "True | \n", + "691 | \n", + "4 | \n", + "True | \n", + "3000000 | \n", + "True | \n", + "4.466 | \n", + "7 | \n", + "
58 | \n", + "7 | \n", + "True | \n", + "991 | \n", + "5 | \n", + "True | \n", + "750000 | \n", + "False | \n", + "4.340 | \n", + "-3 | \n", + "
59 | \n", + "7 | \n", + "False | \n", + "805 | \n", + "14 | \n", + "True | \n", + "300000 | \n", + "True | \n", + "4.539 | \n", + "3 | \n", + "
\n", + " | ... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "... | \n", + "
323 rows \u00d7 9 columns
\n", + "\n", + " | adjectiveCount | \n", + "hasPrivacy | \n", + "revLength | \n", + "countCapital | \n", + "hasDeveloperWebsite | \n", + "installs | \n", + "hasDeveloperEmail | \n", + "avgRating | \n", + "revSent | \n", + "
---|---|---|---|---|---|---|---|---|---|
count | \n", + "323.000000 | \n", + "323 | \n", + "323.000000 | \n", + "323.000000 | \n", + "323 | \n", + "3.230000e+02 | \n", + "323 | \n", + "323.000000 | \n", + "323.000000 | \n", + "
mean | \n", + "12.653251 | \n", + "0.5263158 | \n", + "1114.975232 | \n", + "11.789474 | \n", + "0.9071207 | \n", + "2.634931e+07 | \n", + "0.9287926 | \n", + "4.141879 | \n", + "-2.281734 | \n", + "
std | \n", + "6.520670 | \n", + "0.5000817 | \n", + "498.856574 | \n", + "10.834108 | \n", + "0.2907135 | \n", + "8.654108e+07 | \n", + "0.25757 | \n", + "0.497807 | \n", + "4.932892 | \n", + "
min | \n", + "0.000000 | \n", + "False | \n", + "17.000000 | \n", + "0.000000 | \n", + "False | \n", + "3.000000e+02 | \n", + "False | \n", + "1.000000 | \n", + "-17.000000 | \n", + "
25% | \n", + "8.000000 | \n", + "0 | \n", + "748.500000 | \n", + "6.000000 | \n", + "1 | \n", + "3.000000e+05 | \n", + "1 | \n", + "4.003500 | \n", + "-5.000000 | \n", + "
50% | \n", + "13.000000 | \n", + "1 | \n", + "1123.000000 | \n", + "10.000000 | \n", + "1 | \n", + "3.000000e+06 | \n", + "1 | \n", + "4.242000 | \n", + "-2.000000 | \n", + "
75% | \n", + "17.000000 | \n", + "1 | \n", + "1458.500000 | \n", + "15.000000 | \n", + "1 | \n", + "7.500000e+06 | \n", + "1 | \n", + "4.431000 | \n", + "1.000000 | \n", + "
max | \n", + "41.000000 | \n", + "True | \n", + "2454.000000 | \n", + "109.000000 | \n", + "True | \n", + "7.500000e+08 | \n", + "True | \n", + "4.845000 | \n", + "15.000000 | \n", + "
8 rows \u00d7 9 columns
\n", + "