diff --git a/LUCAS/notebooks/neuralnet_comparison.ipynb b/LUCAS/notebooks/neuralnet_comparison.ipynb new file mode 100644 index 0000000..9a6a1ed --- /dev/null +++ b/LUCAS/notebooks/neuralnet_comparison.ipynb @@ -0,0 +1,1915 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparison of neural Models\n", + "This notebook creates a comparison of how our neural models perform. We will try each model with and without word embeddings, and produce visualisations of model performance. \n", + "\n", + "# Feed-Forward Neural Networks\n", + "First we will find this difference for Feed-Forward Neural Networks (FFNN):" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from tensorflow.keras import Sequential\n", + "from tensorflow.keras.layers import Conv2D, Dense, Dropout, Embedding, Flatten, LSTM, MaxPooling2D\n", + "from tensorflow.keras.activations import relu, sigmoid\n", + "from tensorflow.keras.preprocessing.sequence import pad_sequences\n", + "from tensorflow.keras.preprocessing.text import text_to_word_sequence, Tokenizer\n", + "from tensorflow.keras.regularizers import l2\n", + "from tensorflow.keras.callbacks import EarlyStopping\n", + "from tensorflow.keras.utils import to_categorical\n", + "from scripts import training_helpers\n", + "from sklearn.model_selection import train_test_split, StratifiedKFold\n", + "\n", + "from gensim.models import KeyedVectors\n", + "from seaborn import boxplot\n", + "from pandas import DataFrame" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will cross validate our model, so lets create a function to handle this for us. It will use StratifiedKFold splitting:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def run_cross_validate(get_model, X, y, cv=5, categorical=False, add_target_dim=False):\n", + " skfSplitter = StratifiedKFold(n_splits=cv, shuffle=True)\n", + " metrics = {\n", + " \"accuracies\": [],\n", + " }\n", + " \n", + " for train_indices, test_indices in skfSplitter.split(X, y):\n", + " training_X = np.array([X[x] for x in train_indices])\n", + " training_y = np.array([y[x] for x in train_indices])\n", + " test_X = np.array([X[x] for x in test_indices])\n", + " test_y = np.array([y[x] for x in test_indices])\n", + " \n", + " if categorical:\n", + " training_y = to_categorical(training_y)\n", + " test_y = to_categorical(test_y)\n", + " if add_target_dim:\n", + " training_y = np.array([[y] for y in training_y])\n", + " test_y = np.array([[y] for y in test_y])\n", + " \n", + " model = get_model()\n", + " print(\"Fitting with: \", np.array(training_X).shape, \"labels\", np.array(training_y).shape)\n", + " model.fit(np.array(training_X), training_y, epochs=12, batch_size=16, validation_split=0.3,\n", + " callbacks=[EarlyStopping(monitor='val_loss', patience=4)])\n", + " metrics[\"accuracies\"].append(model.evaluate(np.array(test_X), test_y)[1])\n", + " return metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we find results for our Bag of Words (BoW) model:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "data_frame = training_helpers.get_data_frame()\n", + "\n", + "predictors_raw = data_frame['review']\n", + "num_words = 20000\n", + "\n", + "tokenizer = Tokenizer(num_words)\n", + "tokenizer.fit_on_texts(predictors_raw)\n", + "bow_predictors = tokenizer.texts_to_matrix(predictors_raw, mode='tfidf')\n", + "labels = [x for x in data_frame['deceptive']]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6841 - acc: 0.7299 - val_loss: 0.4919 - val_acc: 0.8753\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 979us/step - loss: 0.3207 - acc: 0.9533 - val_loss: 0.4375 - val_acc: 0.8845\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 954us/step - loss: 0.2247 - acc: 0.9861 - val_loss: 0.4551 - val_acc: 0.8684\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 846us/step - loss: 0.1878 - acc: 0.9921 - val_loss: 0.4220 - val_acc: 0.8799\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 996us/step - loss: 0.1701 - acc: 0.9921 - val_loss: 0.4326 - val_acc: 0.8845\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1619 - acc: 0.9960 - val_loss: 0.4388 - val_acc: 0.8707\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 942us/step - loss: 0.1410 - acc: 0.9970 - val_loss: 0.4325 - val_acc: 0.8776\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 995us/step - loss: 0.1366 - acc: 0.9950 - val_loss: 0.4526 - val_acc: 0.8730\n", + "160/160 [==============================] - 0s 289us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7056 - acc: 0.7279 - val_loss: 0.5265 - val_acc: 0.8868\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 829us/step - loss: 0.3381 - acc: 0.9434 - val_loss: 0.4575 - val_acc: 0.8799\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 990us/step - loss: 0.2363 - acc: 0.9831 - val_loss: 0.4420 - val_acc: 0.8915\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1894 - acc: 0.9960 - val_loss: 0.4378 - val_acc: 0.8915\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 924us/step - loss: 0.1702 - acc: 0.9940 - val_loss: 0.4364 - val_acc: 0.8845\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 983us/step - loss: 0.1517 - acc: 0.9980 - val_loss: 0.4367 - val_acc: 0.8915\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 807us/step - loss: 0.1410 - acc: 0.9940 - val_loss: 0.4323 - val_acc: 0.8730\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 904us/step - loss: 0.1376 - acc: 0.9960 - val_loss: 0.4444 - val_acc: 0.8776\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 816us/step - loss: 0.1390 - acc: 0.9980 - val_loss: 0.4466 - val_acc: 0.8845\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 1s 804us/step - loss: 0.1469 - acc: 0.9921 - val_loss: 0.4863 - val_acc: 0.8661\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 1s 843us/step - loss: 0.1403 - acc: 0.9960 - val_loss: 0.4829 - val_acc: 0.8730\n", + "160/160 [==============================] - 0s 227us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7498 - acc: 0.7071 - val_loss: 0.5638 - val_acc: 0.8799\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 815us/step - loss: 0.4170 - acc: 0.9126 - val_loss: 0.4938 - val_acc: 0.8499\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 840us/step - loss: 0.2676 - acc: 0.9722 - val_loss: 0.4810 - val_acc: 0.8637\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.2140 - acc: 0.9891 - val_loss: 0.4444 - val_acc: 0.8799\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1829 - acc: 0.9950 - val_loss: 0.4518 - val_acc: 0.8822\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 975us/step - loss: 0.1756 - acc: 0.9901 - val_loss: 0.4546 - val_acc: 0.8753\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1630 - acc: 0.9960 - val_loss: 0.4691 - val_acc: 0.8845\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1602 - acc: 0.9970 - val_loss: 0.4792 - val_acc: 0.8891\n", + "160/160 [==============================] - 0s 255us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.6894 - acc: 0.7358 - val_loss: 0.5138 - val_acc: 0.8545\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.3196 - acc: 0.9523 - val_loss: 0.4592 - val_acc: 0.8799\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.2174 - acc: 0.9891 - val_loss: 0.4649 - val_acc: 0.8637\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1838 - acc: 0.9940 - val_loss: 0.4599 - val_acc: 0.8707\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.1663 - acc: 0.9960 - val_loss: 0.4437 - val_acc: 0.8799\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1553 - acc: 0.9950 - val_loss: 0.4822 - val_acc: 0.8614\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1593 - acc: 0.9950 - val_loss: 0.5086 - val_acc: 0.8637\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1509 - acc: 0.9970 - val_loss: 0.4692 - val_acc: 0.8730\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 987us/step - loss: 0.1471 - acc: 0.9940 - val_loss: 0.4755 - val_acc: 0.8753\n", + "160/160 [==============================] - 0s 206us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7318 - acc: 0.7080 - val_loss: 0.5465 - val_acc: 0.8753\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 927us/step - loss: 0.3468 - acc: 0.9643 - val_loss: 0.4573 - val_acc: 0.8822\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 916us/step - loss: 0.2312 - acc: 0.9921 - val_loss: 0.4483 - val_acc: 0.8753\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 728us/step - loss: 0.1951 - acc: 0.9960 - val_loss: 0.4571 - val_acc: 0.8753\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 779us/step - loss: 0.1763 - acc: 0.9960 - val_loss: 0.4494 - val_acc: 0.8753\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1657 - acc: 0.9990 - val_loss: 0.4440 - val_acc: 0.8822\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 977us/step - loss: 0.1584 - acc: 0.9980 - val_loss: 0.4273 - val_acc: 0.8799\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1548 - acc: 0.9960 - val_loss: 0.4505 - val_acc: 0.8730\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1544 - acc: 0.9921 - val_loss: 0.4302 - val_acc: 0.8938\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1580 - acc: 0.9940 - val_loss: 0.4619 - val_acc: 0.8707\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1466 - acc: 0.9980 - val_loss: 0.4619 - val_acc: 0.8730\n", + "160/160 [==============================] - 0s 225us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.7268 - acc: 0.6941 - val_loss: 0.5627 - val_acc: 0.8776\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 837us/step - loss: 0.3848 - acc: 0.9325 - val_loss: 0.4620 - val_acc: 0.8822\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 972us/step - loss: 0.2405 - acc: 0.9881 - val_loss: 0.4566 - val_acc: 0.8868\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 873us/step - loss: 0.1955 - acc: 0.9970 - val_loss: 0.4575 - val_acc: 0.8845\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1771 - acc: 0.9940 - val_loss: 0.4582 - val_acc: 0.8915\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 920us/step - loss: 0.1674 - acc: 0.9940 - val_loss: 0.4646 - val_acc: 0.8776\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 863us/step - loss: 0.1607 - acc: 0.9940 - val_loss: 0.4594 - val_acc: 0.8730\n", + "160/160 [==============================] - 0s 219us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7634 - acc: 0.6634 - val_loss: 0.5894 - val_acc: 0.8822\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.4324 - acc: 0.9166 - val_loss: 0.4643 - val_acc: 0.8938\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.2883 - acc: 0.9742 - val_loss: 0.4305 - val_acc: 0.8776\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.2227 - acc: 0.9841 - val_loss: 0.4194 - val_acc: 0.8938\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1930 - acc: 0.9911 - val_loss: 0.4176 - val_acc: 0.8799\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1785 - acc: 0.9911 - val_loss: 0.4234 - val_acc: 0.8707\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1646 - acc: 0.9940 - val_loss: 0.4333 - val_acc: 0.8822\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1583 - acc: 0.9960 - val_loss: 0.4301 - val_acc: 0.8730\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 965us/step - loss: 0.1568 - acc: 0.9940 - val_loss: 0.4599 - val_acc: 0.8707\n", + "160/160 [==============================] - 0s 195us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.7641 - acc: 0.6872 - val_loss: 0.5579 - val_acc: 0.8822\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.3955 - acc: 0.9355 - val_loss: 0.5036 - val_acc: 0.8522\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.2586 - acc: 0.9791 - val_loss: 0.4740 - val_acc: 0.8868\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.2081 - acc: 0.9921 - val_loss: 0.4843 - val_acc: 0.8845\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1824 - acc: 0.9921 - val_loss: 0.4691 - val_acc: 0.8730\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1643 - acc: 0.9970 - val_loss: 0.4535 - val_acc: 0.8707\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1504 - acc: 0.9980 - val_loss: 0.4490 - val_acc: 0.8891\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1529 - acc: 0.9950 - val_loss: 0.4477 - val_acc: 0.8753\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1434 - acc: 0.9940 - val_loss: 0.4441 - val_acc: 0.8776\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1440 - acc: 0.9950 - val_loss: 0.4579 - val_acc: 0.8707\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1499 - acc: 0.9940 - val_loss: 0.4817 - val_acc: 0.8684\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1531 - acc: 0.9940 - val_loss: 0.5193 - val_acc: 0.8661\n", + "160/160 [==============================] - 0s 230us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.7052 - acc: 0.7130 - val_loss: 0.5127 - val_acc: 0.8753\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 736us/step - loss: 0.3357 - acc: 0.9305 - val_loss: 0.4500 - val_acc: 0.8845\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 724us/step - loss: 0.2225 - acc: 0.9871 - val_loss: 0.4319 - val_acc: 0.8868\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 718us/step - loss: 0.1869 - acc: 0.9871 - val_loss: 0.4384 - val_acc: 0.8799\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 709us/step - loss: 0.1713 - acc: 0.9911 - val_loss: 0.4498 - val_acc: 0.8661\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 718us/step - loss: 0.1615 - acc: 0.9940 - val_loss: 0.4233 - val_acc: 0.8961\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 690us/step - loss: 0.1433 - acc: 0.9970 - val_loss: 0.4405 - val_acc: 0.8799\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 704us/step - loss: 0.1467 - acc: 0.9921 - val_loss: 0.4298 - val_acc: 0.8868\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 708us/step - loss: 0.1292 - acc: 0.9980 - val_loss: 0.4451 - val_acc: 0.8868\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 1s 715us/step - loss: 0.1257 - acc: 0.9950 - val_loss: 0.4153 - val_acc: 0.8799\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 1s 711us/step - loss: 0.1256 - acc: 0.9970 - val_loss: 0.4726 - val_acc: 0.8776\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 1s 687us/step - loss: 0.1233 - acc: 0.9970 - val_loss: 0.4450 - val_acc: 0.8637\n", + "160/160 [==============================] - 0s 161us/step\n", + "Fitting with: (1440, 20000) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7709 - acc: 0.6614 - val_loss: 0.6258 - val_acc: 0.8476\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 798us/step - loss: 0.4380 - acc: 0.9086 - val_loss: 0.4669 - val_acc: 0.8915\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 705us/step - loss: 0.2870 - acc: 0.9682 - val_loss: 0.4391 - val_acc: 0.8891\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 713us/step - loss: 0.2232 - acc: 0.9881 - val_loss: 0.4446 - val_acc: 0.8891\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 727us/step - loss: 0.1941 - acc: 0.9970 - val_loss: 0.4226 - val_acc: 0.8868\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 736us/step - loss: 0.1772 - acc: 0.9960 - val_loss: 0.4077 - val_acc: 0.8891\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 720us/step - loss: 0.1673 - acc: 0.9950 - val_loss: 0.4217 - val_acc: 0.8961\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 693us/step - loss: 0.1556 - acc: 0.9970 - val_loss: 0.4280 - val_acc: 0.8868\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 711us/step - loss: 0.1615 - acc: 0.9950 - val_loss: 0.4308 - val_acc: 0.8799\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 1s 697us/step - loss: 0.1561 - acc: 0.9990 - val_loss: 0.4547 - val_acc: 0.8845\n", + "160/160 [==============================] - 0s 180us/step\n" + ] + } + ], + "source": [ + "def get_ff_bow_model():\n", + " model = Sequential([\n", + " Dense(16, activation=relu, input_shape=(num_words,), kernel_regularizer=l2(0.01)),\n", + " Dropout(0.25),\n", + " Dense(8, activation=relu, kernel_regularizer=l2(0.01)),\n", + " Dense(1, activation=sigmoid)\n", + " ])\n", + " model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", + " return model\n", + "\n", + "ff_bow_scores = run_cross_validate(get_ff_bow_model, bow_predictors, labels, cv=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And for our word vector method. First we must create our word vectors using a word vectorizing model generated in another experiment:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "word_vectors = KeyedVectors.load(\"opspam_w2v.kv\", mmap=\"r\")\n", + "\n", + "predictors_sequences = pad_sequences(tokenizer.texts_to_sequences(predictors_raw))\n", + "max_sequence_length = max([len(x) for x in predictors_sequences])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "embedding_length = word_vectors.vector_size\n", + "\n", + "corpus_words = tokenizer.word_index\n", + "corpus_vocab_size = len(corpus_words)+1\n", + "vectorizer_words = word_vectors.wv\n", + "embedding_matrix = np.zeros((corpus_vocab_size, embedding_length))\n", + "for word, idx in corpus_words.items():\n", + " if word in vectorizer_words.vocab:\n", + " embedding_matrix[idx] = np.array(vectorizer_words[word], dtype=np.float32)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.9209 - acc: 0.4826 - val_loss: 0.8241 - val_acc: 0.5058\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7935 - acc: 0.5283 - val_loss: 0.7679 - val_acc: 0.5820\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7564 - acc: 0.5591 - val_loss: 0.7637 - val_acc: 0.5912\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7345 - acc: 0.5809 - val_loss: 0.7491 - val_acc: 0.5843\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7040 - acc: 0.6207 - val_loss: 0.7282 - val_acc: 0.5543\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6990 - acc: 0.6246 - val_loss: 0.7280 - val_acc: 0.6143\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6904 - acc: 0.6157 - val_loss: 0.7217 - val_acc: 0.6212\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6781 - acc: 0.6375 - val_loss: 0.7192 - val_acc: 0.6467\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6694 - acc: 0.6743 - val_loss: 0.7458 - val_acc: 0.6028\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6700 - acc: 0.6713 - val_loss: 0.7373 - val_acc: 0.6189\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6504 - acc: 0.6862 - val_loss: 0.7777 - val_acc: 0.5820\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6576 - acc: 0.6783 - val_loss: 0.7460 - val_acc: 0.6005\n", + "160/160 [==============================] - 0s 294us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 4s 3ms/step - loss: 0.9513 - acc: 0.5084 - val_loss: 0.8599 - val_acc: 0.5081\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.8459 - acc: 0.5204 - val_loss: 0.8152 - val_acc: 0.4873\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7894 - acc: 0.4965 - val_loss: 0.7690 - val_acc: 0.5219\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7602 - acc: 0.5303 - val_loss: 0.7565 - val_acc: 0.5196\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7510 - acc: 0.5611 - val_loss: 0.7703 - val_acc: 0.5681\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7588 - acc: 0.5879 - val_loss: 0.7839 - val_acc: 0.5312\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7422 - acc: 0.6097 - val_loss: 0.7811 - val_acc: 0.5035\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7296 - acc: 0.6346 - val_loss: 0.7702 - val_acc: 0.5520\n", + "160/160 [==============================] - 0s 306us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 4s 4ms/step - loss: 0.9359 - acc: 0.5144 - val_loss: 0.8700 - val_acc: 0.5612\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.8265 - acc: 0.6157 - val_loss: 0.8473 - val_acc: 0.5704\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7612 - acc: 0.6683 - val_loss: 0.8640 - val_acc: 0.5381\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7338 - acc: 0.6733 - val_loss: 0.8086 - val_acc: 0.6305\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7032 - acc: 0.6922 - val_loss: 0.8274 - val_acc: 0.5982\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6995 - acc: 0.6842 - val_loss: 0.8140 - val_acc: 0.6097\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6837 - acc: 0.7160 - val_loss: 0.8143 - val_acc: 0.6443\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6660 - acc: 0.7398 - val_loss: 0.8751 - val_acc: 0.5704\n", + "160/160 [==============================] - 0s 263us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 4s 4ms/step - loss: 0.8829 - acc: 0.5204 - val_loss: 0.8026 - val_acc: 0.5520\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7805 - acc: 0.5323 - val_loss: 0.7737 - val_acc: 0.5774\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7674 - acc: 0.5482 - val_loss: 0.7544 - val_acc: 0.5912\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7479 - acc: 0.5670 - val_loss: 0.7511 - val_acc: 0.5450\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7369 - acc: 0.5929 - val_loss: 0.7486 - val_acc: 0.5912\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7181 - acc: 0.6077 - val_loss: 0.7357 - val_acc: 0.5912\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6895 - acc: 0.6465 - val_loss: 0.7435 - val_acc: 0.5820\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6723 - acc: 0.6594 - val_loss: 0.7476 - val_acc: 0.6259\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6898 - acc: 0.6415 - val_loss: 0.7639 - val_acc: 0.6259\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6794 - acc: 0.6693 - val_loss: 0.7798 - val_acc: 0.5797\n", + "160/160 [==============================] - 0s 335us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 4s 4ms/step - loss: 0.8804 - acc: 0.5114 - val_loss: 0.8206 - val_acc: 0.5173\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.8086 - acc: 0.4955 - val_loss: 0.7912 - val_acc: 0.5289\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7808 - acc: 0.5134 - val_loss: 0.7772 - val_acc: 0.5751\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7533 - acc: 0.5432 - val_loss: 0.7505 - val_acc: 0.5658\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7456 - acc: 0.5482 - val_loss: 0.7338 - val_acc: 0.5958\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6944 - acc: 0.6216 - val_loss: 0.7413 - val_acc: 0.6051\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6902 - acc: 0.6495 - val_loss: 0.7583 - val_acc: 0.5958\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6705 - acc: 0.6604 - val_loss: 0.7632 - val_acc: 0.6443\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6630 - acc: 0.7021 - val_loss: 0.7724 - val_acc: 0.6628\n", + "160/160 [==============================] - 0s 367us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 5s 5ms/step - loss: 0.8874 - acc: 0.5214 - val_loss: 0.8097 - val_acc: 0.5012\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7886 - acc: 0.5144 - val_loss: 0.7705 - val_acc: 0.5012\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7706 - acc: 0.5015 - val_loss: 0.7667 - val_acc: 0.5035\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7432 - acc: 0.5730 - val_loss: 0.7355 - val_acc: 0.5358\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7331 - acc: 0.5571 - val_loss: 0.7236 - val_acc: 0.5612\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7277 - acc: 0.5809 - val_loss: 0.7189 - val_acc: 0.5912\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7032 - acc: 0.5799 - val_loss: 0.7121 - val_acc: 0.6166\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7005 - acc: 0.5968 - val_loss: 0.7159 - val_acc: 0.5751\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 9/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6993 - acc: 0.6127 - val_loss: 0.7485 - val_acc: 0.5150\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7053 - acc: 0.6117 - val_loss: 0.7538 - val_acc: 0.5866\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7019 - acc: 0.6256 - val_loss: 0.7142 - val_acc: 0.6097\n", + "160/160 [==============================] - 0s 492us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 5s 5ms/step - loss: 0.8920 - acc: 0.5055 - val_loss: 0.8175 - val_acc: 0.4873\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7954 - acc: 0.5154 - val_loss: 0.7949 - val_acc: 0.5242\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7746 - acc: 0.5174 - val_loss: 0.7670 - val_acc: 0.5035\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7491 - acc: 0.5283 - val_loss: 0.7389 - val_acc: 0.5473\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7377 - acc: 0.5323 - val_loss: 0.7341 - val_acc: 0.5219\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7330 - acc: 0.5561 - val_loss: 0.7375 - val_acc: 0.5704\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7317 - acc: 0.5124 - val_loss: 0.7280 - val_acc: 0.5012\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7158 - acc: 0.5313 - val_loss: 0.7052 - val_acc: 0.5774\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7047 - acc: 0.5998 - val_loss: 0.7271 - val_acc: 0.5450\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7022 - acc: 0.6058 - val_loss: 0.7436 - val_acc: 0.5912\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7008 - acc: 0.6296 - val_loss: 0.7374 - val_acc: 0.5658\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6995 - acc: 0.6068 - val_loss: 0.7685 - val_acc: 0.5797\n", + "160/160 [==============================] - 0s 350us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 5s 5ms/step - loss: 0.8891 - acc: 0.5164 - val_loss: 0.8056 - val_acc: 0.4919\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7825 - acc: 0.5045 - val_loss: 0.7710 - val_acc: 0.5196\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7575 - acc: 0.5641 - val_loss: 0.7588 - val_acc: 0.5566\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7454 - acc: 0.5710 - val_loss: 0.7511 - val_acc: 0.5704\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7276 - acc: 0.6087 - val_loss: 0.7639 - val_acc: 0.5127\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7200 - acc: 0.6197 - val_loss: 0.7439 - val_acc: 0.6212\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7128 - acc: 0.6246 - val_loss: 0.7711 - val_acc: 0.5335\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7019 - acc: 0.6385 - val_loss: 0.7634 - val_acc: 0.5473\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6985 - acc: 0.6455 - val_loss: 0.7276 - val_acc: 0.6328\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6783 - acc: 0.6683 - val_loss: 0.7609 - val_acc: 0.5797\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6801 - acc: 0.6663 - val_loss: 0.7424 - val_acc: 0.6143\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6818 - acc: 0.6763 - val_loss: 0.7433 - val_acc: 0.6282\n", + "160/160 [==============================] - 0s 299us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 5s 5ms/step - loss: 0.9225 - acc: 0.5074 - val_loss: 0.8355 - val_acc: 0.5058\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.8099 - acc: 0.5462 - val_loss: 0.7916 - val_acc: 0.5912\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7566 - acc: 0.6197 - val_loss: 0.7677 - val_acc: 0.6074\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7362 - acc: 0.6296 - val_loss: 0.7969 - val_acc: 0.5335\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7294 - acc: 0.6326 - val_loss: 0.7467 - val_acc: 0.6467\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7109 - acc: 0.6683 - val_loss: 0.7903 - val_acc: 0.5589\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6913 - acc: 0.6753 - val_loss: 0.7964 - val_acc: 0.6236\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6834 - acc: 0.6902 - val_loss: 0.7506 - val_acc: 0.6582\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.6810 - acc: 0.6931 - val_loss: 0.7542 - val_acc: 0.6536\n", + "160/160 [==============================] - 0s 293us/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 5s 5ms/step - loss: 0.8913 - acc: 0.5362 - val_loss: 0.8622 - val_acc: 0.4919\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7880 - acc: 0.5849 - val_loss: 0.7844 - val_acc: 0.5889\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7606 - acc: 0.6028 - val_loss: 0.7727 - val_acc: 0.6005\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7388 - acc: 0.6346 - val_loss: 0.7571 - val_acc: 0.5958\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7332 - acc: 0.6435 - val_loss: 0.7682 - val_acc: 0.6282\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7039 - acc: 0.6773 - val_loss: 0.7803 - val_acc: 0.6143\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7175 - acc: 0.6673 - val_loss: 0.7860 - val_acc: 0.5866\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.7136 - acc: 0.6544 - val_loss: 0.7704 - val_acc: 0.6189\n", + "160/160 [==============================] - 0s 338us/step\n" + ] + } + ], + "source": [ + "def get_ff_wv_model():\n", + " model_ff_wv = Sequential([\n", + " Embedding(corpus_vocab_size, embedding_length, weights=[embedding_matrix], trainable=False,\n", + " input_length=max_sequence_length),\n", + " Flatten(),\n", + " Dense(16, activation=relu, kernel_regularizer=l2(0.01)), #, input_shape=(num_words,)\n", + " Dropout(0.25),\n", + " Dense(8, activation=relu, kernel_regularizer=l2(0.01)),\n", + " Dense(1, activation=sigmoid)\n", + " ])\n", + " model_ff_wv.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])\n", + " return model_ff_wv\n", + "\n", + "ff_wv_scores = run_cross_validate(get_ff_wv_model, predictors_sequences, labels, cv=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bag of words: [0.875, 0.85625, 0.88125, 0.8875, 0.84375, 0.89375, 0.91875, 0.8375, 0.86875, 0.86875]\n", + "Word vectors: [0.625, 0.63125, 0.54375, 0.50625, 0.5625, 0.5875, 0.5625, 0.625, 0.54375, 0.55625]\n" + ] + } + ], + "source": [ + "print (\"Bag of words: \", ff_bow_scores['accuracies'])\n", + "print (\"Word vectors: \", ff_wv_scores['accuracies'])\n", + "\n", + "ff_scores_entries =[('Bag of Words', x) for x in ff_bow_scores['accuracies']] + [('Word Vectors', x) for x in ff_wv_scores['accuracies']]\n", + "ff_scores_data_frame = DataFrame(ff_scores_entries, columns=['input type', 'accuracy'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEKCAYAAAD9xUlFAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4xLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvDW2N/gAAFrBJREFUeJzt3X20XXV95/H3hyAQREAlsurFEDRRpB2rktJBq6LVLrQVap1RqF31oUprhxjb0Y7WDmXoqg/VqZOJjFNkqdUqiFolakaKiE+AkvD8XO5C0QQfIiIPAmLCd/7Y+24OJze5J5idc5P7fq111917n9/Z+5ubc+/n/Pbv7N9OVSFJEsBu4y5AkjR7GAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnqGAqSpI6hIEnq7D7uArbVAQccUIsWLRp3GZK0U7nkkkt+XFULZmq304XCokWLWLt27bjLkKSdSpKbR2nn6SNJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUmenu05hV7By5UomJyfHXQbr168HYGJiYqx1LF68mGXLlo21BkkNQ2EOu+eee8ZdgqRZxlAYg9nyrnj58uUArFixYsyVSJotHFOQJHUMBUlSx1CQJHUMBUlSp9dQSHJ0khuSTCZ5yzSPH5zkvCRXJvlKkoP6rEeStHW9hUKSecCpwAuBw4Djkxw21Ow9wEeq6inAKcA7+qpHkjSzPnsKRwCTVXVTVd0HnAkcO9TmMOC8dvn8aR6XJO1AfYbCBPC9gfV17bZBVwAvbZdfAjwiyaOHd5TkhCRrk6zdsGFDL8VKkvoNhUyzrYbW3wQ8J8llwHOA9cDGzZ5UdVpVLa2qpQsWzHiLUUnSQ9TnFc3rgMcNrB8E3DLYoKpuAf4AIMk+wEur6vYea5IkbUWfPYU1wJIkhyTZAzgOWDXYIMkBSaZqeCvwwR7rkSTNoLdQqKqNwInAOcB1wFlVdU2SU5Ic0zY7Crghyb8DBwJ/31c9kqSZ9TohXlWtBlYPbTtpYPlTwKf6rEGSNLo5N0vqbLmXwWww9XOYmi11rvO+DtIcDIXJyUkuv/o6Nu39qHGXMna73dd8GOySm3445krGb97dPxl3CdKsMOdCAWDT3o/inkNfNO4yNIvMv371zI2kOcAJ8SRJHUNBktQxFCRJHUNBktSZcwPN69evZ97dtzuwqAeZd/etrF+/2bRb0pxjT0GS1JlzPYWJiQl+8PPd/UiqHmT+9auZmDhw3GVIY2dPQZLUMRQkSR1DQZLUmXNjCtDMc+Onj2C3e+8A4P699h1zJePXzH3kmII050Jh8eLF4y5h1picvBOAxY/3jyEc6GtDYg6GglMjP2BqyuwVK1aMuRJJs4VjCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkzpy7TmE2WLlyJZOTk+Muo6th6nqFcVm8eLHXj0izhKEwh82fP3/cJUiaZQyFMfBdsaTZyjEFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdQwFSVLHUJAkdXoNhSRHJ7khyWSSt0zz+MIk5ye5LMmVSV7UZz2SpK3rLRSSzANOBV4IHAYcn+SwoWZ/A5xVVU8DjgP+T1/1SJJm1mdP4Qhgsqpuqqr7gDOBY4faFLBvu7wfcEuP9UiSZtDn/RQmgO8NrK8DfnOozcnAvyVZBjwceH6P9UiSZtBnTyHTbKuh9eOBD1fVQcCLgI8m2aymJCckWZtk7YYNG3ooVZIE/YbCOuBxA+sHsfnpoT8BzgKoqouAvYADhndUVadV1dKqWrpgwYKeypUk9RkKa4AlSQ5JsgfNQPKqoTbfBX4bIMmTaULBroAkjUlvoVBVG4ETgXOA62g+ZXRNklOSHNM2+6/A65JcAZwBvKqqhk8xSZJ2kD4Hmqmq1cDqoW0nDSxfCzyzzxokSaPzimZJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUsdQkCR1DAVJUmekUEjy6SS/m8QQkaRd2Kh/5N8P/CFwY5J3Jjm0x5okSWMyUihU1Zeq6hXA04HvAOcmuTDJq5M8rM8CJUk7zsing5I8GngV8FrgMmAFTUic20tlkqQdbvdRGiX5V+BQ4KPAi6vq++1Dn0iytq/iJEk71kihALyvqr483QNVtXQ71iNJGqNRTx89Ocn+UytJHpnkz3uqSZI0JqOGwuuq6qdTK1V1G/C6fkqSJI3LqKGwW5JMrSSZB+zRT0mSpHEZdUzhHOCsJP8XKODPgC/2VpUkaSxGDYX/Bvwp8HogwL8Bp/dVlCRpPEYKhaq6n+aq5vf3W44kaZxGvU5hCfAO4DBgr6ntVfX4nuqSJI3BqAPNH6LpJWwEngt8hOZCNknSLmTUUJhfVecBqaqbq+pk4Hn9lSVJGodRB5rvbafNvjHJicB64DH9lSVJGodRewpvBPYG3gAcDvwR8Mq+ipIkjceMPYX2QrWXVdWbgbuAV/delSRpLGbsKVTVJuDwwSuaJUm7plHHFC4Dzk7ySeBnUxur6l97qUqSNBajhsKjgFt58CeOCjAUJGkXMuoVzY4jSNIcMOoVzR+i6Rk8SFW9ZobnHU1z2855wOlV9c6hx99LczEcNJ9uekxV7Y8kaSxGPX30+YHlvYCXALds7Qntp5ZOBV4ArAPWJFlVVddOtamqvxhovwx42oj1SJJ6MOrpo08Pric5A/jSDE87Apisqpva55wJHAtcu4X2xwN/O0o9kqR+jHrx2rAlwMIZ2kwA3xtYX9du20ySg4FDgGnvAy1J2jFGHVO4kwePKfyA5h4LW33aNNs2G5doHQd8qr0mYrrjnwCcALBw4UxZJEl6qEY9ffSIh7DvdcDjBtYPYsvjEMcB/2Urxz8NOA1g6dKlWwoWSdIvaaTTR0lekmS/gfX9k/z+DE9bAyxJckiSPWj+8K+aZt9PAh4JXDR62ZKkPow6pvC3VXX71EpV/ZQZBoWraiNwIs39na8Dzqqqa5KckuSYgabHA2dWlT0ASRqzUT+SOl14zPjcqloNrB7adtLQ+skj1iBJ6tmoPYW1Sf4xyROSPL696OySPguTJO14o4bCMuA+4BPAWcA9bGVgWJK0cxr100c/A97Scy2SpDEb9dNH5ybZf2D9kUnO6a8sSdI4jHr66ID2E0cAVNVteI9mSdrljBoK9yfpLiVOsogtX50sSdpJjfqR1LcB30jy1Xb92bTTTkiSdh2jDjR/MclSmiC4HDib5hNIkqRdyKgT4r0WWE4zf9HlwH+kmZbieVt7niRp5zLqmMJy4DeAm6vquTQ3w9nQW1WSpLEYNRTurap7AZLsWVXXA0/qryxJ0jiMOtC8rr1O4bPAuUluY4bbcUqSdj6jDjS/pF08Ocn5wH7AF3urSpI0FqP2FDpV9dWZW0mSdkYP9R7NkqRdkKEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkjqEgSeoYCpKkzjbPkipp17Zy5UomJyfHWsP69esBmJiYGGsdAIsXL2bZsmXjLmOHMRQkzTr33HPPuEuYswwFSQ8yG94VL1++HIAVK1aMuZK5xzEFSVLHUJAkdQwFSVLHUJAkdQwFSVLHTx9Js8RsuD5gtpj6OUx9Cmmu25HXShgK0iwxOTnJjddcxsJ9No27lLHb4xfNSYyf37x2zJWM33fvmrdDj2coSLPIwn028ddPv2PcZWgWeful++7Q4zmmIEnq9BoKSY5OckOSySRv2UKblyW5Nsk1ST7eZz2SpK3r7fRRknnAqcALgHXAmiSrquragTZLgLcCz6yq25I8pq96JEkz67OncAQwWVU3VdV9wJnAsUNtXgecWlW3AVTVj3qsR5I0gz5DYQL43sD6unbboCcCT0xyQZJvJjm6x3okSTPo89NHmWZbTXP8JcBRwEHA15P8WlX99EE7Sk4ATgBYuHDh9q9UkgT021NYBzxuYP0g4JZp2pxdVb+oqm8DN9CExINU1WlVtbSqli5YsKC3giVpruszFNYAS5IckmQP4Dhg1VCbzwLPBUhyAM3ppJt6rEmStBW9hUJVbQROBM4BrgPOqqprkpyS5Ji22TnArUmuBc4H3lxVt/ZVkyRp63q9ormqVgOrh7adNLBcwF+2X5KkMfOKZklSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHUMBUlSx1CQJHV6nRBP0ujWr1/Pz+6cx9sv3XfcpWgWufnOeTx8/foddjx7CpKkjj0FaZaYmJjg5xu/z18//Y5xl6JZ5O2X7sueE8O3t++PPQVJUsdQkCR1DAVJUsdQkCR1HGiWZpHv3uVHUgF+eHfzfvXAve8fcyXj99275rFkBx7PUJBmicWLF4+7hFnjvslJAPY82J/JEnbsa8NQkGaJZcuWjbuEWWP58uUArFixYsyVzD2OKUiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKljKEiSOr3eeS3J0cAKYB5welW9c+jxVwHvBta3m95XVaf3WZOkrVu5ciWT7e0wx2Xq+FN3YBunxYsXz6m74vUWCknmAacCLwDWAWuSrKqqa4eafqKqTuyrDkk7n/nz54+7hDmrz57CEcBkVd0EkORM4FhgOBQkzSJz6V2xNtfnmMIE8L2B9XXttmEvTXJlkk8leVyP9UiSZtBnKGSabTW0/jlgUVU9BfgS8M/T7ig5IcnaJGs3bNiwncuUJE3pMxTWAYPv/A8CbhlsUFW3VtXP29UPAIdPt6OqOq2qllbV0gULFvRSrCSp31BYAyxJckiSPYDjgFWDDZL8ysDqMcB1PdYjSZpBbwPNVbUxyYnAOTQfSf1gVV2T5BRgbVWtAt6Q5BhgI/AT4FV91SNJmlmqhk/zz25Lly6ttWvXjrsMSdqpJLmkqpbO1M4rmiVJHUNBktTZ6U4fJdkA3DzuOnYhBwA/HncR0jR8bW5fB1fVjB/f3OlCQdtXkrWjnGeUdjRfm+Ph6SNJUsdQkCR1DAWdNu4CpC3wtTkGjilIkjr2FCRJHUNhjJJsSnJ5kiuSXJrkGT0fb0GSbyW5LMmzBrYfm+SzA+tvTTI5sP7iJKuG97cNxz0qyecfeuXa0ZK8N8kbB9bPSXL6wPr/TPKXv8T+T07ypqFtRyW5aGjb7kl+ODRP2ij73z/Jnz/U+uYyQ2G87qmqp1bVrwNvBd7R8/F+G7i+qp5WVV8f2H4hcOTA+pHAHUke064/A7hg1IO0d93Tzu1Cmv93kuxGc83Arw48PvJrYhteD18DDkqyaGDb84Grq+r7I+5jyv7ANoWCr9uGoTB77AvcBpBknyTntb2Hq5IcO9UoyX9Pcn2Sc5OcMfxuq21zcPv8K9vvC5M8FfgH4EVt76S732FVbQBuT7K43TQBfJr2j0L7/cJ238e3NV2d5F0Dx7wrySlJvgUcmeTots5vAH8w0O457fEvb3ssj9guPz1tbxfwwP//rwJXA3cmeWSSPYEnA5el8e729XBVkpdD967//CQfB65qt70tyQ1JvgQ8afiAVXU/8Eng5QObjwPOaJ//hCRfTHJJkq8nObTdfmCSz7Q97ivaHvc7gSe0r7N3j1pnkocn+UK7n6un2s0pVeXXmL6ATcDlwPXA7cDh7fbdgX3b5QOASZqbFi1t288HHgHcCLxpmv1+Dnhlu/wa4LPt8quA922hlg8Df0zzy3omTa/iH9pabgP2Ah4LfBdY0G7/MvD77fMLeFm7vBfNXfeWtHWfBXx+oLZntsv7ALuP+//Bry2+Pr8DLAT+FPgz4O+AFwHPBL7WtnkpcC7NTMgHtq+PXwGOAn4GHNK2O5wmHPameQM0uYXX7m8Al7XLewI/Ah7Zrp8HLGmXfxP4crv8CeCN7fI8YD9gEU0Pg22s86XABwaet9+4/x929Jc9hfGaOn10KHA08JEkoflD+vYkV9LckW6C5oX8W8DZVXVPVd1J8wd2OkcCH2+XP9o+byZT7wyfAVwEXEzzi/c04IaqupfmF/YrVbWhqjYCHwOe3T5/E03vAuBQ4NtVdWM1v1n/MnScf0zyBmD/dj+anYZfExcNrF/Ytvkt4Iyq2lRVPwS+SvM6Abi4qr7dLj8L+ExV3V1VdzB0b5UpVbUG2CfJk4AXAt+sqtuS7NMe95NJLgf+ieaPOsDzgPe3z99UVbdPs+tR67wKeH6SdyV51hb2tUszFGaJqrqIplewAHhF+/3wqnoq8EOad9/T3eJ0pN2P0GbqHPIzgIva0NmL5p3U1LnjrR3/3qraNNMxq+qdwGtpejvfnDoFoFlp6jXxH2hOH32T5g3H4HjC1l4TPxtaH/Xz72fSnDbqTh3R/K36afsmaurrySPub+Q6q+rfeaBX844kJ23DMXYJhsIs0f5xnAfcStP9/VFV/SLJc4GD22bfAF6cZK/2ndPvbmF3F9L8QkETMN8YoYRraU4PPQu4rN12Oc1pg6l3hd8CnpPkgHZQ7niad1zDrgcOSfKEdv34gX/nE6rqqqp6F7CWpleh2ekC4PeAn7TvsH9CM4B7JE2vAZrB4ZcnmZdkAU3P8eJp9vU14CVJ5rfjSC/eynHPAP6IpgewCqDtXXw7yX8GaMcIfr1tfx7w+nb7vCT7AnfSnGIdPP6MdSZ5LHB3Vf0L8B7g6Vupc5fU253XNJL5bVcYmncyr6yqTUk+BnwuyVoeGHOgqtak+WjoFTQzxa6lGYsY9gbgg0neDGwAXj1TIVVV7SDxflX1i3bzRcAJtKFQVd9P8lbg/Lbe1VV19jT7ujfJCcAXkvyYJpR+rX34jW3QbaIJov83U20am6toeq8fH9q2T1VNzV76GZqQuIKmJ/BXVfWD4R5gVV2a5BM0r+ebgcFPvzHU9tokdwOXVNVgb+MVwPuT/A3wMJoexRXAcuC0JH9C87p6fVVdlOSCJFfTvMb+apQ6aXpF705yP/AL2rCZS7yieSeTZJ+quivJ3jTvfk6oqkvHXZekXYM9hZ3PaUkOoznf/88GgqTtyZ6CJKnjQLMkqWMoSJI6hoIkqWMoaM5IcuHMrbZ5n4uS/OG2PibNVoaC5oyq6mNq8kXAlv7wb+0xaVYyFDRnJLmr/X5Ukq8k+VQ7k+vH2jmnSPKddt6bi9uvxe32Dyf5T8P7opmN81ntbJx/MXTIBz3Wzuz51IF9XJDkKWnuLfDRJF9OcmOS1w20eXOSNWlmvP0f/fxkpAcYCpqrnga8ETgMeDzNzJ9T7qiqI4D3Af9rhv28Bfh6OxfPe2d47HSamWpJ8kRgz6q6sm37FJppS44ETkry2CS/QzPT7BHAU4HDkzwbqUeGguaqi6tqXTVz+F9Oc6pnyhkD348cfuIv4ZPA7yV5GM2U5h8eeGxq9tsf00wjcgTwO+3XZcClNPNELdmO9Uib8YpmzVU/H1jexIN/F2qa5Y20b6LaU017bOsBq+ruJOcCxwIvo7k/xnTHnFoP8I6q+qdtPZb0UNlTkDb38oHvU7OBfodmSmVo/qg/rF0eno1z0HSPnQ78b2BNO+volGPb2W8fTTNd+RrgHOA17Yy4JJnIA7dIlXphT0Ha3J7tjLG78cC03x8Azk5yMc1UzVOzd14JbExyBfDhoXGFzR6rqkuS3AF8aOiYFwNfoLnT2d9V1S3ALUmeDFzUjoPfRTOl9I+2879X6jj3kTQgyXeApQNTQ2/v/T8W+ApwaDueQZKTgbuq6j19HFPaFp4+knaQJH9Mc6Oit00FgjTb2FOQJHXsKUiSOoaCJKljKEiSOoaCJKljKEiSOoaCJKnz/wHJ4W1qEkm2AgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "boxplot(x='input type', y='accuracy', data=ff_scores_data_frame)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The results are very strange! How could word vectors be less accurate than bag of words? This is known to occur when two conditions are met:\n", + " \n", + "* The dataset is small\n", + "* The dataset is very domain specific\n", + "\n", + "It is possible that these conditions actually are met here. The problem however is that running these models over our full dataset will take much longer, and will require a commited experiment to complete the investigation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Convolutional Network\n", + "Now let's try this with a convolutional network. It has been shown that word vectors perform better for text classification than Bag of Words. If BoW is more accurate, it is a clear sign that we should investigate why. First we find the Bag of Words result:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1600, 20000)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bow_predictors.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1600, 1, 20000, 1)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batches = 1600\n", + "convolutional_data = np.array(np.split(np.array([[[y] for y in z] for z in bow_predictors]), batches))\n", + "convolutional_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def get_conv_bow_model():\n", + " model = Sequential([\n", + " Conv2D(\n", + " filters=50,\n", + " kernel_size=(1, 10),\n", + " data_format=\"channels_last\",\n", + " input_shape=(1, 20000, 1),\n", + " activation=relu),\n", + " MaxPooling2D(pool_size=(1, 10)),\n", + " Dropout(0.2),\n", + " Flatten(),\n", + " Dense(2, activation='softmax')\n", + " ])\n", + " model.compile(\n", + " loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.4104 - acc: 0.8133 - val_loss: 0.6877 - val_acc: 0.6420\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.1270 - acc: 0.9563 - val_loss: 0.7839 - val_acc: 0.6305\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.0611 - acc: 0.9891 - val_loss: 0.9292 - val_acc: 0.6120\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0412 - acc: 0.9911 - val_loss: 1.1034 - val_acc: 0.5889\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0321 - acc: 0.9930 - val_loss: 1.1769 - val_acc: 0.5912\n", + "160/160 [==============================] - 1s 7ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 16s 15ms/step - loss: 0.4176 - acc: 0.8064 - val_loss: 0.7610 - val_acc: 0.6189\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.1339 - acc: 0.9494 - val_loss: 0.8414 - val_acc: 0.6328\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0693 - acc: 0.9861 - val_loss: 0.9440 - val_acc: 0.6212\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0391 - acc: 0.9940 - val_loss: 1.1303 - val_acc: 0.6028\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0212 - acc: 0.9970 - val_loss: 1.2155 - val_acc: 0.6143\n", + "160/160 [==============================] - 1s 7ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 16s 15ms/step - loss: 0.4728 - acc: 0.7716 - val_loss: 0.6186 - val_acc: 0.6674\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.1369 - acc: 0.9503 - val_loss: 0.6578 - val_acc: 0.6697\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0724 - acc: 0.9821 - val_loss: 0.9531 - val_acc: 0.6351\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0431 - acc: 0.9940 - val_loss: 0.8991 - val_acc: 0.6328\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0247 - acc: 0.9970 - val_loss: 1.0405 - val_acc: 0.6212\n", + "160/160 [==============================] - 1s 7ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.3978 - acc: 0.8113 - val_loss: 0.6811 - val_acc: 0.6328\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.1221 - acc: 0.9623 - val_loss: 0.7995 - val_acc: 0.6305\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0695 - acc: 0.9801 - val_loss: 0.9517 - val_acc: 0.6051\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0386 - acc: 0.9930 - val_loss: 1.0806 - val_acc: 0.6212\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0240 - acc: 0.9950 - val_loss: 1.1951 - val_acc: 0.6120\n", + "160/160 [==============================] - 1s 7ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 17s 16ms/step - loss: 0.4336 - acc: 0.7786 - val_loss: 0.6395 - val_acc: 0.6697\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 21s 21ms/step - loss: 0.1405 - acc: 0.9434 - val_loss: 0.7227 - val_acc: 0.6582\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 21s 21ms/step - loss: 0.0678 - acc: 0.9871 - val_loss: 0.8461 - val_acc: 0.6536\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.0361 - acc: 0.9950 - val_loss: 1.0363 - val_acc: 0.6236\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.0234 - acc: 0.9960 - val_loss: 1.1762 - val_acc: 0.6143\n", + "160/160 [==============================] - 1s 7ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.4447 - acc: 0.7805 - val_loss: 0.6711 - val_acc: 0.6282\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 19s 19ms/step - loss: 0.1323 - acc: 0.9573 - val_loss: 0.7492 - val_acc: 0.6351\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 17s 16ms/step - loss: 0.0769 - acc: 0.9782 - val_loss: 0.9064 - val_acc: 0.6189\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 15s 15ms/step - loss: 0.0408 - acc: 0.9930 - val_loss: 1.0671 - val_acc: 0.6028\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.0253 - acc: 1.0000 - val_loss: 1.1682 - val_acc: 0.6120\n", + "160/160 [==============================] - 1s 7ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.4321 - acc: 0.7776 - val_loss: 0.6643 - val_acc: 0.6467\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.1234 - acc: 0.9623 - val_loss: 0.8116 - val_acc: 0.6282\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 19s 19ms/step - loss: 0.0605 - acc: 0.9881 - val_loss: 0.9559 - val_acc: 0.6097\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.0384 - acc: 0.9930 - val_loss: 1.0549 - val_acc: 0.6120\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.0208 - acc: 0.9990 - val_loss: 1.1966 - val_acc: 0.6074\n", + "160/160 [==============================] - 1s 8ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.4149 - acc: 0.8054 - val_loss: 0.7225 - val_acc: 0.6490\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 18s 17ms/step - loss: 0.1227 - acc: 0.9682 - val_loss: 0.7774 - val_acc: 0.6490\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 20s 20ms/step - loss: 0.0639 - acc: 0.9841 - val_loss: 0.9796 - val_acc: 0.6166\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 19s 19ms/step - loss: 0.0328 - acc: 0.9950 - val_loss: 1.0511 - val_acc: 0.6305\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.0191 - acc: 0.9980 - val_loss: 1.1928 - val_acc: 0.6236\n", + "160/160 [==============================] - 1s 7ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 17s 16ms/step - loss: 0.4449 - acc: 0.7954 - val_loss: 0.6790 - val_acc: 0.6397\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.1311 - acc: 0.9543 - val_loss: 0.7680 - val_acc: 0.6467\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.0736 - acc: 0.9811 - val_loss: 0.9357 - val_acc: 0.6467\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 16s 16ms/step - loss: 0.0440 - acc: 0.9901 - val_loss: 1.1653 - val_acc: 0.6005\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.0265 - acc: 0.9960 - val_loss: 1.1660 - val_acc: 0.6259\n", + "160/160 [==============================] - 2s 11ms/step\n", + "Fitting with: (1440, 1, 20000, 1) labels (1440, 2)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 21s 21ms/step - loss: 0.4229 - acc: 0.8024 - val_loss: 0.6728 - val_acc: 0.6605\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 20s 20ms/step - loss: 0.1182 - acc: 0.9583 - val_loss: 0.7547 - val_acc: 0.6467\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 17s 17ms/step - loss: 0.0661 - acc: 0.9791 - val_loss: 0.9196 - val_acc: 0.6490\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.0412 - acc: 0.9901 - val_loss: 1.1109 - val_acc: 0.6120\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 18s 18ms/step - loss: 0.0232 - acc: 0.9970 - val_loss: 1.1591 - val_acc: 0.6305\n", + "160/160 [==============================] - 1s 7ms/step\n" + ] + } + ], + "source": [ + "conv_bow_scores = run_cross_validate(get_conv_bow_model, convolutional_data, labels, cv=10, categorical=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And our word vector result:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "def vectorize_review(review_words):\n", + " sentence = []\n", + " for word in review_words:\n", + " if word in word_vectors.wv:\n", + " sentence.append(word_vectors.wv['dog'])\n", + " return np.array(sentence, np.float64)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def pad_vectorized_review(vectorized_review, length):\n", + " return np.concatenate((vectorized_review, np.zeros((length - len(vectorized_review), 100))))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "vectorized_reviews = [vectorize_review(text_to_word_sequence(x)) for x in predictors_raw]\n", + "pad_length = max([x.shape[0] for x in vectorized_reviews])\n", + "vectorized_reviews = np.array([[pad_vectorized_review(x, pad_length)] for x in vectorized_reviews])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(1600, 1, 381, 100)\n" + ] + } + ], + "source": [ + "print(vectorized_reviews.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "def get_conv_wv_model():\n", + " model = Sequential([\n", + " Conv2D(\n", + " filters=50,\n", + " kernel_size=(10, 100),\n", + " data_format=\"channels_first\",\n", + " input_shape=(1, 381, 100),\n", + " activation=relu),\n", + " MaxPooling2D(strides=(1, 1), pool_size=(2, 1), data_format=\"channels_first\"),\n", + " Dropout(0.2),\n", + " Flatten(),\n", + " Dense(2, activation='softmax')\n", + " ])\n", + " model.compile(\n", + " loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting with: (1332, 1, 381, 100) labels (1332, 2)\n", + "Train on 932 samples, validate on 400 samples\n", + "Epoch 1/12\n", + "932/932 [==============================] - 6s 7ms/step - loss: 0.6975 - acc: 0.4764 - val_loss: 0.6934 - val_acc: 0.4475\n", + "Epoch 2/12\n", + "932/932 [==============================] - 5s 5ms/step - loss: 0.6915 - acc: 0.5300 - val_loss: 0.6942 - val_acc: 0.5025\n", + "Epoch 3/12\n", + "932/932 [==============================] - 4s 5ms/step - loss: 0.6849 - acc: 0.5590 - val_loss: 0.6958 - val_acc: 0.4925\n", + "Epoch 4/12\n", + "932/932 [==============================] - 4s 5ms/step - loss: 0.6753 - acc: 0.5869 - val_loss: 0.7030 - val_acc: 0.5225\n", + "Epoch 5/12\n", + "932/932 [==============================] - 4s 5ms/step - loss: 0.6682 - acc: 0.5912 - val_loss: 0.7110 - val_acc: 0.5000\n", + "268/268 [==============================] - 1s 2ms/step\n", + "Fitting with: (1332, 1, 381, 100) labels (1332, 2)\n", + "Train on 932 samples, validate on 400 samples\n", + "Epoch 1/12\n", + "932/932 [==============================] - 6s 7ms/step - loss: 0.6989 - acc: 0.4775 - val_loss: 0.6931 - val_acc: 0.4975\n", + "Epoch 2/12\n", + "932/932 [==============================] - 5s 5ms/step - loss: 0.6932 - acc: 0.4710 - val_loss: 0.6932 - val_acc: 0.4975\n", + "Epoch 3/12\n", + "932/932 [==============================] - 5s 5ms/step - loss: 0.6932 - acc: 0.5011 - val_loss: 0.6932 - val_acc: 0.4975\n", + "Epoch 4/12\n", + "932/932 [==============================] - 5s 5ms/step - loss: 0.6934 - acc: 0.4775 - val_loss: 0.6932 - val_acc: 0.4975\n", + "Epoch 5/12\n", + "932/932 [==============================] - 5s 5ms/step - loss: 0.6932 - acc: 0.5011 - val_loss: 0.6932 - val_acc: 0.4975\n", + "268/268 [==============================] - 1s 2ms/step\n", + "Fitting with: (1334, 1, 381, 100) labels (1334, 2)\n", + "Train on 933 samples, validate on 401 samples\n", + "Epoch 1/12\n", + "933/933 [==============================] - 6s 7ms/step - loss: 0.6957 - acc: 0.4759 - val_loss: 0.6931 - val_acc: 0.4938\n", + "Epoch 2/12\n", + "933/933 [==============================] - 5s 6ms/step - loss: 0.6919 - acc: 0.5252 - val_loss: 0.6923 - val_acc: 0.5037\n", + "Epoch 3/12\n", + "933/933 [==============================] - 5s 5ms/step - loss: 0.6842 - acc: 0.5659 - val_loss: 0.6956 - val_acc: 0.5012\n", + "Epoch 4/12\n", + "933/933 [==============================] - 5s 6ms/step - loss: 0.6716 - acc: 0.5884 - val_loss: 0.6993 - val_acc: 0.5112\n", + "Epoch 5/12\n", + "933/933 [==============================] - 5s 5ms/step - loss: 0.6600 - acc: 0.5949 - val_loss: 0.7082 - val_acc: 0.5187\n", + "Epoch 6/12\n", + "933/933 [==============================] - 5s 5ms/step - loss: 0.6565 - acc: 0.5949 - val_loss: 0.7069 - val_acc: 0.5187\n", + "266/266 [==============================] - 1s 3ms/step\n", + "Fitting with: (1334, 1, 381, 100) labels (1334, 2)\n", + "Train on 933 samples, validate on 401 samples\n", + "Epoch 1/12\n", + "933/933 [==============================] - 6s 6ms/step - loss: 0.6965 - acc: 0.4952 - val_loss: 0.6932 - val_acc: 0.5137\n", + "Epoch 2/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6920 - acc: 0.5359 - val_loss: 0.6932 - val_acc: 0.5212\n", + "Epoch 3/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6865 - acc: 0.5627 - val_loss: 0.6960 - val_acc: 0.5237\n", + "Epoch 4/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6786 - acc: 0.5756 - val_loss: 0.6971 - val_acc: 0.5287\n", + "Epoch 5/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6761 - acc: 0.5713 - val_loss: 0.7037 - val_acc: 0.5087\n", + "266/266 [==============================] - 1s 2ms/step\n", + "Fitting with: (1334, 1, 381, 100) labels (1334, 2)\n", + "Train on 933 samples, validate on 401 samples\n", + "Epoch 1/12\n", + "933/933 [==============================] - 6s 7ms/step - loss: 0.6979 - acc: 0.5091 - val_loss: 0.6924 - val_acc: 0.5112\n", + "Epoch 2/12\n", + "933/933 [==============================] - 5s 5ms/step - loss: 0.6928 - acc: 0.5177 - val_loss: 0.6922 - val_acc: 0.5287\n", + "Epoch 3/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6823 - acc: 0.5852 - val_loss: 0.6925 - val_acc: 0.5287\n", + "Epoch 4/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6680 - acc: 0.5927 - val_loss: 0.6970 - val_acc: 0.5237\n", + "Epoch 5/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6621 - acc: 0.5916 - val_loss: 0.6982 - val_acc: 0.5387\n", + "Epoch 6/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6552 - acc: 0.5981 - val_loss: 0.7037 - val_acc: 0.5411\n", + "266/266 [==============================] - 1s 3ms/step\n", + "Fitting with: (1334, 1, 381, 100) labels (1334, 2)\n", + "Train on 933 samples, validate on 401 samples\n", + "Epoch 1/12\n", + "933/933 [==============================] - 7s 7ms/step - loss: 0.6992 - acc: 0.4995 - val_loss: 0.6934 - val_acc: 0.4888\n", + "Epoch 2/12\n", + "933/933 [==============================] - 5s 5ms/step - loss: 0.6934 - acc: 0.4770 - val_loss: 0.6933 - val_acc: 0.4888\n", + "Epoch 3/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6930 - acc: 0.5091 - val_loss: 0.6934 - val_acc: 0.4988\n", + "Epoch 4/12\n", + "933/933 [==============================] - 4s 5ms/step - loss: 0.6915 - acc: 0.5456 - val_loss: 0.6944 - val_acc: 0.4788\n", + "Epoch 5/12\n", + "933/933 [==============================] - 5s 5ms/step - loss: 0.6869 - acc: 0.5552 - val_loss: 0.6962 - val_acc: 0.4913\n", + "Epoch 6/12\n", + "933/933 [==============================] - 5s 5ms/step - loss: 0.6740 - acc: 0.5809 - val_loss: 0.7028 - val_acc: 0.5212\n", + "266/266 [==============================] - 1s 2ms/step\n" + ] + } + ], + "source": [ + "conv_wv_scores = run_cross_validate(get_conv_wv_model, vectorized_reviews, labels, cv=6, categorical=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bag of words: [0.84375, 0.86875, 0.7625, 0.825, 0.86875, 0.85, 0.8, 0.8375, 0.8375, 0.78125]\n", + "Word vectors: [0.4738805965701146, 0.5, 0.488721804063123, 0.5300751881940025, 0.4812030079669522, 0.496240601727837]\n" + ] + } + ], + "source": [ + "print (\"Bag of words: \", conv_bow_scores['accuracies'])\n", + "print (\"Word vectors: \", conv_wv_scores['accuracies'])\n", + "\n", + "conv_scores_entries =[('Bag of Words', x) for x in conv_bow_scores['accuracies']] + [('Word Vectors', x) for x in conv_wv_scores['accuracies']]\n", + "conv_scores_data_frame = DataFrame(conv_scores_entries, columns=['input type', 'accuracy'])" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "boxplot(x='input type', y='accuracy', data=conv_scores_data_frame)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Just as before our results are alarming. We need to investigate why this can occur, it may be that we have made a mistake in generating our word vectors" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Recurrent Neural Networks\n", + "Let's also try this same experiment for Recurrent Neural Networks. We will use LSTM since this is known to be a good option for text classification. First with Bag of Words:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1600, 1, 20000)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "batches = 1600\n", + "rnn_bow_data = np.array(np.split(bow_predictors, batches))\n", + "rnn_bow_data.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1600, 1)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rnn_bow_targets = np.array([[x] for x in labels])\n", + "rnn_bow_targets.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.5428 - acc: 0.7637 - val_loss: 0.3637 - val_acc: 0.8799\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1343 - acc: 0.9861 - val_loss: 0.3267 - val_acc: 0.8753\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0618 - acc: 0.9990 - val_loss: 0.3146 - val_acc: 0.8891\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0396 - acc: 1.0000 - val_loss: 0.3132 - val_acc: 0.8915\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0288 - acc: 1.0000 - val_loss: 0.3127 - val_acc: 0.8868\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0224 - acc: 1.0000 - val_loss: 0.3150 - val_acc: 0.8868\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0180 - acc: 1.0000 - val_loss: 0.3180 - val_acc: 0.8891\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0150 - acc: 1.0000 - val_loss: 0.3211 - val_acc: 0.8938\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0127 - acc: 1.0000 - val_loss: 0.3256 - val_acc: 0.8938\n", + "160/160 [==============================] - 0s 237us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.5319 - acc: 0.7706 - val_loss: 0.3679 - val_acc: 0.8730\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1252 - acc: 0.9861 - val_loss: 0.3113 - val_acc: 0.8799\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0609 - acc: 0.9970 - val_loss: 0.3081 - val_acc: 0.8868\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0391 - acc: 1.0000 - val_loss: 0.3008 - val_acc: 0.8891\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.0281 - acc: 1.0000 - val_loss: 0.2969 - val_acc: 0.8915\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.0216 - acc: 1.0000 - val_loss: 0.2971 - val_acc: 0.8891\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.0174 - acc: 1.0000 - val_loss: 0.2988 - val_acc: 0.8961\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.0144 - acc: 1.0000 - val_loss: 0.3019 - val_acc: 0.8984\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 2s 1ms/step - loss: 0.0121 - acc: 1.0000 - val_loss: 0.3039 - val_acc: 0.8984\n", + "160/160 [==============================] - 0s 229us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.5196 - acc: 0.7607 - val_loss: 0.3464 - val_acc: 0.8915\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1028 - acc: 0.9791 - val_loss: 0.3072 - val_acc: 0.8938\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0453 - acc: 0.9990 - val_loss: 0.3003 - val_acc: 0.8915\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0286 - acc: 1.0000 - val_loss: 0.3003 - val_acc: 0.8891\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0203 - acc: 1.0000 - val_loss: 0.3056 - val_acc: 0.8868\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0156 - acc: 1.0000 - val_loss: 0.3105 - val_acc: 0.8845\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0125 - acc: 1.0000 - val_loss: 0.3130 - val_acc: 0.8845\n", + "160/160 [==============================] - 0s 224us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 3s 3ms/step - loss: 0.5540 - acc: 0.7686 - val_loss: 0.4088 - val_acc: 0.8753\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1803 - acc: 0.9772 - val_loss: 0.3416 - val_acc: 0.8776\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0933 - acc: 0.9980 - val_loss: 0.3257 - val_acc: 0.8684\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0619 - acc: 0.9990 - val_loss: 0.3187 - val_acc: 0.8637\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0456 - acc: 1.0000 - val_loss: 0.3161 - val_acc: 0.8661\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0356 - acc: 1.0000 - val_loss: 0.3161 - val_acc: 0.8614\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0287 - acc: 1.0000 - val_loss: 0.3183 - val_acc: 0.8637\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0238 - acc: 1.0000 - val_loss: 0.3198 - val_acc: 0.8637\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0201 - acc: 1.0000 - val_loss: 0.3231 - val_acc: 0.8637\n", + "160/160 [==============================] - 0s 183us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 4s 4ms/step - loss: 0.5290 - acc: 0.7696 - val_loss: 0.3542 - val_acc: 0.8822\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1171 - acc: 0.9861 - val_loss: 0.3179 - val_acc: 0.8707\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0541 - acc: 0.9990 - val_loss: 0.3179 - val_acc: 0.8753\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0348 - acc: 1.0000 - val_loss: 0.3170 - val_acc: 0.8684\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0251 - acc: 1.0000 - val_loss: 0.3195 - val_acc: 0.8684\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0194 - acc: 1.0000 - val_loss: 0.3243 - val_acc: 0.8684\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0156 - acc: 1.0000 - val_loss: 0.3283 - val_acc: 0.8661\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0130 - acc: 1.0000 - val_loss: 0.3334 - val_acc: 0.8684\n", + "160/160 [==============================] - 0s 214us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 4s 4ms/step - loss: 0.5361 - acc: 0.7537 - val_loss: 0.3424 - val_acc: 0.8753\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1170 - acc: 0.9811 - val_loss: 0.3131 - val_acc: 0.8776\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0500 - acc: 0.9990 - val_loss: 0.3046 - val_acc: 0.8915\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0318 - acc: 1.0000 - val_loss: 0.3040 - val_acc: 0.8891\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0230 - acc: 1.0000 - val_loss: 0.3056 - val_acc: 0.8915\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0176 - acc: 1.0000 - val_loss: 0.3096 - val_acc: 0.8915\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0142 - acc: 1.0000 - val_loss: 0.3137 - val_acc: 0.8891\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0117 - acc: 1.0000 - val_loss: 0.3174 - val_acc: 0.8891\n", + "160/160 [==============================] - 0s 239us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 4s 4ms/step - loss: 0.5679 - acc: 0.7428 - val_loss: 0.4064 - val_acc: 0.8684\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 2s 2ms/step - loss: 0.1829 - acc: 0.9762 - val_loss: 0.3367 - val_acc: 0.8799\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0931 - acc: 0.9990 - val_loss: 0.3038 - val_acc: 0.8915\n", + "Epoch 4/12\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0615 - acc: 0.9990 - val_loss: 0.2955 - val_acc: 0.8915\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0450 - acc: 1.0000 - val_loss: 0.2853 - val_acc: 0.8961\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0350 - acc: 1.0000 - val_loss: 0.2822 - val_acc: 0.8984\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0282 - acc: 1.0000 - val_loss: 0.2824 - val_acc: 0.8961\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0234 - acc: 1.0000 - val_loss: 0.2832 - val_acc: 0.8961\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0198 - acc: 1.0000 - val_loss: 0.2830 - val_acc: 0.8984\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0170 - acc: 1.0000 - val_loss: 0.2855 - val_acc: 0.8938\n", + "160/160 [==============================] - 0s 230us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 5s 4ms/step - loss: 0.5634 - acc: 0.7458 - val_loss: 0.4024 - val_acc: 0.8684\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1787 - acc: 0.9821 - val_loss: 0.3468 - val_acc: 0.8915\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0907 - acc: 0.9980 - val_loss: 0.3273 - val_acc: 0.8845\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0602 - acc: 1.0000 - val_loss: 0.3214 - val_acc: 0.8776\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0443 - acc: 1.0000 - val_loss: 0.3196 - val_acc: 0.8776\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0344 - acc: 1.0000 - val_loss: 0.3198 - val_acc: 0.8730\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0278 - acc: 1.0000 - val_loss: 0.3222 - val_acc: 0.8776\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0230 - acc: 1.0000 - val_loss: 0.3260 - val_acc: 0.8776\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0195 - acc: 1.0000 - val_loss: 0.3297 - val_acc: 0.8753\n", + "160/160 [==============================] - 0s 198us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 5s 5ms/step - loss: 0.5229 - acc: 0.7746 - val_loss: 0.3663 - val_acc: 0.8661\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1295 - acc: 0.9821 - val_loss: 0.3030 - val_acc: 0.9007\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0640 - acc: 0.9970 - val_loss: 0.3018 - val_acc: 0.8938\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0416 - acc: 1.0000 - val_loss: 0.2985 - val_acc: 0.8891\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0301 - acc: 1.0000 - val_loss: 0.2996 - val_acc: 0.8915\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0234 - acc: 1.0000 - val_loss: 0.3038 - val_acc: 0.8868\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0188 - acc: 1.0000 - val_loss: 0.3071 - val_acc: 0.8915\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0156 - acc: 1.0000 - val_loss: 0.3115 - val_acc: 0.8915\n", + "160/160 [==============================] - 0s 209us/step\n", + "Fitting with: (1440, 1, 20000) labels (1440, 1)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 4s 4ms/step - loss: 0.5431 - acc: 0.7468 - val_loss: 0.3233 - val_acc: 0.8961\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.1069 - acc: 0.9821 - val_loss: 0.3107 - val_acc: 0.8684\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0441 - acc: 0.9990 - val_loss: 0.2851 - val_acc: 0.8845\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0272 - acc: 1.0000 - val_loss: 0.2836 - val_acc: 0.8961\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0198 - acc: 1.0000 - val_loss: 0.2866 - val_acc: 0.8868\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0154 - acc: 1.0000 - val_loss: 0.2890 - val_acc: 0.8915\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0124 - acc: 1.0000 - val_loss: 0.2920 - val_acc: 0.8915\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 1s 1ms/step - loss: 0.0104 - acc: 1.0000 - val_loss: 0.2961 - val_acc: 0.8915\n", + "160/160 [==============================] - 0s 196us/step\n" + ] + } + ], + "source": [ + "def get_rnn_bow_model():\n", + " model = Sequential([\n", + " LSTM(8, input_shape=(1, 20000)),\n", + " Dense(1, activation='sigmoid')\n", + " ])\n", + " model.compile(\n", + " loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])\n", + " return model\n", + "\n", + "rnn_bow_scores = run_cross_validate(get_rnn_bow_model, rnn_bow_data, rnn_bow_targets, cv=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And with Word Vectors:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 62s 62ms/step - loss: 0.6888 - acc: 0.5402 - val_loss: 0.6834 - val_acc: 0.5404\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 57s 57ms/step - loss: 0.6733 - acc: 0.5909 - val_loss: 0.6546 - val_acc: 0.6236\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 56s 56ms/step - loss: 0.6554 - acc: 0.6038 - val_loss: 0.6469 - val_acc: 0.6189\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 60s 59ms/step - loss: 0.6487 - acc: 0.6187 - val_loss: 0.6443 - val_acc: 0.6259\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 58s 58ms/step - loss: 0.6480 - acc: 0.6137 - val_loss: 0.6396 - val_acc: 0.6351\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 61s 60ms/step - loss: 0.6374 - acc: 0.6266 - val_loss: 0.6375 - val_acc: 0.6397\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 62s 62ms/step - loss: 0.6345 - acc: 0.6316 - val_loss: 0.6285 - val_acc: 0.6513\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 58s 57ms/step - loss: 0.6259 - acc: 0.6455 - val_loss: 0.6333 - val_acc: 0.6420\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 64s 63ms/step - loss: 0.6282 - acc: 0.6445 - val_loss: 0.6301 - val_acc: 0.6582\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 65s 65ms/step - loss: 0.6379 - acc: 0.6256 - val_loss: 0.6290 - val_acc: 0.6513\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 58s 58ms/step - loss: 0.6169 - acc: 0.6465 - val_loss: 0.6272 - val_acc: 0.6467\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 58s 58ms/step - loss: 0.6149 - acc: 0.6534 - val_loss: 0.6222 - val_acc: 0.6605\n", + "160/160 [==============================] - 0s 3ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 67s 67ms/step - loss: 0.6884 - acc: 0.5353 - val_loss: 0.6811 - val_acc: 0.6005\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 60s 59ms/step - loss: 0.6717 - acc: 0.5968 - val_loss: 0.6534 - val_acc: 0.6189\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 64s 64ms/step - loss: 0.6515 - acc: 0.6157 - val_loss: 0.6452 - val_acc: 0.6189\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 64s 63ms/step - loss: 0.6432 - acc: 0.6226 - val_loss: 0.6378 - val_acc: 0.6305\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 64s 64ms/step - loss: 0.6430 - acc: 0.6157 - val_loss: 0.6500 - val_acc: 0.5889\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 91s 90ms/step - loss: 0.6342 - acc: 0.6326 - val_loss: 0.6359 - val_acc: 0.6097\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 65s 64ms/step - loss: 0.6276 - acc: 0.6395 - val_loss: 0.6327 - val_acc: 0.6536\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 67s 66ms/step - loss: 0.6332 - acc: 0.6346 - val_loss: 0.6309 - val_acc: 0.6420\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 71s 71ms/step - loss: 0.6214 - acc: 0.6316 - val_loss: 0.6358 - val_acc: 0.6166\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 72s 72ms/step - loss: 0.6120 - acc: 0.6524 - val_loss: 0.6467 - val_acc: 0.6420\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 70s 70ms/step - loss: 0.6185 - acc: 0.6524 - val_loss: 0.6225 - val_acc: 0.6559\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 71s 70ms/step - loss: 0.6102 - acc: 0.6475 - val_loss: 0.6225 - val_acc: 0.6467\n", + "160/160 [==============================] - 1s 3ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 71s 70ms/step - loss: 0.6903 - acc: 0.5223 - val_loss: 0.6819 - val_acc: 0.5196\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 75s 74ms/step - loss: 0.6711 - acc: 0.5869 - val_loss: 0.6466 - val_acc: 0.6236\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 69s 69ms/step - loss: 0.6464 - acc: 0.6177 - val_loss: 0.6375 - val_acc: 0.6328\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 70s 69ms/step - loss: 0.6456 - acc: 0.6316 - val_loss: 0.6343 - val_acc: 0.6305\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 66s 66ms/step - loss: 0.6339 - acc: 0.6236 - val_loss: 0.6691 - val_acc: 0.5982\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 65s 65ms/step - loss: 0.6251 - acc: 0.6504 - val_loss: 0.6209 - val_acc: 0.6721\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 70s 69ms/step - loss: 0.6196 - acc: 0.6405 - val_loss: 0.6202 - val_acc: 0.6674\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 63s 63ms/step - loss: 0.6187 - acc: 0.6624 - val_loss: 0.6122 - val_acc: 0.6582\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 83s 82ms/step - loss: 0.6152 - acc: 0.6673 - val_loss: 0.6368 - val_acc: 0.6420\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 75s 74ms/step - loss: 0.6272 - acc: 0.6524 - val_loss: 0.6318 - val_acc: 0.6259\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 75s 74ms/step - loss: 0.6005 - acc: 0.6713 - val_loss: 0.6140 - val_acc: 0.6628\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 85s 84ms/step - loss: 0.5985 - acc: 0.6842 - val_loss: 0.6047 - val_acc: 0.6836\n", + "160/160 [==============================] - 0s 3ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 88s 87ms/step - loss: 0.6866 - acc: 0.5362 - val_loss: 0.6726 - val_acc: 0.5635\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 83s 83ms/step - loss: 0.6628 - acc: 0.5988 - val_loss: 0.6554 - val_acc: 0.6051\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 71s 70ms/step - loss: 0.6488 - acc: 0.6087 - val_loss: 0.6522 - val_acc: 0.5935\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 75s 75ms/step - loss: 0.6505 - acc: 0.6077 - val_loss: 0.6587 - val_acc: 0.5958\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 71s 70ms/step - loss: 0.6428 - acc: 0.6266 - val_loss: 0.6429 - val_acc: 0.6051\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 67s 67ms/step - loss: 0.6415 - acc: 0.6276 - val_loss: 0.6421 - val_acc: 0.6074\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 68s 68ms/step - loss: 0.6374 - acc: 0.6286 - val_loss: 0.6412 - val_acc: 0.6236\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 71s 70ms/step - loss: 0.6351 - acc: 0.6266 - val_loss: 0.6380 - val_acc: 0.6305\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 64s 63ms/step - loss: 0.6293 - acc: 0.6346 - val_loss: 0.6298 - val_acc: 0.6328\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 60s 60ms/step - loss: 0.6218 - acc: 0.6425 - val_loss: 0.6497 - val_acc: 0.6143\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 66s 66ms/step - loss: 0.6171 - acc: 0.6524 - val_loss: 0.6061 - val_acc: 0.6767\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 72s 72ms/step - loss: 0.6143 - acc: 0.6524 - val_loss: 0.6134 - val_acc: 0.6628\n", + "160/160 [==============================] - 0s 3ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 82s 81ms/step - loss: 0.6987 - acc: 0.4826 - val_loss: 0.6887 - val_acc: 0.5727\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 67s 67ms/step - loss: 0.6840 - acc: 0.5770 - val_loss: 0.6715 - val_acc: 0.6028\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 67s 66ms/step - loss: 0.6641 - acc: 0.6018 - val_loss: 0.6541 - val_acc: 0.6005\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 66s 66ms/step - loss: 0.6446 - acc: 0.6266 - val_loss: 0.6315 - val_acc: 0.6490\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 67s 67ms/step - loss: 0.6442 - acc: 0.6137 - val_loss: 0.6301 - val_acc: 0.6420\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 64s 64ms/step - loss: 0.6468 - acc: 0.6216 - val_loss: 0.6516 - val_acc: 0.6328\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 64s 64ms/step - loss: 0.6385 - acc: 0.6216 - val_loss: 0.6304 - val_acc: 0.6328\n", + "Epoch 8/12\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1007/1007 [==============================] - 65s 64ms/step - loss: 0.6331 - acc: 0.6485 - val_loss: 0.6256 - val_acc: 0.6582\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 65s 64ms/step - loss: 0.6318 - acc: 0.6435 - val_loss: 0.6302 - val_acc: 0.6282\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 66s 65ms/step - loss: 0.6230 - acc: 0.6455 - val_loss: 0.6141 - val_acc: 0.6697\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 64s 64ms/step - loss: 0.6309 - acc: 0.6475 - val_loss: 0.6288 - val_acc: 0.6374\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 65s 64ms/step - loss: 0.6283 - acc: 0.6395 - val_loss: 0.6318 - val_acc: 0.6236\n", + "160/160 [==============================] - 0s 3ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 69s 68ms/step - loss: 0.6893 - acc: 0.5392 - val_loss: 0.6756 - val_acc: 0.6005\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 65s 65ms/step - loss: 0.6760 - acc: 0.5859 - val_loss: 0.6496 - val_acc: 0.6212\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 64s 64ms/step - loss: 0.6484 - acc: 0.6127 - val_loss: 0.6219 - val_acc: 0.6490\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 63s 62ms/step - loss: 0.6364 - acc: 0.6445 - val_loss: 0.6187 - val_acc: 0.6536\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 61s 61ms/step - loss: 0.6387 - acc: 0.6286 - val_loss: 0.6261 - val_acc: 0.6212\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 61s 61ms/step - loss: 0.6376 - acc: 0.6375 - val_loss: 0.6137 - val_acc: 0.6490\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 62s 61ms/step - loss: 0.6285 - acc: 0.6385 - val_loss: 0.6234 - val_acc: 0.6513\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 63s 63ms/step - loss: 0.6304 - acc: 0.6455 - val_loss: 0.6402 - val_acc: 0.6120\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 63s 63ms/step - loss: 0.6238 - acc: 0.6326 - val_loss: 0.6097 - val_acc: 0.6721\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 67s 67ms/step - loss: 0.6117 - acc: 0.6554 - val_loss: 0.6064 - val_acc: 0.6697\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 69s 68ms/step - loss: 0.6045 - acc: 0.6634 - val_loss: 0.6313 - val_acc: 0.6328\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 64s 64ms/step - loss: 0.6021 - acc: 0.6643 - val_loss: 0.5917 - val_acc: 0.6790\n", + "160/160 [==============================] - 0s 2ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 72s 71ms/step - loss: 0.6910 - acc: 0.5194 - val_loss: 0.6821 - val_acc: 0.5635\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 68s 67ms/step - loss: 0.6755 - acc: 0.5899 - val_loss: 0.6486 - val_acc: 0.6189\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 59s 59ms/step - loss: 0.6484 - acc: 0.6356 - val_loss: 0.6686 - val_acc: 0.5982\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 61s 61ms/step - loss: 0.6367 - acc: 0.6375 - val_loss: 0.6529 - val_acc: 0.5912\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 66s 65ms/step - loss: 0.6339 - acc: 0.6524 - val_loss: 0.6297 - val_acc: 0.6513\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 63s 63ms/step - loss: 0.6282 - acc: 0.6415 - val_loss: 0.6270 - val_acc: 0.6420\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 64s 63ms/step - loss: 0.6245 - acc: 0.6465 - val_loss: 0.6343 - val_acc: 0.6490\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 59s 58ms/step - loss: 0.6247 - acc: 0.6564 - val_loss: 0.6331 - val_acc: 0.6536\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 57s 56ms/step - loss: 0.6143 - acc: 0.6673 - val_loss: 0.6305 - val_acc: 0.6605\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 72s 72ms/step - loss: 0.6135 - acc: 0.6653 - val_loss: 0.6333 - val_acc: 0.6374\n", + "160/160 [==============================] - 1s 5ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 96s 95ms/step - loss: 0.6945 - acc: 0.5134 - val_loss: 0.6855 - val_acc: 0.5820\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 80s 80ms/step - loss: 0.6778 - acc: 0.5770 - val_loss: 0.6553 - val_acc: 0.6212\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 59s 59ms/step - loss: 0.6453 - acc: 0.6147 - val_loss: 0.6244 - val_acc: 0.6536\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 65s 64ms/step - loss: 0.6669 - acc: 0.6068 - val_loss: 0.6432 - val_acc: 0.6212\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 82s 82ms/step - loss: 0.6385 - acc: 0.6276 - val_loss: 0.6249 - val_acc: 0.6443\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 83s 82ms/step - loss: 0.6359 - acc: 0.6266 - val_loss: 0.6222 - val_acc: 0.6628\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 85s 84ms/step - loss: 0.6291 - acc: 0.6346 - val_loss: 0.6182 - val_acc: 0.6744\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 88s 88ms/step - loss: 0.6293 - acc: 0.6425 - val_loss: 0.6231 - val_acc: 0.6582\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 85s 84ms/step - loss: 0.6234 - acc: 0.6266 - val_loss: 0.6308 - val_acc: 0.6236\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 81s 80ms/step - loss: 0.6269 - acc: 0.6336 - val_loss: 0.6256 - val_acc: 0.6282\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 83s 82ms/step - loss: 0.6180 - acc: 0.6435 - val_loss: 0.6045 - val_acc: 0.6767\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 78s 78ms/step - loss: 0.6172 - acc: 0.6455 - val_loss: 0.5987 - val_acc: 0.6651\n", + "160/160 [==============================] - 0s 2ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 75s 75ms/step - loss: 0.6898 - acc: 0.5392 - val_loss: 0.6768 - val_acc: 0.6120\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 67s 66ms/step - loss: 0.6667 - acc: 0.5958 - val_loss: 0.6455 - val_acc: 0.6143\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 66s 65ms/step - loss: 0.6447 - acc: 0.6028 - val_loss: 0.6323 - val_acc: 0.6282\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 72s 71ms/step - loss: 0.6346 - acc: 0.6286 - val_loss: 0.6324 - val_acc: 0.6328\n", + "Epoch 5/12\n", + "1007/1007 [==============================] - 79s 79ms/step - loss: 0.6277 - acc: 0.6296 - val_loss: 0.6252 - val_acc: 0.6374\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 76s 76ms/step - loss: 0.6265 - acc: 0.6365 - val_loss: 0.6373 - val_acc: 0.6328\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 68s 67ms/step - loss: 0.6225 - acc: 0.6336 - val_loss: 0.6263 - val_acc: 0.6513\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 83s 83ms/step - loss: 0.6069 - acc: 0.6435 - val_loss: 0.6160 - val_acc: 0.6674\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 74s 73ms/step - loss: 0.6150 - acc: 0.6495 - val_loss: 0.6174 - val_acc: 0.6236\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 71s 71ms/step - loss: 0.6167 - acc: 0.6395 - val_loss: 0.6224 - val_acc: 0.6490\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 74s 73ms/step - loss: 0.6047 - acc: 0.6733 - val_loss: 0.6486 - val_acc: 0.6120\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 69s 68ms/step - loss: 0.6026 - acc: 0.6624 - val_loss: 0.6058 - val_acc: 0.6559\n", + "160/160 [==============================] - 0s 3ms/step\n", + "Fitting with: (1440, 784) labels (1440,)\n", + "Train on 1007 samples, validate on 433 samples\n", + "Epoch 1/12\n", + "1007/1007 [==============================] - 81s 80ms/step - loss: 0.6927 - acc: 0.5074 - val_loss: 0.6830 - val_acc: 0.5820\n", + "Epoch 2/12\n", + "1007/1007 [==============================] - 79s 78ms/step - loss: 0.6829 - acc: 0.5591 - val_loss: 0.6628 - val_acc: 0.6212\n", + "Epoch 3/12\n", + "1007/1007 [==============================] - 74s 74ms/step - loss: 0.6681 - acc: 0.5958 - val_loss: 0.6378 - val_acc: 0.6536\n", + "Epoch 4/12\n", + "1007/1007 [==============================] - 73s 72ms/step - loss: 0.6509 - acc: 0.6236 - val_loss: 0.6282 - val_acc: 0.6443\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 5/12\n", + "1007/1007 [==============================] - 66s 65ms/step - loss: 0.6414 - acc: 0.6207 - val_loss: 0.6306 - val_acc: 0.6374\n", + "Epoch 6/12\n", + "1007/1007 [==============================] - 77s 77ms/step - loss: 0.6386 - acc: 0.6286 - val_loss: 0.6189 - val_acc: 0.6420\n", + "Epoch 7/12\n", + "1007/1007 [==============================] - 80s 79ms/step - loss: 0.6361 - acc: 0.6395 - val_loss: 0.6182 - val_acc: 0.6513\n", + "Epoch 8/12\n", + "1007/1007 [==============================] - 81s 80ms/step - loss: 0.6230 - acc: 0.6614 - val_loss: 0.6216 - val_acc: 0.6282\n", + "Epoch 9/12\n", + "1007/1007 [==============================] - 78s 78ms/step - loss: 0.6183 - acc: 0.6475 - val_loss: 0.6271 - val_acc: 0.6536\n", + "Epoch 10/12\n", + "1007/1007 [==============================] - 78s 77ms/step - loss: 0.6245 - acc: 0.6475 - val_loss: 0.6002 - val_acc: 0.6697\n", + "Epoch 11/12\n", + "1007/1007 [==============================] - 82s 81ms/step - loss: 0.6083 - acc: 0.6683 - val_loss: 0.5864 - val_acc: 0.6813\n", + "Epoch 12/12\n", + "1007/1007 [==============================] - 78s 78ms/step - loss: 0.6335 - acc: 0.6187 - val_loss: 0.6513 - val_acc: 0.5958\n", + "160/160 [==============================] - 0s 3ms/step\n" + ] + } + ], + "source": [ + "def get_rnn_wv_model():\n", + " model = Sequential([\n", + " Embedding(corpus_vocab_size, embedding_length, weights=[embedding_matrix], input_length=max_sequence_length,\n", + " trainable=False),\n", + " LSTM(8),\n", + " Dense(1, activation='sigmoid')\n", + " ])\n", + " model.compile(\n", + " loss='binary_crossentropy',\n", + " optimizer='adam',\n", + " metrics=['accuracy'])\n", + " return model\n", + "\n", + "rnn_wv_scores = run_cross_validate(get_rnn_wv_model, predictors_sequences, labels, cv=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Bag of words: [0.89375, 0.86875, 0.88125, 0.89375, 0.86875, 0.875, 0.8875, 0.9125, 0.85625, 0.86875]\n", + "Word vectors: [0.7125, 0.65, 0.68125, 0.68125, 0.6375, 0.6, 0.59375, 0.74375, 0.625, 0.59375]\n" + ] + } + ], + "source": [ + "print (\"Bag of words: \", rnn_bow_scores['accuracies'])\n", + "print (\"Word vectors: \", rnn_wv_scores['accuracies'])\n", + "\n", + "rnn_scores_entries =[('Bag of Words', x) for x in rnn_bow_scores['accuracies']] + [('Word Vectors', x) for x in rnn_wv_scores['accuracies']]\n", + "rnn_scores_data_frame = DataFrame(rnn_scores_entries, columns=['input type', 'accuracy'])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "boxplot(x='input type', y='accuracy', data=rnn_scores_data_frame)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Comparison of all models:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "bow_scores = []\n", + "for score in ff_bow_scores['accuracies']:\n", + " bow_scores.append((\"Feed-Forward\", score))\n", + "for score in conv_bow_scores['accuracies']:\n", + " bow_scores.append((\"Convolutional\", score))\n", + "for score in rnn_bow_scores['accuracies']:\n", + " bow_scores.append((\"Recurrent\", score))\n", + " \n", + "boxplot(x='model', y='accuracy', data=DataFrame(bow_scores, columns=[\"model\", \"accuracy\"]))\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "wv_scores = []\n", + "for score in ff_wv_scores['accuracies']:\n", + " wv_scores.append((\"Feed-Forward\", score))\n", + "for score in conv_wv_scores['accuracies']:\n", + " wv_scores.append((\"Convolutional\", score))\n", + "for score in rnn_wv_scores['accuracies']:\n", + " wv_scores.append((\"Recurrent\", score))\n", + "\n", + "boxplot(x='model', y='accuracy', data=DataFrame(wv_scores, columns=[\"model\", \"accuracy\"]))\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From these results we can see that more work is needed to investigate why our embeddings to not perform better than bag of words. There are a number of possibilities, already suggested." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.6" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}