diff --git a/Toxic Comments LSTM GloVe.ipynb b/Toxic Comments LSTM GloVe.ipynb
new file mode 100644
index 0000000..17f13bc
--- /dev/null
+++ b/Toxic Comments LSTM GloVe.ipynb
@@ -0,0 +1,693 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Using TensorFlow backend.\n"
+ ]
+ }
+ ],
+ "source": [
+ "import re\n",
+ "from tqdm import tqdm_notebook\n",
+ "\n",
+ "from nltk.corpus import stopwords\n",
+ "\n",
+ "from tensorflow.keras import regularizers, initializers, optimizers, callbacks\n",
+ "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
+ "from tensorflow.keras.preprocessing.text import Tokenizer\n",
+ "from keras.utils.np_utils import to_categorical\n",
+ "from tensorflow.keras.layers import *\n",
+ "from tensorflow.keras.models import Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "MAX_NB_WORDS = 100000 # max no. of words for tokenizer\n",
+ "MAX_SEQUENCE_LENGTH = 200 # max length of each entry (sentence), including padding\n",
+ "VALIDATION_SPLIT = 0.2 # data for validation (not used in training)\n",
+ "EMBEDDING_DIM = 100 # embedding dimensions for word vectors (word2vec/GloVe)\n",
+ "GLOVE_DIR = \"glove/glove.6B.\"+str(EMBEDDING_DIM)+\"d.txt\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "train = pd.read_csv('data/toxic_train.csv')\n",
+ "test = pd.read_csv('data/toxic_test.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " comment_text | \n",
+ " toxic | \n",
+ " severe_toxic | \n",
+ " obscene | \n",
+ " threat | \n",
+ " insult | \n",
+ " identity_hate | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0000997932d777bf | \n",
+ " Explanation\\nWhy the edits made under my usern... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 000103f0d9cfb60f | \n",
+ " D'aww! He matches this background colour I'm s... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 000113f07ec002fd | \n",
+ " Hey man, I'm really not trying to edit war. It... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 0001b41b1c6bb37e | \n",
+ " \"\\nMore\\nI can't make any real suggestions on ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 0001d958c54c6e35 | \n",
+ " You, sir, are my hero. Any chance you remember... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id comment_text toxic \\\n",
+ "0 0000997932d777bf Explanation\\nWhy the edits made under my usern... 0 \n",
+ "1 000103f0d9cfb60f D'aww! He matches this background colour I'm s... 0 \n",
+ "2 000113f07ec002fd Hey man, I'm really not trying to edit war. It... 0 \n",
+ "3 0001b41b1c6bb37e \"\\nMore\\nI can't make any real suggestions on ... 0 \n",
+ "4 0001d958c54c6e35 You, sir, are my hero. Any chance you remember... 0 \n",
+ "\n",
+ " severe_toxic obscene threat insult identity_hate \n",
+ "0 0 0 0 0 0 \n",
+ "1 0 0 0 0 0 \n",
+ "2 0 0 0 0 0 \n",
+ "3 0 0 0 0 0 \n",
+ "4 0 0 0 0 0 "
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0\n",
+ "comment_text 0\n",
+ "toxic 0\n",
+ "severe_toxic 0\n",
+ "obscene 0\n",
+ "threat 0\n",
+ "insult 0\n",
+ "identity_hate 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0\n",
+ "comment_text 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']\n",
+ "y = train[labels].values\n",
+ "comments_train = train['comment_text']\n",
+ "comments_test = test['comment_text']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "comments_train = list(comments_train)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def clean_text(text, remove_stopwords = True):\n",
+ " output = \"\"\n",
+ " text = str(text).replace(\"\\n\", \"\")\n",
+ " text = re.sub(r'[^\\w\\s]','',text).lower()\n",
+ " if remove_stopwords:\n",
+ " text = text.split(\" \")\n",
+ " for word in text:\n",
+ " if word not in stopwords.words(\"english\"):\n",
+ " output = output + \" \" + word\n",
+ " else:\n",
+ " output = text\n",
+ " return str(output.strip())[1:-3].replace(\" \", \" \")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.jupyter.widget-view+json": {
+ "model_id": "fb26a74c31cf451dac8bbe6515492e2d",
+ "version_major": 2,
+ "version_minor": 0
+ },
+ "text/plain": [
+ "HBox(children=(IntProgress(value=0, max=159571), HTML(value='')))"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "texts = [] \n",
+ "\n",
+ "for line in tqdm_notebook(comments_train, total=159571): \n",
+ " texts.append(clean_text(line))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Sample data: aww matches background colour im seemingly stuck thanks talk 2151 january 11 2016 [0 0 0 0 0 0]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Sample data:', texts[1], y[1])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tokenizer = Tokenizer(num_words=MAX_NB_WORDS)\n",
+ "tokenizer.fit_on_texts(texts)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Vocabulary size: 324669\n"
+ ]
+ }
+ ],
+ "source": [
+ "sequences = tokenizer.texts_to_sequences(texts)\n",
+ "word_index = tokenizer.word_index\n",
+ "print('Vocabulary size:', len(word_index))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Shape of data tensor: (159571, 200)\n",
+ "Shape of label tensor: (159571, 6)\n"
+ ]
+ }
+ ],
+ "source": [
+ "data = pad_sequences(sequences, padding = 'post', maxlen = MAX_SEQUENCE_LENGTH)\n",
+ "\n",
+ "print('Shape of data tensor:', data.shape)\n",
+ "print('Shape of label tensor:', y.shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "indices = np.arange(data.shape[0])\n",
+ "np.random.shuffle(indices)\n",
+ "data = data[indices]\n",
+ "labels = y[indices]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "num_validation_samples = int(VALIDATION_SPLIT*data.shape[0])\n",
+ "x_train = data[: -num_validation_samples]\n",
+ "y_train = labels[: -num_validation_samples]\n",
+ "x_val = data[-num_validation_samples: ]\n",
+ "y_val = labels[-num_validation_samples: ]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of entries in each category:\n",
+ "training: [12259 1274 6777 389 6288 1136]\n",
+ "validation: [3035 321 1672 89 1589 269]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Number of entries in each category:')\n",
+ "print('training: ', y_train.sum(axis=0))\n",
+ "print('validation: ', y_val.sum(axis=0))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Tokenized sentences: \n",
+ " [ 870 136 3 136 3 119 613 242 27 442 34 1281\n",
+ " 19 406 59 10384 164 103 121 98 9 2 16 358\n",
+ " 42 98 118 32 1006 42 2 2081 42 5177 736 2266\n",
+ " 1873 242 1040 31 193 810 1469 1763 2540 35 42 164\n",
+ " 1068 98 164 4782 1161 166 300 5765 3933 300 485 743\n",
+ " 3053 300 5821 4058 7204 375 300 239 302 42 553 3\n",
+ " 1068 42 395 2409 300 5568 1937 1204 2603 8 348 732\n",
+ " 9 33 343 644 605 1184 1876 527 317 4707 2174 1729\n",
+ " 2073 300 171 45 3314 11 4 6 679 193 32 127\n",
+ " 2357 486 24 1380 0 0 0 0 0 0 0 0\n",
+ " 0 0 0 0 0 0 0 0 0 0 0 0\n",
+ " 0 0 0 0 0 0 0 0 0 0 0 0\n",
+ " 0 0 0 0 0 0 0 0 0 0 0 0\n",
+ " 0 0 0 0 0 0 0 0 0 0 0 0\n",
+ " 0 0 0 0 0 0 0 0 0 0 0 0\n",
+ " 0 0 0 0 0 0 0 0 0 0 0 0\n",
+ " 0 0 0 0 0 0 0 0]\n",
+ "One hot label: \n",
+ " [0 0 0 0 0 0]\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Tokenized sentences: \\n', data[10])\n",
+ "print('One hot label: \\n', labels[10])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Loading GloVe from: glove/glove.6B.100d.txt ...Done.\n",
+ " Proceeding with Embedding Matrix... Completed!\n"
+ ]
+ }
+ ],
+ "source": [
+ "embeddings_index = {}\n",
+ "f = open(GLOVE_DIR)\n",
+ "print('Loading GloVe from:', GLOVE_DIR,'...', end='')\n",
+ "for line in f:\n",
+ " values = line.split()\n",
+ " word = values[0]\n",
+ " embeddings_index[word] = np.asarray(values[1:], dtype='float32')\n",
+ "f.close()\n",
+ "print(\"Done.\\n Proceeding with Embedding Matrix...\", end=\"\")\n",
+ "\n",
+ "embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))\n",
+ "for word, i in word_index.items():\n",
+ " embedding_vector = embeddings_index.get(word)\n",
+ " if embedding_vector is not None:\n",
+ " embedding_matrix[i] = embedding_vector\n",
+ "print(\" Completed!\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
+ "embedding_layer = Embedding(len(word_index) + 1,\n",
+ " EMBEDDING_DIM,\n",
+ " weights = [embedding_matrix],\n",
+ " input_length = MAX_SEQUENCE_LENGTH,\n",
+ " trainable=False,\n",
+ " name = 'embeddings')\n",
+ "embedded_sequences = embedding_layer(sequence_input)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "x = LSTM(60, return_sequences=True,name='lstm_layer')(embedded_sequences)\n",
+ "x = GlobalMaxPool1D()(x)\n",
+ "x = Dropout(0.1)(x)\n",
+ "x = Dense(50, activation=\"relu\")(x)\n",
+ "x = Dropout(0.1)(x)\n",
+ "preds = Dense(6, activation=\"sigmoid\")(x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model: \"model\"\n",
+ "_________________________________________________________________\n",
+ "Layer (type) Output Shape Param # \n",
+ "=================================================================\n",
+ "input_1 (InputLayer) [(None, 200)] 0 \n",
+ "_________________________________________________________________\n",
+ "embeddings (Embedding) (None, 200, 100) 32467000 \n",
+ "_________________________________________________________________\n",
+ "lstm_layer (UnifiedLSTM) (None, 200, 60) 38640 \n",
+ "_________________________________________________________________\n",
+ "global_max_pooling1d (Global (None, 60) 0 \n",
+ "_________________________________________________________________\n",
+ "dropout (Dropout) (None, 60) 0 \n",
+ "_________________________________________________________________\n",
+ "dense (Dense) (None, 50) 3050 \n",
+ "_________________________________________________________________\n",
+ "dropout_1 (Dropout) (None, 50) 0 \n",
+ "_________________________________________________________________\n",
+ "dense_1 (Dense) (None, 6) 306 \n",
+ "=================================================================\n",
+ "Total params: 32,508,996\n",
+ "Trainable params: 41,996\n",
+ "Non-trainable params: 32,467,000\n",
+ "_________________________________________________________________\n"
+ ]
+ }
+ ],
+ "source": [
+ "model = Model(sequence_input, preds)\n",
+ "model.compile(loss = 'binary_crossentropy',\n",
+ " optimizer='adam',\n",
+ " metrics = ['accuracy'])\n",
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tf.keras.utils.plot_model(model)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Training progress:\n",
+ "Train on 127657 samples, validate on 31914 samples\n",
+ "Epoch 1/2\n",
+ "127657/127657 [==============================] - 537s 4ms/sample - loss: 0.1277 - accuracy: 0.9650 - val_loss: 0.1040 - val_accuracy: 0.9699\n",
+ "Epoch 2/2\n",
+ "127657/127657 [==============================] - 533s 4ms/sample - loss: 0.0967 - accuracy: 0.9720 - val_loss: 0.0890 - val_accuracy: 0.9734\n"
+ ]
+ }
+ ],
+ "source": [
+ "print('Training progress:')\n",
+ "history = model.fit(x_train, y_train, epochs = 2, batch_size=32, validation_data=(x_val, y_val))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "