Add notebook

susanli2016 · Jul 22, 2020 · 0015983 · 0015983
1 parent b303790
commit 0015983
Showing 1 changed file with 378 additions and 0 deletions.
diff --git a/Fake_News_LogReg.ipynb b/Fake_News_LogReg.ipynb
@@ -0,0 +1,378 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 99,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "import pickle\n",
+    "from sklearn.linear_model import LogisticRegressionCV\n",
+    "\n",
+    "df = pd.read_csv('data/corona_fake.csv')\n",
+    "\n",
+    "df.loc[df['label'] == 'Fake', ['label']] = 'FAKE'\n",
+    "df.loc[df['label'] == 'fake', ['label']] = 'FAKE'\n",
+    "df.loc[df['source'] == 'facebook', ['source']] = 'Facebook'\n",
+    "df.text.fillna(df.title, inplace=True)\n",
+    "\n",
+    "df.loc[5]['label'] = 'FAKE'\n",
+    "df.loc[15]['label'] = 'TRUE'\n",
+    "df.loc[43]['label'] = 'FAKE'\n",
+    "df.loc[131]['label'] = 'TRUE'\n",
+    "df.loc[242]['label'] = 'FAKE'\n",
+    "\n",
+    "df = df.sample(frac=1).reset_index(drop=True)\n",
+    "df.title.fillna('missing', inplace=True)\n",
+    "df.source.fillna('missing', inplace=True)\n",
+    "\n",
+    "df['title_text'] = df['title'] + ' ' + df['text']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TRUE    586\n",
+       "FAKE    578\n",
+       "Name: label, dtype: int64"
+      ]
+     },
+     "execution_count": 100,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['label'].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 101,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>title</th>\n",
+       "      <th>text</th>\n",
+       "      <th>source</th>\n",
+       "      <th>label</th>\n",
+       "      <th>title_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>0</td>\n",
+       "      <td>What precautions can I take when grocery shopp...</td>\n",
+       "      <td>The coronavirus that causes COVID-19 is primar...</td>\n",
+       "      <td>https://www.health.harvard.edu/</td>\n",
+       "      <td>TRUE</td>\n",
+       "      <td>What precautions can I take when grocery shopp...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>BREAKING: New Evidence Based on Cell Phone Dat...</td>\n",
+       "      <td>Bartiromo broke news this morning that cell ph...</td>\n",
+       "      <td>https://www.thegatewaypundit.com/</td>\n",
+       "      <td>FAKE</td>\n",
+       "      <td>BREAKING: New Evidence Based on Cell Phone Dat...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>COVID-19 and the CIA’s Biological Warfare on Cuba</td>\n",
+       "      <td>Maybe it was a plan that went horribly wrong, ...</td>\n",
+       "      <td>https://www.globalresearch.ca</td>\n",
+       "      <td>FAKE</td>\n",
+       "      <td>COVID-19 and the CIA’s Biological Warfare on C...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>missing</td>\n",
+       "      <td>Donating blood requires that you be administer...</td>\n",
+       "      <td>missing</td>\n",
+       "      <td>FAKE</td>\n",
+       "      <td>missing Donating blood requires that you be ad...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>Is it safe to donate blood during the outbreak...</td>\n",
+       "      <td>COVID-19 doesn’t pose any known risk to blood ...</td>\n",
+       "      <td>https://www.globalhealthnow.org/</td>\n",
+       "      <td>TRUE</td>\n",
+       "      <td>Is it safe to donate blood during the outbreak...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                               title  \\\n",
+       "0  What precautions can I take when grocery shopp...   \n",
+       "1  BREAKING: New Evidence Based on Cell Phone Dat...   \n",
+       "2  COVID-19 and the CIA’s Biological Warfare on Cuba   \n",
+       "3                                            missing   \n",
+       "4  Is it safe to donate blood during the outbreak...   \n",
+       "\n",
+       "                                                text  \\\n",
+       "0  The coronavirus that causes COVID-19 is primar...   \n",
+       "1  Bartiromo broke news this morning that cell ph...   \n",
+       "2  Maybe it was a plan that went horribly wrong, ...   \n",
+       "3  Donating blood requires that you be administer...   \n",
+       "4  COVID-19 doesn’t pose any known risk to blood ...   \n",
+       "\n",
+       "                              source label  \\\n",
+       "0    https://www.health.harvard.edu/  TRUE   \n",
+       "1  https://www.thegatewaypundit.com/  FAKE   \n",
+       "2      https://www.globalresearch.ca  FAKE   \n",
+       "3                            missing  FAKE   \n",
+       "4   https://www.globalhealthnow.org/  TRUE   \n",
+       "\n",
+       "                                          title_text  \n",
+       "0  What precautions can I take when grocery shopp...  \n",
+       "1  BREAKING: New Evidence Based on Cell Phone Dat...  \n",
+       "2  COVID-19 and the CIA’s Biological Warfare on C...  \n",
+       "3  missing Donating blood requires that you be ad...  \n",
+       "4  Is it safe to donate blood during the outbreak...  "
+      ]
+     },
+     "execution_count": 101,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 152,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'CORONAVIRUS: A WUHAN LABORATORY SPONSORED BY SOROS, VIRUS AFFECTS ONLY MONGOLOID RACE There is a biolaboratory in Wuhan – until recently, nothing was known about it. Its address is Gaoxin, three sixes – the number mentioned in the Bible, under which the name of the beast of the Apocalypse is hidden. But it’s even more symbolic that it exists thanks to the money of the famous banker George Soros, who shares the globalist ideas of Bill Gates. This could be part of a cunning plan.The coronavirus affects only the representatives of the Mongoloid race, which is very suspicious and raises questions.'"
+      ]
+     },
+     "execution_count": 152,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['title_text'][50]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 153,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def preprocessor(text):\n",
+    "    \n",
+    "    text = re.sub('<[^>]*>', '', text)\n",
+    "    text = re.sub(r'[^\\w\\s]','', text)\n",
+    "    text = text.lower()\n",
+    "\n",
+    "    return text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 154,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['title_text'] = df['title_text'].apply(preprocessor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 156,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'coronavirus a wuhan laboratory sponsored by soros virus affects only mongoloid race there is a biolaboratory in wuhan  until recently nothing was known about it its address is gaoxin three sixes  the number mentioned in the bible under which the name of the beast of the apocalypse is hidden but its even more symbolic that it exists thanks to the money of the famous banker george soros who shares the globalist ideas of bill gates this could be part of a cunning planthe coronavirus affects only the representatives of the mongoloid race which is very suspicious and raises questions'"
+      ]
+     },
+     "execution_count": 156,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['title_text'][50]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 157,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "porter = PorterStemmer()\n",
+    "\n",
+    "def tokenizer_porter(text):\n",
+    "    return [porter.stem(word) for word in text.split()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 158,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tfidf = TfidfVectorizer(strip_accents=None,\n",
+    "                        lowercase=False,\n",
+    "                        preprocessor=None,\n",
+    "                        tokenizer=tokenizer_porter,\n",
+    "                        use_idf=True,\n",
+    "                        norm='l2',\n",
+    "                        smooth_idf=True)\n",
+    "X = tfidf.fit_transform(df['title_text'])\n",
+    "y = df.label.values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 159,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
+      "[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   14.8s remaining:   22.2s\n",
+      "[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   18.1s finished\n"
+     ]
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.5, shuffle=False)\n",
+    "\n",
+    "clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train, y_train)\n",
+    "\n",
+    "fake_news_model = open('fake_news_model.sav', 'wb')\n",
+    "pickle.dump(clf, fake_news_model)\n",
+    "fake_news_model.close()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 160,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.9347079037800687"
+      ]
+     },
+     "execution_count": 160,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "filename = 'fake_news_model.sav'\n",
+    "saved_clf = pickle.load(open(filename, 'rb'))\n",
+    "\n",
+    "saved_clf.score(X_test, y_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 161,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---Test Set Results---\n",
+      "Accuracy with logreg: 0.9347079037800687\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "        FAKE       0.93      0.94      0.93       281\n",
+      "        TRUE       0.94      0.93      0.94       301\n",
+      "\n",
+      "    accuracy                           0.93       582\n",
+      "   macro avg       0.93      0.93      0.93       582\n",
+      "weighted avg       0.93      0.93      0.93       582\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.metrics import classification_report, accuracy_score\n",
+    "y_pred = clf.predict(X_test)\n",
+    "print(\"---Test Set Results---\")\n",
+    "print(\"Accuracy with logreg: {}\".format(accuracy_score(y_test, y_pred)))\n",
+    "print(classification_report(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}