Skip to content

Commit

Permalink
Add notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
susanli2016 authored Jul 22, 2020
1 parent b303790 commit 0015983
Showing 1 changed file with 378 additions and 0 deletions.
378 changes: 378 additions & 0 deletions Fake_News_LogReg.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,378 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"from nltk.stem.porter import PorterStemmer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.model_selection import train_test_split\n",
"import pickle\n",
"from sklearn.linear_model import LogisticRegressionCV\n",
"\n",
"df = pd.read_csv('data/corona_fake.csv')\n",
"\n",
"df.loc[df['label'] == 'Fake', ['label']] = 'FAKE'\n",
"df.loc[df['label'] == 'fake', ['label']] = 'FAKE'\n",
"df.loc[df['source'] == 'facebook', ['source']] = 'Facebook'\n",
"df.text.fillna(df.title, inplace=True)\n",
"\n",
"df.loc[5]['label'] = 'FAKE'\n",
"df.loc[15]['label'] = 'TRUE'\n",
"df.loc[43]['label'] = 'FAKE'\n",
"df.loc[131]['label'] = 'TRUE'\n",
"df.loc[242]['label'] = 'FAKE'\n",
"\n",
"df = df.sample(frac=1).reset_index(drop=True)\n",
"df.title.fillna('missing', inplace=True)\n",
"df.source.fillna('missing', inplace=True)\n",
"\n",
"df['title_text'] = df['title'] + ' ' + df['text']"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"TRUE 586\n",
"FAKE 578\n",
"Name: label, dtype: int64"
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['label'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>title</th>\n",
" <th>text</th>\n",
" <th>source</th>\n",
" <th>label</th>\n",
" <th>title_text</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>What precautions can I take when grocery shopp...</td>\n",
" <td>The coronavirus that causes COVID-19 is primar...</td>\n",
" <td>https://www.health.harvard.edu/</td>\n",
" <td>TRUE</td>\n",
" <td>What precautions can I take when grocery shopp...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>BREAKING: New Evidence Based on Cell Phone Dat...</td>\n",
" <td>Bartiromo broke news this morning that cell ph...</td>\n",
" <td>https://www.thegatewaypundit.com/</td>\n",
" <td>FAKE</td>\n",
" <td>BREAKING: New Evidence Based on Cell Phone Dat...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>COVID-19 and the CIA’s Biological Warfare on Cuba</td>\n",
" <td>Maybe it was a plan that went horribly wrong, ...</td>\n",
" <td>https://www.globalresearch.ca</td>\n",
" <td>FAKE</td>\n",
" <td>COVID-19 and the CIA’s Biological Warfare on C...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>missing</td>\n",
" <td>Donating blood requires that you be administer...</td>\n",
" <td>missing</td>\n",
" <td>FAKE</td>\n",
" <td>missing Donating blood requires that you be ad...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>Is it safe to donate blood during the outbreak...</td>\n",
" <td>COVID-19 doesn’t pose any known risk to blood ...</td>\n",
" <td>https://www.globalhealthnow.org/</td>\n",
" <td>TRUE</td>\n",
" <td>Is it safe to donate blood during the outbreak...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" title \\\n",
"0 What precautions can I take when grocery shopp... \n",
"1 BREAKING: New Evidence Based on Cell Phone Dat... \n",
"2 COVID-19 and the CIA’s Biological Warfare on Cuba \n",
"3 missing \n",
"4 Is it safe to donate blood during the outbreak... \n",
"\n",
" text \\\n",
"0 The coronavirus that causes COVID-19 is primar... \n",
"1 Bartiromo broke news this morning that cell ph... \n",
"2 Maybe it was a plan that went horribly wrong, ... \n",
"3 Donating blood requires that you be administer... \n",
"4 COVID-19 doesn’t pose any known risk to blood ... \n",
"\n",
" source label \\\n",
"0 https://www.health.harvard.edu/ TRUE \n",
"1 https://www.thegatewaypundit.com/ FAKE \n",
"2 https://www.globalresearch.ca FAKE \n",
"3 missing FAKE \n",
"4 https://www.globalhealthnow.org/ TRUE \n",
"\n",
" title_text \n",
"0 What precautions can I take when grocery shopp... \n",
"1 BREAKING: New Evidence Based on Cell Phone Dat... \n",
"2 COVID-19 and the CIA’s Biological Warfare on C... \n",
"3 missing Donating blood requires that you be ad... \n",
"4 Is it safe to donate blood during the outbreak... "
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 152,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"'CORONAVIRUS: A WUHAN LABORATORY SPONSORED BY SOROS, VIRUS AFFECTS ONLY MONGOLOID RACE There is a biolaboratory in Wuhan – until recently, nothing was known about it. Its address is Gaoxin, three sixes – the number mentioned in the Bible, under which the name of the beast of the Apocalypse is hidden. But it’s even more symbolic that it exists thanks to the money of the famous banker George Soros, who shares the globalist ideas of Bill Gates. This could be part of a cunning plan.The coronavirus affects only the representatives of the Mongoloid race, which is very suspicious and raises questions.'"
]
},
"execution_count": 152,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['title_text'][50]"
]
},
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": [
"def preprocessor(text):\n",
" \n",
" text = re.sub('<[^>]*>', '', text)\n",
" text = re.sub(r'[^\\w\\s]','', text)\n",
" text = text.lower()\n",
"\n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": 154,
"metadata": {},
"outputs": [],
"source": [
"df['title_text'] = df['title_text'].apply(preprocessor)"
]
},
{
"cell_type": "code",
"execution_count": 156,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'coronavirus a wuhan laboratory sponsored by soros virus affects only mongoloid race there is a biolaboratory in wuhan until recently nothing was known about it its address is gaoxin three sixes the number mentioned in the bible under which the name of the beast of the apocalypse is hidden but its even more symbolic that it exists thanks to the money of the famous banker george soros who shares the globalist ideas of bill gates this could be part of a cunning planthe coronavirus affects only the representatives of the mongoloid race which is very suspicious and raises questions'"
]
},
"execution_count": 156,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['title_text'][50]"
]
},
{
"cell_type": "code",
"execution_count": 157,
"metadata": {},
"outputs": [],
"source": [
"porter = PorterStemmer()\n",
"\n",
"def tokenizer_porter(text):\n",
" return [porter.stem(word) for word in text.split()]"
]
},
{
"cell_type": "code",
"execution_count": 158,
"metadata": {},
"outputs": [],
"source": [
"tfidf = TfidfVectorizer(strip_accents=None,\n",
" lowercase=False,\n",
" preprocessor=None,\n",
" tokenizer=tokenizer_porter,\n",
" use_idf=True,\n",
" norm='l2',\n",
" smooth_idf=True)\n",
"X = tfidf.fit_transform(df['title_text'])\n",
"y = df.label.values"
]
},
{
"cell_type": "code",
"execution_count": 159,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n",
"[Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 14.8s remaining: 22.2s\n",
"[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 18.1s finished\n"
]
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.5, shuffle=False)\n",
"\n",
"clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train, y_train)\n",
"\n",
"fake_news_model = open('fake_news_model.sav', 'wb')\n",
"pickle.dump(clf, fake_news_model)\n",
"fake_news_model.close()"
]
},
{
"cell_type": "code",
"execution_count": 160,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.9347079037800687"
]
},
"execution_count": 160,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filename = 'fake_news_model.sav'\n",
"saved_clf = pickle.load(open(filename, 'rb'))\n",
"\n",
"saved_clf.score(X_test, y_test)"
]
},
{
"cell_type": "code",
"execution_count": 161,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"---Test Set Results---\n",
"Accuracy with logreg: 0.9347079037800687\n",
" precision recall f1-score support\n",
"\n",
" FAKE 0.93 0.94 0.93 281\n",
" TRUE 0.94 0.93 0.94 301\n",
"\n",
" accuracy 0.93 582\n",
" macro avg 0.93 0.93 0.93 582\n",
"weighted avg 0.93 0.93 0.93 582\n",
"\n"
]
}
],
"source": [
"from sklearn.metrics import classification_report, accuracy_score\n",
"y_pred = clf.predict(X_test)\n",
"print(\"---Test Set Results---\")\n",
"print(\"Accuracy with logreg: {}\".format(accuracy_score(y_test, y_pred)))\n",
"print(classification_report(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 0015983

Please sign in to comment.