-
Notifications
You must be signed in to change notification settings - Fork 2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
b303790
commit 0015983
Showing
1 changed file
with
378 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,378 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 99, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import re\n", | ||
"from nltk.stem.porter import PorterStemmer\n", | ||
"from sklearn.feature_extraction.text import TfidfVectorizer\n", | ||
"from sklearn.model_selection import train_test_split\n", | ||
"import pickle\n", | ||
"from sklearn.linear_model import LogisticRegressionCV\n", | ||
"\n", | ||
"df = pd.read_csv('data/corona_fake.csv')\n", | ||
"\n", | ||
"df.loc[df['label'] == 'Fake', ['label']] = 'FAKE'\n", | ||
"df.loc[df['label'] == 'fake', ['label']] = 'FAKE'\n", | ||
"df.loc[df['source'] == 'facebook', ['source']] = 'Facebook'\n", | ||
"df.text.fillna(df.title, inplace=True)\n", | ||
"\n", | ||
"df.loc[5]['label'] = 'FAKE'\n", | ||
"df.loc[15]['label'] = 'TRUE'\n", | ||
"df.loc[43]['label'] = 'FAKE'\n", | ||
"df.loc[131]['label'] = 'TRUE'\n", | ||
"df.loc[242]['label'] = 'FAKE'\n", | ||
"\n", | ||
"df = df.sample(frac=1).reset_index(drop=True)\n", | ||
"df.title.fillna('missing', inplace=True)\n", | ||
"df.source.fillna('missing', inplace=True)\n", | ||
"\n", | ||
"df['title_text'] = df['title'] + ' ' + df['text']" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 100, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"TRUE 586\n", | ||
"FAKE 578\n", | ||
"Name: label, dtype: int64" | ||
] | ||
}, | ||
"execution_count": 100, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df['label'].value_counts()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 101, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/html": [ | ||
"<div>\n", | ||
"<style scoped>\n", | ||
" .dataframe tbody tr th:only-of-type {\n", | ||
" vertical-align: middle;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe tbody tr th {\n", | ||
" vertical-align: top;\n", | ||
" }\n", | ||
"\n", | ||
" .dataframe thead th {\n", | ||
" text-align: right;\n", | ||
" }\n", | ||
"</style>\n", | ||
"<table border=\"1\" class=\"dataframe\">\n", | ||
" <thead>\n", | ||
" <tr style=\"text-align: right;\">\n", | ||
" <th></th>\n", | ||
" <th>title</th>\n", | ||
" <th>text</th>\n", | ||
" <th>source</th>\n", | ||
" <th>label</th>\n", | ||
" <th>title_text</th>\n", | ||
" </tr>\n", | ||
" </thead>\n", | ||
" <tbody>\n", | ||
" <tr>\n", | ||
" <td>0</td>\n", | ||
" <td>What precautions can I take when grocery shopp...</td>\n", | ||
" <td>The coronavirus that causes COVID-19 is primar...</td>\n", | ||
" <td>https://www.health.harvard.edu/</td>\n", | ||
" <td>TRUE</td>\n", | ||
" <td>What precautions can I take when grocery shopp...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>1</td>\n", | ||
" <td>BREAKING: New Evidence Based on Cell Phone Dat...</td>\n", | ||
" <td>Bartiromo broke news this morning that cell ph...</td>\n", | ||
" <td>https://www.thegatewaypundit.com/</td>\n", | ||
" <td>FAKE</td>\n", | ||
" <td>BREAKING: New Evidence Based on Cell Phone Dat...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>2</td>\n", | ||
" <td>COVID-19 and the CIA’s Biological Warfare on Cuba</td>\n", | ||
" <td>Maybe it was a plan that went horribly wrong, ...</td>\n", | ||
" <td>https://www.globalresearch.ca</td>\n", | ||
" <td>FAKE</td>\n", | ||
" <td>COVID-19 and the CIA’s Biological Warfare on C...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>3</td>\n", | ||
" <td>missing</td>\n", | ||
" <td>Donating blood requires that you be administer...</td>\n", | ||
" <td>missing</td>\n", | ||
" <td>FAKE</td>\n", | ||
" <td>missing Donating blood requires that you be ad...</td>\n", | ||
" </tr>\n", | ||
" <tr>\n", | ||
" <td>4</td>\n", | ||
" <td>Is it safe to donate blood during the outbreak...</td>\n", | ||
" <td>COVID-19 doesn’t pose any known risk to blood ...</td>\n", | ||
" <td>https://www.globalhealthnow.org/</td>\n", | ||
" <td>TRUE</td>\n", | ||
" <td>Is it safe to donate blood during the outbreak...</td>\n", | ||
" </tr>\n", | ||
" </tbody>\n", | ||
"</table>\n", | ||
"</div>" | ||
], | ||
"text/plain": [ | ||
" title \\\n", | ||
"0 What precautions can I take when grocery shopp... \n", | ||
"1 BREAKING: New Evidence Based on Cell Phone Dat... \n", | ||
"2 COVID-19 and the CIA’s Biological Warfare on Cuba \n", | ||
"3 missing \n", | ||
"4 Is it safe to donate blood during the outbreak... \n", | ||
"\n", | ||
" text \\\n", | ||
"0 The coronavirus that causes COVID-19 is primar... \n", | ||
"1 Bartiromo broke news this morning that cell ph... \n", | ||
"2 Maybe it was a plan that went horribly wrong, ... \n", | ||
"3 Donating blood requires that you be administer... \n", | ||
"4 COVID-19 doesn’t pose any known risk to blood ... \n", | ||
"\n", | ||
" source label \\\n", | ||
"0 https://www.health.harvard.edu/ TRUE \n", | ||
"1 https://www.thegatewaypundit.com/ FAKE \n", | ||
"2 https://www.globalresearch.ca FAKE \n", | ||
"3 missing FAKE \n", | ||
"4 https://www.globalhealthnow.org/ TRUE \n", | ||
"\n", | ||
" title_text \n", | ||
"0 What precautions can I take when grocery shopp... \n", | ||
"1 BREAKING: New Evidence Based on Cell Phone Dat... \n", | ||
"2 COVID-19 and the CIA’s Biological Warfare on C... \n", | ||
"3 missing Donating blood requires that you be ad... \n", | ||
"4 Is it safe to donate blood during the outbreak... " | ||
] | ||
}, | ||
"execution_count": 101, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df.head()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 152, | ||
"metadata": { | ||
"scrolled": false | ||
}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'CORONAVIRUS: A WUHAN LABORATORY SPONSORED BY SOROS, VIRUS AFFECTS ONLY MONGOLOID RACE There is a biolaboratory in Wuhan – until recently, nothing was known about it. Its address is Gaoxin, three sixes – the number mentioned in the Bible, under which the name of the beast of the Apocalypse is hidden. But it’s even more symbolic that it exists thanks to the money of the famous banker George Soros, who shares the globalist ideas of Bill Gates. This could be part of a cunning plan.The coronavirus affects only the representatives of the Mongoloid race, which is very suspicious and raises questions.'" | ||
] | ||
}, | ||
"execution_count": 152, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df['title_text'][50]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 153, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def preprocessor(text):\n", | ||
" \n", | ||
" text = re.sub('<[^>]*>', '', text)\n", | ||
" text = re.sub(r'[^\\w\\s]','', text)\n", | ||
" text = text.lower()\n", | ||
"\n", | ||
" return text" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 154, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"df['title_text'] = df['title_text'].apply(preprocessor)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 156, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"'coronavirus a wuhan laboratory sponsored by soros virus affects only mongoloid race there is a biolaboratory in wuhan until recently nothing was known about it its address is gaoxin three sixes the number mentioned in the bible under which the name of the beast of the apocalypse is hidden but its even more symbolic that it exists thanks to the money of the famous banker george soros who shares the globalist ideas of bill gates this could be part of a cunning planthe coronavirus affects only the representatives of the mongoloid race which is very suspicious and raises questions'" | ||
] | ||
}, | ||
"execution_count": 156, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"df['title_text'][50]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 157, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"porter = PorterStemmer()\n", | ||
"\n", | ||
"def tokenizer_porter(text):\n", | ||
" return [porter.stem(word) for word in text.split()]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 158, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tfidf = TfidfVectorizer(strip_accents=None,\n", | ||
" lowercase=False,\n", | ||
" preprocessor=None,\n", | ||
" tokenizer=tokenizer_porter,\n", | ||
" use_idf=True,\n", | ||
" norm='l2',\n", | ||
" smooth_idf=True)\n", | ||
"X = tfidf.fit_transform(df['title_text'])\n", | ||
"y = df.label.values" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 159, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stderr", | ||
"output_type": "stream", | ||
"text": [ | ||
"[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.\n", | ||
"[Parallel(n_jobs=-1)]: Done 2 out of 5 | elapsed: 14.8s remaining: 22.2s\n", | ||
"[Parallel(n_jobs=-1)]: Done 5 out of 5 | elapsed: 18.1s finished\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.5, shuffle=False)\n", | ||
"\n", | ||
"clf = LogisticRegressionCV(cv=5, scoring='accuracy', random_state=0, n_jobs=-1, verbose=3, max_iter=300).fit(X_train, y_train)\n", | ||
"\n", | ||
"fake_news_model = open('fake_news_model.sav', 'wb')\n", | ||
"pickle.dump(clf, fake_news_model)\n", | ||
"fake_news_model.close()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 160, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"0.9347079037800687" | ||
] | ||
}, | ||
"execution_count": 160, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"filename = 'fake_news_model.sav'\n", | ||
"saved_clf = pickle.load(open(filename, 'rb'))\n", | ||
"\n", | ||
"saved_clf.score(X_test, y_test)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 161, | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"---Test Set Results---\n", | ||
"Accuracy with logreg: 0.9347079037800687\n", | ||
" precision recall f1-score support\n", | ||
"\n", | ||
" FAKE 0.93 0.94 0.93 281\n", | ||
" TRUE 0.94 0.93 0.94 301\n", | ||
"\n", | ||
" accuracy 0.93 582\n", | ||
" macro avg 0.93 0.93 0.93 582\n", | ||
"weighted avg 0.93 0.93 0.93 582\n", | ||
"\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"from sklearn.metrics import classification_report, accuracy_score\n", | ||
"y_pred = clf.predict(X_test)\n", | ||
"print(\"---Test Set Results---\")\n", | ||
"print(\"Accuracy with logreg: {}\".format(accuracy_score(y_test, y_pred)))\n", | ||
"print(classification_report(y_test, y_pred))" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.6.7" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |