Skip to content

Commit

Permalink
Add notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
susanli2016 authored Aug 3, 2017
1 parent 868da20 commit 8d63013
Showing 1 changed file with 158 additions and 0 deletions.
158 changes: 158 additions & 0 deletions Cleaning Text.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"raw_docs = [\"Here are some very simple basic sentences.\",\n",
"\"They won't be very interesting, I'm afraid.\",\n",
"\"The point of these examples is to _learn how basic text cleaning works_ on *very simple* data.\"]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'wo', \"n't\", 'be', 'very', 'interesting', ',', 'I', \"'m\", 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', 'cleaning', 'works_', 'on', '*very', 'simple*', 'data', '.']]\n"
]
}
],
"source": [
"# Tokenizing text into bags of words\n",
"from nltk.tokenize import word_tokenize\n",
"tokenized_docs = [word_tokenize(doc) for doc in raw_docs]\n",
"print(tokenized_docs)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'be', 'very', 'interesting', 'I', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]\n"
]
}
],
"source": [
"# Removing punctuation\n",
"import re\n",
"import string\n",
"regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
"\n",
"tokenized_docs_no_punctuation = []\n",
"\n",
"for review in tokenized_docs:\n",
" new_review = []\n",
" for token in review:\n",
" new_token = regex.sub(u'', token)\n",
" if not new_token == u'':\n",
" new_review.append(new_token)\n",
" \n",
" tokenized_docs_no_punctuation.append(new_review)\n",
" \n",
"print(tokenized_docs_no_punctuation)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['Here', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'interesting', 'I', 'afraid'], ['The', 'point', 'examples', 'learn', 'basic', 'text', 'cleaning', 'works', 'simple', 'data']]\n"
]
}
],
"source": [
"# Cleaning text of stopwords\n",
"from nltk.corpus import stopwords\n",
"\n",
"tokenized_docs_no_stopwords = []\n",
"\n",
"for doc in tokenized_docs_no_punctuation:\n",
" new_term_vector = []\n",
" for word in doc:\n",
" if not word in stopwords.words('english'):\n",
" new_term_vector.append(word)\n",
" \n",
" tokenized_docs_no_stopwords.append(new_term_vector)\n",
"\n",
"print(tokenized_docs_no_stopwords)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[['here', 'simpl', 'basic', 'sentenc'], ['they', 'wo', 'nt', 'interest', 'I', 'afraid'], ['the', 'point', 'exampl', 'learn', 'basic', 'text', 'clean', 'work', 'simpl', 'data']]\n"
]
}
],
"source": [
"# Stemming and Lemmatizing\n",
"from nltk.stem.porter import PorterStemmer\n",
"from nltk.stem.snowball import SnowballStemmer\n",
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"\n",
"porter = PorterStemmer()\n",
"snowball = SnowballStemmer('english')\n",
"wordnet = WordNetLemmatizer()\n",
"\n",
"preprocessed_docs = []\n",
"\n",
"for doc in tokenized_docs_no_stopwords:\n",
" final_doc = []\n",
" for word in doc:\n",
" final_doc.append(porter.stem(word))\n",
" #final_doc.append(snowball.stem(word))\n",
" #final_doc.append(wordnet.lemmatize(word))\n",
" \n",
" preprocessed_docs.append(final_doc)\n",
"\n",
"print(preprocessed_docs)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit 8d63013

Please sign in to comment.