Add notebook

susanli2016 · Aug 3, 2017 · 8d63013 · 8d63013
1 parent 868da20
commit 8d63013
Showing 1 changed file with 158 additions and 0 deletions.
diff --git a/Cleaning Text.ipynb b/Cleaning Text.ipynb
@@ -0,0 +1,158 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "raw_docs = [\"Here are some very simple basic sentences.\",\n",
+    "\"They won't be very interesting, I'm afraid.\",\n",
+    "\"The point of these examples is to _learn how basic text cleaning works_ on *very simple* data.\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'wo', \"n't\", 'be', 'very', 'interesting', ',', 'I', \"'m\", 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', 'cleaning', 'works_', 'on', '*very', 'simple*', 'data', '.']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Tokenizing text into bags of words\n",
+    "from nltk.tokenize import word_tokenize\n",
+    "tokenized_docs = [word_tokenize(doc) for doc in raw_docs]\n",
+    "print(tokenized_docs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'be', 'very', 'interesting', 'I', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Removing punctuation\n",
+    "import re\n",
+    "import string\n",
+    "regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n",
+    "\n",
+    "tokenized_docs_no_punctuation = []\n",
+    "\n",
+    "for review in tokenized_docs:\n",
+    "    new_review = []\n",
+    "    for token in review:\n",
+    "        new_token = regex.sub(u'', token)\n",
+    "        if not new_token == u'':\n",
+    "            new_review.append(new_token)\n",
+    "    \n",
+    "    tokenized_docs_no_punctuation.append(new_review)\n",
+    "    \n",
+    "print(tokenized_docs_no_punctuation)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['Here', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'interesting', 'I', 'afraid'], ['The', 'point', 'examples', 'learn', 'basic', 'text', 'cleaning', 'works', 'simple', 'data']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Cleaning text of stopwords\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "tokenized_docs_no_stopwords = []\n",
+    "\n",
+    "for doc in tokenized_docs_no_punctuation:\n",
+    "    new_term_vector = []\n",
+    "    for word in doc:\n",
+    "        if not word in stopwords.words('english'):\n",
+    "            new_term_vector.append(word)\n",
+    "    \n",
+    "    tokenized_docs_no_stopwords.append(new_term_vector)\n",
+    "\n",
+    "print(tokenized_docs_no_stopwords)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[['here', 'simpl', 'basic', 'sentenc'], ['they', 'wo', 'nt', 'interest', 'I', 'afraid'], ['the', 'point', 'exampl', 'learn', 'basic', 'text', 'clean', 'work', 'simpl', 'data']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Stemming and Lemmatizing\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from nltk.stem.snowball import SnowballStemmer\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "\n",
+    "porter = PorterStemmer()\n",
+    "snowball = SnowballStemmer('english')\n",
+    "wordnet = WordNetLemmatizer()\n",
+    "\n",
+    "preprocessed_docs = []\n",
+    "\n",
+    "for doc in tokenized_docs_no_stopwords:\n",
+    "    final_doc = []\n",
+    "    for word in doc:\n",
+    "        final_doc.append(porter.stem(word))\n",
+    "        #final_doc.append(snowball.stem(word))\n",
+    "        #final_doc.append(wordnet.lemmatize(word))\n",
+    "    \n",
+    "    preprocessed_docs.append(final_doc)\n",
+    "\n",
+    "print(preprocessed_docs)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}