From 8d630136cb177fd7a6795f574b1ec7ff5757072d Mon Sep 17 00:00:00 2001 From: Susan Li Date: Thu, 3 Aug 2017 00:25:19 -0400 Subject: [PATCH] Add notebook --- Cleaning Text.ipynb | 158 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 Cleaning Text.ipynb diff --git a/Cleaning Text.ipynb b/Cleaning Text.ipynb new file mode 100644 index 0000000..b28b9cb --- /dev/null +++ b/Cleaning Text.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "raw_docs = [\"Here are some very simple basic sentences.\",\n", + "\"They won't be very interesting, I'm afraid.\",\n", + "\"The point of these examples is to _learn how basic text cleaning works_ on *very simple* data.\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'wo', \"n't\", 'be', 'very', 'interesting', ',', 'I', \"'m\", 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', 'cleaning', 'works_', 'on', '*very', 'simple*', 'data', '.']]\n" + ] + } + ], + "source": [ + "# Tokenizing text into bags of words\n", + "from nltk.tokenize import word_tokenize\n", + "tokenized_docs = [word_tokenize(doc) for doc in raw_docs]\n", + "print(tokenized_docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'be', 'very', 'interesting', 'I', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]\n" + ] + } + ], + "source": [ + "# Removing punctuation\n", + "import re\n", + "import string\n", + "regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html\n", + "\n", + "tokenized_docs_no_punctuation = []\n", + "\n", + "for review in tokenized_docs:\n", + " new_review = []\n", + " for token in review:\n", + " new_token = regex.sub(u'', token)\n", + " if not new_token == u'':\n", + " new_review.append(new_token)\n", + " \n", + " tokenized_docs_no_punctuation.append(new_review)\n", + " \n", + "print(tokenized_docs_no_punctuation)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['Here', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'interesting', 'I', 'afraid'], ['The', 'point', 'examples', 'learn', 'basic', 'text', 'cleaning', 'works', 'simple', 'data']]\n" + ] + } + ], + "source": [ + "# Cleaning text of stopwords\n", + "from nltk.corpus import stopwords\n", + "\n", + "tokenized_docs_no_stopwords = []\n", + "\n", + "for doc in tokenized_docs_no_punctuation:\n", + " new_term_vector = []\n", + " for word in doc:\n", + " if not word in stopwords.words('english'):\n", + " new_term_vector.append(word)\n", + " \n", + " tokenized_docs_no_stopwords.append(new_term_vector)\n", + "\n", + "print(tokenized_docs_no_stopwords)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[['here', 'simpl', 'basic', 'sentenc'], ['they', 'wo', 'nt', 'interest', 'I', 'afraid'], ['the', 'point', 'exampl', 'learn', 'basic', 'text', 'clean', 'work', 'simpl', 'data']]\n" + ] + } + ], + "source": [ + "# Stemming and Lemmatizing\n", + "from nltk.stem.porter import PorterStemmer\n", + "from nltk.stem.snowball import SnowballStemmer\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "\n", + "porter = PorterStemmer()\n", + "snowball = SnowballStemmer('english')\n", + "wordnet = WordNetLemmatizer()\n", + "\n", + "preprocessed_docs = []\n", + "\n", + "for doc in tokenized_docs_no_stopwords:\n", + " final_doc = []\n", + " for word in doc:\n", + " final_doc.append(porter.stem(word))\n", + " #final_doc.append(snowball.stem(word))\n", + " #final_doc.append(wordnet.lemmatize(word))\n", + " \n", + " preprocessed_docs.append(final_doc)\n", + "\n", + "print(preprocessed_docs)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}