Add notebooks

susanli2016 · Aug 3, 2017 · 868da20 · 868da20
1 parent 878fd42
commit 868da20
Showing 5 changed files with 1,358 additions and 0 deletions.
diff --git a/Advanced NLP Tasks with NLTK.ipynb b/Advanced NLP Tasks with NLTK.ipynb
@@ -0,0 +1,85 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MD: modal auxiliary\n",
+      "    can cannot could couldn't dare may might must need ought shall should\n",
+      "    shouldn't will would\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "nltk.help.upenn_tagset('MD')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Children', 'NNP'),\n",
+       " (\"should't\", 'VBZ'),\n",
+       " ('drink', 'VB'),\n",
+       " ('a', 'DT'),\n",
+       " ('sugary', 'JJ'),\n",
+       " ('drink', 'NN'),\n",
+       " ('before', 'IN'),\n",
+       " ('bed', 'NN'),\n",
+       " ('.', '.')]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# POS tagging with NLTK\n",
+    "text11 = \"Children should't drink a sugary drink before bed.\"\n",
+    "text13 = nltk.word_tokenize(text11)\n",
+    "nltk.pos_tag(text13)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Basic NLP Tasks with NLTK.ipynb b/Basic NLP Tasks with NLTK.ipynb
@@ -0,0 +1,416 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "*** Introductory Examples for the NLTK Book ***\n",
+      "Loading text1, ..., text9 and sent1, ..., sent9\n",
+      "Type the name of the text or sentence to view it.\n",
+      "Type: 'texts()' or 'sents()' to list the materials.\n",
+      "text1: Moby Dick by Herman Melville 1851\n",
+      "text2: Sense and Sensibility by Jane Austen 1811\n",
+      "text3: The Book of Genesis\n",
+      "text4: Inaugural Address Corpus\n",
+      "text5: Chat Corpus\n",
+      "text6: Monty Python and the Holy Grail\n",
+      "text7: Wall Street Journal\n",
+      "text8: Personals Corpus\n",
+      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "from nltk.book import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Text: Moby Dick by Herman Melville 1851>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sent1: Call me Ishmael .\n",
+      "sent2: The family of Dashwood had long been settled in Sussex .\n",
+      "sent3: In the beginning God created the heaven and the earth .\n",
+      "sent4: Fellow - Citizens of the Senate and of the House of Representatives :\n",
+      "sent5: I have a problem with people PMing me to lol JOIN\n",
+      "sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !\n",
+      "sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .\n",
+      "sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .\n",
+      "sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .\n"
+     ]
+    }
+   ],
+   "source": [
+    "sents()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Call', 'me', 'Ishmael', '.']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sent1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<Text: Wall Street Journal> 100676\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(text7, len(text7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 18\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(sent7, len(sent7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Foster',\n",
+       " 'tallies',\n",
+       " 'rejected',\n",
+       " 'budding',\n",
+       " 'Ratings',\n",
+       " 'earns',\n",
+       " 'Raton',\n",
+       " '8.70',\n",
+       " 'Carnival',\n",
+       " 'Driscoll']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(set(text7))[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12408"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Frequency of words\n",
+    "dist = FreqDist(text7)\n",
+    "len(dist)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocab1 = list(dist.keys())\n",
+    "vocab1[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dist['Vinken']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['billion',\n",
+       " 'company',\n",
+       " 'president',\n",
+       " 'because',\n",
+       " 'market',\n",
+       " 'million',\n",
+       " 'shares',\n",
+       " 'trading',\n",
+       " 'program']"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]\n",
+    "freqwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['list', 'listed', 'lists', 'listing', 'listings']"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# different forms of the same \"word\"\n",
+    "input1 = 'List listed lists listing listings'\n",
+    "words1 = input1.lower().split(' ')\n",
+    "words1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['list', 'list', 'list', 'list', 'list']"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "porter = nltk.PorterStemmer()\n",
+    "[porter.stem(t) for t in words1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Children', \"shouldn't\", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# tokenization\n",
+    "text11 = \"Children shouldn't drink a sugary drink before bed.\"\n",
+    "text11.split(' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Children',\n",
+       " 'should',\n",
+       " \"n't\",\n",
+       " 'drink',\n",
+       " 'a',\n",
+       " 'sugary',\n",
+       " 'drink',\n",
+       " 'before',\n",
+       " 'bed',\n",
+       " '.']"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nltk.word_tokenize(text11)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# sentence splitting\n",
+    "text12 = 'This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!'\n",
+    "sentences = nltk.sent_tokenize(text12)\n",
+    "len(sentences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['This is the first sentence.',\n",
+       " 'A gallon of milk in the U.S. costs $2.99.',\n",
+       " 'Is this the third sentence?',\n",
+       " 'Yes, it is!']"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sentences"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/LDA.ipynb b/LDA.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.tokenize import RegexpTokenizer\n",
+    "from stop_words import get_stop_words\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from gensim import corpora, models\n",
+    "import gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = RegexpTokenizer(r'\\w+')\n",
+    "\n",
+    "# create English stop words list\n",
+    "en_stop = get_stop_words('en')\n",
+    "\n",
+    "# Create p_stemmer of class PorterStemmer\n",
+    "p_stemmer = PorterStemmer()\n",
+    "    \n",
+    "# create sample documents\n",
+    "doc_a = \"Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.\"\n",
+    "doc_b = \"My mother spends a lot of time driving my brother around to baseball practice.\"\n",
+    "doc_c = \"Some health experts suggest that driving may cause increased tension and blood pressure.\"\n",
+    "doc_d = \"I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.\"\n",
+    "doc_e = \"Health professionals say that brocolli is good for your health.\" \n",
+    "\n",
+    "# compile sample documents into a list\n",
+    "doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]\n",
+    "\n",
+    "# list for tokenized documents in loop\n",
+    "texts = []\n",
+    "\n",
+    "# loop through document list\n",
+    "for i in doc_set:\n",
+    "    \n",
+    "    # clean and tokenize document string\n",
+    "    raw = i.lower()\n",
+    "    tokens = tokenizer.tokenize(raw)\n",
+    "\n",
+    "    # remove stop words from tokens\n",
+    "    stopped_tokens = [i for i in tokens if not i in en_stop]\n",
+    "    \n",
+    "    # stem tokens\n",
+    "    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n",
+    "    \n",
+    "    # add tokens to list\n",
+    "    texts.append(stemmed_tokens)\n",
+    "\n",
+    "# turn our tokenized documents into a id <-> term dictionary\n",
+    "dictionary = corpora.Dictionary(texts)\n",
+    "    \n",
+    "# convert tokenized documents into a document-term matrix\n",
+    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
+    "\n",
+    "# generate LDA model\n",
+    "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\" + 0.043*\"caus\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\" + 0.059*\"mother\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=2, num_words=4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=3, num_words=3))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Topic Modeling.ipynb b/Topic Modeling.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n",
+    "doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n",
+    "doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n",
+    "doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n",
+    "doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n",
+    "\n",
+    "doc_complete = [doc1, doc2, doc3, doc4, doc5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import string\n",
+    "stop = set(stopwords.words('english'))\n",
+    "exclude = set(string.punctuation)\n",
+    "lemma = WordNetLemmatizer()\n",
+    "\n",
+    "def clean(doc):\n",
+    "    stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n",
+    "    punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n",
+    "    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n",
+    "    return normalized\n",
+    "doc_clean = [clean(doc).split() for doc in doc_complete]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gensim\n",
+    "from gensim import corpora\n",
+    "dictionary = corpora.Dictionary(doc_clean)\n",
+    "doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "Lda = gensim.models.ldamodel.LdaModel\n",
+    "ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=3, num_words=4))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Working with Text Data in Pandas.ipynb b/Working with Text Data in Pandas.ipynb