Skip to content

Commit

Permalink
Add notebooks
Browse files Browse the repository at this point in the history
susanli2016 authored Aug 3, 2017
1 parent 878fd42 commit 868da20
Showing 5 changed files with 1,358 additions and 0 deletions.
85 changes: 85 additions & 0 deletions Advanced NLP Tasks with NLTK.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"MD: modal auxiliary\n",
" can cannot could couldn't dare may might must need ought shall should\n",
" shouldn't will would\n"
]
}
],
"source": [
"import nltk\n",
"nltk.help.upenn_tagset('MD')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('Children', 'NNP'),\n",
" (\"should't\", 'VBZ'),\n",
" ('drink', 'VB'),\n",
" ('a', 'DT'),\n",
" ('sugary', 'JJ'),\n",
" ('drink', 'NN'),\n",
" ('before', 'IN'),\n",
" ('bed', 'NN'),\n",
" ('.', '.')]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# POS tagging with NLTK\n",
"text11 = \"Children should't drink a sugary drink before bed.\"\n",
"text13 = nltk.word_tokenize(text11)\n",
"nltk.pos_tag(text13)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
416 changes: 416 additions & 0 deletions Basic NLP Tasks with NLTK.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,416 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"*** Introductory Examples for the NLTK Book ***\n",
"Loading text1, ..., text9 and sent1, ..., sent9\n",
"Type the name of the text or sentence to view it.\n",
"Type: 'texts()' or 'sents()' to list the materials.\n",
"text1: Moby Dick by Herman Melville 1851\n",
"text2: Sense and Sensibility by Jane Austen 1811\n",
"text3: The Book of Genesis\n",
"text4: Inaugural Address Corpus\n",
"text5: Chat Corpus\n",
"text6: Monty Python and the Holy Grail\n",
"text7: Wall Street Journal\n",
"text8: Personals Corpus\n",
"text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
]
}
],
"source": [
"import nltk\n",
"from nltk.book import *"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<Text: Moby Dick by Herman Melville 1851>"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"text1"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sent1: Call me Ishmael .\n",
"sent2: The family of Dashwood had long been settled in Sussex .\n",
"sent3: In the beginning God created the heaven and the earth .\n",
"sent4: Fellow - Citizens of the Senate and of the House of Representatives :\n",
"sent5: I have a problem with people PMing me to lol JOIN\n",
"sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !\n",
"sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .\n",
"sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .\n",
"sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .\n"
]
}
],
"source": [
"sents()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Call', 'me', 'Ishmael', '.']"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sent1"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<Text: Wall Street Journal> 100676\n"
]
}
],
"source": [
"print(text7, len(text7))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 18\n"
]
}
],
"source": [
"print(sent7, len(sent7))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Foster',\n",
" 'tallies',\n",
" 'rejected',\n",
" 'budding',\n",
" 'Ratings',\n",
" 'earns',\n",
" 'Raton',\n",
" '8.70',\n",
" 'Carnival',\n",
" 'Driscoll']"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(set(text7))[:10]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"12408"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Frequency of words\n",
"dist = FreqDist(text7)\n",
"len(dist)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vocab1 = list(dist.keys())\n",
"vocab1[:10]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dist['Vinken']"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['billion',\n",
" 'company',\n",
" 'president',\n",
" 'because',\n",
" 'market',\n",
" 'million',\n",
" 'shares',\n",
" 'trading',\n",
" 'program']"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]\n",
"freqwords"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['list', 'listed', 'lists', 'listing', 'listings']"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# different forms of the same \"word\"\n",
"input1 = 'List listed lists listing listings'\n",
"words1 = input1.lower().split(' ')\n",
"words1"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['list', 'list', 'list', 'list', 'list']"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"porter = nltk.PorterStemmer()\n",
"[porter.stem(t) for t in words1]"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Children', \"shouldn't\", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# tokenization\n",
"text11 = \"Children shouldn't drink a sugary drink before bed.\"\n",
"text11.split(' ')"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['Children',\n",
" 'should',\n",
" \"n't\",\n",
" 'drink',\n",
" 'a',\n",
" 'sugary',\n",
" 'drink',\n",
" 'before',\n",
" 'bed',\n",
" '.']"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nltk.word_tokenize(text11)"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# sentence splitting\n",
"text12 = 'This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!'\n",
"sentences = nltk.sent_tokenize(text12)\n",
"len(sentences)"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['This is the first sentence.',\n",
" 'A gallon of milk in the U.S. costs $2.99.',\n",
" 'Is this the third sentence?',\n",
" 'Yes, it is!']"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sentences"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
127 changes: 127 additions & 0 deletions LDA.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"from nltk.tokenize import RegexpTokenizer\n",
"from stop_words import get_stop_words\n",
"from nltk.stem.porter import PorterStemmer\n",
"from gensim import corpora, models\n",
"import gensim"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"tokenizer = RegexpTokenizer(r'\\w+')\n",
"\n",
"# create English stop words list\n",
"en_stop = get_stop_words('en')\n",
"\n",
"# Create p_stemmer of class PorterStemmer\n",
"p_stemmer = PorterStemmer()\n",
" \n",
"# create sample documents\n",
"doc_a = \"Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.\"\n",
"doc_b = \"My mother spends a lot of time driving my brother around to baseball practice.\"\n",
"doc_c = \"Some health experts suggest that driving may cause increased tension and blood pressure.\"\n",
"doc_d = \"I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.\"\n",
"doc_e = \"Health professionals say that brocolli is good for your health.\" \n",
"\n",
"# compile sample documents into a list\n",
"doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]\n",
"\n",
"# list for tokenized documents in loop\n",
"texts = []\n",
"\n",
"# loop through document list\n",
"for i in doc_set:\n",
" \n",
" # clean and tokenize document string\n",
" raw = i.lower()\n",
" tokens = tokenizer.tokenize(raw)\n",
"\n",
" # remove stop words from tokens\n",
" stopped_tokens = [i for i in tokens if not i in en_stop]\n",
" \n",
" # stem tokens\n",
" stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n",
" \n",
" # add tokens to list\n",
" texts.append(stemmed_tokens)\n",
"\n",
"# turn our tokenized documents into a id <-> term dictionary\n",
"dictionary = corpora.Dictionary(texts)\n",
" \n",
"# convert tokenized documents into a document-term matrix\n",
"corpus = [dictionary.doc2bow(text) for text in texts]\n",
"\n",
"# generate LDA model\n",
"ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\" + 0.043*\"caus\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\" + 0.059*\"mother\"')]\n"
]
}
],
"source": [
"print(ldamodel.print_topics(num_topics=2, num_words=4))"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\"')]\n"
]
}
],
"source": [
"print(ldamodel.print_topics(num_topics=3, num_words=3))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
104 changes: 104 additions & 0 deletions Topic Modeling.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n",
"doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n",
"doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n",
"doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n",
"doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n",
"\n",
"doc_complete = [doc1, doc2, doc3, doc4, doc5]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"from nltk.stem.wordnet import WordNetLemmatizer\n",
"import string\n",
"stop = set(stopwords.words('english'))\n",
"exclude = set(string.punctuation)\n",
"lemma = WordNetLemmatizer()\n",
"\n",
"def clean(doc):\n",
" stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n",
" punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n",
" normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n",
" return normalized\n",
"doc_clean = [clean(doc).split() for doc in doc_complete]"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"import gensim\n",
"from gensim import corpora\n",
"dictionary = corpora.Dictionary(doc_clean)\n",
"doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"Lda = gensim.models.ldamodel.LdaModel\n",
"ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n"
]
}
],
"source": [
"print(ldamodel.print_topics(num_topics=3, num_words=4))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
626 changes: 626 additions & 0 deletions Working with Text Data in Pandas.ipynb

Large diffs are not rendered by default.

0 comments on commit 868da20

Please sign in to comment.