diff --git a/Advanced NLP Tasks with NLTK.ipynb b/Advanced NLP Tasks with NLTK.ipynb
new file mode 100644
index 0000000..5f3fe5d
--- /dev/null
+++ b/Advanced NLP Tasks with NLTK.ipynb	
@@ -0,0 +1,85 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MD: modal auxiliary\n",
+      "    can cannot could couldn't dare may might must need ought shall should\n",
+      "    shouldn't will would\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "nltk.help.upenn_tagset('MD')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[('Children', 'NNP'),\n",
+       " (\"should't\", 'VBZ'),\n",
+       " ('drink', 'VB'),\n",
+       " ('a', 'DT'),\n",
+       " ('sugary', 'JJ'),\n",
+       " ('drink', 'NN'),\n",
+       " ('before', 'IN'),\n",
+       " ('bed', 'NN'),\n",
+       " ('.', '.')]"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# POS tagging with NLTK\n",
+    "text11 = \"Children should't drink a sugary drink before bed.\"\n",
+    "text13 = nltk.word_tokenize(text11)\n",
+    "nltk.pos_tag(text13)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Basic NLP Tasks with NLTK.ipynb b/Basic NLP Tasks with NLTK.ipynb
new file mode 100644
index 0000000..d5ca6a4
--- /dev/null
+++ b/Basic NLP Tasks with NLTK.ipynb	
@@ -0,0 +1,416 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "*** Introductory Examples for the NLTK Book ***\n",
+      "Loading text1, ..., text9 and sent1, ..., sent9\n",
+      "Type the name of the text or sentence to view it.\n",
+      "Type: 'texts()' or 'sents()' to list the materials.\n",
+      "text1: Moby Dick by Herman Melville 1851\n",
+      "text2: Sense and Sensibility by Jane Austen 1811\n",
+      "text3: The Book of Genesis\n",
+      "text4: Inaugural Address Corpus\n",
+      "text5: Chat Corpus\n",
+      "text6: Monty Python and the Holy Grail\n",
+      "text7: Wall Street Journal\n",
+      "text8: Personals Corpus\n",
+      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
+     ]
+    }
+   ],
+   "source": [
+    "import nltk\n",
+    "from nltk.book import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<Text: Moby Dick by Herman Melville 1851>"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "text1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sent1: Call me Ishmael .\n",
+      "sent2: The family of Dashwood had long been settled in Sussex .\n",
+      "sent3: In the beginning God created the heaven and the earth .\n",
+      "sent4: Fellow - Citizens of the Senate and of the House of Representatives :\n",
+      "sent5: I have a problem with people PMing me to lol JOIN\n",
+      "sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !\n",
+      "sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .\n",
+      "sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .\n",
+      "sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .\n"
+     ]
+    }
+   ],
+   "source": [
+    "sents()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Call', 'me', 'Ishmael', '.']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sent1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<Text: Wall Street Journal> 100676\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(text7, len(text7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 18\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(sent7, len(sent7))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Foster',\n",
+       " 'tallies',\n",
+       " 'rejected',\n",
+       " 'budding',\n",
+       " 'Ratings',\n",
+       " 'earns',\n",
+       " 'Raton',\n",
+       " '8.70',\n",
+       " 'Carnival',\n",
+       " 'Driscoll']"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "list(set(text7))[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "12408"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Frequency of words\n",
+    "dist = FreqDist(text7)\n",
+    "len(dist)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "vocab1 = list(dist.keys())\n",
+    "vocab1[:10]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "dist['Vinken']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['billion',\n",
+       " 'company',\n",
+       " 'president',\n",
+       " 'because',\n",
+       " 'market',\n",
+       " 'million',\n",
+       " 'shares',\n",
+       " 'trading',\n",
+       " 'program']"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]\n",
+    "freqwords"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['list', 'listed', 'lists', 'listing', 'listings']"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# different forms of the same \"word\"\n",
+    "input1 = 'List listed lists listing listings'\n",
+    "words1 = input1.lower().split(' ')\n",
+    "words1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['list', 'list', 'list', 'list', 'list']"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "porter = nltk.PorterStemmer()\n",
+    "[porter.stem(t) for t in words1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Children', \"shouldn't\", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# tokenization\n",
+    "text11 = \"Children shouldn't drink a sugary drink before bed.\"\n",
+    "text11.split(' ')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['Children',\n",
+       " 'should',\n",
+       " \"n't\",\n",
+       " 'drink',\n",
+       " 'a',\n",
+       " 'sugary',\n",
+       " 'drink',\n",
+       " 'before',\n",
+       " 'bed',\n",
+       " '.']"
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "nltk.word_tokenize(text11)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "4"
+      ]
+     },
+     "execution_count": 36,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# sentence splitting\n",
+    "text12 = 'This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!'\n",
+    "sentences = nltk.sent_tokenize(text12)\n",
+    "len(sentences)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['This is the first sentence.',\n",
+       " 'A gallon of milk in the U.S. costs $2.99.',\n",
+       " 'Is this the third sentence?',\n",
+       " 'Yes, it is!']"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sentences"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/LDA.ipynb b/LDA.ipynb
new file mode 100644
index 0000000..11e5952
--- /dev/null
+++ b/LDA.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.tokenize import RegexpTokenizer\n",
+    "from stop_words import get_stop_words\n",
+    "from nltk.stem.porter import PorterStemmer\n",
+    "from gensim import corpora, models\n",
+    "import gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = RegexpTokenizer(r'\\w+')\n",
+    "\n",
+    "# create English stop words list\n",
+    "en_stop = get_stop_words('en')\n",
+    "\n",
+    "# Create p_stemmer of class PorterStemmer\n",
+    "p_stemmer = PorterStemmer()\n",
+    "    \n",
+    "# create sample documents\n",
+    "doc_a = \"Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.\"\n",
+    "doc_b = \"My mother spends a lot of time driving my brother around to baseball practice.\"\n",
+    "doc_c = \"Some health experts suggest that driving may cause increased tension and blood pressure.\"\n",
+    "doc_d = \"I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.\"\n",
+    "doc_e = \"Health professionals say that brocolli is good for your health.\" \n",
+    "\n",
+    "# compile sample documents into a list\n",
+    "doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]\n",
+    "\n",
+    "# list for tokenized documents in loop\n",
+    "texts = []\n",
+    "\n",
+    "# loop through document list\n",
+    "for i in doc_set:\n",
+    "    \n",
+    "    # clean and tokenize document string\n",
+    "    raw = i.lower()\n",
+    "    tokens = tokenizer.tokenize(raw)\n",
+    "\n",
+    "    # remove stop words from tokens\n",
+    "    stopped_tokens = [i for i in tokens if not i in en_stop]\n",
+    "    \n",
+    "    # stem tokens\n",
+    "    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n",
+    "    \n",
+    "    # add tokens to list\n",
+    "    texts.append(stemmed_tokens)\n",
+    "\n",
+    "# turn our tokenized documents into a id <-> term dictionary\n",
+    "dictionary = corpora.Dictionary(texts)\n",
+    "    \n",
+    "# convert tokenized documents into a document-term matrix\n",
+    "corpus = [dictionary.doc2bow(text) for text in texts]\n",
+    "\n",
+    "# generate LDA model\n",
+    "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\" + 0.043*\"caus\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\" + 0.059*\"mother\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=2, num_words=4))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=3, num_words=3))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Topic Modeling.ipynb b/Topic Modeling.ipynb
new file mode 100644
index 0000000..269a6be
--- /dev/null
+++ b/Topic Modeling.ipynb	
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n",
+    "doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n",
+    "doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n",
+    "doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n",
+    "doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n",
+    "\n",
+    "doc_complete = [doc1, doc2, doc3, doc4, doc5]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords\n",
+    "from nltk.stem.wordnet import WordNetLemmatizer\n",
+    "import string\n",
+    "stop = set(stopwords.words('english'))\n",
+    "exclude = set(string.punctuation)\n",
+    "lemma = WordNetLemmatizer()\n",
+    "\n",
+    "def clean(doc):\n",
+    "    stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n",
+    "    punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n",
+    "    normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n",
+    "    return normalized\n",
+    "doc_clean = [clean(doc).split() for doc in doc_complete]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gensim\n",
+    "from gensim import corpora\n",
+    "dictionary = corpora.Dictionary(doc_clean)\n",
+    "doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "Lda = gensim.models.ldamodel.LdaModel\n",
+    "ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(ldamodel.print_topics(num_topics=3, num_words=4))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/Working with Text Data in Pandas.ipynb b/Working with Text Data in Pandas.ipynb
new file mode 100644
index 0000000..3828c79
--- /dev/null
+++ b/Working with Text Data in Pandas.ipynb	
@@ -0,0 +1,626 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Monday: The doctor's appointment is at 2:45pm.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Tuesday: The dentist's appointment is at 11:30...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Wednesday: At 7:00pm, there is a basketball game!</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Thursday: Be back home by 11:15 pm at the latest.</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Friday: Take the train at 08:10 am, arrive at ...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text\n",
+       "0     Monday: The doctor's appointment is at 2:45pm.\n",
+       "1  Tuesday: The dentist's appointment is at 11:30...\n",
+       "2  Wednesday: At 7:00pm, there is a basketball game!\n",
+       "3  Thursday: Be back home by 11:15 pm at the latest.\n",
+       "4  Friday: Take the train at 08:10 am, arrive at ..."
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "time_sentences = [\"Monday: The doctor's appointment is at 2:45pm.\", \n",
+    "                  \"Tuesday: The dentist's appointment is at 11:30 am.\",\n",
+    "                  \"Wednesday: At 7:00pm, there is a basketball game!\",\n",
+    "                  \"Thursday: Be back home by 11:15 pm at the latest.\",\n",
+    "                  \"Friday: Take the train at 08:10 am, arrive at 09:00am.\"]\n",
+    "df = pd.DataFrame(time_sentences, columns = ['text'])\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    46\n",
+       "1    50\n",
+       "2    49\n",
+       "3    49\n",
+       "4    54\n",
+       "Name: text, dtype: int64"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find the number of characters for each string in df['text']\n",
+    "df['text'].str.len()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0     7\n",
+       "1     8\n",
+       "2     8\n",
+       "3    10\n",
+       "4    10\n",
+       "Name: text, dtype: int64"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find the number of tokens for each string in df['text']\n",
+    "df['text'].str.split().str.len()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0     True\n",
+       "1     True\n",
+       "2    False\n",
+       "3    False\n",
+       "4    False\n",
+       "Name: text, dtype: bool"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find which entries contain the word 'appointment'\n",
+    "df['text'].str.contains('appointment')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0    3\n",
+       "1    4\n",
+       "2    3\n",
+       "3    4\n",
+       "4    8\n",
+       "Name: text, dtype: int64"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find how many times a digit occurs in each string\n",
+    "df['text'].str.count(r'\\d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0                   [2, 4, 5]\n",
+       "1                [1, 1, 3, 0]\n",
+       "2                   [7, 0, 0]\n",
+       "3                [1, 1, 1, 5]\n",
+       "4    [0, 8, 1, 0, 0, 9, 0, 0]\n",
+       "Name: text, dtype: object"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# find all occurances of the digits\n",
+    "df['text'].str.findall(r'\\d')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0               [(2, 45)]\n",
+       "1              [(11, 30)]\n",
+       "2               [(7, 00)]\n",
+       "3              [(11, 15)]\n",
+       "4    [(08, 10), (09, 00)]\n",
+       "Name: text, dtype: object"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# group and find the hours and minutes\n",
+    "df['text'].str.findall(r'(\\d?\\d):(\\d\\d)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0          ???: The doctor's appointment is at 2:45pm.\n",
+       "1       ???: The dentist's appointment is at 11:30 am.\n",
+       "2          ???: At 7:00pm, there is a basketball game!\n",
+       "3         ???: Be back home by 11:15 pm at the latest.\n",
+       "4    ???: Take the train at 08:10 am, arrive at 09:...\n",
+       "Name: text, dtype: object"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['text'].str.replace(r'\\w+day\\b', '???')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0          Mon: The doctor's appointment is at 2:45pm.\n",
+       "1       Tue: The dentist's appointment is at 11:30 am.\n",
+       "2          Wed: At 7:00pm, there is a basketball game!\n",
+       "3         Thu: Be back home by 11:15 pm at the latest.\n",
+       "4    Fri: Take the train at 08:10 am, arrive at 09:...\n",
+       "Name: text, dtype: object"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['text'].str.replace(r'(\\w+day\\b)', lambda x: x.groups()[0][:3])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\Users\\Susan\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)\n",
+      "  from ipykernel import kernelapp as app\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2</td>\n",
+       "      <td>45</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>11</td>\n",
+       "      <td>30</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>7</td>\n",
+       "      <td>00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>11</td>\n",
+       "      <td>15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>08</td>\n",
+       "      <td>10</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    0   1\n",
+       "0   2  45\n",
+       "1  11  30\n",
+       "2   7  00\n",
+       "3  11  15\n",
+       "4  08  10"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# create new columns from first match of extracted groups\n",
+    "df['text'].str.extract(r'(\\d?\\d):(\\d\\d)')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th>match</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <th>0</th>\n",
+       "      <td>2:45pm</td>\n",
+       "      <td>2</td>\n",
+       "      <td>45</td>\n",
+       "      <td>pm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <th>0</th>\n",
+       "      <td>11:30 am</td>\n",
+       "      <td>11</td>\n",
+       "      <td>30</td>\n",
+       "      <td>am</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <th>0</th>\n",
+       "      <td>7:00pm</td>\n",
+       "      <td>7</td>\n",
+       "      <td>00</td>\n",
+       "      <td>pm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <th>0</th>\n",
+       "      <td>11:15 pm</td>\n",
+       "      <td>11</td>\n",
+       "      <td>15</td>\n",
+       "      <td>pm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">4</th>\n",
+       "      <th>0</th>\n",
+       "      <td>08:10 am</td>\n",
+       "      <td>08</td>\n",
+       "      <td>10</td>\n",
+       "      <td>am</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>09:00am</td>\n",
+       "      <td>09</td>\n",
+       "      <td>00</td>\n",
+       "      <td>am</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                0   1   2   3\n",
+       "  match                      \n",
+       "0 0        2:45pm   2  45  pm\n",
+       "1 0      11:30 am  11  30  am\n",
+       "2 0        7:00pm   7  00  pm\n",
+       "3 0      11:15 pm  11  15  pm\n",
+       "4 0      08:10 am  08  10  am\n",
+       "  1       09:00am  09  00  am"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# extract the entire time, the hours, the minutes, and the period\n",
+    "df['text'].str.extractall(r'((\\d?\\d):(\\d\\d) ?([ap]m))')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th>time</th>\n",
+       "      <th>hour</th>\n",
+       "      <th>minute</th>\n",
+       "      <th>period</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th>match</th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <th>0</th>\n",
+       "      <td>2:45pm</td>\n",
+       "      <td>2</td>\n",
+       "      <td>45</td>\n",
+       "      <td>pm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <th>0</th>\n",
+       "      <td>11:30 am</td>\n",
+       "      <td>11</td>\n",
+       "      <td>30</td>\n",
+       "      <td>am</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <th>0</th>\n",
+       "      <td>7:00pm</td>\n",
+       "      <td>7</td>\n",
+       "      <td>00</td>\n",
+       "      <td>pm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <th>0</th>\n",
+       "      <td>11:15 pm</td>\n",
+       "      <td>11</td>\n",
+       "      <td>15</td>\n",
+       "      <td>pm</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">4</th>\n",
+       "      <th>0</th>\n",
+       "      <td>08:10 am</td>\n",
+       "      <td>08</td>\n",
+       "      <td>10</td>\n",
+       "      <td>am</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>09:00am</td>\n",
+       "      <td>09</td>\n",
+       "      <td>00</td>\n",
+       "      <td>am</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "             time hour minute period\n",
+       "  match                             \n",
+       "0 0        2:45pm    2     45     pm\n",
+       "1 0      11:30 am   11     30     am\n",
+       "2 0        7:00pm    7     00     pm\n",
+       "3 0      11:15 pm   11     15     pm\n",
+       "4 0      08:10 am   08     10     am\n",
+       "  1       09:00am   09     00     am"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df['text'].str.extractall(r'(?P<time>(?P<hour>\\d?\\d):(?P<minute>\\d\\d) ?(?P<period>[ap]m))')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}