From 868da20b6784c737501c05c3c08a8b296cfa447e Mon Sep 17 00:00:00 2001 From: Susan Li Date: Thu, 3 Aug 2017 00:22:47 -0400 Subject: [PATCH] Add notebooks --- Advanced NLP Tasks with NLTK.ipynb | 85 ++++ Basic NLP Tasks with NLTK.ipynb | 416 ++++++++++++++++ LDA.ipynb | 127 +++++ Topic Modeling.ipynb | 104 ++++ Working with Text Data in Pandas.ipynb | 626 +++++++++++++++++++++++++ 5 files changed, 1358 insertions(+) create mode 100644 Advanced NLP Tasks with NLTK.ipynb create mode 100644 Basic NLP Tasks with NLTK.ipynb create mode 100644 LDA.ipynb create mode 100644 Topic Modeling.ipynb create mode 100644 Working with Text Data in Pandas.ipynb diff --git a/Advanced NLP Tasks with NLTK.ipynb b/Advanced NLP Tasks with NLTK.ipynb new file mode 100644 index 0000000..5f3fe5d --- /dev/null +++ b/Advanced NLP Tasks with NLTK.ipynb @@ -0,0 +1,85 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MD: modal auxiliary\n", + " can cannot could couldn't dare may might must need ought shall should\n", + " shouldn't will would\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.help.upenn_tagset('MD')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Children', 'NNP'),\n", + " (\"should't\", 'VBZ'),\n", + " ('drink', 'VB'),\n", + " ('a', 'DT'),\n", + " ('sugary', 'JJ'),\n", + " ('drink', 'NN'),\n", + " ('before', 'IN'),\n", + " ('bed', 'NN'),\n", + " ('.', '.')]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# POS tagging with NLTK\n", + "text11 = \"Children should't drink a sugary drink before bed.\"\n", + "text13 = nltk.word_tokenize(text11)\n", + "nltk.pos_tag(text13)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Basic NLP Tasks with NLTK.ipynb b/Basic NLP Tasks with NLTK.ipynb new file mode 100644 index 0000000..d5ca6a4 --- /dev/null +++ b/Basic NLP Tasks with NLTK.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*** Introductory Examples for the NLTK Book ***\n", + "Loading text1, ..., text9 and sent1, ..., sent9\n", + "Type the name of the text or sentence to view it.\n", + "Type: 'texts()' or 'sents()' to list the materials.\n", + "text1: Moby Dick by Herman Melville 1851\n", + "text2: Sense and Sensibility by Jane Austen 1811\n", + "text3: The Book of Genesis\n", + "text4: Inaugural Address Corpus\n", + "text5: Chat Corpus\n", + "text6: Monty Python and the Holy Grail\n", + "text7: Wall Street Journal\n", + "text8: Personals Corpus\n", + "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n" + ] + } + ], + "source": [ + "import nltk\n", + "from nltk.book import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sent1: Call me Ishmael .\n", + "sent2: The family of Dashwood had long been settled in Sussex .\n", + "sent3: In the beginning God created the heaven and the earth .\n", + "sent4: Fellow - Citizens of the Senate and of the House of Representatives :\n", + "sent5: I have a problem with people PMing me to lol JOIN\n", + "sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !\n", + "sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .\n", + "sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .\n", + "sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .\n" + ] + } + ], + "source": [ + "sents()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Call', 'me', 'Ishmael', '.']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sent1" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 100676\n" + ] + } + ], + "source": [ + "print(text7, len(text7))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 18\n" + ] + } + ], + "source": [ + "print(sent7, len(sent7))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Foster',\n", + " 'tallies',\n", + " 'rejected',\n", + " 'budding',\n", + " 'Ratings',\n", + " 'earns',\n", + " 'Raton',\n", + " '8.70',\n", + " 'Carnival',\n", + " 'Driscoll']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(set(text7))[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12408" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Frequency of words\n", + "dist = FreqDist(text7)\n", + "len(dist)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vocab1 = list(dist.keys())\n", + "vocab1[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dist['Vinken']" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['billion',\n", + " 'company',\n", + " 'president',\n", + " 'because',\n", + " 'market',\n", + " 'million',\n", + " 'shares',\n", + " 'trading',\n", + " 'program']" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]\n", + "freqwords" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['list', 'listed', 'lists', 'listing', 'listings']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# different forms of the same \"word\"\n", + "input1 = 'List listed lists listing listings'\n", + "words1 = input1.lower().split(' ')\n", + "words1" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['list', 'list', 'list', 'list', 'list']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "porter = nltk.PorterStemmer()\n", + "[porter.stem(t) for t in words1]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Children', \"shouldn't\", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# tokenization\n", + "text11 = \"Children shouldn't drink a sugary drink before bed.\"\n", + "text11.split(' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Children',\n", + " 'should',\n", + " \"n't\",\n", + " 'drink',\n", + " 'a',\n", + " 'sugary',\n", + " 'drink',\n", + " 'before',\n", + " 'bed',\n", + " '.']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.word_tokenize(text11)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sentence splitting\n", + "text12 = 'This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!'\n", + "sentences = nltk.sent_tokenize(text12)\n", + "len(sentences)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['This is the first sentence.',\n", + " 'A gallon of milk in the U.S. costs $2.99.',\n", + " 'Is this the third sentence?',\n", + " 'Yes, it is!']" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sentences" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/LDA.ipynb b/LDA.ipynb new file mode 100644 index 0000000..11e5952 --- /dev/null +++ b/LDA.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.tokenize import RegexpTokenizer\n", + "from stop_words import get_stop_words\n", + "from nltk.stem.porter import PorterStemmer\n", + "from gensim import corpora, models\n", + "import gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "tokenizer = RegexpTokenizer(r'\\w+')\n", + "\n", + "# create English stop words list\n", + "en_stop = get_stop_words('en')\n", + "\n", + "# Create p_stemmer of class PorterStemmer\n", + "p_stemmer = PorterStemmer()\n", + " \n", + "# create sample documents\n", + "doc_a = \"Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.\"\n", + "doc_b = \"My mother spends a lot of time driving my brother around to baseball practice.\"\n", + "doc_c = \"Some health experts suggest that driving may cause increased tension and blood pressure.\"\n", + "doc_d = \"I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.\"\n", + "doc_e = \"Health professionals say that brocolli is good for your health.\" \n", + "\n", + "# compile sample documents into a list\n", + "doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]\n", + "\n", + "# list for tokenized documents in loop\n", + "texts = []\n", + "\n", + "# loop through document list\n", + "for i in doc_set:\n", + " \n", + " # clean and tokenize document string\n", + " raw = i.lower()\n", + " tokens = tokenizer.tokenize(raw)\n", + "\n", + " # remove stop words from tokens\n", + " stopped_tokens = [i for i in tokens if not i in en_stop]\n", + " \n", + " # stem tokens\n", + " stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n", + " \n", + " # add tokens to list\n", + " texts.append(stemmed_tokens)\n", + "\n", + "# turn our tokenized documents into a id <-> term dictionary\n", + "dictionary = corpora.Dictionary(texts)\n", + " \n", + "# convert tokenized documents into a document-term matrix\n", + "corpus = [dictionary.doc2bow(text) for text in texts]\n", + "\n", + "# generate LDA model\n", + "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\" + 0.043*\"caus\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\" + 0.059*\"mother\"')]\n" + ] + } + ], + "source": [ + "print(ldamodel.print_topics(num_topics=2, num_words=4))" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\"')]\n" + ] + } + ], + "source": [ + "print(ldamodel.print_topics(num_topics=3, num_words=3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Topic Modeling.ipynb b/Topic Modeling.ipynb new file mode 100644 index 0000000..269a6be --- /dev/null +++ b/Topic Modeling.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n", + "doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n", + "doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n", + "doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n", + "doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n", + "\n", + "doc_complete = [doc1, doc2, doc3, doc4, doc5]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "import string\n", + "stop = set(stopwords.words('english'))\n", + "exclude = set(string.punctuation)\n", + "lemma = WordNetLemmatizer()\n", + "\n", + "def clean(doc):\n", + " stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n", + " punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n", + " normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n", + " return normalized\n", + "doc_clean = [clean(doc).split() for doc in doc_complete]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import gensim\n", + "from gensim import corpora\n", + "dictionary = corpora.Dictionary(doc_clean)\n", + "doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "Lda = gensim.models.ldamodel.LdaModel\n", + "ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n" + ] + } + ], + "source": [ + "print(ldamodel.print_topics(num_topics=3, num_words=4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Working with Text Data in Pandas.ipynb b/Working with Text Data in Pandas.ipynb new file mode 100644 index 0000000..3828c79 --- /dev/null +++ b/Working with Text Data in Pandas.ipynb @@ -0,0 +1,626 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0Monday: The doctor's appointment is at 2:45pm.
1Tuesday: The dentist's appointment is at 11:30...
2Wednesday: At 7:00pm, there is a basketball game!
3Thursday: Be back home by 11:15 pm at the latest.
4Friday: Take the train at 08:10 am, arrive at ...
\n", + "
" + ], + "text/plain": [ + " text\n", + "0 Monday: The doctor's appointment is at 2:45pm.\n", + "1 Tuesday: The dentist's appointment is at 11:30...\n", + "2 Wednesday: At 7:00pm, there is a basketball game!\n", + "3 Thursday: Be back home by 11:15 pm at the latest.\n", + "4 Friday: Take the train at 08:10 am, arrive at ..." + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "time_sentences = [\"Monday: The doctor's appointment is at 2:45pm.\", \n", + " \"Tuesday: The dentist's appointment is at 11:30 am.\",\n", + " \"Wednesday: At 7:00pm, there is a basketball game!\",\n", + " \"Thursday: Be back home by 11:15 pm at the latest.\",\n", + " \"Friday: Take the train at 08:10 am, arrive at 09:00am.\"]\n", + "df = pd.DataFrame(time_sentences, columns = ['text'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 46\n", + "1 50\n", + "2 49\n", + "3 49\n", + "4 54\n", + "Name: text, dtype: int64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find the number of characters for each string in df['text']\n", + "df['text'].str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 7\n", + "1 8\n", + "2 8\n", + "3 10\n", + "4 10\n", + "Name: text, dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find the number of tokens for each string in df['text']\n", + "df['text'].str.split().str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 True\n", + "1 True\n", + "2 False\n", + "3 False\n", + "4 False\n", + "Name: text, dtype: bool" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find which entries contain the word 'appointment'\n", + "df['text'].str.contains('appointment')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3\n", + "1 4\n", + "2 3\n", + "3 4\n", + "4 8\n", + "Name: text, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find how many times a digit occurs in each string\n", + "df['text'].str.count(r'\\d')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [2, 4, 5]\n", + "1 [1, 1, 3, 0]\n", + "2 [7, 0, 0]\n", + "3 [1, 1, 1, 5]\n", + "4 [0, 8, 1, 0, 0, 9, 0, 0]\n", + "Name: text, dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find all occurances of the digits\n", + "df['text'].str.findall(r'\\d')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [(2, 45)]\n", + "1 [(11, 30)]\n", + "2 [(7, 00)]\n", + "3 [(11, 15)]\n", + "4 [(08, 10), (09, 00)]\n", + "Name: text, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# group and find the hours and minutes\n", + "df['text'].str.findall(r'(\\d?\\d):(\\d\\d)')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 ???: The doctor's appointment is at 2:45pm.\n", + "1 ???: The dentist's appointment is at 11:30 am.\n", + "2 ???: At 7:00pm, there is a basketball game!\n", + "3 ???: Be back home by 11:15 pm at the latest.\n", + "4 ???: Take the train at 08:10 am, arrive at 09:...\n", + "Name: text, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['text'].str.replace(r'\\w+day\\b', '???')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Mon: The doctor's appointment is at 2:45pm.\n", + "1 Tue: The dentist's appointment is at 11:30 am.\n", + "2 Wed: At 7:00pm, there is a basketball game!\n", + "3 Thu: Be back home by 11:15 pm at the latest.\n", + "4 Fri: Take the train at 08:10 am, arrive at 09:...\n", + "Name: text, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['text'].str.replace(r'(\\w+day\\b)', lambda x: x.groups()[0][:3])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Susan\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)\n", + " from ipykernel import kernelapp as app\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
0245
11130
2700
31115
40810
\n", + "
" + ], + "text/plain": [ + " 0 1\n", + "0 2 45\n", + "1 11 30\n", + "2 7 00\n", + "3 11 15\n", + "4 08 10" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create new columns from first match of extracted groups\n", + "df['text'].str.extract(r'(\\d?\\d):(\\d\\d)')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
match
002:45pm245pm
1011:30 am1130am
207:00pm700pm
3011:15 pm1115pm
4008:10 am0810am
109:00am0900am
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + " match \n", + "0 0 2:45pm 2 45 pm\n", + "1 0 11:30 am 11 30 am\n", + "2 0 7:00pm 7 00 pm\n", + "3 0 11:15 pm 11 15 pm\n", + "4 0 08:10 am 08 10 am\n", + " 1 09:00am 09 00 am" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# extract the entire time, the hours, the minutes, and the period\n", + "df['text'].str.extractall(r'((\\d?\\d):(\\d\\d) ?([ap]m))')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timehourminuteperiod
match
002:45pm245pm
1011:30 am1130am
207:00pm700pm
3011:15 pm1115pm
4008:10 am0810am
109:00am0900am
\n", + "
" + ], + "text/plain": [ + " time hour minute period\n", + " match \n", + "0 0 2:45pm 2 45 pm\n", + "1 0 11:30 am 11 30 am\n", + "2 0 7:00pm 7 00 pm\n", + "3 0 11:15 pm 11 15 pm\n", + "4 0 08:10 am 08 10 am\n", + " 1 09:00am 09 00 am" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['text'].str.extractall(r'(?P