diff --git a/Advanced NLP Tasks with NLTK.ipynb b/Advanced NLP Tasks with NLTK.ipynb new file mode 100644 index 0000000..5f3fe5d --- /dev/null +++ b/Advanced NLP Tasks with NLTK.ipynb @@ -0,0 +1,85 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MD: modal auxiliary\n", + " can cannot could couldn't dare may might must need ought shall should\n", + " shouldn't will would\n" + ] + } + ], + "source": [ + "import nltk\n", + "nltk.help.upenn_tagset('MD')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('Children', 'NNP'),\n", + " (\"should't\", 'VBZ'),\n", + " ('drink', 'VB'),\n", + " ('a', 'DT'),\n", + " ('sugary', 'JJ'),\n", + " ('drink', 'NN'),\n", + " ('before', 'IN'),\n", + " ('bed', 'NN'),\n", + " ('.', '.')]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# POS tagging with NLTK\n", + "text11 = \"Children should't drink a sugary drink before bed.\"\n", + "text13 = nltk.word_tokenize(text11)\n", + "nltk.pos_tag(text13)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Basic NLP Tasks with NLTK.ipynb b/Basic NLP Tasks with NLTK.ipynb new file mode 100644 index 0000000..d5ca6a4 --- /dev/null +++ b/Basic NLP Tasks with NLTK.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "*** Introductory Examples for the NLTK Book ***\n", + "Loading text1, ..., text9 and sent1, ..., sent9\n", + "Type the name of the text or sentence to view it.\n", + "Type: 'texts()' or 'sents()' to list the materials.\n", + "text1: Moby Dick by Herman Melville 1851\n", + "text2: Sense and Sensibility by Jane Austen 1811\n", + "text3: The Book of Genesis\n", + "text4: Inaugural Address Corpus\n", + "text5: Chat Corpus\n", + "text6: Monty Python and the Holy Grail\n", + "text7: Wall Street Journal\n", + "text8: Personals Corpus\n", + "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n" + ] + } + ], + "source": [ + "import nltk\n", + "from nltk.book import *" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "sent1: Call me Ishmael .\n", + "sent2: The family of Dashwood had long been settled in Sussex .\n", + "sent3: In the beginning God created the heaven and the earth .\n", + "sent4: Fellow - Citizens of the Senate and of the House of Representatives :\n", + "sent5: I have a problem with people PMing me to lol JOIN\n", + "sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !\n", + "sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .\n", + "sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .\n", + "sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .\n" + ] + } + ], + "source": [ + "sents()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Call', 'me', 'Ishmael', '.']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sent1" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 100676\n" + ] + } + ], + "source": [ + "print(text7, len(text7))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 18\n" + ] + } + ], + "source": [ + "print(sent7, len(sent7))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Foster',\n", + " 'tallies',\n", + " 'rejected',\n", + " 'budding',\n", + " 'Ratings',\n", + " 'earns',\n", + " 'Raton',\n", + " '8.70',\n", + " 'Carnival',\n", + " 'Driscoll']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "list(set(text7))[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "12408" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Frequency of words\n", + "dist = FreqDist(text7)\n", + "len(dist)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vocab1 = list(dist.keys())\n", + "vocab1[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dist['Vinken']" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['billion',\n", + " 'company',\n", + " 'president',\n", + " 'because',\n", + " 'market',\n", + " 'million',\n", + " 'shares',\n", + " 'trading',\n", + " 'program']" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]\n", + "freqwords" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['list', 'listed', 'lists', 'listing', 'listings']" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# different forms of the same \"word\"\n", + "input1 = 'List listed lists listing listings'\n", + "words1 = input1.lower().split(' ')\n", + "words1" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['list', 'list', 'list', 'list', 'list']" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "porter = nltk.PorterStemmer()\n", + "[porter.stem(t) for t in words1]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Children', \"shouldn't\", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# tokenization\n", + "text11 = \"Children shouldn't drink a sugary drink before bed.\"\n", + "text11.split(' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Children',\n", + " 'should',\n", + " \"n't\",\n", + " 'drink',\n", + " 'a',\n", + " 'sugary',\n", + " 'drink',\n", + " 'before',\n", + " 'bed',\n", + " '.']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "nltk.word_tokenize(text11)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sentence splitting\n", + "text12 = 'This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!'\n", + "sentences = nltk.sent_tokenize(text12)\n", + "len(sentences)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['This is the first sentence.',\n", + " 'A gallon of milk in the U.S. costs $2.99.',\n", + " 'Is this the third sentence?',\n", + " 'Yes, it is!']" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sentences" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/LDA.ipynb b/LDA.ipynb new file mode 100644 index 0000000..11e5952 --- /dev/null +++ b/LDA.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.tokenize import RegexpTokenizer\n", + "from stop_words import get_stop_words\n", + "from nltk.stem.porter import PorterStemmer\n", + "from gensim import corpora, models\n", + "import gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "tokenizer = RegexpTokenizer(r'\\w+')\n", + "\n", + "# create English stop words list\n", + "en_stop = get_stop_words('en')\n", + "\n", + "# Create p_stemmer of class PorterStemmer\n", + "p_stemmer = PorterStemmer()\n", + " \n", + "# create sample documents\n", + "doc_a = \"Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother.\"\n", + "doc_b = \"My mother spends a lot of time driving my brother around to baseball practice.\"\n", + "doc_c = \"Some health experts suggest that driving may cause increased tension and blood pressure.\"\n", + "doc_d = \"I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better.\"\n", + "doc_e = \"Health professionals say that brocolli is good for your health.\" \n", + "\n", + "# compile sample documents into a list\n", + "doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]\n", + "\n", + "# list for tokenized documents in loop\n", + "texts = []\n", + "\n", + "# loop through document list\n", + "for i in doc_set:\n", + " \n", + " # clean and tokenize document string\n", + " raw = i.lower()\n", + " tokens = tokenizer.tokenize(raw)\n", + "\n", + " # remove stop words from tokens\n", + " stopped_tokens = [i for i in tokens if not i in en_stop]\n", + " \n", + " # stem tokens\n", + " stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]\n", + " \n", + " # add tokens to list\n", + " texts.append(stemmed_tokens)\n", + "\n", + "# turn our tokenized documents into a id <-> term dictionary\n", + "dictionary = corpora.Dictionary(texts)\n", + " \n", + "# convert tokenized documents into a document-term matrix\n", + "corpus = [dictionary.doc2bow(text) for text in texts]\n", + "\n", + "# generate LDA model\n", + "ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\" + 0.043*\"caus\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\" + 0.059*\"mother\"')]\n" + ] + } + ], + "source": [ + "print(ldamodel.print_topics(num_topics=2, num_words=4))" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.072*\"drive\" + 0.043*\"health\" + 0.043*\"pressur\"'), (1, '0.081*\"brocolli\" + 0.081*\"good\" + 0.059*\"brother\"')]\n" + ] + } + ], + "source": [ + "print(ldamodel.print_topics(num_topics=3, num_words=3))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Topic Modeling.ipynb b/Topic Modeling.ipynb new file mode 100644 index 0000000..269a6be --- /dev/null +++ b/Topic Modeling.ipynb @@ -0,0 +1,104 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "doc1 = \"Sugar is bad to consume. My sister likes to have sugar, but not my father.\"\n", + "doc2 = \"My father spends a lot of time driving my sister around to dance practice.\"\n", + "doc3 = \"Doctors suggest that driving may cause increased stress and blood pressure.\"\n", + "doc4 = \"Sometimes I feel pressure to perform well at school, but my father never seems to drive my sister to do better.\"\n", + "doc5 = \"Health experts say that Sugar is not good for your lifestyle.\"\n", + "\n", + "doc_complete = [doc1, doc2, doc3, doc4, doc5]" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "import string\n", + "stop = set(stopwords.words('english'))\n", + "exclude = set(string.punctuation)\n", + "lemma = WordNetLemmatizer()\n", + "\n", + "def clean(doc):\n", + " stop_free = ' '.join([i for i in doc.lower().split() if i not in stop])\n", + " punc_free = ''.join([ch for ch in stop_free if ch not in exclude])\n", + " normalized = ' '.join(lemma.lemmatize(word) for word in punc_free.split())\n", + " return normalized\n", + "doc_clean = [clean(doc).split() for doc in doc_complete]" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "import gensim\n", + "from gensim import corpora\n", + "dictionary = corpora.Dictionary(doc_clean)\n", + "doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "Lda = gensim.models.ldamodel.LdaModel\n", + "ldamodel = Lda(doc_term_matrix, num_topics = 3, id2word = dictionary, passes=50)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[(0, '0.135*\"sugar\" + 0.054*\"like\" + 0.054*\"consume\" + 0.054*\"bad\"'), (1, '0.056*\"father\" + 0.056*\"sister\" + 0.056*\"pressure\" + 0.056*\"driving\"'), (2, '0.029*\"sister\" + 0.029*\"father\" + 0.029*\"blood\" + 0.029*\"may\"')]\n" + ] + } + ], + "source": [ + "print(ldamodel.print_topics(num_topics=3, num_words=4))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/Working with Text Data in Pandas.ipynb b/Working with Text Data in Pandas.ipynb new file mode 100644 index 0000000..3828c79 --- /dev/null +++ b/Working with Text Data in Pandas.ipynb @@ -0,0 +1,626 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
text
0Monday: The doctor's appointment is at 2:45pm.
1Tuesday: The dentist's appointment is at 11:30...
2Wednesday: At 7:00pm, there is a basketball game!
3Thursday: Be back home by 11:15 pm at the latest.
4Friday: Take the train at 08:10 am, arrive at ...
\n", + "
" + ], + "text/plain": [ + " text\n", + "0 Monday: The doctor's appointment is at 2:45pm.\n", + "1 Tuesday: The dentist's appointment is at 11:30...\n", + "2 Wednesday: At 7:00pm, there is a basketball game!\n", + "3 Thursday: Be back home by 11:15 pm at the latest.\n", + "4 Friday: Take the train at 08:10 am, arrive at ..." + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "time_sentences = [\"Monday: The doctor's appointment is at 2:45pm.\", \n", + " \"Tuesday: The dentist's appointment is at 11:30 am.\",\n", + " \"Wednesday: At 7:00pm, there is a basketball game!\",\n", + " \"Thursday: Be back home by 11:15 pm at the latest.\",\n", + " \"Friday: Take the train at 08:10 am, arrive at 09:00am.\"]\n", + "df = pd.DataFrame(time_sentences, columns = ['text'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 46\n", + "1 50\n", + "2 49\n", + "3 49\n", + "4 54\n", + "Name: text, dtype: int64" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find the number of characters for each string in df['text']\n", + "df['text'].str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 7\n", + "1 8\n", + "2 8\n", + "3 10\n", + "4 10\n", + "Name: text, dtype: int64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find the number of tokens for each string in df['text']\n", + "df['text'].str.split().str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 True\n", + "1 True\n", + "2 False\n", + "3 False\n", + "4 False\n", + "Name: text, dtype: bool" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find which entries contain the word 'appointment'\n", + "df['text'].str.contains('appointment')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 3\n", + "1 4\n", + "2 3\n", + "3 4\n", + "4 8\n", + "Name: text, dtype: int64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find how many times a digit occurs in each string\n", + "df['text'].str.count(r'\\d')" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [2, 4, 5]\n", + "1 [1, 1, 3, 0]\n", + "2 [7, 0, 0]\n", + "3 [1, 1, 1, 5]\n", + "4 [0, 8, 1, 0, 0, 9, 0, 0]\n", + "Name: text, dtype: object" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find all occurances of the digits\n", + "df['text'].str.findall(r'\\d')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 [(2, 45)]\n", + "1 [(11, 30)]\n", + "2 [(7, 00)]\n", + "3 [(11, 15)]\n", + "4 [(08, 10), (09, 00)]\n", + "Name: text, dtype: object" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# group and find the hours and minutes\n", + "df['text'].str.findall(r'(\\d?\\d):(\\d\\d)')" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 ???: The doctor's appointment is at 2:45pm.\n", + "1 ???: The dentist's appointment is at 11:30 am.\n", + "2 ???: At 7:00pm, there is a basketball game!\n", + "3 ???: Be back home by 11:15 pm at the latest.\n", + "4 ???: Take the train at 08:10 am, arrive at 09:...\n", + "Name: text, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['text'].str.replace(r'\\w+day\\b', '???')" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 Mon: The doctor's appointment is at 2:45pm.\n", + "1 Tue: The dentist's appointment is at 11:30 am.\n", + "2 Wed: At 7:00pm, there is a basketball game!\n", + "3 Thu: Be back home by 11:15 pm at the latest.\n", + "4 Fri: Take the train at 08:10 am, arrive at 09:...\n", + "Name: text, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['text'].str.replace(r'(\\w+day\\b)', lambda x: x.groups()[0][:3])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\Susan\\Anaconda3\\lib\\site-packages\\ipykernel\\__main__.py:2: FutureWarning: currently extract(expand=None) means expand=False (return Index/Series/DataFrame) but in a future version of pandas this will be changed to expand=True (return DataFrame)\n", + " from ipykernel import kernelapp as app\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
01
0245
11130
2700
31115
40810
\n", + "
" + ], + "text/plain": [ + " 0 1\n", + "0 2 45\n", + "1 11 30\n", + "2 7 00\n", + "3 11 15\n", + "4 08 10" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create new columns from first match of extracted groups\n", + "df['text'].str.extract(r'(\\d?\\d):(\\d\\d)')" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123
match
002:45pm245pm
1011:30 am1130am
207:00pm700pm
3011:15 pm1115pm
4008:10 am0810am
109:00am0900am
\n", + "
" + ], + "text/plain": [ + " 0 1 2 3\n", + " match \n", + "0 0 2:45pm 2 45 pm\n", + "1 0 11:30 am 11 30 am\n", + "2 0 7:00pm 7 00 pm\n", + "3 0 11:15 pm 11 15 pm\n", + "4 0 08:10 am 08 10 am\n", + " 1 09:00am 09 00 am" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# extract the entire time, the hours, the minutes, and the period\n", + "df['text'].str.extractall(r'((\\d?\\d):(\\d\\d) ?([ap]m))')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timehourminuteperiod
match
002:45pm245pm
1011:30 am1130am
207:00pm700pm
3011:15 pm1115pm
4008:10 am0810am
109:00am0900am
\n", + "
" + ], + "text/plain": [ + " time hour minute period\n", + " match \n", + "0 0 2:45pm 2 45 pm\n", + "1 0 11:30 am 11 30 am\n", + "2 0 7:00pm 7 00 pm\n", + "3 0 11:15 pm 11 15 pm\n", + "4 0 08:10 am 08 10 am\n", + " 1 09:00am 09 00 am" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['text'].str.extractall(r'(?P