diff --git a/Working with Text.ipynb b/Working with Text.ipynb new file mode 100644 index 0000000..912963b --- /dev/null +++ b/Working with Text.ipynb @@ -0,0 +1,405 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "76" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text1 = \"Ethics are built right into the ideals and objectives of the United Nations \"\n", + "\n", + "len(text1) # The length of text1" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text2 = text1.split(' ') # Return a list of the words in text2, separating by ' '.\n", + "\n", + "len(text2)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Ethics',\n", + " 'are',\n", + " 'built',\n", + " 'right',\n", + " 'into',\n", + " 'the',\n", + " 'ideals',\n", + " 'and',\n", + " 'objectives',\n", + " 'of',\n", + " 'the',\n", + " 'United',\n", + " 'Nations',\n", + " '']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "text2" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Ethics',\n", + " 'built',\n", + " 'right',\n", + " 'into',\n", + " 'ideals',\n", + " 'objectives',\n", + " 'United',\n", + " 'Nations']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# List comprehension\n", + "[w for w in text2 if len(w) > 3] # Words that are greater than 3 letters long in text2" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Ethics', 'United', 'Nations']" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[w for w in text2 if w.istitle()] # Capitalized words in text2" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Ethics', 'ideals', 'objectives', 'Nations']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[w for w in text2 if w.endswith('s')] # Words in text2 that end in 's'" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# find unique words using set()\n", + "text3 = 'To be or not to be'\n", + "text4 = text3.split(' ')\n", + "\n", + "len(text4)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "5" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(set(text4))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'To', 'be', 'not', 'or', 'to'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set(text4)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "4" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(set([w.lower() for w in text4])) # .lower converts the string to lowercase." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'be', 'not', 'or', 'to'}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set([w.lower() for w in text4])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['\"Ethics',\n", + " 'are',\n", + " 'built',\n", + " 'right',\n", + " 'into',\n", + " 'the',\n", + " 'ideals',\n", + " 'and',\n", + " 'objectives',\n", + " 'of',\n", + " 'the',\n", + " 'United',\n", + " 'Nations\"',\n", + " '#UNSG',\n", + " '@',\n", + " 'NY',\n", + " 'Society',\n", + " 'for',\n", + " 'Ethical',\n", + " 'Culture',\n", + " 'bit.ly/2guVelr']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Processing free-text\n", + "text5 = '\"Ethics are built right into the ideals and objectives of the United Nations\" \\\n", + "#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'\n", + "text6 = text5.split(' ')\n", + "\n", + "text6" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['#UNSG']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# finding hashtags\n", + "[w for w in text6 if w.startswith('#')]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['@']" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# finding callouts\n", + "[w for w in text6 if w.startswith('@')]" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "text7 = '@UN @UN_Women \"Ethics are built right into the ideals and objectives of the United Nations\" \\\n", + "#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'\n", + "text8 = text7.split(' ')" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['@UN', '@UN_Women']" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import re # import re - a module that provides support for regular expressions\n", + "\n", + "[w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}