diff --git a/Topic Modeling for Data Preprocessing.ipynb b/Topic Modeling for Data Preprocessing.ipynb new file mode 100644 index 0000000..cad58fa --- /dev/null +++ b/Topic Modeling for Data Preprocessing.ipynb @@ -0,0 +1,3858 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from nltk.corpus import stopwords\n", + "from sklearn.metrics.pairwise import linear_kernel\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.decomposition import LatentDirichletAllocation\n", + "import random\n", + "import re, nltk, spacy, gensim\n", + "import pyLDAvis\n", + "import pyLDAvis.sklearn\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "\n", + "pd.set_option('display.max_columns', 50)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('data/Seattle_Hotels_dirty.csv', encoding=\"latin-1\")\n", + "df.set_index('name', inplace = True)\n", + "tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')\n", + "tfidf_matrix = tf.fit_transform(df['desc'])\n", + "cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "indices = pd.Series(df.index)\n", + "def recommendations(name, cosine_similarities = cosine_similarities):\n", + " \n", + " recommended_hotels = []\n", + " \n", + " # gettin the index of the hotel that matches the name\n", + " idx = indices[indices == name].index[0]\n", + "\n", + " # creating a Series with the similarity scores in descending order\n", + " score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)\n", + "\n", + " # getting the indexes of the 5 most similar hotels except itself\n", + " top_10_indexes = list(score_series.iloc[1:6].index)\n", + " \n", + " # populating the list with the names of the top 5 matching hotels\n", + " for i in top_10_indexes:\n", + " recommended_hotels.append(list(df.index)[i])\n", + " \n", + " return recommended_hotels" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Hilton Seattle',\n", + " \"Mildred's Bed and Breakfast\",\n", + " 'Seattle Airport Marriott',\n", + " 'Days Inn by Wyndham Seattle North of Downtown',\n", + " 'Holiday Inn Express & Suites North Seattle - Shoreline']" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('Hilton Garden Inn Seattle Downtown')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Located on the southern tip of Lake Union, the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure. Non-Smoking\\nHotel is 100% non-smoking, including e-cigarettes, in all guest rooms and public areas. A fee of up to $250 USD will be assessed for smoking in a non-smoking room. Please ask the Front Desk for locations of designated outdoor smoking areas. Check-in: 4:00 pm. Check-out: 12:00 pm. Cancellation policies may vary depending on the rate or dates of your reservation. Please refer to your reservation confirmation to verify your cancellation policy.\\n'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc['Hilton Garden Inn Seattle Downtown'].desc" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'A rare find in the heart of Seattle. 100% non-smoking. Check-in: 4:00 pm. Check-out: 12:00 pm. Cancellation policies may vary depending on the rate or dates of your reservation.'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[\"Mildred's Bed and Breakfast\"].desc" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'We streamline your travel routine. Explore the local area, brimming with alluring attractions. Reserve one of our 14 versatile event spaces for your next business meeting or wedding reception. We also feature an outdoor atrium, which provides a gorgeous backdrop for intimate gatherings. Check-in: 4:00 PM, Check-out: 12:00 PM. We are committed to providing our guests and associates with a smoke-free environment, and are proud to boast one of the most comprehensive smoke-free hotel policies in the industry. Although smoking is not permitted within hotel buildings themselves, guests who smoke are permitted to do so outside in designated areas.'" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[\"Seattle Airport Marriott\"].desc" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "df.reset_index(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We have 1208 sentences in total\n" + ] + } + ], + "source": [ + "df = pd.concat([pd.Series(str(row['name']), str(row['desc']).split('. ')) \n", + " for _, row in df.iterrows()]).reset_index()\n", + "df.columns = ['sentence', 'name']\n", + "df['sentence'] = df['sentence'].map(lambda x: re.sub(r'\\W+', ' ', x))\n", + "print('We have ', len(df), 'sentences in total')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencename
0Located on the southern tip of Lake Union the ...Hilton Garden Inn Seattle Downtown
1Non Smoking Hotel is 100 non smoking including...Hilton Garden Inn Seattle Downtown
2A fee of up to 250 USD will be assessed for sm...Hilton Garden Inn Seattle Downtown
3Please ask the Front Desk for locations of des...Hilton Garden Inn Seattle Downtown
4Check in 4 00 pmHilton Garden Inn Seattle Downtown
5Check out 12 00 pmHilton Garden Inn Seattle Downtown
6Cancellation policies may vary depending on th...Hilton Garden Inn Seattle Downtown
7Please refer to your reservation confirmation ...Hilton Garden Inn Seattle Downtown
\n", + "
" + ], + "text/plain": [ + " sentence \\\n", + "0 Located on the southern tip of Lake Union the ... \n", + "1 Non Smoking Hotel is 100 non smoking including... \n", + "2 A fee of up to 250 USD will be assessed for sm... \n", + "3 Please ask the Front Desk for locations of des... \n", + "4 Check in 4 00 pm \n", + "5 Check out 12 00 pm \n", + "6 Cancellation policies may vary depending on th... \n", + "7 Please refer to your reservation confirmation ... \n", + "\n", + " name \n", + "0 Hilton Garden Inn Seattle Downtown \n", + "1 Hilton Garden Inn Seattle Downtown \n", + "2 Hilton Garden Inn Seattle Downtown \n", + "3 Hilton Garden Inn Seattle Downtown \n", + "4 Hilton Garden Inn Seattle Downtown \n", + "5 Hilton Garden Inn Seattle Downtown \n", + "6 Hilton Garden Inn Seattle Downtown \n", + "7 Hilton Garden Inn Seattle Downtown " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.loc[df['name'] == 'Hilton Garden Inn Seattle Downtown']" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Located on the southern tip of Lake Union the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure\n", + "\n", + "Non Smoking Hotel is 100 non smoking including e cigarettes in all guest rooms and public areas\n", + "\n", + "A fee of up to 250 USD will be assessed for smoking in a non smoking room\n", + "\n", + "Please ask the Front Desk for locations of designated outdoor smoking areas\n", + "\n", + "Check in 4 00 pm\n", + "\n", + "Check out 12 00 pm\n", + "\n", + "Cancellation policies may vary depending on the rate or dates of your reservation\n", + "\n", + "Please refer to your reservation confirmation to verify your cancellation policy \n", + "\n" + ] + } + ], + "source": [ + "a = 0\n", + "for i in range(a,a+8):\n", + " print(df.sentence[i])\n", + " print()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "vectorizer = CountVectorizer(analyzer='word', \n", + " min_df=3, # minimum reqd occurences of a word \n", + " stop_words='english', # remove stop words\n", + " lowercase=True, # convert all words to lowercase\n", + " token_pattern='[a-zA-Z0-9]{3,}', # num chars > 3\n", + " max_features=3000, # max number of uniq words\n", + " )\n", + "\n", + "data_vectorized = vectorizer.fit_transform(df['sentence'])" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,\n", + " evaluate_every=-1, learning_decay=0.7,\n", + " learning_method='online', learning_offset=10.0,\n", + " max_doc_update_iter=100, max_iter=10,\n", + " mean_change_tol=0.001, n_components=40, n_jobs=-1,\n", + " perp_tol=0.1, random_state=0, topic_word_prior=None,\n", + " total_samples=1000000.0, verbose=0)\n" + ] + } + ], + "source": [ + "lda_model = LatentDirichletAllocation(n_components=40, # Number of topics\n", + " learning_method='online',\n", + " random_state=0, \n", + " n_jobs = -1 # Use all available CPUs\n", + " )\n", + "lda_output = lda_model.fit_transform(data_vectorized)\n", + "\n", + "print(lda_model)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/tljh/user/lib/python3.6/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version\n", + "of pandas will change to not sort by default.\n", + "\n", + "To accept the future behavior, pass 'sort=False'.\n", + "\n", + "To retain the current behavior and silence the warning, pass 'sort=True'.\n", + "\n", + " return pd.concat([default_term_info] + list(topic_dfs))\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "PreparedData(topic_coordinates= x y topics cluster Freq\n", + "topic \n", + "23 52.810181 107.997879 1 1 23.309673\n", + "0 61.669094 -23.845095 2 1 7.135673\n", + "21 94.504219 85.496475 3 1 6.933787\n", + "22 8.152671 -61.162136 4 1 5.901442\n", + "5 -20.222895 60.945469 5 1 5.199544\n", + "39 130.094681 47.707428 6 1 5.174447\n", + "37 112.205765 -46.932274 7 1 3.641742\n", + "34 20.300003 123.798866 8 1 3.371417\n", + "10 35.295795 -98.459053 9 1 3.215031\n", + "24 33.399498 -40.309830 10 1 3.051413\n", + "38 -39.223328 -32.901543 11 1 2.941073\n", + "2 -76.319992 101.096680 12 1 2.704279\n", + "20 -76.351448 -23.054537 13 1 2.609571\n", + "1 104.032692 22.052193 14 1 2.491596\n", + "3 94.061157 -11.908739 15 1 2.380405\n", + "26 -29.729797 -61.914070 16 1 1.923049\n", + "36 -81.753860 55.740067 17 1 1.821930\n", + "17 -115.261147 0.317844 18 1 1.643399\n", + "33 71.415016 33.707039 19 1 1.489288\n", + "14 -79.805733 19.744102 20 1 1.457323\n", + "9 -71.979454 -66.767677 21 1 1.403683\n", + "25 64.263123 -65.771851 22 1 1.378312\n", + "4 -26.498144 111.649246 23 1 1.286898\n", + "32 31.737236 77.611855 24 1 0.892748\n", + "13 61.515476 63.975536 25 1 0.817660\n", + "28 -49.619919 5.555991 26 1 0.756039\n", + "7 53.294022 7.669420 27 1 0.707738\n", + "15 -0.307224 84.005600 28 1 0.542800\n", + "12 -48.525143 71.959526 29 1 0.531173\n", + "30 -22.527716 -7.611402 30 1 0.459936\n", + "29 -11.223989 -90.792847 31 1 0.448935\n", + "19 -39.896980 36.212303 32 1 0.360965\n", + "35 36.864887 39.051250 33 1 0.301659\n", + "6 3.264976 31.994232 34 1 0.297041\n", + "18 12.080495 53.989876 35 1 0.273443\n", + "27 0.843159 -24.976843 36 1 0.272854\n", + "8 -17.605000 20.573502 37 1 0.218008\n", + "11 24.873323 17.042814 38 1 0.218008\n", + "31 2.724974 4.337633 39 1 0.218008\n", + "16 25.646587 -8.449744 40 1 0.218008, topic_info= Category Freq Term Total loglift logprob\n", + "763 Default 474.000000 seattle 474.000000 30.0000 30.0000\n", + "350 Default 120.000000 free 120.000000 29.0000 29.0000\n", + "436 Default 271.000000 hotel 271.000000 28.0000 28.0000\n", + "744 Default 99.000000 rooms 99.000000 27.0000 27.0000\n", + "844 Default 99.000000 stay 99.000000 26.0000 26.0000\n", + ".. ... ... ... ... ... ...\n", + "323 Topic40 0.026550 favorite 6.858633 0.5742 -6.8886\n", + "973 Topic40 0.026543 winning 10.604204 0.1381 -6.8889\n", + "844 Topic40 0.026540 stay 99.792137 -2.1038 -6.8890\n", + "223 Topic40 0.026537 culture 5.883395 0.7271 -6.8891\n", + "847 Topic40 0.026537 step 10.568538 0.1413 -6.8891\n", + "\n", + "[2028 rows x 6 columns], token_table= Topic Freq Term\n", + "term \n", + "0 15 0.940261 000\n", + "1 18 0.621097 00pm\n", + "2 23 0.909527 100\n", + "3 29 0.831653 12pm\n", + "4 5 0.277086 200\n", + "... ... ... ...\n", + "982 4 0.246350 worldmark\n", + "982 10 0.492700 worldmark\n", + "983 2 0.848519 worth\n", + "984 9 0.827556 year\n", + "985 29 0.846565 years\n", + "\n", + "[1722 rows x 3 columns], R=30, lambda_step=0.01, plot_opts={'xlab': 'PC1', 'ylab': 'PC2'}, topic_order=[24, 1, 22, 23, 6, 40, 38, 35, 11, 25, 39, 3, 21, 2, 4, 27, 37, 18, 34, 15, 10, 26, 5, 33, 14, 29, 8, 16, 13, 31, 30, 20, 36, 7, 19, 28, 9, 12, 32, 17])" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pyLDAvis.enable_notebook()\n", + "pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Word 0Word 1Word 2Word 3Word 4Word 5Word 6Word 7Word 8Word 9Word 10Word 11Word 12Word 13Word 14Word 15Word 16Word 17Word 18Word 19
Topic 0seattlehotelinnlakeunionstayexperienceofferssouthguestssuitesdowntownoffermoderncomforttodayoriginalalfreduniqueurban
Topic 1staymakeneedsurecomfortabletimeroadextraconvenientnightextendedenjoyableofferingdesignedhitpridestayinglandingsoftcreature
Topic 2freewifiguestssmokepropertyparkingproudamenitiesfurnisheddesignedmotelairportstylishhotellifestylebudgetindustryprovidingchoicefeature
Topic 3eventmeetingspacefeetsquareeventsmeetings000specialplanningpeopleteamcateringhostspacesmatterdedicatedgrillboardroomprivate
Topic 4smokingnon100areasaccessiblehotelsuitereservationtellupgradepublicclubattentivecancellationincludingguestsneedratevarypolicies
Topic 5businessavailablefreehighaccessinternetspeedcentercomplimentaryparkingguesthotelstayroomswirelesstravelroomofferserviceslaundry
Topic 6namedgatewaygrandsheratonprovidesdiversecorevibrantcitypacificlocatednorthwestseattlekitchenettebasketballbaconsmallspendathleticnearby
Topic 7viewscomfortsskylinestunninglookscenichome2diamondtechnologydestinationslandmarksenjoylikespaceseattleiconicstylishneedlesettlesituated
Topic 8accommodationparkingbeervarymileregencycoffeestudiosquiethistoryavenueenjoy2018newsconvenientlymeetingsstoryaccommodationsworldoverlooking
Topic 9queenhillannecomecapitolsuiteneighborhooddoorknowjustvolunteerfireplacesdistinctivesharedbathroomskimptonshowsbaconletlike
Topic 10pooldayoutdoorindoorrelaxheatedfitnesslongswimmingseasonalamenitiesspaincluderoomhotstopstarttubfitbusy
Topic 11locatedunionlakedowntownseattleleisurereservationperfectlycancellationinngardenhiltonhotelbusinessdestinationdiningcitymodernentertainmentneighborhood
Topic 12check12pmyearsguestroomssmokingfloorapartmentmasonuse500majortimeheadingpropertyinternet00pmofferwaitingrenton
Topic 13restaurantsshopsculturaluniquehubgalleriesmuseumsboutiquesspendsitesentertainmentadvantagechicactivitiesattractionssophisticatedsouthportcafincludingnear
Topic 14northwestpacificrenovatednaturalnewlylocallyinspiredurbansourcedmodernbeautifullyoutdoorsmenuparkbountybeautypreparedseasonalareaculinary
Topic 15hotelsnorthsevenguestsserviceamericaavailablegrandsheratonearnbookdoorrangekindwacnotchearlystylishminutes4th
Topic 16bistrowednesdaycancersoundculturaltablenewslaundrycouldnbeautifullyannestaffservicespugetfurrywineinnwalkingmealsfavorite
Topic 17perfecttimehistorichyatthousehospitalitymansiontripgrandhotelgetawaysouthportnationalromanticbaillienicholspensioneshaferapartmentera
Topic 18earlyavailabilitycheckworthknownsuretrainaccommodationtrulydeliverspoliciesstaysoakrestaurantdedicatedwestlakeofficesincrediblesportsjoin
Topic 19cooltouchesawesomesooncomfortableinsidesetstepwatertowntellfloorshavenartguestroomsliketodaykidsloftelements
Topic 20roomsguestoptionsfeaturinggreatcomfortvarietymindmodernprivatefreshdesignedfeatureuserecentlyintimatesheratonironingclubexpanded
Topic 21bedsuiteshomespaciouslivingflatroombedsroomsscreenequippedfamilystudiohotelmicrowaveamenitiesqueenfridgebathroomcomfortable
Topic 22spaceneedlecenterwashingtonuniversityseattlepikeplacemarketconventionstateawayincludingattractionsmedicalclosemasonhospitalcampusvirginia
Topic 23seattlehotellocateddowntownairportplacejustmarketpikeawayminutesbestnearcenterinncityheartwaterfrontfieldeasy
Topic 24cityamenitiesbuildinglikefurnishingsseattleemeraldenjoyhelpcontemporaryviewlandmarksignaturemodernfloorsclassicharborwindowskindthings
Topic 25diningentertainmentgardenglassdoorsworldbarsclasspayfrenchfactexcitingattentionlocalexperiencesfavoritestaircaseknownescapechihuly
Topic 26travelersincludingamazonbusinessclosetownmicrosoftstarbucksbusinessesboeingheadquartersproximitynordstromfacebookgooglecompanies500hipgatesshopping
Topic 27societymloftceilingsfriendstodayelementsroomslikeguestfloorsarttrulyconferencefeaturetripstunningcorporatealexisrooflong
Topic 28artsrichspendmusicthrivingafternoonwayperiodlandscapelightsoakareamuseumssceneinterestinghavendetailsrooftop250hall
Topic 29aircablebalconyconditionedcrownconditioningroomresortguestdaysoasissmokinghospitality500feefeaturegardengrandinvitingfit
Topic 30wantrightrentalbreezedestinationcoveredpossiblerefreshingheartseattlecamlinappreciateworldmarkwelcomingstepsjustwatertownawesomehoteldowntown
Topic 31bountytourspacificcitiesterminalsfreecoolnumberholdresortgameawaybusinessmobilevolunteerneighborhoodsdetailsroundoutsidelook
Topic 32featurescloudsilverkitchenbroadwayrockjimmyhotellatestadiumchefbarcasualexpressoptionsincludingproudsuccessupscalecaf
Topic 33neighborhoodbelltownlivemusicvibrantalikefoodneighborhoodscorehistorysceneareawaterfrontguestroomsnestledvenueslocalsdiversewestgeorgetown
Topic 34hotelseattlefriendlywelcomesoundpugetdowntownawardpetwinningdonoldworldbuiltgoodcomfortablestyleforgetrestaurantsmall
Topic 35outsiderunsshuttlepridepermittedguestsareasdesignatedsmokingbuildingshotelsmokeinvitingdeeplobbyamericanspotshitcollegework
Topic 36roomviewsdesklargestyleenjoydiningofferscomfortlevelprivategorgeousfeewindowsvietnameseupscalerooftopclubincredibleservice
Topic 37breakfasthourfitnesscenterservicecomplimentaryenjoyshuttlesitemorningrestaurantconveniencefacilitycontinentaldailyofferhotstarthappybar
Topic 38artmuseumworkexploreareanearbyseattleattractionsflightcenterlocaleasilystateneedshostelstationpikemarketcatchready
Topic 39coffeelocallobbybarenjoybreakfastexperiencefreshloungedayrestaurantfamilyfriendscomplimentaryhotfoodluxurysnackswatermeet
\n", + "
" + ], + "text/plain": [ + " Word 0 Word 1 Word 2 Word 3 Word 4 \\\n", + "Topic 0 seattle hotel inn lake union \n", + "Topic 1 stay make need sure comfortable \n", + "Topic 2 free wifi guests smoke property \n", + "Topic 3 event meeting space feet square \n", + "Topic 4 smoking non 100 areas accessible \n", + "Topic 5 business available free high access \n", + "Topic 6 named gateway grand sheraton provides \n", + "Topic 7 views comforts skyline stunning look \n", + "Topic 8 accommodation parking beer vary mile \n", + "Topic 9 queen hill anne come capitol \n", + "Topic 10 pool day outdoor indoor relax \n", + "Topic 11 located union lake downtown seattle \n", + "Topic 12 check 12pm years guest rooms \n", + "Topic 13 restaurants shops cultural unique hub \n", + "Topic 14 northwest pacific renovated natural newly \n", + "Topic 15 hotels north seven guests service \n", + "Topic 16 bistro wednesday cancer sound cultural \n", + "Topic 17 perfect time historic hyatt house \n", + "Topic 18 early availability check worth known \n", + "Topic 19 cool touches awesome soon comfortable \n", + "Topic 20 rooms guest options featuring great \n", + "Topic 21 bed suites home spacious living \n", + "Topic 22 space needle center washington university \n", + "Topic 23 seattle hotel located downtown airport \n", + "Topic 24 city amenities building like furnishings \n", + "Topic 25 dining entertainment garden glass doors \n", + "Topic 26 travelers including amazon business close \n", + "Topic 27 societym loft ceilings friends today \n", + "Topic 28 arts rich spend music thriving \n", + "Topic 29 air cable balcony conditioned crown \n", + "Topic 30 want right rental breeze destination \n", + "Topic 31 bounty tours pacific cities terminals \n", + "Topic 32 features cloud silver kitchen broadway \n", + "Topic 33 neighborhood belltown live music vibrant \n", + "Topic 34 hotel seattle friendly welcome sound \n", + "Topic 35 outside runs shuttle pride permitted \n", + "Topic 36 room views desk large style \n", + "Topic 37 breakfast hour fitness center service \n", + "Topic 38 art museum work explore area \n", + "Topic 39 coffee local lobby bar enjoy \n", + "\n", + " Word 5 Word 6 Word 7 Word 8 \\\n", + "Topic 0 stay experience offers south \n", + "Topic 1 time road extra convenient \n", + "Topic 2 parking proud amenities furnished \n", + "Topic 3 events meetings 000 special \n", + "Topic 4 hotel suite reservation tell \n", + "Topic 5 internet speed center complimentary \n", + "Topic 6 diverse core vibrant city \n", + "Topic 7 scenic home2 diamond technology \n", + "Topic 8 regency coffee studios quiet \n", + "Topic 9 suite neighborhood door know \n", + "Topic 10 heated fitness long swimming \n", + "Topic 11 leisure reservation perfectly cancellation \n", + "Topic 12 smoking floor apartment mason \n", + "Topic 13 galleries museums boutiques spend \n", + "Topic 14 locally inspired urban sourced \n", + "Topic 15 america available grand sheraton \n", + "Topic 16 table news laundry couldn \n", + "Topic 17 hospitality mansion trip grand \n", + "Topic 18 sure train accommodation truly \n", + "Topic 19 inside set step watertown \n", + "Topic 20 comfort variety mind modern \n", + "Topic 21 flat room beds rooms \n", + "Topic 22 seattle pike place market \n", + "Topic 23 place just market pike \n", + "Topic 24 seattle emerald enjoy help \n", + "Topic 25 world bars class pay \n", + "Topic 26 town microsoft starbucks businesses \n", + "Topic 27 elements rooms like guest \n", + "Topic 28 afternoon way period landscape \n", + "Topic 29 conditioning room resort guest \n", + "Topic 30 covered possible refreshing heart \n", + "Topic 31 free cool number hold \n", + "Topic 32 rock jimmy hotel late \n", + "Topic 33 alike food neighborhoods core \n", + "Topic 34 puget downtown award pet \n", + "Topic 35 guests areas designated smoking \n", + "Topic 36 enjoy dining offers comfort \n", + "Topic 37 complimentary enjoy shuttle site \n", + "Topic 38 nearby seattle attractions flight \n", + "Topic 39 breakfast experience fresh lounge \n", + "\n", + " Word 9 Word 10 Word 11 Word 12 \\\n", + "Topic 0 guests suites downtown offer \n", + "Topic 1 night extended enjoyable offering \n", + "Topic 2 designed motel airport stylish \n", + "Topic 3 planning people team catering \n", + "Topic 4 upgrade public club attentive \n", + "Topic 5 parking guest hotel stay \n", + "Topic 6 pacific located northwest seattle \n", + "Topic 7 destinations landmarks enjoy like \n", + "Topic 8 history avenue enjoy 2018 \n", + "Topic 9 just volunteer fireplaces distinctive \n", + "Topic 10 seasonal amenities spa include \n", + "Topic 11 inn garden hilton hotel \n", + "Topic 12 use 500 major time \n", + "Topic 13 sites entertainment advantage chic \n", + "Topic 14 modern beautifully outdoors menu \n", + "Topic 15 earn book door range \n", + "Topic 16 beautifully anne staff services \n", + "Topic 17 hotel getaway southport national \n", + "Topic 18 delivers policies stay soak \n", + "Topic 19 tell floors haven art \n", + "Topic 20 private fresh designed feature \n", + "Topic 21 screen equipped family studio \n", + "Topic 22 convention state away including \n", + "Topic 23 away minutes best near \n", + "Topic 24 contemporary view landmark signature \n", + "Topic 25 french fact exciting attention \n", + "Topic 26 boeing headquarters proximity nordstrom \n", + "Topic 27 floors art truly conference \n", + "Topic 28 lights oak area museums \n", + "Topic 29 days oasis smoking hospitality \n", + "Topic 30 seattle camlin appreciate worldmark \n", + "Topic 31 resort game away business \n", + "Topic 32 stadium chef bar casual \n", + "Topic 33 history scene area waterfront \n", + "Topic 34 winning don old world \n", + "Topic 35 buildings hotel smoke inviting \n", + "Topic 36 level private gorgeous fee \n", + "Topic 37 morning restaurant convenience facility \n", + "Topic 38 center local easily state \n", + "Topic 39 day restaurant family friends \n", + "\n", + " Word 13 Word 14 Word 15 Word 16 \\\n", + "Topic 0 modern comfort today original \n", + "Topic 1 designed hit pride staying \n", + "Topic 2 hotel lifestyle budget industry \n", + "Topic 3 host spaces matter dedicated \n", + "Topic 4 cancellation including guests need \n", + "Topic 5 rooms wireless travel room \n", + "Topic 6 kitchenette basketball bacon small \n", + "Topic 7 space seattle iconic stylish \n", + "Topic 8 news conveniently meetings story \n", + "Topic 9 shared bathrooms kimpton shows \n", + "Topic 10 room hot stop start \n", + "Topic 11 business destination dining city \n", + "Topic 12 heading property internet 00pm \n", + "Topic 13 activities attractions sophisticated southport \n", + "Topic 14 park bounty beauty prepared \n", + "Topic 15 kind wac notch early \n", + "Topic 16 puget furry wine inn \n", + "Topic 17 romantic baillie nichols pensione \n", + "Topic 18 restaurant dedicated westlake offices \n", + "Topic 19 guest rooms like today \n", + "Topic 20 use recently intimate sheraton \n", + "Topic 21 hotel microwave amenities queen \n", + "Topic 22 attractions medical close mason \n", + "Topic 23 center inn city heart \n", + "Topic 24 modern floors classic harbor \n", + "Topic 25 local experiences favorite staircase \n", + "Topic 26 facebook google companies 500 \n", + "Topic 27 feature trip stunning corporate \n", + "Topic 28 scene interesting haven details \n", + "Topic 29 500 fee feature garden \n", + "Topic 30 welcoming steps just watertown \n", + "Topic 31 mobile volunteer neighborhoods details \n", + "Topic 32 express options including proud \n", + "Topic 33 guestrooms nestled venues locals \n", + "Topic 34 built good comfortable style \n", + "Topic 35 deep lobby american spots \n", + "Topic 36 windows vietnamese upscale rooftop \n", + "Topic 37 continental daily offer hot \n", + "Topic 38 needs hostel station pike \n", + "Topic 39 complimentary hot food luxury \n", + "\n", + " Word 17 Word 18 Word 19 \n", + "Topic 0 alfred unique urban \n", + "Topic 1 landing soft creature \n", + "Topic 2 providing choice feature \n", + "Topic 3 grill boardroom private \n", + "Topic 4 rate vary policies \n", + "Topic 5 offer services laundry \n", + "Topic 6 spend athletic nearby \n", + "Topic 7 needle settle situated \n", + "Topic 8 accommodations world overlooking \n", + "Topic 9 bacon let like \n", + "Topic 10 tub fit busy \n", + "Topic 11 modern entertainment neighborhood \n", + "Topic 12 offer waiting renton \n", + "Topic 13 caf including near \n", + "Topic 14 seasonal area culinary \n", + "Topic 15 stylish minutes 4th \n", + "Topic 16 walking meals favorite \n", + "Topic 17 shafer apartment era \n", + "Topic 18 incredible sports join \n", + "Topic 19 kids loft elements \n", + "Topic 20 ironing club expanded \n", + "Topic 21 fridge bathroom comfortable \n", + "Topic 22 hospital campus virginia \n", + "Topic 23 waterfront field easy \n", + "Topic 24 windows kind things \n", + "Topic 25 known escape chihuly \n", + "Topic 26 hip gates shopping \n", + "Topic 27 alexis roof long \n", + "Topic 28 rooftop 250 hall \n", + "Topic 29 grand inviting fit \n", + "Topic 30 awesome hotel downtown \n", + "Topic 31 round outside look \n", + "Topic 32 success upscale caf \n", + "Topic 33 diverse west georgetown \n", + "Topic 34 forget restaurant small \n", + "Topic 35 hit college work \n", + "Topic 36 club incredible service \n", + "Topic 37 start happy bar \n", + "Topic 38 market catch ready \n", + "Topic 39 snacks water meet " + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Show top 20 keywords for each topic\n", + "def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):\n", + " keywords = np.array(vectorizer.get_feature_names())\n", + " topic_keywords = []\n", + " for topic_weights in lda_model.components_:\n", + " top_keyword_locs = (-topic_weights).argsort()[:n_words]\n", + " topic_keywords.append(keywords.take(top_keyword_locs))\n", + " return topic_keywords\n", + "\n", + "topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20) \n", + "\n", + "# Topic - Keywords Dataframe\n", + "df_topic_keywords = pd.DataFrame(topic_keywords)\n", + "df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]\n", + "df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]\n", + "df_topic_keywords" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "# Create Document - Topic Matrix\n", + "lda_output = lda_model.transform(data_vectorized)\n", + "\n", + "# column names\n", + "topicnames = [\"Topic\" + str(i) for i in range(40)]\n", + "\n", + "# index names\n", + "docnames = [\"Doc\" + str(i) for i in range(len(data))]\n", + "\n", + "# Make the pandas dataframe\n", + "df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)\n", + "\n", + "# Get dominant topic for each document\n", + "dominant_topic = np.argmax(df_document_topic.values, axis=1)\n", + "df_document_topic['dominant_topic'] = dominant_topic" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Topic0Topic1Topic2Topic3Topic4Topic5Topic6Topic7Topic8Topic9Topic10Topic11Topic12Topic13Topic14Topic15Topic16Topic17Topic18Topic19Topic20Topic21Topic22Topic23Topic24Topic25Topic26Topic27Topic28Topic29Topic30Topic31Topic32Topic33Topic34Topic35Topic36Topic37Topic38Topic39dominant_topic
Doc00.470.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.390.000.070.000.000.000.000.000.000.000.000.000.000.000.000.000.000
Doc10.000.000.000.000.750.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.170.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.004
Doc20.000.000.000.000.430.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.150.000.000.000.000.000.000.000.290.000.000.004
Doc30.000.000.000.000.340.000.000.000.000.000.170.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.340.000.000.004
Doc40.010.010.010.010.010.010.010.010.010.010.010.010.510.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.0112
Doc50.010.010.010.010.010.010.010.010.010.010.010.010.510.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.0112
Doc60.000.000.000.000.840.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.004
Doc70.010.010.010.010.670.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.014
Doc80.000.000.000.000.000.000.300.000.000.000.000.000.000.000.170.000.000.000.000.000.000.000.000.250.000.000.000.000.000.000.000.000.000.220.000.000.000.000.000.006
Doc90.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.320.060.000.580.000.000.000.000.000.000.000.000.000.000.000.000.000.0025
\n", + "
" + ], + "text/plain": [ + " Topic0 Topic1 Topic2 Topic3 Topic4 Topic5 Topic6 Topic7 Topic8 \\\n", + "Doc0 0.47 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc1 0.00 0.00 0.00 0.00 0.75 0.00 0.00 0.00 0.00 \n", + "Doc2 0.00 0.00 0.00 0.00 0.43 0.00 0.00 0.00 0.00 \n", + "Doc3 0.00 0.00 0.00 0.00 0.34 0.00 0.00 0.00 0.00 \n", + "Doc4 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc5 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc6 0.00 0.00 0.00 0.00 0.84 0.00 0.00 0.00 0.00 \n", + "Doc7 0.01 0.01 0.01 0.01 0.67 0.01 0.01 0.01 0.01 \n", + "Doc8 0.00 0.00 0.00 0.00 0.00 0.00 0.30 0.00 0.00 \n", + "Doc9 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + " Topic9 Topic10 Topic11 Topic12 Topic13 Topic14 Topic15 Topic16 \\\n", + "Doc0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc3 0.00 0.17 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc4 0.01 0.01 0.01 0.51 0.01 0.01 0.01 0.01 \n", + "Doc5 0.01 0.01 0.01 0.51 0.01 0.01 0.01 0.01 \n", + "Doc6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc7 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc8 0.00 0.00 0.00 0.00 0.00 0.17 0.00 0.00 \n", + "Doc9 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + " Topic17 Topic18 Topic19 Topic20 Topic21 Topic22 Topic23 Topic24 \\\n", + "Doc0 0.00 0.00 0.00 0.00 0.00 0.00 0.39 0.00 \n", + "Doc1 0.00 0.00 0.00 0.17 0.00 0.00 0.00 0.00 \n", + "Doc2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc3 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc4 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc5 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc7 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc8 0.00 0.00 0.00 0.00 0.00 0.00 0.25 0.00 \n", + "Doc9 0.00 0.00 0.00 0.00 0.00 0.32 0.06 0.00 \n", + "\n", + " Topic25 Topic26 Topic27 Topic28 Topic29 Topic30 Topic31 Topic32 \\\n", + "Doc0 0.07 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc2 0.00 0.00 0.00 0.15 0.00 0.00 0.00 0.00 \n", + "Doc3 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc4 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc5 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc7 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc8 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc9 0.58 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + " Topic33 Topic34 Topic35 Topic36 Topic37 Topic38 Topic39 \\\n", + "Doc0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc2 0.00 0.00 0.00 0.29 0.00 0.00 0.00 \n", + "Doc3 0.00 0.00 0.00 0.34 0.00 0.00 0.00 \n", + "Doc4 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc5 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc7 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "Doc8 0.22 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "Doc9 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + " dominant_topic \n", + "Doc0 0 \n", + "Doc1 4 \n", + "Doc2 4 \n", + "Doc3 4 \n", + "Doc4 12 \n", + "Doc5 12 \n", + "Doc6 4 \n", + "Doc7 4 \n", + "Doc8 6 \n", + "Doc9 25 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_document_topic.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "df_document_topic.reset_index(inplace=True)\n", + "df_sent_topic= pd.merge(df, df_document_topic, left_index=True, right_index=True)\n", + "df_sent_topic.drop('index', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencenameTopic0Topic1Topic2Topic3Topic4Topic5Topic6Topic7Topic8Topic9Topic10Topic11Topic12Topic13Topic14Topic15Topic16Topic17Topic18Topic19Topic20Topic21Topic22Topic23Topic24Topic25Topic26Topic27Topic28Topic29Topic30Topic31Topic32Topic33Topic34Topic35Topic36Topic37Topic38Topic39dominant_topic
0Located on the southern tip of Lake Union the ...Hilton Garden Inn Seattle Downtown0.470.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.390.000.070.000.000.000.000.000.000.000.000.000.000.000.000.000.000
1Non Smoking Hotel is 100 non smoking including...Hilton Garden Inn Seattle Downtown0.000.000.000.000.750.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.170.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.004
2A fee of up to 250 USD will be assessed for sm...Hilton Garden Inn Seattle Downtown0.000.000.000.000.430.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.150.000.000.000.000.000.000.000.290.000.000.004
3Please ask the Front Desk for locations of des...Hilton Garden Inn Seattle Downtown0.000.000.000.000.340.000.000.000.000.000.170.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.340.000.000.004
4Check in 4 00 pmHilton Garden Inn Seattle Downtown0.010.010.010.010.010.010.010.010.010.010.010.010.510.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.0112
5Check out 12 00 pmHilton Garden Inn Seattle Downtown0.010.010.010.010.010.010.010.010.010.010.010.010.510.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.0112
6Cancellation policies may vary depending on th...Hilton Garden Inn Seattle Downtown0.000.000.000.000.840.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.004
7Please refer to your reservation confirmation ...Hilton Garden Inn Seattle Downtown0.010.010.010.010.670.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.010.014
8Located in the city s vibrant core the Sherato...Sheraton Grand Seattle0.000.000.000.000.000.000.300.000.000.000.000.000.000.000.170.000.000.000.000.000.000.000.000.250.000.000.000.000.000.000.000.000.000.220.000.000.000.000.000.006
9Step out of our front doors to find gourmet di...Sheraton Grand Seattle0.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.320.060.000.580.000.000.000.000.000.000.000.000.000.000.000.000.000.0025
\n", + "
" + ], + "text/plain": [ + " sentence \\\n", + "0 Located on the southern tip of Lake Union the ... \n", + "1 Non Smoking Hotel is 100 non smoking including... \n", + "2 A fee of up to 250 USD will be assessed for sm... \n", + "3 Please ask the Front Desk for locations of des... \n", + "4 Check in 4 00 pm \n", + "5 Check out 12 00 pm \n", + "6 Cancellation policies may vary depending on th... \n", + "7 Please refer to your reservation confirmation ... \n", + "8 Located in the city s vibrant core the Sherato... \n", + "9 Step out of our front doors to find gourmet di... \n", + "\n", + " name Topic0 Topic1 Topic2 Topic3 Topic4 \\\n", + "0 Hilton Garden Inn Seattle Downtown 0.47 0.00 0.00 0.00 0.00 \n", + "1 Hilton Garden Inn Seattle Downtown 0.00 0.00 0.00 0.00 0.75 \n", + "2 Hilton Garden Inn Seattle Downtown 0.00 0.00 0.00 0.00 0.43 \n", + "3 Hilton Garden Inn Seattle Downtown 0.00 0.00 0.00 0.00 0.34 \n", + "4 Hilton Garden Inn Seattle Downtown 0.01 0.01 0.01 0.01 0.01 \n", + "5 Hilton Garden Inn Seattle Downtown 0.01 0.01 0.01 0.01 0.01 \n", + "6 Hilton Garden Inn Seattle Downtown 0.00 0.00 0.00 0.00 0.84 \n", + "7 Hilton Garden Inn Seattle Downtown 0.01 0.01 0.01 0.01 0.67 \n", + "8 Sheraton Grand Seattle 0.00 0.00 0.00 0.00 0.00 \n", + "9 Sheraton Grand Seattle 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + " Topic5 Topic6 Topic7 Topic8 Topic9 Topic10 Topic11 Topic12 Topic13 \\\n", + "0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "3 0.00 0.00 0.00 0.00 0.00 0.17 0.00 0.00 0.00 \n", + "4 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.51 0.01 \n", + "5 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.51 0.01 \n", + "6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "7 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "8 0.00 0.30 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "9 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + " Topic14 Topic15 Topic16 Topic17 Topic18 Topic19 Topic20 Topic21 \\\n", + "0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "1 0.00 0.00 0.00 0.00 0.00 0.00 0.17 0.00 \n", + "2 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "3 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "4 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "5 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "7 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "8 0.17 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "9 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + " Topic22 Topic23 Topic24 Topic25 Topic26 Topic27 Topic28 Topic29 \\\n", + "0 0.00 0.39 0.00 0.07 0.00 0.00 0.00 0.00 \n", + "1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "2 0.00 0.00 0.00 0.00 0.00 0.00 0.15 0.00 \n", + "3 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "4 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "5 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "7 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "8 0.00 0.25 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "9 0.32 0.06 0.00 0.58 0.00 0.00 0.00 0.00 \n", + "\n", + " Topic30 Topic31 Topic32 Topic33 Topic34 Topic35 Topic36 Topic37 \\\n", + "0 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "2 0.00 0.00 0.00 0.00 0.00 0.00 0.29 0.00 \n", + "3 0.00 0.00 0.00 0.00 0.00 0.00 0.34 0.00 \n", + "4 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "5 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "6 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "7 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.01 \n", + "8 0.00 0.00 0.00 0.22 0.00 0.00 0.00 0.00 \n", + "9 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 \n", + "\n", + " Topic38 Topic39 dominant_topic \n", + "0 0.00 0.00 0 \n", + "1 0.00 0.00 4 \n", + "2 0.00 0.00 4 \n", + "3 0.00 0.00 4 \n", + "4 0.01 0.01 12 \n", + "5 0.01 0.01 12 \n", + "6 0.00 0.00 4 \n", + "7 0.01 0.01 4 \n", + "8 0.00 0.00 6 \n", + "9 0.00 0.00 25 " + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sent_topic.head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencedominant_topic
0Located on the southern tip of Lake Union the ...0
1Non Smoking Hotel is 100 non smoking including...4
2A fee of up to 250 USD will be assessed for sm...4
3Please ask the Front Desk for locations of des...4
4Check in 4 00 pm12
5Check out 12 00 pm12
6Cancellation policies may vary depending on th...4
7Please refer to your reservation confirmation ...4
\n", + "
" + ], + "text/plain": [ + " sentence dominant_topic\n", + "0 Located on the southern tip of Lake Union the ... 0\n", + "1 Non Smoking Hotel is 100 non smoking including... 4\n", + "2 A fee of up to 250 USD will be assessed for sm... 4\n", + "3 Please ask the Front Desk for locations of des... 4\n", + "4 Check in 4 00 pm 12\n", + "5 Check out 12 00 pm 12\n", + "6 Cancellation policies may vary depending on th... 4\n", + "7 Please refer to your reservation confirmation ... 4" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sent_topic.loc[df_sent_topic['name'] == 'Hilton Garden Inn Seattle Downtown'][['sentence', 'dominant_topic']]" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencedominant_topic
1053The Spa at the WAC will spoil you with a compl...4
697Our hotel is completely non smoking4
70Please ask the Front Desk for locations of des...4
7Please refer to your reservation confirmation ...4
23We do not allow smoking in our rooms public ar...4
1144100 non smoking4
605Non Smoking Hotel4
69A fee of up to 250 USD will be assessed for sm...4
102The characters Attentive staff members who alw...4
431100 non smoking and accessible accommodations ...4
6Cancellation policies may vary depending on th...4
68Non Smoking Hotel is 100 non smoking including...4
737The accommodations are romantic and timeless a...4
3Please ask the Front Desk for locations of des...4
1147Cancellation policies may vary depending on th...4
2A fee of up to 250 USD will be assessed for sm...4
81All guestrooms are non smoking4
263Upgrade to a Premium Suite to enjoy compliment...4
479Our SeaTac hotel is a non smoking facility4
1Non Smoking Hotel is 100 non smoking including...4
\n", + "
" + ], + "text/plain": [ + " sentence dominant_topic\n", + "1053 The Spa at the WAC will spoil you with a compl... 4\n", + "697 Our hotel is completely non smoking 4\n", + "70 Please ask the Front Desk for locations of des... 4\n", + "7 Please refer to your reservation confirmation ... 4\n", + "23 We do not allow smoking in our rooms public ar... 4\n", + "1144 100 non smoking 4\n", + "605 Non Smoking Hotel 4\n", + "69 A fee of up to 250 USD will be assessed for sm... 4\n", + "102 The characters Attentive staff members who alw... 4\n", + "431 100 non smoking and accessible accommodations ... 4\n", + "6 Cancellation policies may vary depending on th... 4\n", + "68 Non Smoking Hotel is 100 non smoking including... 4\n", + "737 The accommodations are romantic and timeless a... 4\n", + "3 Please ask the Front Desk for locations of des... 4\n", + "1147 Cancellation policies may vary depending on th... 4\n", + "2 A fee of up to 250 USD will be assessed for sm... 4\n", + "81 All guestrooms are non smoking 4\n", + "263 Upgrade to a Premium Suite to enjoy compliment... 4\n", + "479 Our SeaTac hotel is a non smoking facility 4\n", + "1 Non Smoking Hotel is 100 non smoking including... 4" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sent_topic.loc[df_sent_topic['dominant_topic'] == 4][['sentence', 'dominant_topic']].sample(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sentencedominant_topic
604Check in 15 00 check out 11 0012
24Check in is at 3pm and check out is at 12pm12
608Check in 15 00 check out 11 0012
78Check in begins at 4 00pm12
546Check emails in the Work Zone12
82Late check out is subject to availability12
1146Check out 12 00 pm12
4Check in 4 00 pm12
5Check out 12 00 pm12
71Check in 4 00 pm Check out 12 00 pm12
\n", + "
" + ], + "text/plain": [ + " sentence dominant_topic\n", + "604 Check in 15 00 check out 11 00 12\n", + "24 Check in is at 3pm and check out is at 12pm 12\n", + "608 Check in 15 00 check out 11 00 12\n", + "78 Check in begins at 4 00pm 12\n", + "546 Check emails in the Work Zone 12\n", + "82 Late check out is subject to availability 12\n", + "1146 Check out 12 00 pm 12\n", + "4 Check in 4 00 pm 12\n", + "5 Check out 12 00 pm 12\n", + "71 Check in 4 00 pm Check out 12 00 pm 12" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_sent_topic.loc[df_sent_topic['dominant_topic'] == 12][['sentence', 'dominant_topic']].sample(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There are 20 sentences that belong to topic 4 and we will remove\n", + "There are 19 sentences that belong to topic 12 and we will remove\n" + ] + } + ], + "source": [ + "print('There are', len(df_sent_topic.loc[df_sent_topic['dominant_topic'] == 4]), 'sentences that belong to topic 4 and we will remove')\n", + "print('There are', len(df_sent_topic.loc[df_sent_topic['dominant_topic'] == 12]), 'sentences that belong to topic 12 and we will remove')" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Topic #Num Sentences
023302
10129
22187
3572
43968
52256
63746
71044
83440
92437
10337
11232
12131
132028
143826
153621
16420
171219
181717
193315
202614
21914
221411
23257
24136
25286
26325
27294
28304
2973
3062
31351
32271
33151
34191
35181
\n", + "
" + ], + "text/plain": [ + " Topic # Num Sentences\n", + "0 23 302\n", + "1 0 129\n", + "2 21 87\n", + "3 5 72\n", + "4 39 68\n", + "5 22 56\n", + "6 37 46\n", + "7 10 44\n", + "8 34 40\n", + "9 24 37\n", + "10 3 37\n", + "11 2 32\n", + "12 1 31\n", + "13 20 28\n", + "14 38 26\n", + "15 36 21\n", + "16 4 20\n", + "17 12 19\n", + "18 17 17\n", + "19 33 15\n", + "20 26 14\n", + "21 9 14\n", + "22 14 11\n", + "23 25 7\n", + "24 13 6\n", + "25 28 6\n", + "26 32 5\n", + "27 29 4\n", + "28 30 4\n", + "29 7 3\n", + "30 6 2\n", + "31 35 1\n", + "32 27 1\n", + "33 15 1\n", + "34 19 1\n", + "35 18 1" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name=\"Num Sentences\")\n", + "df_topic_distribution.columns = ['Topic #', 'Num Sentences']\n", + "df_topic_distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "df_sent_topic_clean = df_sent_topic.drop(df_sent_topic[(df_sent_topic.dominant_topic == 4) | (df_sent_topic.dominant_topic == 12)].index)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "df_description = df_sent_topic_clean[['sentence','name']]\n", + "df_description = df_description.groupby('name')['sentence'].agg(lambda col: ' '.join(col)).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namesentence
011th Avenue Inn Bed and BreakfastWalk to the Pike Place Market and to the other...
1Ace Hotel SeattleWe fell in love with a former maritime workers...
2Aloft Seattle RedmondCelebrate your style at Aloft Seattle Redmond ...
3Americas Best Value Inn Shoreline / Seattle NorthAmericas Best Value Inn Shoreline Seattle Nort...
4Ballard InnHistoric Style with Modern Amenities The Balla...
\n", + "
" + ], + "text/plain": [ + " name \\\n", + "0 11th Avenue Inn Bed and Breakfast \n", + "1 Ace Hotel Seattle \n", + "2 Aloft Seattle Redmond \n", + "3 Americas Best Value Inn Shoreline / Seattle North \n", + "4 Ballard Inn \n", + "\n", + " sentence \n", + "0 Walk to the Pike Place Market and to the other... \n", + "1 We fell in love with a former maritime workers... \n", + "2 Celebrate your style at Aloft Seattle Redmond ... \n", + "3 Americas Best Value Inn Shoreline Seattle Nort... \n", + "4 Historic Style with Modern Amenities The Balla... " + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_description.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Located on the southern tip of Lake Union the Hilton Garden Inn Seattle Downtown hotel is perfectly located for business and leisure'" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_description['sentence'][45]" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "df_description.set_index('name', inplace = True)\n", + "tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0, stop_words='english')\n", + "tfidf_matrix = tf.fit_transform(df_description['sentence'])\n", + "cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "indices = pd.Series(df_description.index)\n", + "def recommendations(name, cosine_similarities = cosine_similarities):\n", + " \n", + " recommended_hotels = []\n", + " \n", + " # gettin the index of the hotel that matches the name\n", + " idx = indices[indices == name].index[0]\n", + "\n", + " # creating a Series with the similarity scores in descending order\n", + " score_series = pd.Series(cosine_similarities[idx]).sort_values(ascending = False)\n", + "\n", + " # getting the indexes of the 5 most similar hotels except itself\n", + " top_10_indexes = list(score_series.iloc[1:6].index)\n", + " \n", + " # populating the list with the names of the top 5 matching hotels\n", + " for i in top_10_indexes:\n", + " recommended_hotels.append(list(df_description.index)[i])\n", + " \n", + " return recommended_hotels" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['Silver Cloud Inn - Seattle Lake Union',\n", + " 'Residence Inn by Marriott Seattle Downtown/Lake Union',\n", + " 'Staybridge Suites Seattle Downtown - Lake Union',\n", + " 'Homewood Suites by Hilton Seattle Downtown',\n", + " 'Days Inn by Wyndham Seattle North of Downtown']" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "recommendations('Hilton Garden Inn Seattle Downtown')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}