-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb_main.py
168 lines (142 loc) · 5.79 KB
/
imdb_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# the data is from: https://www.kaggle.com/deepmatrix/imdb-5000-movie-dataset
import pandas as pd
from collections import Counter
import numpy as np
import imdb_plot as iplot
import math
import imdb_scraping as iscrap
import imdb_util as iutil
import imdb_machine_learning as iml
data_path = 'C:/Users/rodri/OneDrive/Documentos/python/data/'
def keywords_weight(df):
# get plot keywords count
plot_count = get_plot_keywords_count(df)
# drop all the rows where plot_keywords is empty (NaN)
# NEW
#df.dropna(subset=['plot_keywords'], inplace=True)
df2 = df[['plot_keywords', 'imdb_score']]
# loop through plot_keywords and imdb_score in order to get the imdb_score SUM for each word
plot_imdb = {}
plot_keywords = df2['plot_keywords'].tolist()
imdb_score = df2['imdb_score'].tolist()
for pltws, imdb in zip(plot_keywords, imdb_score):
# NEW
if pltws is not np.nan:
for w in pltws.split('|'):
if plot_imdb.get(w):
plot_imdb[w] += imdb
else:
plot_imdb[w] = imdb
# get the MEAN imdb score for each word
for w, s in plot_imdb.items():
plot_imdb[w] = round(s/plot_count[w], 1)
# convert from Counter (plot_count) to DataFrame
df3 = pd.DataFrame.from_dict(plot_count, orient='index').reset_index()
df3.columns = ['plot_keyword', 'count']
df3.set_index('plot_keyword', inplace=True)
# convert from dict (plot_imdb) to DataFrame
df4 = pd.DataFrame.from_dict(plot_imdb, orient='index').reset_index()
df4.columns = ['plot_keyword', 'imdb_score_mean']
df4.set_index('plot_keyword', inplace=True)
# join DataFrames
df5 = df3.join(df4)
# Let's measure the two criteria on similar numerical scale, and then assign the importance (weights) to those criteria
# The scale I'll use is 0-100 for both variables (count and imdb score)
# Step 1: get the max and min from each variable
countMax = df5['count'].max()
countMin = df5['count'].min()
imdbMax = df5['imdb_score_mean'].max()
imdbMin = df5['imdb_score_mean'].min()
# Step 2: get the weight using the formula: (x-min)/(max-min)*100
df5['count_weight'] = (df5['count']-countMin)/(countMax-countMin)*100
df5['imdb_weight'] = (df5['imdb_score_mean']-imdbMin)/(imdbMax-imdbMin)*100
# Step 3: Calculate the weighted average
# Here I'll assume the weight of imdb is 75%. Hence, the weight of count will be 25%.
df5['weighted_average'] = round( (.25*df5['count_weight'] + .75*df5['imdb_weight']), 1)
df5 = df5['weighted_average']
# convert df5 to dict
weighted_avg = df5.to_dict()
iutil.save_pickle(weighted_avg, 'weighted_plot_keywords.pickle')
# apply the weighted average to plot_keywords in the original DF
# remove the '\xa0' character from the movie titles
df['movie_title'] = df.apply(lambda x: x['movie_title'].replace('\xa0', ''), axis=1)
movie_titles = df['movie_title'].tolist()
df.set_index('movie_title', inplace=True)
plot_keywords_weight = {}
# loop through each list of words of each row, and then calculate the weight average
for mt, pltws in zip(movie_titles, plot_keywords):
# NEW
if pltws is not np.nan:
weight = 0
for w in pltws.split('|'):
weight += weighted_avg[w]
_len = len(pltws.split('|'))
plot_keywords_weight[mt] = round(weight/_len, 1)
else:
plot_keywords_weight[mt] = 0
# convert from dict (plot_keywords_weight) to DataFrame
df6 = pd.DataFrame.from_dict(plot_keywords_weight, orient='index')
df6.columns = ['plot_keywords_weight']
# join both tables
df7 = df.join(df6)
df7.reset_index(inplace=True)
#print(df7)
iutil.save_pickle(df7, 'imdb.pickle')
return df7
def assign_plot_keywords_weight(df):
filename = 'weighted_plot_keywords.pickle'
# check whether the weighted plot keywords exists or not
if not iutil.file_exists(filename):
df_temp = iutil.get_dataset()
keywords_weight(df_temp)
keywords_dict = iutil.get_pickle(filename)
plot_keywords = df['plot_keywords'].tolist()
movies_name = df['movie_title'].tolist()
# assign a wegiht to plot keywords for each movie
new_keywords_dict = {}
for words, mname in zip(plot_keywords, movies_name):
if words is not None:
i = 0
weighted_sum = 0
for w in words.split('|'):
if keywords_dict.get(w):
weight = keywords_dict[w]
weighted_sum += weight
i+=1
if weighted_sum > 0:
new_keywords_dict[mname] = weighted_sum/i
else:
new_keywords_dict[mname] = 0
else:
new_keywords_dict[mname] = 0
# from dict to DataFrame
df2 = pd.DataFrame.from_dict(new_keywords_dict, orient='index')
df2.columns = ['plot_keywords_weight']
df3 = df.set_index('movie_title').join(df2).reset_index()
df3 = df3[['movie_title', 'movie_facebook_likes', 'cast_total_facebook_likes', 'plot_keywords', 'plot_keywords_weight']]
#print(df3)
return df3
# Preparing data for machine learning
if iutil.file_exists('imdb.pickle'):
df = iutil.get_pickle('imdb.pickle')
else:
df = iutil.get_dataset()
df = df[['movie_title', 'movie_facebook_likes', 'cast_total_facebook_likes', 'plot_keywords', 'imdb_score']]
df.drop_duplicates(['movie_title'], inplace=True)
df = keywords_weight(df)
df2 = iscrap.get_new_movies_coming_soon()
df3 = assign_plot_keywords_weight(df2)
#print(df.head())
#print(df3.head())
iml.start(df, df3)
# Data Analysis
# (OK) imdb vs country | (OK) imdb vs movie year | (OK) imdb vs facebook popularity
# (OK) imdb vs director facebook popularity | (OK) imdb vs cast facebook popularity
# (OK) imdb vs genre | (OK) genre vs movies count | (OK) director vs number of movies | (OK) director vs imdb score
# (OK) comparision between budget and gross
# Data to be scrapped
# movie_title (original title), movie_year, plot_keywords, movie_facebook_likes, director_name, director_facebook_likes, actor_1_name
# actor_1_facebook_likes (the same for actor 2 and 3)
# Machine Learning
# Features: movie_facebook_likes, cast_facebook_likes? (director, actor1, actor2, actor3), plot_keywords? (assign a weight to each word?)
# Label: imdb_score