This repository has been archived by the owner on Mar 21, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb-api-scrapper.py
179 lines (144 loc) · 6.25 KB
/
imdb-api-scrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import argcomplete
import argparse
from imdbpie import Imdb
import json
import logging
import sys
# Generates a movie json
def gen_json(movies):
with open('movies.json', mode='w') as moviesjson:
json.dump(movies, moviesjson)
# Requests the IMDb data for a given movie id
def do_query(id):
# Create lists for fields which can have more than one feature
actors = list()
writers = list()
directors = list()
genres = list()
# Calculate the next movie id
following_id = id + 1
if imdb.title_exists('tt' + '%07d' % id) == False:
following_id = id + 1
logger.info('Movie with id tt' + '%07d' % id + ' does not exists. Continue with tt' + '%07d' % following_id)
return False
try:
# Retrieve the movie as object
movie = imdb.get_title_by_id('tt' + '%07d' % id)
# Store the persons categorized by their positions
for person in movie.credits:
if person.token == 'directors':
directors.append(person.name)
elif person.token == 'writers':
writers.append(person.name)
else:
actors.append(person.name)
# Store the genres
for genre in movie.genres:
genres.append(genre)
except TypeError:
logger.info('Movie with id tt' + '%07d' % id + ' has inconsistent data. Skip and continue with tt' + '%07d' % following_id)
return False
except AttributeError:
logger.info('Movie with id tt' + '%07d' % id + ' is an episode. Skip and continue with tt' + '%07d' % following_id)
return False
except:
logger.info('An error occured while processing movie with id tt' + '%07d' % id + '. Skip and continue with tt' + '%07d' % following_id)
return False
# Store the movie data in a dictionary
movie_data = {
"actor": actors,
"certification": movie.certification,
"director": directors,
"genre": genres,
"id": movie.imdb_id,
"name": str(movie.title),
"plot": movie.plot_outline,
"poster_url": movie.poster_url,
"cover_url": movie.cover_url,
"release_date": movie.release_date,
"runtime": str(movie.runtime) + ' min.',
"year": movie.year
}
logger.info('Movie with id ' + movie.imdb_id + ' retrieved.')
return movie_data
# Removes the wrapping dictionary of the data in the json file
def clean_json():
# Load the dictionary from json file
with open('movies.json') as f:
movies = json.load(f)
# Get the value of the wrapping dictionary
movies_temp = movies['movies']
# Store the unwrapped dictionaries
with open('movies.json', mode='w') as moviesjson:
json.dump(movies_temp, moviesjson)
# Main
if __name__ == '__main__':
# Reload does the trick!
reload(sys)
# Set default encoding
sys.setdefaultencoding('utf-8')
# Define commandline arguments
parser = argparse.ArgumentParser(description='retrieve films from IMDB', usage='python imdb-api-scrapper.py 10000 save')
parser.add_argument('number', type=int, help='number of movies to request')
parser.add_argument('storing', choices=['save', 'unsave'],
help='[save] store movies data after each request,[unsave] store movies data after all requests were executed')
parser.add_argument('--start', type=int, help='the movie id to start with')
parser.add_argument('--overwrite', default='yes', choices=['yes', 'no'], help='[yes] overwrite json file, [no] append json file')
parser.add_argument('--episodes', default='no', choices=['yes', 'no'],
help='[yes] retrieve movies and episodes, [no] retrieve movies only')
args = parser.parse_args()
argcomplete.autocomplete(parser)
# Set up a specific logger with desired output level
logging.basicConfig(filename='./logs/imdb-scrapper.log', level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
logging.getLogger().addHandler(logging.StreamHandler())
# Only show warnings for request library
logging.getLogger('requests').setLevel(logging.WARNING)
if args.start != None:
START_ID = args.start
else:
START_ID = 1
MAX_ITERATIONS = args.number
if args.episodes == 'yes':
exclude_episodes = False
else:
exclude_episodes = True
# Proxy the requests
imdb = Imdb(anonymize=True, exclude_episodes=exclude_episodes)
if args.overwrite == 'yes':
# Create a clean json file
logger.info('JSON file "movies.json" created.')
with open('movies.json', mode='w') as moviesjson:
json.dump({'movies': []}, moviesjson)
# Create a dictionary for the movies
movies = {'movies': []}
else:
# Load the dictionary from json file
with open('movies.json') as f:
movies_temp = json.load(f)
# Create a dictionary for the movies
movies = {'movies': []}
# Load data from file into created movies dictionary
for i in range(0, len(movies_temp)):
movies['movies'].append(movies_temp[i])
# Process N films of IMDb
logger.info('Movie retrieval started.')
for i in range(START_ID, START_ID + MAX_ITERATIONS):
# Get the movie data
movie = do_query(i)
if movie == False:
continue
# Append the movie data dictionary to the movies dictionary
movies['movies'].append(movie)
# Store the update movies dictionary in the json file (after each request)
if args.storing == 'save':
gen_json(movies)
logger.info('Movie with id ' + movie['id'] + ' stored in "movies.json".')
# Store the updated movies dictionary in the json file (after all movies were retrieved)
if args.storing == 'unsave':
gen_json(movies)
logger.info('All retrieved movies were stored in "movies.json".')
# Remove the wrapping dictionary
clean_json()
logger.info('"movies.json" cleaned up.')
logger.info('Movie retrieval finished.')