-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawler.py
175 lines (124 loc) · 4.16 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import requests, json, os, sys
from datetime import datetime, date, timedelta
from bs4 import BeautifulSoup
from slugify import slugify
# DOWNLOAD IMAGE
def downloadImage(url, path):
if not url.startswith("http://www.noonsite.com"):
if url.startswith("/"):
url = "http://www.noonsite.com" + url
else:
url = "http://www.noonsite.com/" + url
r = requests.get(url, stream=True)
if r.status_code == 200:
if not os.path.exists(path):
os.makedirs(path)
if os.path.isdir(path):
os.rmdir(path)
if not os.path.exists(path):
with open(path, "wb") as f:
for chunk in r:
f.write(chunk)
# GET SECTIONS
def getSections(country, html):
soup = BeautifulSoup(html, "html.parser")
sections = soup.find(id="noonsite-sections")
# download images
folder = "data/" + country["slug"]
sectionStr = str(sections)
for img in sections.find_all("img"):
if img != None and img.get("src") != None:
fileName = img.get("src").replace("http://www.noonsite.com", "")
downloadImage(img.get("src"), folder + fileName)
sectionStr = sectionStr.replace("src=\"http://www.noonsite.com", "src=\"")
cities = []
for a in sections.find_all("a"):
if a != None and a.get("href") != None and a.get("href").startswith(country["url"]) and len(a.get("href")) < 90:
cities.append({
"name": a.get_text().replace("*", "").strip(),
"url": a.get("href"),
"slug": slugify(a.get_text().replace("*", "").strip())
})
return sectionStr, cities
def downloadSection(country, section):
p = requests.get(country["url"] + "?rc=" + section)
countryHtml = p.text
return getSections(country, countryHtml)
# DOWNLOAD PROFILE
def downloadProfile(country):
return downloadSection(country, "CountryProfile")
# DOWNLOAD FORMATLITIES
def downloadFormalities(country):
return downloadSection(country, "Formalities")
# DOWNLOAD GENERALINFO
def downloadGeneralInfo(country):
return downloadSection(country, "GeneralInfo")
# DOWNLOAD CITY
def downloadCity(city):
c = requests.get(city["url"])
return getSections(city, c.text)
# DOWNLOAD COUNTRIES
def downloadCountries(getFlag=False):
countries = {}
c = requests.get("http://www.noonsite.com/Countries")
countriesHtml = c.text
soup = BeautifulSoup(countriesHtml, "html.parser")
countrylListing = soup.find(id="noonsite-countries-listing")
currentArea = None
for child in countrylListing.contents:
# new area
if child.name == "h2":
currentArea = child.get_text().strip()
if child.name == "p":
for a in child.find_all("a"):
if not currentArea in countries:
countries[currentArea] = []
flag = None
if getFlag == True:
countryHtml = requests.get(a.get("href")).text
countrySoup = BeautifulSoup(countryHtml, "html.parser")
for img in countrySoup.find_all("img"):
if "flags" in img.get("src") and ".gif/image" in img.get("src"):
flag = img.get("src").replace("http://www.noonsite.com", "")
print a.get_text()
countries[currentArea].append({
"name": a.get_text(),
"url": a.get("href"),
"slug": slugify(a.get_text()),
"flag": flag
})
return countries
# download countries
print "BUILD COUNTRIES.JSON"
countries = downloadCountries(True)
# store countries in json file
with open("data/countries.json", "w") as f:
f.write(json.dumps(countries))
print "\n\n"
print "FETCH INDIVIDUAL CONTRIES"
for area in countries:
for country in countries[area]:
print country["name"]
profile, cities = downloadProfile(country)
formalities, bla = downloadFormalities(country)
generalinfo, blub = downloadGeneralInfo(country)
folder = "data/" + country["slug"]
# download cities
for city in cities:
try:
print "- " + city["name"]
except:
pass
cityInfo, bam = downloadCity(city)
if not os.path.isdir(folder + "/city"):
os.mkdir(folder + "/city")
with open(folder + "/city/" + city["slug"] + ".html", "w") as f:
f.write(str(cityInfo))
if not os.path.isdir(folder + "/"):
os.mkdir(folder + "/")
with open(folder + "/profile.html", "w") as f:
f.write(str(profile))
with open(folder + "/formalities.html", "w") as f:
f.write(str(formalities))
with open(folder + "/general.html", "w") as f:
f.write(str(generalinfo))