-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
33 lines (26 loc) · 931 Bytes
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
from webpage import Webpage
from queue import Queue
class Scraper:
@staticmethod
def scrape_from_link(link, max_items=20):
link_queue = Queue()
link_queue.put(link)
webpages = []
passed_links={
link: True
} # I assume this is a map and has search complexity of O(log N) or O(1)
while link_queue.qsize() > 0 and len(webpages) < max_items:
link = link_queue.get()
try: # In case page is invalid, skip it
item = Webpage(link)
connects = item.get_connects()
except:
#print(f"INVALID LINK: {link}")
continue
webpages.append(item)
for x in connects:
if x in passed_links:
continue
passed_links[x] = True
link_queue.put(x)
return webpages