-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping_lab.py
107 lines (86 loc) · 3.72 KB
/
scraping_lab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# creating a class with the name Scraping
# defining an __init__ method to initiate first url, keyword and limit
# defining a start method that launch the scraping process
# return the url and count if the keyword is found
# otherwise return a text that said "Nope!"
#
import requests
import re
import random
class Scraping:
# Filter all the href with this string
WIKI_LINK = "/wiki/"
# base url of wikipedia, used to build a new link
WIKI_URL = 'https://en.wikipedia.org'
def __init__(self, url, keyword):
# constructor attributes
self.url = url
self.keyword = keyword
# private attributes
self.limit = 100
self.count = 1
#default value
self.debug = False
# method to set the debug attribute
def set_debug(self, value):
self.debug = value
# method to set the debug attribute
def set_limit(self, value):
self.limit = value
def search(self):
if self.debug == True:
print("Start searching")
# Note: count=self.count
return self.recursive_search(self.url, self.keyword, self.count, self.limit)
def recursive_search(self, url, keyword, count, limit):
# get the content of the url passed
response = requests.get(url)
# store the text of the page in the variable
storage_content = response.text
# find the h1 in the page and store its content in the variable title
title = re.findall(r'<h1 id="firstHeading" class="firstHeading" lang="en">(.+?)</h1>',storage_content)[0]
# check if the keyword is the title
if keyword in title:
print(f'{count}. This url contains the keyword in the title: {url}')
return url
else:
# gets all the href of the page
hrefs = re.findall(r'href=[\'"]?([^\'" >]+)', storage_content)
# create a empty list
wiki_list = []
# loop through all of the href links
for href_content in hrefs:
# check if the current href start with "wiki_link" (/wiki/)
if href_content.startswith(Scraping.WIKI_LINK):
# if yes, then add it to our new list: wiki_list
wiki_list.append(href_content)
if self.debug == True:
# print the amount of href of the current page
print(f"Amount of href in the current page: {len(wiki_list)}")
# build a new url from a random href of our list
next_link = Scraping.WIKI_URL + random.choice(wiki_list)
# Increase our counter and assign it to a new variable
new_count = count + 1
# check if our new counter is superior to our given limit
# Note: superior so that we reach the limit. limit included
# ex:
# limit = 5
# we want to also call the 5th url
if new_count > limit:
# yes, return and stop everything
return "Nope!"
else:
if self.debug == True:
# print our new counter and the next link
print(f'{new_count}. {next_link}')
# call our function again with the new link
# the new count and the limit
# Note: this is the HEART of the recursive function
return self.recursive_search(next_link, keyword, new_count, limit)
word_to_look_for = "ing"
first_url = "https://en.wikipedia.org/wiki/Special:Random"
super_scraping_variable = Scraping(first_url, word_to_look_for)
super_scraping_variable.set_debug(True)
super_scraping_variable.set_limit(10)
search_response = super_scraping_variable.search()
print(f"Url = {search_response}")