-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeGithubUsingSelenium.py
124 lines (96 loc) · 3.75 KB
/
scrapeGithubUsingSelenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import time
import json
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
def removeDuplicates(listToRemoveDuplicates) -> list:
newList = []
for lists in listToRemoveDuplicates:
for l in lists:
if l not in newList:
newList.append(l)
return(newList)
class Type():
repository = "Repository"
code = "Code"
class ScrapeGithubForLinks():
def __init__(self, githubEmail, githubPassword, startingPage = 1, linksFile = "links.json", language = "Python", type = Type.repository):
self.githubEmail = githubEmail
self.githubPassword = githubPassword
self.currentPage = startingPage
self.language = language
self.linksFile = linksFile
self.type = type
self.driver = webdriver.Chrome(".\chromedriver.exe")
self.login()
def nextPage(self):
self.gotoPage(self.currentPage+1)
def previusPage(self):
self.gotoPage(self.currentPage-1)
def gotoPage(self, page):
self.currentPage = page
def login(self):
self.driver.get("https://github.com/login")
self.driver.find_element(By.NAME, "login").send_keys(self.githubEmail)
self.driver.find_element(By.NAME, "password").send_keys(self.githubPassword)
self.driver.find_element(By.NAME, "commit").click()
def getCurrentPageLinks(self, topic) -> list:
url = "https://github.com/search?l=%s&p=%d&q=%s&type=%s" %(self.language, self.currentPage, topic, self.type)
links = []
self.driver.get(url)
source = self.driver.page_source
soup = bs(source, "lxml")
#page = self.driver.find_element_by_tag_name("body")
linksFrame = soup.find_all("div", "f4 text-normal")
for link in linksFrame:
links.append("https://github.com/"+link.find("a")["href"])
return(links)
def getLinks(self, topic, endPage, startPage = 1, save = True) -> list: # returns the href links in selected pages
try:
links = []
self.currentPage = startPage
for _ in range(startPage, endPage):
link = self.getCurrentPageLinks(topic)
if link == []:
time.sleep(20)
self.previusPage()
links.append(link)
self.nextPage()
links = removeDuplicates(links)
except Exception as e:
print(e)
finally:
if save:
with open(self.linksFile,"w") as f:
json.dump(links, f)
return(links)
def quit(self):
self.driver.quit()
class ScrapeGithubLinks():
def __init__(self, linksFile = "links.json", dataFile = "data.json"):
self.linksFile = linksFile
self.dataFile = dataFile
def scrapeLinks(self, fromLinks = True, save = True):
currentJsonIndex = 0
jsonData = {}
if fromLinks:
with open(self.linksFile, "r") as f:
links = json.load(f)
else:
links = fromLinks
for link in links:
link = link.replace("https://github.com/", "https://raw.githubusercontent.com/", 1).replace("blob/", "", 1)
print(link)
r = requests.get(link)
if r.status_code != 200:
print("error:\n")
continue
text = bs(r.content, "lxml").text
jsonData[currentJsonIndex] = text
currentJsonIndex += 1
if save:
with open(self.dataFile, "w") as f:
json.dump(jsonData, f)
return(jsonData)