-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrap.py
150 lines (130 loc) · 5.18 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import chromedriver_autoinstaller
import undetected_chromedriver as uc
import json
lenPage1 = 20
lenPage2 = 21
options = uc.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
options.add_argument('--no-sandbox')
# proxy http
# options.add_argument('--proxy-server=http://lastlnwhacker0gaCH:[email protected]:59100')
# fake user agent
options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' + '(KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36')
driver = uc.Chrome(options=options, executable_path=chromedriver_autoinstaller.install())
def scarpUrl():
url = []
#find
# for 1 - lenPage
for i in range(1, lenPage1+1):
driver.get("http://www.resource.lib.su.ac.th/awardsu/web/type.php?option=&keyword=จิตรกรรม&page=" + str(i))
while True:
try:
driver.find_element(By.CSS_SELECTOR, ".info.Sriracha")
break
except:
pass
x = driver.find_elements(By.CSS_SELECTOR, ".info.Sriracha")
for i in x:
url.append(i.get_attribute('href'))
print(len(url))
# remove duplicate url
url = list(dict.fromkeys(url))
for i in range(1, lenPage2+1):
driver.get("http://www.resource.lib.su.ac.th/awardsu/web/type.php?option=&keyword=ภาพพิมพ์&page=" + str(i))
while True:
try:
driver.find_element(By.CSS_SELECTOR, ".info.Sriracha")
break
except:
pass
x = driver.find_elements(By.CSS_SELECTOR, ".info.Sriracha")
for i in x:
url.append(i.get_attribute('href'))
print(len(url))
# remove duplicate url
url = list(dict.fromkeys(url))
# save url to file
with open('url.txt', 'w', encoding='utf-8') as f:
for item in url:
f.write("%s\n" % item)
# Close the WebDriver when done
driver.quit()
def scrapInfo():
# schema
# {
# "Artwork_Name": "",
# "Artist_Name": "",
# "Artwork_Type": "",
# "Artwork_Size": "",
# "Artwork_Technique": "",
# "Exhibition_Name": "",
# "Award_Name": "",
# "License": "",
# "Concept": "",
# "Detail": "",
# "Image": "",
# "URL": ""
# }
# Read the URLs from the file
with open('url.txt', 'r', encoding='utf-8') as f:
urls = [line.strip() for line in f.readlines()]
# Define a dictionary to map data fields to XPaths
data_fields = {
"Artwork_Name": '//mark[@class="title" and text()="ชื่อผลงาน"]/following-sibling::span',
"Artist_Name": '//mark[@class="title" and text()="ชื่อศิลปิน"]/following-sibling::span',
"Artwork_Type": '//mark[@class="title" and text()="ประเภท"]/following-sibling::span',
"Artwork_Size": '//mark[@class="title" and text()="ขนาด"]/following-sibling::span',
"Artwork_Technique": '//mark[@class="title" and text()="เทคนิค"]/following-sibling::span',
"Exhibition_Name": '//mark[@class="title" and text()="นิทรรศการ"]/following-sibling::span',
"Award_Name": '//mark[@class="title" and text()="รางวัลที่ได้รับ"]/following-sibling::span',
"License": '//mark[@class="title" and text()="ผู้ครอบครอง"]/following-sibling::span',
"Concept": "//*[@id='concept']/span",
"Detail": "//*[@id='description']/span",
"Image": "//img[@class='img-responsive']",
"URL": ""
}
data = []
# delete p tag that not have text
p_elements = driver.find_elements(By.XPATH, "//p[text()='']")
for p_element in p_elements:
driver.execute_script("arguments[0].remove();", p_element)
# Scrape the data from each URL
for url in urls:
print("Scraping:", url)
driver.get(url)
scraped_data = {}
# click button
try:
x = driver.find_elements(By.CSS_SELECTOR, ".btn.btn-primary")
for i in x:
i.click()
except:
pass
for field, xpath in data_fields.items():
try:
if field == "URL":
scraped_data[field] = url
continue
element = driver.find_element(By.XPATH, xpath)
if field == "Image":
scraped_data[field] = element.get_attribute('src')
else:
scraped_data[field] = element.text
except Exception as e:
print("Error processing field:", field)
scraped_data[field] = ""
data.append(scraped_data)
# Save the scraped data to a JSON file
with open('data.json', 'w', encoding='utf-8') as outfile:
json.dump(data, outfile, ensure_ascii=False, indent=4)
# Close the WebDriver when done
driver.quit()
# scarpUrl()
scrapInfo()