-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstart_a_business.py
103 lines (87 loc) · 3.19 KB
/
start_a_business.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from config import *
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from requests.exceptions import ConnectionError
from multiprocessing import Pool
import time
import random
from datetime import datetime
import pprint
#随机UserAgent
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'User-Agent': ua.random}
#构造参数访问页面
def get_most_page(start):
try:
data = {
'name':"",
'industryCode': "",
'typeCode':"",
'wasBindUniTechnology' :"-9",
'investStageCode':"",
'provinceCode' :'43',
'pageIndex':start,
'pageSize':'15',
}
params = urlencode(data)
base = 'http://cy.ncss.org.cn/search/projectlist?'
url = base + params
try:
response = requests.get(url,headers=headers,timeout=10)
if response.status_code == 200:
return response.text
return None
except ConnectionError:
print('Error occurred')
return None
except ConnectionError as c:
print(c)
pass
#解析网页中的JSON文件
def parse_page_index(html):
try:
date_time = datetime.now().date()
date_time = '记录时间 ' + datetime.strftime(date_time, '%Y-%m-%d') # 转变成str
soup = BeautifulSoup(html,"lxml")
project= soup.select('.search-list-item')
for i in project:
project_name = i.select('.project-list-item-title')[0].text
project_school = i.select('.project-list-item-tags-text')[0].select("span")[0].text
project_province = i.select('.project-list-item-tags-text')[0].select("span")[1].text
project_type = i.select('.project-list-item-tags-text')[0].select("span")[2].text
project_industry = i.select('.project-list-item-tags-img')[0].text
project_Introduction = i.select('.project-list-item-desc')[0].text
project_url = i.select('.project-list-info')[0].select("a")[0].get('href')
url = 'http://cy.ncss.org.cn/' + project_url
yield {
'project_name': project_name,
'project_school': project_school,
'project_province': project_province,
'project_type': project_type,
'project_industry': project_industry,
'project_Introduction': project_Introduction,
'url': url,
'save_date': date_time, # 记录信息保存的日期
}
except IndexError as e:
print(e)
pass
def main(start):
html = get_most_page(start)
items = parse_page_index(html)
for it in items:
if project.update({'url': it['url']}, {'$set': it}, True): #确认唯一的url ,用来更新数据
pprint.pprint(it)
print('')
print('')
if __name__ == '__main__':
start_time = time.clock()
pool = Pool(processes=4)
groups = [x for x in range(0,end+1)]
pool.map(main,groups)# 多进程运行
pool.close()
pool.join()
end_time = time.clock()
print('\n', '程序结束', ' 共运行了 ', str(end_time - start_time), ' 秒')