diff --git a/proxypool/getter.py b/proxypool/getter.py
index cb99766..b9906c6 100644
--- a/proxypool/getter.py
+++ b/proxypool/getter.py
@@ -35,7 +35,7 @@ def crawl_ip181(self):
html = get_page(start_url)
ip_adress = re.compile('
\s*(.*?) | \s*(.*?) | ')
# \s* 匹配空格,起到换行作用
- re_ip_adress = ip_adress.findall(html)
+ re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
@@ -48,7 +48,7 @@ def crawl_kuaidaili(self):
ip_adress = re.compile(
'(.*) | \s*(\w+) | '
)
- re_ip_adress = ip_adress.findall(html)
+ re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
@@ -61,7 +61,7 @@ def crawl_xicidaili(self):
'![Cn](http://fs.xicidaili.com/images/flag/cn.png) | \s*(.*?) | \s*(.*?) | '
)
# \s* 匹配空格,起到换行作用
- re_ip_adress = ip_adress.findall(html)
+ re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
@@ -88,7 +88,7 @@ def crawl_data5u(self):
' \s*- (.*?)
\s*- (.*?)
'
)
# \s * 匹配空格,起到换行作用
- re_ip_adress = ip_adress.findall(html)
+ re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
@@ -99,7 +99,7 @@ def crawl_kxdaili(self):
html = get_page(start_url)
ip_adress = re.compile('\s*(.*?) | \s*(.*?) | ')
# \s* 匹配空格,起到换行作用
- re_ip_adress = ip_adress.findall(html)
+ re_ip_adress = ip_adress.findall(str(html))
for adress, port in re_ip_adress:
result = adress + ':' + port
yield result.replace(' ', '')
@@ -111,7 +111,7 @@ def crawl_premproxy(self):
html = get_page(start_url)
if html:
ip_adress = re.compile('(.*?) | ')
- re_ip_adress = ip_adress.findall(html)
+ re_ip_adress = ip_adress.findall(str(html))
for adress_port in re_ip_adress:
yield adress_port.replace(' ', '')
@@ -123,10 +123,10 @@ def crawl_xroxy(self):
if html:
ip_adress1 = re.compile(
"title='View this Proxy details'>\s*(.*).*")
- re_ip_adress1 = ip_adress1.findall(html)
+ re_ip_adress1 = ip_adress1.findall(str(html))
ip_adress2 = re.compile(
"title='Select proxies with port number .*'>(.*)")
- re_ip_adress2 = ip_adress2.findall(html)
+ re_ip_adress2 = ip_adress2.findall(str(html))
for adress, port in zip(re_ip_adress1, re_ip_adress2):
adress_port = adress + ':' + port
yield adress_port.replace(' ', '')
diff --git a/proxypool/schedule.py b/proxypool/schedule.py
index cf75c93..329b450 100644
--- a/proxypool/schedule.py
+++ b/proxypool/schedule.py
@@ -24,6 +24,7 @@ def set_raw_proxies(self, proxies):
self._raw_proxies = proxies
self._conn = RedisClient()
+ # 利用aiohttp实现异步检测
async def test_single_proxy(self, proxy):
"""
text one proxy, if valid, put them to usable_proxies.
@@ -102,8 +103,8 @@ def valid_proxy(cycle=VALID_CHECK_CYCLE):
"""
Get half of proxies which in redis
"""
- conn = RedisClient()
- tester = ValidityTester()
+ conn = RedisClient() # redis连接对象
+ tester = ValidityTester() # 用来检测代理是否可用的类
while True:
print('Refreshing ip')
count = int(0.5 * conn.queue_len)
@@ -132,6 +133,8 @@ def check_pool(lower_threshold=POOL_LOWER_THRESHOLD,
def run(self):
print('Ip processing running')
+ # 运行了两个进程,check_pool是从网上获取代理,进行筛选,放到数据库.
+ # valid_proxy是从数据库里拿出来检测
valid_process = Process(target=Schedule.valid_proxy)
check_process = Process(target=Schedule.check_pool)
valid_process.start()