diff --git a/Pipfile b/Pipfile
index 6acdbb47..3662a8a4 100644
--- a/Pipfile
+++ b/Pipfile
@@ -14,7 +14,7 @@ django = "==2.1.15"
jsonfield = "==2.0.2"
raven = "==6.9.0"
psycopg2-binary = "==2.8.6"
-scrapy-tw-rental-house = "==1.1.1"
+scrapy-tw-rental-house = "==1.1.2"
[requires]
python_version = "3"
diff --git a/Pipfile.lock b/Pipfile.lock
index d701ec58..d2094bf0 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "b550970a17bba890cd0aaf2996af64f185cb29f35eea0aab6837232a80ea72b9"
+ "sha256": "ff1dfcfbb8f590c849421d7b440a7ba02ee38c5715d1cbb22350bb276693706b"
},
"pipfile-spec": 6,
"requires": {
@@ -403,11 +403,11 @@
},
"scrapy-tw-rental-house": {
"hashes": [
- "sha256:2639ad2747b986b534b84025739d42aee856d0b14569c6d493dc5844b22ed6e5",
- "sha256:4f9e191299f5440fb9b84fa77359f3e2aff0c530c0c1a03ef453b51a4ff3d32f"
+ "sha256:02a4ddc1e1fb78deded0273236f4c681855a1bbb17ab5889883ca9bb100b6339",
+ "sha256:c5231be11f29280cab54ef93505d829e5d819f168c04061fc29ad9e8e210f43c"
],
"index": "pypi",
- "version": "==1.1.1"
+ "version": "==1.1.2"
},
"service-identity": {
"hashes": [
diff --git a/backend/rental/libs/export/field.py b/backend/rental/libs/export/field.py
index 90e75969..683bf66c 100644
--- a/backend/rental/libs/export/field.py
+++ b/backend/rental/libs/export/field.py
@@ -36,13 +36,13 @@ def to_human(self, val, use_tf=True):
if self.fn:
val = self.fn(val)
- if type(val) is datetime:
+ if isinstance(val, datetime):
val = timezone.localtime(val).strftime('%Y-%m-%d %H:%M:%S %Z')
- elif val is '' or val is None:
+ elif val == '' or val is None:
val = '-'
- elif val is True or val == 'true':
+ elif val == True or val == 'true':
val = 'T' if use_tf else 1
- elif val is False or val == 'false':
+ elif val == False or val == 'false':
val = 'F' if use_tf else 0
return val
@@ -51,13 +51,13 @@ def to_machine(self, val):
if self.fn:
val = self.fn(val)
- if type(val) is datetime:
+ if isinstance(val, datetime):
pass
- elif val is '' or val is None:
+ elif val == '' or val is None:
val = None
- elif val is True or val == 'true':
+ elif val == True or val == 'true':
val = True
- elif val is False or val == 'false':
+ elif val == False or val == 'false':
val = False
return val
diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py
deleted file mode 100644
index c5200978..00000000
--- a/crawler/crawler/items.py
+++ /dev/null
@@ -1,73 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://doc.scrapy.org/en/latest/topics/items.html
-
-from scrapy import Field, Item
-
-
-class GenericHouseItem(Item):
- top_region = Field()
- sub_region = Field()
- deal_time = Field()
- deal_status = Field()
- n_day_deal = Field()
- vendor = Field()
- vendor_house_id = Field()
- vendor_house_url = Field()
- # price related
- monthly_price = Field()
- deposit_type = Field()
- n_month_deposit = Field()
- deposit = Field()
- is_require_management_fee = Field()
- monthly_management_fee = Field()
- has_parking = Field()
- is_require_parking_fee = Field()
- monthly_parking_fee = Field()
- per_ping_price = Field()
- # other basic info
- building_type = Field()
- property_type = Field()
- is_rooftop = Field()
- floor = Field()
- total_floor = Field()
- dist_to_highest_floor = Field()
- floor_ping = Field()
- n_living_room = Field()
- n_bed_room = Field()
- n_bath_room = Field()
- n_balcony = Field()
- apt_feature_code = Field()
- rough_address = Field()
- rough_coordinate = Field()
- # boolean map
- # eletricity: true, water: true, gas: true, internet: true, cable_tv: true
- additional_fee = Field()
- # school, park, dept_store, conv_store, traditional_mkt, night_mkt,
- # hospital, police_office
- living_functions = Field()
- # subway, bus, public_bike, train, hsr
- transportation = Field()
- has_tenant_restriction = Field()
- has_gender_restriction = Field()
- gender_restriction = Field()
- can_cook = Field()
- allow_pet = Field()
- has_perperty_registration = Field()
- # undermined for now
- facilities = Field()
- contact = Field()
- author = Field()
- agent_org = Field()
- imgs = Field()
-
-
-class RawHouseItem(Item):
- house_id = Field()
- vendor = Field()
- is_list = Field()
- raw = Field()
- dict = Field()
diff --git a/crawler/crawler/spiders/all_591_cities.py b/crawler/crawler/spiders/all_591_cities.py
deleted file mode 100644
index 46cb93aa..00000000
--- a/crawler/crawler/spiders/all_591_cities.py
+++ /dev/null
@@ -1,86 +0,0 @@
-all_591_cities = [
- {
- "city": "台北市",
- "id": "1"
- },
- {
- "city": "新北市",
- "id": "3"
- },
- {
- "city": "桃園市",
- "id": "6"
- },
- {
- "city": "新竹市",
- "id": "4"
- },
- {
- "city": "新竹縣",
- "id": "5"
- },
- {
- "city": "基隆市",
- "id": "2"
- },
- {
- "city": "宜蘭縣",
- "id": "21"
- },
- {
- "city": "台中市",
- "id": "8"
- },
- {
- "city": "彰化縣",
- "id": "10"
- },
- {
- "city": "苗栗縣",
- "id": "7"
- },
- {
- "city": "雲林縣",
- "id": "14"
- },
- {
- "city": "南投縣",
- "id": "11"
- },
- {
- "city": "高雄市",
- "id": "17"
- },
- {
- "city": "台南市",
- "id": "15"
- },
- {
- "city": "嘉義市",
- "id": "12"
- },
- {
- "city": "屏東縣",
- "id": "19"
- },
- {
- "city": "嘉義縣",
- "id": "13"
- },
- {
- "city": "花蓮縣",
- "id": "23"
- },
- {
- "city": "台東縣",
- "id": "22"
- },
- {
- "city": "金門縣",
- "id": "25"
- },
- {
- "city": "澎湖縣",
- "id": "24"
- }
-]
diff --git a/crawler/crawler/spiders/house_spider.py b/crawler/crawler/spiders/house_spider.py
deleted file mode 100644
index 21e57cec..00000000
--- a/crawler/crawler/spiders/house_spider.py
+++ /dev/null
@@ -1,290 +0,0 @@
-import scrapy
-import re
-import traceback
-import uuid
-from django.db import connection
-from scrapy.spidermiddlewares.httperror import HttpError
-from rental.models import HouseTS, Vendor
-from rental import models
-from crawlerrequest.models import RequestTS
-from crawlerrequest.enums import RequestType
-from rental.enums import UNKNOWN_ENUM
-
-# TODO: yield request
-
-class HouseSpider(scrapy.Spider):
- queue_length = 30
- n_live_spider = 0
-
- def __init__(
- self,
- vendor,
- is_list,
- request_generator,
- response_router=None,
- response_parser=None,
- **kwargs
- ):
- '''
- request_gerator:
- parameter: accept seed as variable
- return: dictionary of request parameter
-
- errback, meta.db_request, dont_filter, callback
- will be added beforehand
-
- response_parser:
- Standard spider parser, don't need to handle request error and
- exception.
- Will be set as default request callback
- '''
- super().__init__(**kwargs)
- y = models.current_year()
- m = models.current_month()
- d = models.current_day()
- h = models.current_stepped_hour()
-
- self.spider_id = str(uuid.uuid4())
-
- try:
- self.vendor = Vendor.objects.get(
- name = vendor
- )
- except Vendor.DoesNotExist:
- raise Exception('Vendor "{}" is not defined.'.format(vendor))
-
- if is_list:
- self.request_type = RequestType.LIST
- else:
- self.request_type = RequestType.DETAIL
-
- self.request_generator = request_generator
-
- if response_router:
- self.response_router = response_router
- elif response_parser:
- self.response_router = lambda x: response_parser
- else:
- raise Exception('No response router or parser given')
-
- self.ts = {
- 'y': y,
- 'm': m,
- 'd': d,
- 'h': h
- }
-
- def has_request(self):
- undone_requests = RequestTS.objects.filter(
- year = self.ts['y'],
- month = self.ts['m'],
- day = self.ts['d'],
- hour = self.ts['h'],
- # Ignore pending request since we will generate new one and rerun it anyway
- is_pending = False,
- vendor = self.vendor,
- request_type = self.request_type
- )[:1]
-
- return undone_requests.count() > 0
-
- def has_record(self):
- today_houses = HouseTS.objects.filter(
- year = self.ts['y'],
- month = self.ts['m'],
- day = self.ts['d'],
- hour = self.ts['h'],
- vendor = self.vendor
- )[:1]
-
- return today_houses.count() > 0
-
- def gen_persist_request(self, seed):
- RequestTS.objects.create(
- request_type=self.request_type,
- vendor=self.vendor,
- seed=seed
- )
-
- def next_request(self, request_generator=None):
- if self.n_live_spider >= self.queue_length:
- # At most self.queue_length in memory
- return None
-
- # #21, temp workaround to get next_request ASAP
- # this operation is still not atomic, different session may get the same request
- with connection.cursor() as cursor:
- sql = (
- 'update request_ts set owner = %s where id = ('
- 'select id from request_ts where year = %s and month = %s '
- 'and day = %s and hour = %s and vendor_id = %s and request_type = %s '
- 'and is_pending = %s and owner is null order by id limit 1)'
- )
- a = cursor.execute(sql, [
- self.spider_id,
- self.ts['y'],
- self.ts['m'],
- self.ts['d'],
- self.ts['h'],
- self.vendor.id,
- self.request_type.value,
- False
- ])
-
- next_row = RequestTS.objects.filter(
- year=self.ts['y'],
- month=self.ts['m'],
- day=self.ts['d'],
- hour=self.ts['h'],
- vendor=self.vendor,
- request_type=self.request_type,
- is_pending=False,
- owner=self.spider_id
- ).order_by('created')
-
- next_row = next_row.first()
-
- if next_row is None:
- return None
-
- next_row.is_pending = True
- next_row.save()
- self.n_live_spider += 1
-
- requestArgs = {
- 'dont_filter': True,
- 'errback': self.error_handler,
- 'callback': self.parser_wrapper,
- 'meta': {}
- }
-
- if not request_generator:
- request_generator = self.request_generator
-
- requestArgs = {
- **requestArgs,
- **request_generator(next_row.seed)
- }
-
- if 'db_request' not in requestArgs['meta']:
- requestArgs['meta']['db_request'] = next_row
-
- return scrapy.Request(**requestArgs)
-
- def parser_wrapper(self, response):
- db_request = response.meta['db_request']
- db_request.last_status = response.status
- db_request.save()
-
- seed = response.meta.get('seed', {})
-
- try:
- response_parser = self.response_router(seed)
- for item in response_parser(response):
- if item is True:
- db_request.delete()
- else:
- yield item
- except:
- self.logger.error(
- 'Parser error in {} when handle meta {}. [{}] - {:.128}'.format(
- self.name,
- seed,
- response.status,
- response.text
- )
- )
- traceback.print_exc()
-
- self.n_live_spider -= 1
- # quick fix for concurrency issue
- mercy = 10
- while True:
- next_request = self.next_request()
- if next_request:
- yield next_request
- elif mercy < 0:
- break
- else:
- mercy -= 1
-
- def error_handler(self, failure):
- self.n_live_spider -= 1
- if failure.check(HttpError):
- response = failure.value.response
- self.logger.error('[Live|{}] HttpError on {}[{}]'.format(
- self.n_live_spider, response.url, response.status))
-
- request = failure.value.response.request.meta['db_request']
- request.last_status = response.status
-
- if response.status == 599:
- request.is_pending = False
-
- request.save()
- else:
- self.logger.error(
- '[Live|{}] Error: {}'.format(self.n_live_spider, failure))
-
- def clean_number(self, number_string):
- if number_string is None or number_string == '':
- return None
-
- number_string = '{}'.format(number_string)
- pure_number = re.sub('[^\\d.-]', '', number_string)
- if pure_number == '':
- # it could be '' if no digit included
- return None
- elif pure_number.isdigit():
- return int(pure_number, base=10)
- else:
- return float(pure_number)
-
- def get_enum(self, EnumCls, house_id, value):
- try:
- enum = EnumCls[value]
- except KeyError:
- self.logger.error('Unknown property: {}/{} in house {}'.format(
- value,
- EnumCls.__name__,
- house_id
- ))
- enum = UNKNOWN_ENUM
-
- return enum
-
- def css_first(self, base, selector, default='', allow_empty=False, deep_text=False):
- # Check how to find if there's missing attribute
- css = self.css(base, selector, [default], deep_text=deep_text)
- if css:
- return css[0]
-
- if not allow_empty:
- self.logger.info(
- 'Fail to get css first from {}({})'.format(
- base,
- selector
- )
- )
-
- return ''
-
- def css(self, base, selector, default=None, deep_text=False):
- # Issue #30, we may get innerHTML like "some of target string"
- # deep_text=True retrieve text in the way different from ::text, which will also get all child text.
- if deep_text:
- ret = map(lambda dom: ''.join(dom.css('*::text').extract()), base.css(selector))
- else:
- ret = base.css(selector).extract()
-
- if not ret:
- ret = [] if default is None else default
-
- ret = self.clean_string(ret)
- return list(ret)
-
- def clean_string(self, strings):
- # remove empty and strip
- strings = filter(lambda str: str.replace(u'\xa0', '').strip(), strings)
- strings = map(lambda str: str.replace(u'\xa0', '').strip(), strings)
- return strings
diff --git a/crawler/go.sh b/crawler/go.sh
index e03b0dc5..57ad5692 100755
--- a/crawler/go.sh
+++ b/crawler/go.sh
@@ -3,23 +3,22 @@
now=`date +'%Y.%m.%d.%H%M'`
mkdir -p ../logs
-. ../bin/activate
echo '===== LIST ====='
-scrapy crawl list591 -L INFO
+pipenv run scrapy crawl list591 -L INFO
mv scrapy.log ../logs/$now.list.log
echo '===== DETAIL ====='
-scrapy crawl detail591 -L INFO
+pipenv run scrapy crawl detail591 -L INFO
mv scrapy.log ../logs/$now.detail.log
echo '===== STATEFUL UPDATE ====='
-python ../backend/manage.py syncstateful -ts
+pipenv run python ../backend/manage.py syncstateful -ts
echo '===== CHECK EXPORT ====='
-python ../backend/manage.py export -p
+pipenv run python ../backend/manage.py export -p
echo '===== GENERATE STATISTICS ====='
-python ../backend/manage.py statscheck
+pipenv run python ../backend/manage.py statscheck
echo '===== FINALIZE ====='
diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py b/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py
index cb1e40f0..7a22a600 100644
--- a/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py
+++ b/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py
@@ -99,7 +99,7 @@ def default_parse_detail(self, response):
'House {} not found by receiving status code {}'
.format(house_id, response.status)
)
- return None
+ return None
detail_dict = jsonResp['data']
detail_dict['house_id'] = house_id
diff --git a/scrapy-package/setup.py b/scrapy-package/setup.py
index 6e61cb50..130e5b36 100644
--- a/scrapy-package/setup.py
+++ b/scrapy-package/setup.py
@@ -5,7 +5,7 @@
setuptools.setup(
name="scrapy-tw-rental-house",
- version="1.1.1",
+ version="1.1.2",
author="ddio",
author_email="ddio@ddio.io",
description="Scrapy spider for TW Rental House",