diff --git a/Pipfile b/Pipfile index 6acdbb47..3662a8a4 100644 --- a/Pipfile +++ b/Pipfile @@ -14,7 +14,7 @@ django = "==2.1.15" jsonfield = "==2.0.2" raven = "==6.9.0" psycopg2-binary = "==2.8.6" -scrapy-tw-rental-house = "==1.1.1" +scrapy-tw-rental-house = "==1.1.2" [requires] python_version = "3" diff --git a/Pipfile.lock b/Pipfile.lock index d701ec58..d2094bf0 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "b550970a17bba890cd0aaf2996af64f185cb29f35eea0aab6837232a80ea72b9" + "sha256": "ff1dfcfbb8f590c849421d7b440a7ba02ee38c5715d1cbb22350bb276693706b" }, "pipfile-spec": 6, "requires": { @@ -403,11 +403,11 @@ }, "scrapy-tw-rental-house": { "hashes": [ - "sha256:2639ad2747b986b534b84025739d42aee856d0b14569c6d493dc5844b22ed6e5", - "sha256:4f9e191299f5440fb9b84fa77359f3e2aff0c530c0c1a03ef453b51a4ff3d32f" + "sha256:02a4ddc1e1fb78deded0273236f4c681855a1bbb17ab5889883ca9bb100b6339", + "sha256:c5231be11f29280cab54ef93505d829e5d819f168c04061fc29ad9e8e210f43c" ], "index": "pypi", - "version": "==1.1.1" + "version": "==1.1.2" }, "service-identity": { "hashes": [ diff --git a/backend/rental/libs/export/field.py b/backend/rental/libs/export/field.py index 90e75969..683bf66c 100644 --- a/backend/rental/libs/export/field.py +++ b/backend/rental/libs/export/field.py @@ -36,13 +36,13 @@ def to_human(self, val, use_tf=True): if self.fn: val = self.fn(val) - if type(val) is datetime: + if isinstance(val, datetime): val = timezone.localtime(val).strftime('%Y-%m-%d %H:%M:%S %Z') - elif val is '' or val is None: + elif val == '' or val is None: val = '-' - elif val is True or val == 'true': + elif val == True or val == 'true': val = 'T' if use_tf else 1 - elif val is False or val == 'false': + elif val == False or val == 'false': val = 'F' if use_tf else 0 return val @@ -51,13 +51,13 @@ def to_machine(self, val): if self.fn: val = self.fn(val) - if type(val) is datetime: + if isinstance(val, datetime): pass - elif val is '' or val is None: + elif val == '' or val is None: val = None - elif val is True or val == 'true': + elif val == True or val == 'true': val = True - elif val is False or val == 'false': + elif val == False or val == 'false': val = False return val diff --git a/crawler/crawler/items.py b/crawler/crawler/items.py deleted file mode 100644 index c5200978..00000000 --- a/crawler/crawler/items.py +++ /dev/null @@ -1,73 +0,0 @@ -# -*- coding: utf-8 -*- - -# Define here the models for your scraped items -# -# See documentation in: -# https://doc.scrapy.org/en/latest/topics/items.html - -from scrapy import Field, Item - - -class GenericHouseItem(Item): - top_region = Field() - sub_region = Field() - deal_time = Field() - deal_status = Field() - n_day_deal = Field() - vendor = Field() - vendor_house_id = Field() - vendor_house_url = Field() - # price related - monthly_price = Field() - deposit_type = Field() - n_month_deposit = Field() - deposit = Field() - is_require_management_fee = Field() - monthly_management_fee = Field() - has_parking = Field() - is_require_parking_fee = Field() - monthly_parking_fee = Field() - per_ping_price = Field() - # other basic info - building_type = Field() - property_type = Field() - is_rooftop = Field() - floor = Field() - total_floor = Field() - dist_to_highest_floor = Field() - floor_ping = Field() - n_living_room = Field() - n_bed_room = Field() - n_bath_room = Field() - n_balcony = Field() - apt_feature_code = Field() - rough_address = Field() - rough_coordinate = Field() - # boolean map - # eletricity: true, water: true, gas: true, internet: true, cable_tv: true - additional_fee = Field() - # school, park, dept_store, conv_store, traditional_mkt, night_mkt, - # hospital, police_office - living_functions = Field() - # subway, bus, public_bike, train, hsr - transportation = Field() - has_tenant_restriction = Field() - has_gender_restriction = Field() - gender_restriction = Field() - can_cook = Field() - allow_pet = Field() - has_perperty_registration = Field() - # undermined for now - facilities = Field() - contact = Field() - author = Field() - agent_org = Field() - imgs = Field() - - -class RawHouseItem(Item): - house_id = Field() - vendor = Field() - is_list = Field() - raw = Field() - dict = Field() diff --git a/crawler/crawler/spiders/all_591_cities.py b/crawler/crawler/spiders/all_591_cities.py deleted file mode 100644 index 46cb93aa..00000000 --- a/crawler/crawler/spiders/all_591_cities.py +++ /dev/null @@ -1,86 +0,0 @@ -all_591_cities = [ - { - "city": "台北市", - "id": "1" - }, - { - "city": "新北市", - "id": "3" - }, - { - "city": "桃園市", - "id": "6" - }, - { - "city": "新竹市", - "id": "4" - }, - { - "city": "新竹縣", - "id": "5" - }, - { - "city": "基隆市", - "id": "2" - }, - { - "city": "宜蘭縣", - "id": "21" - }, - { - "city": "台中市", - "id": "8" - }, - { - "city": "彰化縣", - "id": "10" - }, - { - "city": "苗栗縣", - "id": "7" - }, - { - "city": "雲林縣", - "id": "14" - }, - { - "city": "南投縣", - "id": "11" - }, - { - "city": "高雄市", - "id": "17" - }, - { - "city": "台南市", - "id": "15" - }, - { - "city": "嘉義市", - "id": "12" - }, - { - "city": "屏東縣", - "id": "19" - }, - { - "city": "嘉義縣", - "id": "13" - }, - { - "city": "花蓮縣", - "id": "23" - }, - { - "city": "台東縣", - "id": "22" - }, - { - "city": "金門縣", - "id": "25" - }, - { - "city": "澎湖縣", - "id": "24" - } -] diff --git a/crawler/crawler/spiders/house_spider.py b/crawler/crawler/spiders/house_spider.py deleted file mode 100644 index 21e57cec..00000000 --- a/crawler/crawler/spiders/house_spider.py +++ /dev/null @@ -1,290 +0,0 @@ -import scrapy -import re -import traceback -import uuid -from django.db import connection -from scrapy.spidermiddlewares.httperror import HttpError -from rental.models import HouseTS, Vendor -from rental import models -from crawlerrequest.models import RequestTS -from crawlerrequest.enums import RequestType -from rental.enums import UNKNOWN_ENUM - -# TODO: yield request - -class HouseSpider(scrapy.Spider): - queue_length = 30 - n_live_spider = 0 - - def __init__( - self, - vendor, - is_list, - request_generator, - response_router=None, - response_parser=None, - **kwargs - ): - ''' - request_gerator: - parameter: accept seed as variable - return: dictionary of request parameter - - errback, meta.db_request, dont_filter, callback - will be added beforehand - - response_parser: - Standard spider parser, don't need to handle request error and - exception. - Will be set as default request callback - ''' - super().__init__(**kwargs) - y = models.current_year() - m = models.current_month() - d = models.current_day() - h = models.current_stepped_hour() - - self.spider_id = str(uuid.uuid4()) - - try: - self.vendor = Vendor.objects.get( - name = vendor - ) - except Vendor.DoesNotExist: - raise Exception('Vendor "{}" is not defined.'.format(vendor)) - - if is_list: - self.request_type = RequestType.LIST - else: - self.request_type = RequestType.DETAIL - - self.request_generator = request_generator - - if response_router: - self.response_router = response_router - elif response_parser: - self.response_router = lambda x: response_parser - else: - raise Exception('No response router or parser given') - - self.ts = { - 'y': y, - 'm': m, - 'd': d, - 'h': h - } - - def has_request(self): - undone_requests = RequestTS.objects.filter( - year = self.ts['y'], - month = self.ts['m'], - day = self.ts['d'], - hour = self.ts['h'], - # Ignore pending request since we will generate new one and rerun it anyway - is_pending = False, - vendor = self.vendor, - request_type = self.request_type - )[:1] - - return undone_requests.count() > 0 - - def has_record(self): - today_houses = HouseTS.objects.filter( - year = self.ts['y'], - month = self.ts['m'], - day = self.ts['d'], - hour = self.ts['h'], - vendor = self.vendor - )[:1] - - return today_houses.count() > 0 - - def gen_persist_request(self, seed): - RequestTS.objects.create( - request_type=self.request_type, - vendor=self.vendor, - seed=seed - ) - - def next_request(self, request_generator=None): - if self.n_live_spider >= self.queue_length: - # At most self.queue_length in memory - return None - - # #21, temp workaround to get next_request ASAP - # this operation is still not atomic, different session may get the same request - with connection.cursor() as cursor: - sql = ( - 'update request_ts set owner = %s where id = (' - 'select id from request_ts where year = %s and month = %s ' - 'and day = %s and hour = %s and vendor_id = %s and request_type = %s ' - 'and is_pending = %s and owner is null order by id limit 1)' - ) - a = cursor.execute(sql, [ - self.spider_id, - self.ts['y'], - self.ts['m'], - self.ts['d'], - self.ts['h'], - self.vendor.id, - self.request_type.value, - False - ]) - - next_row = RequestTS.objects.filter( - year=self.ts['y'], - month=self.ts['m'], - day=self.ts['d'], - hour=self.ts['h'], - vendor=self.vendor, - request_type=self.request_type, - is_pending=False, - owner=self.spider_id - ).order_by('created') - - next_row = next_row.first() - - if next_row is None: - return None - - next_row.is_pending = True - next_row.save() - self.n_live_spider += 1 - - requestArgs = { - 'dont_filter': True, - 'errback': self.error_handler, - 'callback': self.parser_wrapper, - 'meta': {} - } - - if not request_generator: - request_generator = self.request_generator - - requestArgs = { - **requestArgs, - **request_generator(next_row.seed) - } - - if 'db_request' not in requestArgs['meta']: - requestArgs['meta']['db_request'] = next_row - - return scrapy.Request(**requestArgs) - - def parser_wrapper(self, response): - db_request = response.meta['db_request'] - db_request.last_status = response.status - db_request.save() - - seed = response.meta.get('seed', {}) - - try: - response_parser = self.response_router(seed) - for item in response_parser(response): - if item is True: - db_request.delete() - else: - yield item - except: - self.logger.error( - 'Parser error in {} when handle meta {}. [{}] - {:.128}'.format( - self.name, - seed, - response.status, - response.text - ) - ) - traceback.print_exc() - - self.n_live_spider -= 1 - # quick fix for concurrency issue - mercy = 10 - while True: - next_request = self.next_request() - if next_request: - yield next_request - elif mercy < 0: - break - else: - mercy -= 1 - - def error_handler(self, failure): - self.n_live_spider -= 1 - if failure.check(HttpError): - response = failure.value.response - self.logger.error('[Live|{}] HttpError on {}[{}]'.format( - self.n_live_spider, response.url, response.status)) - - request = failure.value.response.request.meta['db_request'] - request.last_status = response.status - - if response.status == 599: - request.is_pending = False - - request.save() - else: - self.logger.error( - '[Live|{}] Error: {}'.format(self.n_live_spider, failure)) - - def clean_number(self, number_string): - if number_string is None or number_string == '': - return None - - number_string = '{}'.format(number_string) - pure_number = re.sub('[^\\d.-]', '', number_string) - if pure_number == '': - # it could be '' if no digit included - return None - elif pure_number.isdigit(): - return int(pure_number, base=10) - else: - return float(pure_number) - - def get_enum(self, EnumCls, house_id, value): - try: - enum = EnumCls[value] - except KeyError: - self.logger.error('Unknown property: {}/{} in house {}'.format( - value, - EnumCls.__name__, - house_id - )) - enum = UNKNOWN_ENUM - - return enum - - def css_first(self, base, selector, default='', allow_empty=False, deep_text=False): - # Check how to find if there's missing attribute - css = self.css(base, selector, [default], deep_text=deep_text) - if css: - return css[0] - - if not allow_empty: - self.logger.info( - 'Fail to get css first from {}({})'.format( - base, - selector - ) - ) - - return '' - - def css(self, base, selector, default=None, deep_text=False): - # Issue #30, we may get innerHTML like "some of target string" - # deep_text=True retrieve text in the way different from ::text, which will also get all child text. - if deep_text: - ret = map(lambda dom: ''.join(dom.css('*::text').extract()), base.css(selector)) - else: - ret = base.css(selector).extract() - - if not ret: - ret = [] if default is None else default - - ret = self.clean_string(ret) - return list(ret) - - def clean_string(self, strings): - # remove empty and strip - strings = filter(lambda str: str.replace(u'\xa0', '').strip(), strings) - strings = map(lambda str: str.replace(u'\xa0', '').strip(), strings) - return strings diff --git a/crawler/go.sh b/crawler/go.sh index e03b0dc5..57ad5692 100755 --- a/crawler/go.sh +++ b/crawler/go.sh @@ -3,23 +3,22 @@ now=`date +'%Y.%m.%d.%H%M'` mkdir -p ../logs -. ../bin/activate echo '===== LIST =====' -scrapy crawl list591 -L INFO +pipenv run scrapy crawl list591 -L INFO mv scrapy.log ../logs/$now.list.log echo '===== DETAIL =====' -scrapy crawl detail591 -L INFO +pipenv run scrapy crawl detail591 -L INFO mv scrapy.log ../logs/$now.detail.log echo '===== STATEFUL UPDATE =====' -python ../backend/manage.py syncstateful -ts +pipenv run python ../backend/manage.py syncstateful -ts echo '===== CHECK EXPORT =====' -python ../backend/manage.py export -p +pipenv run python ../backend/manage.py export -p echo '===== GENERATE STATISTICS =====' -python ../backend/manage.py statscheck +pipenv run python ../backend/manage.py statscheck echo '===== FINALIZE =====' diff --git a/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py b/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py index cb1e40f0..7a22a600 100644 --- a/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py +++ b/scrapy-package/scrapy_twrh/spiders/rental591/detail_mixin.py @@ -99,7 +99,7 @@ def default_parse_detail(self, response): 'House {} not found by receiving status code {}' .format(house_id, response.status) ) - return None + return None detail_dict = jsonResp['data'] detail_dict['house_id'] = house_id diff --git a/scrapy-package/setup.py b/scrapy-package/setup.py index 6e61cb50..130e5b36 100644 --- a/scrapy-package/setup.py +++ b/scrapy-package/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="scrapy-tw-rental-house", - version="1.1.1", + version="1.1.2", author="ddio", author_email="ddio@ddio.io", description="Scrapy spider for TW Rental House",