Skip to content

Commit

Permalink
Merge pull request #182 from g0v/adapt-to-str-obfuscation
Browse files Browse the repository at this point in the history
Adapt to str obfuscation
  • Loading branch information
ddio authored Sep 23, 2024
2 parents aa13683 + a971850 commit 79b2d25
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 66 deletions.
2 changes: 1 addition & 1 deletion scrapy-tw-rental-house/pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "scrapy-tw-rental-house"
version = "1.3.7"
version = "1.4.0"
description = "Scrapy spider for TW Rental House"
readme = "README.md"
authors = ["ddio <[email protected]>"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,25 +29,25 @@ def get_title(response):
.house-title title
'''
return {
'title': css(response, '.house-title h1::text')[0],
'deal_time': css(response, '.house-title .tag-deal::text'),
'breadcrumb': css(response, '.crumbs a.t5-link::text')
'title': css(response, '.house-title h1', self_text=True)[0],
'deal_time': css(response, '.house-title .tag-deal', self_text=True),
'breadcrumb': css(response, '.crumbs a.t5-link', self_text=True)
}

def get_house_pattern(response):
'''
.house-label 新上架、可開伙、有陽台
.house-pattern 物件類型、坪數、樓層/總樓層、建物類型
'''
tag_list = css(response, '.house-label span::text')
item_list = css(response, '.house-pattern span::text')
tag_list = css(response, '.house-label > span', self_text=True)
item_list = css(response, '.house-pattern > span', self_text=True)
items = {}
fields_def = ['property_type', 'floor_ping', 'floor', 'building_type']

if len(item_list) > 0 and '坪' in item_list[0]:
# if 整層住家 && 無房無廳無衛(??),坪數在第一個 🥹
fields_def = ['floor_ping', 'floor', 'building_type']
breadcrumb = css(response, '.crumbs a.t5-link::text')
breadcrumb = css(response, '.crumbs a.t5-link', self_text=True)
if breadcrumb and '整層住家' in breadcrumb:
items['property_type'] = '整層住家'

Expand All @@ -66,8 +66,8 @@ def get_house_price(response):
.house-price 租金、押金
押金 can be 押金*個月、押金面議,還可填其他(數值,不確定如何呈現)
'''
price = css(response, '.house-price .price strong::text')
deposit_str = css(response, '.house-price::text')
price = css(response, '.house-price .price strong', self_text=True)
deposit_str = css(response, '.house-price', self_text=True)

return {
'price': price[0],
Expand All @@ -78,7 +78,7 @@ def get_house_address(response):
'''
.address 約略經緯度、約略地址
'''
address_str = css(response, '.address .load-map::text')
address_str = css(response, '.address .load-map', self_text=True)

# lat lng is in NUXT init script
js_scripts = css(response, 'script::text')
Expand Down Expand Up @@ -109,14 +109,14 @@ def get_service(response):
services = {}
cate_list = response.css('.service .service-cate > div')
for cate in cate_list:
title = css(cate, 'p::text')[0]
content = css(cate, 'span::text')
title = css(cate, 'p', self_text=True)[0]
content = css(cate, 'span', self_text=True)
if content and title:
services[title] = content[0]

# .service .service-facility 提供設備
supported_facility = css(response, '.service .service-facility dl:not(.del) dd::text')
unsupported_facility = css(response, '.service .service-facility dl.del dd::text')
supported_facility = css(response, '.service .service-facility dl:not(.del) dd', self_text=True)
unsupported_facility = css(response, '.service .service-facility dl.del dd', self_text=True)
services['supported_facility'] = supported_facility
services['unsupported_facility'] = unsupported_facility
return services
Expand All @@ -125,7 +125,7 @@ def get_promotion(response):
'''
.preference-item 屋主直租、產權保障、etc..
'''
item_list = css(response, '.preference-item p:first-child::text')
item_list = css(response, '.preference-item p:first-child', self_text=True)
return {
'promotion': item_list
}
Expand All @@ -151,8 +151,8 @@ def get_misc_info(response):
*response.css('.house-detail .content.right .item')
]
for item in items:
title = css(item, '.label::text')[0]
content = css(item, '.value::text')
title = css(item, '.label', self_text=True)[0]
content = css(item, '.value', self_text=True)
if content and title:
misc[title] = content

Expand All @@ -166,9 +166,9 @@ def get_contact(response):
.contact-card .phone
'''
contact_card = response.css('.contact-card')
author_name = css(contact_card, '.name::text')
agent_org = css(contact_card, '.econ-name::text')
phone = css(contact_card, '.phone button span > span::text')
author_name = css(contact_card, '.name', self_text=True)
agent_org = css(contact_card, '.econ-name', self_text=True)
phone = css(contact_card, '.phone button span > span', self_text=True)

if author_name:
author_name = author_name[0]
Expand Down
35 changes: 30 additions & 5 deletions scrapy-tw-rental-house/scrapy_twrh/spiders/rental591/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,38 @@ def parse_price(number_string: str):

return ret

def css(base: Response, selector, default=None, deep_text=False):
def reorder_inline_flex_dom(base: Response, selector):
'''
Issue #30, we may get innerHTML like "some of <kkkk></kkkk>target <qqq></qqq>string"
deep_text=True retrieve text in the way different from ::text,
which will also get all child text.
Issue #181, we may get innerHTML like <span> <i style="order:2;font-style:normal;">5</i></span>
'''
if deep_text:
items = base.css(selector)
ret = []
for item in items:
# child span may contain style="display:inline-flex;"
i_list = item.css('span[style*=display\\:inline-flex] > i')
plain_value = item.xpath('text()').get()
if plain_value is not None:
ret.append(plain_value)
elif i_list:
# store i_list order (in style:order) and its ::text content)
shuffled_list = []
for i in i_list:
order = i.css('::attr(style)').re_first(r'order:(\d+)')
text = i.css('::text').get()
shuffled_list.append((order, text))
# sort by order
shuffled_list.sort(key=lambda x: int(x[0]))
ret.append(''.join(map(lambda x: x[1], shuffled_list)))
return ret

def css(base: Response, selector, default=None, deep_text=False, self_text=False):
'''retrieve text in clean way'''
if self_text:
ret = reorder_inline_flex_dom(base, selector)
elif deep_text:
# Issue #30, we may get innerHTML like "some of <kkkk></kkkk>target <qqq></qqq>string"
# deep_text=True retrieve text in the way different from ::text,
# which will also get all child text.
ret = map(lambda dom: ''.join(dom.css('*::text').getall()), base.css(selector))
else:
ret = base.css(selector).getall()
Expand Down
2 changes: 1 addition & 1 deletion scrapy-twrh-example/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

44 changes: 5 additions & 39 deletions twrh-dataset/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion twrh-dataset/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ readme = "README.md"
python = "^3.10"
# cffi = "==1.13.2"
django = "^5"
scrapy-tw-rental-house = "==1.3.7"
scrapy-tw-rental-house = "==1.4.0"
psycopg2-binary = "^2.9.9"
pylint-django = "^2.5.5"
sentry-sdk = "^1.39.1"
Expand Down

0 comments on commit 79b2d25

Please sign in to comment.