Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat/add-similar-objects #7

Open
DefJM opened this issue May 12, 2020 · 0 comments
Open

feat/add-similar-objects #7

DefJM opened this issue May 12, 2020 · 0 comments

Comments

@DefJM
Copy link

DefJM commented May 12, 2020

Hi @asmaier

I'd like to create a pull request to add "similar objects" in the parsing section of immoscout.py.
Can you give me the rights so I can do so? Currently I don't seem to have permission.

Thanks a lot!
Jan

# -*- coding: utf-8 -*-
import scrapy
import json
from immospider.items import ImmoscoutItem


class ImmoscoutSpider(scrapy.Spider):
    name = "immoscout"
    allowed_domains = ["immobilienscout24.de"]
    # start_urls = ['https://www.immobilienscout24.de/Suche/S-2/Wohnung-Miete/Berlin/Berlin']
    # start_urls = ['https://www.immobilienscout24.de/Suche/S-2/Wohnung-Miete/Berlin/Berlin/Lichterfelde-Steglitz_Nikolassee-Zehlendorf_Dahlem-Zehlendorf_Zehlendorf-Zehlendorf/2,50-/60,00-/EURO--800,00/-/-/']

    # The immoscout search results are stored as json inside their javascript. This makes the parsing very easy.
    # I learned this trick from https://github.com/balzer82/immoscraper/blob/master/immoscraper.ipynb .
    script_xpath = './/script[contains(., "IS24.resultList")]'
    next_xpath = '//div[@id = "pager"]/div/a/@href'

    def start_requests(self):
        yield scrapy.Request(self.url)

    def parse(self, response):

        print(response.url)

        for line in response.xpath(self.script_xpath).extract_first().split('\n'):
            if line.strip().startswith('resultListModel'):
                immo_json = line.strip()
                immo_json = json.loads(immo_json[17:-1])

                #TODO: On result pages with just a single result resultlistEntry is not a list, but a dictionary.
                #TODO: So extracting data will fail.
                for result in immo_json["searchResponseModel"]["resultlist.resultlist"]["resultlistEntries"][0]["resultlistEntry"]:
                    item = self.parse_result(result, response)
                    yield item

                    # check for and parse "similar objects" with additional matching results in json body
                    if "similarObjects" in result:
                        for i in result["similarObjects"][0]["similarObject"]:
                            item = self.parse_data_object(i, response)
                            yield item

        next_page_list = response.xpath(self.next_xpath).extract()
        if next_page_list:
            next_page = next_page_list[-1]
            print("Scraping next page", next_page)
            if next_page:
                next_page = response.urljoin(next_page)
                yield scrapy.Request(next_page, callback=self.parse)

    def parse_result(self, result, response):
        """parse json result for each site

        :param result: [description]
        :type result: [type]
        """
        item = ImmoscoutItem()
        data = result["resultlist.realEstate"]

        item["immo_id"] = data["@id"]
        item["url"] = response.urljoin("/expose/" + str(data["@id"]))
        item["title"] = data["title"]
        address = data["address"]
        try:
            item["address"] = address["street"] + " " + address["houseNumber"]
        except:
            item["address"] = None
        item["city"] = address["city"]
        item["zip_code"] = address["postcode"]
        item["district"] = address["quarter"]

        item["rent"] = data["price"]["value"]
        item["sqm"] = data["livingSpace"]
        item["rooms"] = data["numberOfRooms"]

        if "calculatedPrice" in data:
            item["extra_costs"] = (
                data["calculatedPrice"]["value"] - data["price"]["value"]
            )
        if "builtInKitchen" in data:
            item["kitchen"] = data["builtInKitchen"]
        if "balcony" in data:
            item["balcony"] = data["balcony"]
        if "garden" in data:
            item["garden"] = data["garden"]
        if "privateOffer" in data:
            item["private"] = data["privateOffer"]
        if "plotArea" in data:
            item["area"] = data["plotArea"]
        if "cellar" in data:
            item["cellar"] = data["cellar"]

        try:
            contact = data["contactDetails"]
            item["contact_name"] = contact["firstname"] + " " + contact["lastname"]
        except:
            item["contact_name"] = None

        try:
            item["media_count"] = len(data["galleryAttachments"]["attachment"])
        except:
            item["media_count"] = 0

        try:
            item["lat"] = address["wgs84Coordinate"]["latitude"]
            item["lng"] = address["wgs84Coordinate"]["longitude"]
        except Exception as e:
            # print(e)
            item["lat"] = None
            item["lng"] = None

        print(item)

        return item

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant