|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +# Zillow scraper functions, these are sourced at the top of zillow_runfile.py |
| 3 | + |
| 4 | +import re as re |
| 5 | +import time |
| 6 | +import zipcode |
| 7 | +from selenium import webdriver |
| 8 | +from selenium.webdriver.common.by import By |
| 9 | +from selenium.webdriver.support.ui import WebDriverWait |
| 10 | +from selenium.webdriver.support import expected_conditions as EC |
| 11 | +from selenium.common.exceptions import TimeoutException |
| 12 | +from selenium.common.exceptions import NoSuchElementException |
| 13 | + |
| 14 | +def zipcodes_list(st_items): |
| 15 | + # If st_items is a single zipcode string. |
| 16 | + if type(st_items) == str: |
| 17 | + zc_objects = zipcode.islike(st_items) |
| 18 | + output = [str(i).split(" ", 1)[1].split(">")[0] |
| 19 | + for i in zc_objects] |
| 20 | + # If st_items is a list of zipcode strings. |
| 21 | + elif type(st_items) == list: |
| 22 | + zc_objects = [n for i in st_items for n in zipcode.islike(str(i))] |
| 23 | + output = [str(i).split(" ", 1)[1].split(">")[0] |
| 24 | + for i in zc_objects] |
| 25 | + else: |
| 26 | + raise ValueError("input 'st_items' must be of type str or list") |
| 27 | + return(output) |
| 28 | + |
| 29 | +def init_driver(file_path): |
| 30 | + # Starting maximized fixes https://github.com/ChrisMuir/Zillow/issues/1 |
| 31 | + options = webdriver.ChromeOptions() |
| 32 | + options.add_argument("--start-maximized") |
| 33 | + |
| 34 | + driver = webdriver.Chrome(executable_path=file_path, chrome_options=options) |
| 35 | + driver.wait = WebDriverWait(driver, 10) |
| 36 | + return(driver) |
| 37 | + |
| 38 | +def navigate_to_website(driver, site): |
| 39 | + driver.get(site) |
| 40 | + |
| 41 | +def click_buy_button(driver): |
| 42 | + try: |
| 43 | + button = driver.wait.until(EC.element_to_be_clickable( |
| 44 | + (By.CLASS_NAME, "nav-header"))) |
| 45 | + button.click() |
| 46 | + time.sleep(10) |
| 47 | + except (TimeoutException, NoSuchElementException): |
| 48 | + raise ValueError("Clicking the 'Buy' button failed") |
| 49 | + |
| 50 | +def enter_search_term(driver, search_term): |
| 51 | + try: |
| 52 | + search_bar = driver.wait.until(EC.presence_of_element_located( |
| 53 | + (By.ID, "citystatezip"))) |
| 54 | + button = driver.wait.until(EC.element_to_be_clickable( |
| 55 | + (By.CLASS_NAME, "zsg-icon-searchglass"))) |
| 56 | + search_bar.clear() |
| 57 | + time.sleep(3) |
| 58 | + search_bar.send_keys(search_term) |
| 59 | + time.sleep(3) |
| 60 | + button.click() |
| 61 | + time.sleep(3) |
| 62 | + return(True) |
| 63 | + except (TimeoutException, NoSuchElementException): |
| 64 | + return(False) |
| 65 | + |
| 66 | +def results_test(driver): |
| 67 | + # Check to see if there are any returned results |
| 68 | + try: |
| 69 | + no_results = driver.find_element_by_css_selector( |
| 70 | + '.zoom-out-message').is_displayed() |
| 71 | + except (NoSuchElementException, TimeoutException): |
| 72 | + # Check to see if the zipcode is invalid or not |
| 73 | + try: |
| 74 | + no_results = driver.find_element_by_class_name( |
| 75 | + 'zsg-icon-x-thick').is_displayed() |
| 76 | + except (NoSuchElementException, TimeoutException): |
| 77 | + no_results = False |
| 78 | + return(no_results) |
| 79 | + |
| 80 | +def get_html(driver): |
| 81 | + output = [] |
| 82 | + keep_going = True |
| 83 | + while keep_going: |
| 84 | + # Pull page HTML |
| 85 | + try: |
| 86 | + output.append(driver.page_source) |
| 87 | + except TimeoutException: |
| 88 | + pass |
| 89 | + try: |
| 90 | + # Check to see if a "next page" link exists |
| 91 | + keep_going = driver.find_element_by_class_name( |
| 92 | + 'zsg-pagination-next').is_displayed() |
| 93 | + except NoSuchElementException: |
| 94 | + keep_going = False |
| 95 | + if keep_going: |
| 96 | + # Test to ensure the "updating results" image isnt displayed. |
| 97 | + # Will try up to 5 times before giving up, with a 5 second wait |
| 98 | + # between each try. |
| 99 | + tries = 5 |
| 100 | + try: |
| 101 | + cover = driver.find_element_by_class_name( |
| 102 | + 'list-loading-message-cover').is_displayed() |
| 103 | + except (TimeoutException, NoSuchElementException): |
| 104 | + cover = False |
| 105 | + while cover and tries > 0: |
| 106 | + time.sleep(5) |
| 107 | + tries -= 1 |
| 108 | + try: |
| 109 | + cover = driver.find_element_by_class_name( |
| 110 | + 'list-loading-message-cover').is_displayed() |
| 111 | + except (TimeoutException, NoSuchElementException): |
| 112 | + cover = False |
| 113 | + # If the "updating results" image is confirmed to be gone |
| 114 | + # (cover == False), click next page. Otherwise, give up on trying |
| 115 | + # to click thru to the next page of house results, and return the |
| 116 | + # results that have been scraped up to the current page. |
| 117 | + if cover == False: |
| 118 | + try: |
| 119 | + driver.wait.until(EC.element_to_be_clickable( |
| 120 | + (By.CLASS_NAME, 'zsg-pagination-next'))).click() |
| 121 | + time.sleep(3) |
| 122 | + except TimeoutException: |
| 123 | + keep_going = False |
| 124 | + else: |
| 125 | + keep_going = False |
| 126 | + return(output) |
| 127 | + |
| 128 | +def get_listings(list_obj): |
| 129 | + # Split the raw HTML into segments, one for each listing. |
| 130 | + output = [] |
| 131 | + for i in list_obj: |
| 132 | + htmlSplit = i.split('" id="zpid_')[1:] |
| 133 | + output += htmlSplit |
| 134 | + print(str(len(output)) + " home listings scraped\n***") |
| 135 | + return(output) |
| 136 | + |
| 137 | +def get_street_address(soup_obj): |
| 138 | + try: |
| 139 | + street = soup_obj.find( |
| 140 | + "span", {"itemprop" : "streetAddress"}).get_text().strip() |
| 141 | + except (ValueError, AttributeError): |
| 142 | + street = "NA" |
| 143 | + if len(street) == 0 or street == "null": |
| 144 | + street = "NA" |
| 145 | + return(street) |
| 146 | + |
| 147 | + |
| 148 | +def get_city(soup_obj): |
| 149 | + try: |
| 150 | + city = soup_obj.find( |
| 151 | + "span", {"itemprop" : "addressLocality"}).get_text().strip() |
| 152 | + except (ValueError, AttributeError): |
| 153 | + city = "NA" |
| 154 | + if len(city) == 0 or city == "null": |
| 155 | + city = "NA" |
| 156 | + return(city) |
| 157 | + |
| 158 | +def get_state(soup_obj): |
| 159 | + try: |
| 160 | + state = soup_obj.find( |
| 161 | + "span", {"itemprop" : "addressRegion"}).get_text().strip() |
| 162 | + except (ValueError, AttributeError): |
| 163 | + state = "NA" |
| 164 | + if len(state) == 0 or state == 'null': |
| 165 | + state = "NA" |
| 166 | + return(state) |
| 167 | + |
| 168 | +def get_zipcode(soup_obj): |
| 169 | + try: |
| 170 | + zipcode = soup_obj.find( |
| 171 | + "span", {"itemprop" : "postalCode"}).get_text().strip() |
| 172 | + except (ValueError, AttributeError): |
| 173 | + zipcode = "NA" |
| 174 | + if len(zipcode) == 0 or zipcode == 'null': |
| 175 | + zipcode = "NA" |
| 176 | + return(zipcode) |
| 177 | + |
| 178 | +def get_price(soup_obj, list_obj): |
| 179 | + # Look for price within the BeautifulSoup object. |
| 180 | + try: |
| 181 | + price = soup_obj.find( |
| 182 | + "span", {"class" : "zsg-photo-card-price"}).get_text().strip() |
| 183 | + except (ValueError, AttributeError): |
| 184 | + # If that fails, look for price within list_obj (object "card_info"). |
| 185 | + try: |
| 186 | + price = [n for n in list_obj |
| 187 | + if any(["$" in n, "K" in n, "k" in n])] |
| 188 | + if len(price) > 0: |
| 189 | + price = price[0].split(" ") |
| 190 | + price = [n for n in price if re.search("[0-9]", n) is not None] |
| 191 | + if len(price[0]) > 0: |
| 192 | + price = price[0] |
| 193 | + else: |
| 194 | + price = "NA" |
| 195 | + else: |
| 196 | + price = "NA" |
| 197 | + except (ValueError, AttributeError): |
| 198 | + price = "NA" |
| 199 | + if len(price) == 0 or price == "null": |
| 200 | + price = "NA" |
| 201 | + if price is not "NA": |
| 202 | + # Transformations to the price string. |
| 203 | + price = price.replace(",", "").replace("+", "").replace("$", "") |
| 204 | + if any(["K" in price, "k" in price]): |
| 205 | + price = price.lower().split("k")[0].strip() |
| 206 | + price = price + "000" |
| 207 | + if any(["M" in price, "m" in price]): |
| 208 | + price = price.lower().split("m")[0].strip() |
| 209 | + if "." not in price: |
| 210 | + price = price + "000000" |
| 211 | + else: |
| 212 | + pricelen = len(price.split('.')[0]) + 6 |
| 213 | + price = price.replace('.', '') |
| 214 | + diff = pricelen - len(price) |
| 215 | + price = price + (diff * "0") |
| 216 | + if len(price) == 0: |
| 217 | + price = 'NA' |
| 218 | + return(price) |
| 219 | + |
| 220 | +def get_card_info(soup_obj): |
| 221 | + # For most listings, card_info will contain info on number of bedrooms, |
| 222 | + # number of bathrooms, square footage, and sometimes price. |
| 223 | + try: |
| 224 | + card = soup_obj.find( |
| 225 | + "span", {"class" : "zsg-photo-card-info"}).get_text().split(" · ") |
| 226 | + except (ValueError, AttributeError): |
| 227 | + card = "NA" |
| 228 | + if len(card) == 0 or card == 'null': |
| 229 | + card = "NA" |
| 230 | + return(card) |
| 231 | + |
| 232 | +def get_sqft(list_obj): |
| 233 | + sqft = [n for n in list_obj if "sqft" in n] |
| 234 | + if len(sqft) > 0: |
| 235 | + try: |
| 236 | + sqft = float(sqft[0].split("sqft")[0].strip().replace(",", "").replace("+", "")) |
| 237 | + except (ValueError, IndexError): |
| 238 | + sqft = "NA" |
| 239 | + if sqft == 0: |
| 240 | + sqft = "NA" |
| 241 | + else: |
| 242 | + sqft = "NA" |
| 243 | + return(sqft) |
| 244 | + |
| 245 | +def get_bedrooms(list_obj): |
| 246 | + beds = [n for n in list_obj if any(["bd" in n, "tudio" in n])] |
| 247 | + if len(beds) > 0: |
| 248 | + if any([beds[0] == "Studio", beds[0] == "studio"]): |
| 249 | + beds = 0 |
| 250 | + return(beds) |
| 251 | + try: |
| 252 | + beds = float(beds[0].split("bd")[0].strip()) |
| 253 | + except (ValueError, IndexError): |
| 254 | + if any([beds[0] == "Studio", beds[0] == "studio"]): |
| 255 | + beds = 0 |
| 256 | + else: |
| 257 | + beds = "NA" |
| 258 | + else: |
| 259 | + beds = "NA" |
| 260 | + return(beds) |
| 261 | + |
| 262 | +def get_bathrooms(list_obj): |
| 263 | + baths = [n for n in list_obj if "ba" in n] |
| 264 | + if len(baths) > 0: |
| 265 | + try: |
| 266 | + baths = float(baths[0].split("ba")[0].strip()) |
| 267 | + except (ValueError, IndexError): |
| 268 | + baths = "NA" |
| 269 | + if baths == 0: |
| 270 | + baths = "NA" |
| 271 | + else: |
| 272 | + baths = "NA" |
| 273 | + return(baths) |
| 274 | + |
| 275 | +def get_days_on_market(soup_obj): |
| 276 | + try: |
| 277 | + dom = soup_obj.find_all( |
| 278 | + "span", {"class" : "zsg-photo-card-notification"}) |
| 279 | + dom = [n for n in dom if "illow" in n.get_text()] |
| 280 | + if len(dom) > 0: |
| 281 | + dom = dom[0].get_text().strip() |
| 282 | + dom = int(dom.split(" ")[0]) |
| 283 | + else: |
| 284 | + dom = "NA" |
| 285 | + except (ValueError, AttributeError): |
| 286 | + dom = "NA" |
| 287 | + return(dom) |
| 288 | + |
| 289 | +def get_sale_type(soup_obj): |
| 290 | + try: |
| 291 | + saletype = soup_obj.find( |
| 292 | + "span", {"class" : "zsg-photo-card-status"}).get_text().strip() |
| 293 | + except (ValueError, AttributeError): |
| 294 | + saletype = "NA" |
| 295 | + if len(saletype) == 0 or saletype == 'null': |
| 296 | + saletype = "NA" |
| 297 | + return(saletype) |
| 298 | + |
| 299 | +def get_url(soup_obj): |
| 300 | + # Try to find url in the BeautifulSoup object. |
| 301 | + href = [n["href"] for n in soup_obj.find_all("a", href = True)] |
| 302 | + url = [i for i in href if "homedetails" in i] |
| 303 | + if len(url) > 0: |
| 304 | + url = "http://www.zillow.com/homes/for_sale/" + url[0] |
| 305 | + else: |
| 306 | + # If that fails, contruct the url from the zpid of the listing. |
| 307 | + url = [i for i in href if "zpid" in i and "avorite" not in i] |
| 308 | + if len(url) > 0: |
| 309 | + zpid = re.findall(r"\d{8,10}", href[0]) |
| 310 | + if zpid is not None and len(zpid) > 0: |
| 311 | + url = 'http://www.zillow.com/homes/for_sale/' \ |
| 312 | + + str(zpid[0]) \ |
| 313 | + + '_zpid/any_days/globalrelevanceex_sort/29.759534,' \ |
| 314 | + + '-95.335321,29.675003,-95.502863_rect/12_zm/' |
| 315 | + else: |
| 316 | + url = "NA" |
| 317 | + else: |
| 318 | + url = "NA" |
| 319 | + return(url) |
| 320 | + |
| 321 | +def close_connection(driver): |
| 322 | + driver.quit() |
0 commit comments