Skip to content

Commit f2f9e64

Browse files
author
shanhong cheng
committed
zillow
1 parent a077e25 commit f2f9e64

File tree

3 files changed

+551
-0
lines changed

3 files changed

+551
-0
lines changed

Zillow/README.md

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
Zillow Scraping with Python
2+
===========================
3+
4+
WARNING: Use this code at your own risk, scraping is against Zillow's TOC.
5+
-------------------------------------------------------------------------
6+
7+
Basic tool for scraping current home listings from Zillow, written in Python
8+
using Selenium. The code takes as input search terms that would normally be
9+
entered on the Zillow home page. It creates 11 variables on each home listing
10+
from the data, saves them to a dataframe, and then writes the df to a CSV file
11+
that gets saved to your working directory. Using zip codes as search terms
12+
seems to yield the best results, the scraper works at a rate of about 75
13+
zip codes per hour (compared to the Zillow API limit of 1000 homes per 24h).
14+
15+
There are two files, `zillow_runfile.py` and `zillow_functions.py`. Save them
16+
both to your working directory, open the runfile and step through the code
17+
line-by-line. The zillow functions are sourced at the top of the runfile.
18+
19+
This tool uses a for loop to iterate over a list of input search terms, scrape
20+
the listings of each, and append the results to a dataframe. Function `zipcodes_list()`
21+
allows the user to compile a large list of zip codes to use as search terms,
22+
using the package [zipcode](https://pypi.python.org/pypi/zipcode). For example,
23+
`st = zipcodes_list(['10', '11', '770'])`
24+
will yield every US zip code that begins with '10', '11', or '770' as a single
25+
list. Object `st` could then be passed to the scraper. The scraper
26+
seems to fly below the radar of Zillows anti-scraping TOC rules.
27+
28+
Some things to keep in mind:
29+
---------------------------
30+
- You will need to edit the input parameter of function `init_driver` within`zillow_runfile.py`
31+
to point to the local path of your web driver program (required by Selenium).
32+
- The max return for each search term (i.e. each zip code) is 520 home listings.
33+
- There tends to be a small amount of NA's on every search, however foreclosure
34+
properties seem to be more likely to return NA's. So the more foreclosures
35+
there are in a search, the more NA's there will be.
36+
37+
Software Requirements/Info
38+
---------------------
39+
- This code was written using [Python 3.5](https://www.python.org/downloads/).
40+
- This code was written using [Selenium v3.0.2](http://www.seleniumhq.org/download/).
41+
- The Selenium package requires a webdriver program. This code was written
42+
using [Chromedriver v2.25](https://sites.google.com/a/chromium.org/chromedriver/downloads).
43+
44+
Example of the output dataframe:
45+
------------------------
46+
47+
```
48+
df.head(n=6)
49+
```
50+
51+
```
52+
address city state zip price sqft bedrooms \
53+
0 4251 Drake St Houston TX 77005 895000 3501 4
54+
1 6534 Westchester Ave Houston TX 77005 1325000 2720 3
55+
2 2635 Centenary St Houston TX 77005 1449000 4114 5
56+
3 2336 Robinhood St Houston TX 77005 1295000 3652 3
57+
4 3135 Bissonnet St Houston TX 77005 359000 1692 2
58+
5 3824 Byron St Houston TX 77005 1100000 3161 3
59+
60+
bathrooms days_on_zillow sale_type \
61+
0 4.0 4 House For Sale
62+
1 3.0 8 House For Sale
63+
2 5.0 NA House For Sale
64+
3 4.0 33 House For Sale
65+
4 2.5 37 Townhouse For Sale
66+
5 4.0 43 House For Sale
67+
68+
url
69+
0 http://www.zillow.com/homes/for_sale/27825953_...
70+
1 http://www.zillow.com/homes/for_sale/27792196_...
71+
2 http://www.zillow.com/homes/for_sale/27812119_...
72+
3 http://www.zillow.com/homes/for_sale/27836800_...
73+
4 http://www.zillow.com/homes/for_sale/27834549_...
74+
5 http://www.zillow.com/homes/for_sale/27823372_...
75+
```

Zillow/zillow_functions.py

Lines changed: 322 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,322 @@
1+
# -*- coding: utf-8 -*-
2+
# Zillow scraper functions, these are sourced at the top of zillow_runfile.py
3+
4+
import re as re
5+
import time
6+
import zipcode
7+
from selenium import webdriver
8+
from selenium.webdriver.common.by import By
9+
from selenium.webdriver.support.ui import WebDriverWait
10+
from selenium.webdriver.support import expected_conditions as EC
11+
from selenium.common.exceptions import TimeoutException
12+
from selenium.common.exceptions import NoSuchElementException
13+
14+
def zipcodes_list(st_items):
15+
# If st_items is a single zipcode string.
16+
if type(st_items) == str:
17+
zc_objects = zipcode.islike(st_items)
18+
output = [str(i).split(" ", 1)[1].split(">")[0]
19+
for i in zc_objects]
20+
# If st_items is a list of zipcode strings.
21+
elif type(st_items) == list:
22+
zc_objects = [n for i in st_items for n in zipcode.islike(str(i))]
23+
output = [str(i).split(" ", 1)[1].split(">")[0]
24+
for i in zc_objects]
25+
else:
26+
raise ValueError("input 'st_items' must be of type str or list")
27+
return(output)
28+
29+
def init_driver(file_path):
30+
# Starting maximized fixes https://github.com/ChrisMuir/Zillow/issues/1
31+
options = webdriver.ChromeOptions()
32+
options.add_argument("--start-maximized")
33+
34+
driver = webdriver.Chrome(executable_path=file_path, chrome_options=options)
35+
driver.wait = WebDriverWait(driver, 10)
36+
return(driver)
37+
38+
def navigate_to_website(driver, site):
39+
driver.get(site)
40+
41+
def click_buy_button(driver):
42+
try:
43+
button = driver.wait.until(EC.element_to_be_clickable(
44+
(By.CLASS_NAME, "nav-header")))
45+
button.click()
46+
time.sleep(10)
47+
except (TimeoutException, NoSuchElementException):
48+
raise ValueError("Clicking the 'Buy' button failed")
49+
50+
def enter_search_term(driver, search_term):
51+
try:
52+
search_bar = driver.wait.until(EC.presence_of_element_located(
53+
(By.ID, "citystatezip")))
54+
button = driver.wait.until(EC.element_to_be_clickable(
55+
(By.CLASS_NAME, "zsg-icon-searchglass")))
56+
search_bar.clear()
57+
time.sleep(3)
58+
search_bar.send_keys(search_term)
59+
time.sleep(3)
60+
button.click()
61+
time.sleep(3)
62+
return(True)
63+
except (TimeoutException, NoSuchElementException):
64+
return(False)
65+
66+
def results_test(driver):
67+
# Check to see if there are any returned results
68+
try:
69+
no_results = driver.find_element_by_css_selector(
70+
'.zoom-out-message').is_displayed()
71+
except (NoSuchElementException, TimeoutException):
72+
# Check to see if the zipcode is invalid or not
73+
try:
74+
no_results = driver.find_element_by_class_name(
75+
'zsg-icon-x-thick').is_displayed()
76+
except (NoSuchElementException, TimeoutException):
77+
no_results = False
78+
return(no_results)
79+
80+
def get_html(driver):
81+
output = []
82+
keep_going = True
83+
while keep_going:
84+
# Pull page HTML
85+
try:
86+
output.append(driver.page_source)
87+
except TimeoutException:
88+
pass
89+
try:
90+
# Check to see if a "next page" link exists
91+
keep_going = driver.find_element_by_class_name(
92+
'zsg-pagination-next').is_displayed()
93+
except NoSuchElementException:
94+
keep_going = False
95+
if keep_going:
96+
# Test to ensure the "updating results" image isnt displayed.
97+
# Will try up to 5 times before giving up, with a 5 second wait
98+
# between each try.
99+
tries = 5
100+
try:
101+
cover = driver.find_element_by_class_name(
102+
'list-loading-message-cover').is_displayed()
103+
except (TimeoutException, NoSuchElementException):
104+
cover = False
105+
while cover and tries > 0:
106+
time.sleep(5)
107+
tries -= 1
108+
try:
109+
cover = driver.find_element_by_class_name(
110+
'list-loading-message-cover').is_displayed()
111+
except (TimeoutException, NoSuchElementException):
112+
cover = False
113+
# If the "updating results" image is confirmed to be gone
114+
# (cover == False), click next page. Otherwise, give up on trying
115+
# to click thru to the next page of house results, and return the
116+
# results that have been scraped up to the current page.
117+
if cover == False:
118+
try:
119+
driver.wait.until(EC.element_to_be_clickable(
120+
(By.CLASS_NAME, 'zsg-pagination-next'))).click()
121+
time.sleep(3)
122+
except TimeoutException:
123+
keep_going = False
124+
else:
125+
keep_going = False
126+
return(output)
127+
128+
def get_listings(list_obj):
129+
# Split the raw HTML into segments, one for each listing.
130+
output = []
131+
for i in list_obj:
132+
htmlSplit = i.split('" id="zpid_')[1:]
133+
output += htmlSplit
134+
print(str(len(output)) + " home listings scraped\n***")
135+
return(output)
136+
137+
def get_street_address(soup_obj):
138+
try:
139+
street = soup_obj.find(
140+
"span", {"itemprop" : "streetAddress"}).get_text().strip()
141+
except (ValueError, AttributeError):
142+
street = "NA"
143+
if len(street) == 0 or street == "null":
144+
street = "NA"
145+
return(street)
146+
147+
148+
def get_city(soup_obj):
149+
try:
150+
city = soup_obj.find(
151+
"span", {"itemprop" : "addressLocality"}).get_text().strip()
152+
except (ValueError, AttributeError):
153+
city = "NA"
154+
if len(city) == 0 or city == "null":
155+
city = "NA"
156+
return(city)
157+
158+
def get_state(soup_obj):
159+
try:
160+
state = soup_obj.find(
161+
"span", {"itemprop" : "addressRegion"}).get_text().strip()
162+
except (ValueError, AttributeError):
163+
state = "NA"
164+
if len(state) == 0 or state == 'null':
165+
state = "NA"
166+
return(state)
167+
168+
def get_zipcode(soup_obj):
169+
try:
170+
zipcode = soup_obj.find(
171+
"span", {"itemprop" : "postalCode"}).get_text().strip()
172+
except (ValueError, AttributeError):
173+
zipcode = "NA"
174+
if len(zipcode) == 0 or zipcode == 'null':
175+
zipcode = "NA"
176+
return(zipcode)
177+
178+
def get_price(soup_obj, list_obj):
179+
# Look for price within the BeautifulSoup object.
180+
try:
181+
price = soup_obj.find(
182+
"span", {"class" : "zsg-photo-card-price"}).get_text().strip()
183+
except (ValueError, AttributeError):
184+
# If that fails, look for price within list_obj (object "card_info").
185+
try:
186+
price = [n for n in list_obj
187+
if any(["$" in n, "K" in n, "k" in n])]
188+
if len(price) > 0:
189+
price = price[0].split(" ")
190+
price = [n for n in price if re.search("[0-9]", n) is not None]
191+
if len(price[0]) > 0:
192+
price = price[0]
193+
else:
194+
price = "NA"
195+
else:
196+
price = "NA"
197+
except (ValueError, AttributeError):
198+
price = "NA"
199+
if len(price) == 0 or price == "null":
200+
price = "NA"
201+
if price is not "NA":
202+
# Transformations to the price string.
203+
price = price.replace(",", "").replace("+", "").replace("$", "")
204+
if any(["K" in price, "k" in price]):
205+
price = price.lower().split("k")[0].strip()
206+
price = price + "000"
207+
if any(["M" in price, "m" in price]):
208+
price = price.lower().split("m")[0].strip()
209+
if "." not in price:
210+
price = price + "000000"
211+
else:
212+
pricelen = len(price.split('.')[0]) + 6
213+
price = price.replace('.', '')
214+
diff = pricelen - len(price)
215+
price = price + (diff * "0")
216+
if len(price) == 0:
217+
price = 'NA'
218+
return(price)
219+
220+
def get_card_info(soup_obj):
221+
# For most listings, card_info will contain info on number of bedrooms,
222+
# number of bathrooms, square footage, and sometimes price.
223+
try:
224+
card = soup_obj.find(
225+
"span", {"class" : "zsg-photo-card-info"}).get_text().split(" · ")
226+
except (ValueError, AttributeError):
227+
card = "NA"
228+
if len(card) == 0 or card == 'null':
229+
card = "NA"
230+
return(card)
231+
232+
def get_sqft(list_obj):
233+
sqft = [n for n in list_obj if "sqft" in n]
234+
if len(sqft) > 0:
235+
try:
236+
sqft = float(sqft[0].split("sqft")[0].strip().replace(",", "").replace("+", ""))
237+
except (ValueError, IndexError):
238+
sqft = "NA"
239+
if sqft == 0:
240+
sqft = "NA"
241+
else:
242+
sqft = "NA"
243+
return(sqft)
244+
245+
def get_bedrooms(list_obj):
246+
beds = [n for n in list_obj if any(["bd" in n, "tudio" in n])]
247+
if len(beds) > 0:
248+
if any([beds[0] == "Studio", beds[0] == "studio"]):
249+
beds = 0
250+
return(beds)
251+
try:
252+
beds = float(beds[0].split("bd")[0].strip())
253+
except (ValueError, IndexError):
254+
if any([beds[0] == "Studio", beds[0] == "studio"]):
255+
beds = 0
256+
else:
257+
beds = "NA"
258+
else:
259+
beds = "NA"
260+
return(beds)
261+
262+
def get_bathrooms(list_obj):
263+
baths = [n for n in list_obj if "ba" in n]
264+
if len(baths) > 0:
265+
try:
266+
baths = float(baths[0].split("ba")[0].strip())
267+
except (ValueError, IndexError):
268+
baths = "NA"
269+
if baths == 0:
270+
baths = "NA"
271+
else:
272+
baths = "NA"
273+
return(baths)
274+
275+
def get_days_on_market(soup_obj):
276+
try:
277+
dom = soup_obj.find_all(
278+
"span", {"class" : "zsg-photo-card-notification"})
279+
dom = [n for n in dom if "illow" in n.get_text()]
280+
if len(dom) > 0:
281+
dom = dom[0].get_text().strip()
282+
dom = int(dom.split(" ")[0])
283+
else:
284+
dom = "NA"
285+
except (ValueError, AttributeError):
286+
dom = "NA"
287+
return(dom)
288+
289+
def get_sale_type(soup_obj):
290+
try:
291+
saletype = soup_obj.find(
292+
"span", {"class" : "zsg-photo-card-status"}).get_text().strip()
293+
except (ValueError, AttributeError):
294+
saletype = "NA"
295+
if len(saletype) == 0 or saletype == 'null':
296+
saletype = "NA"
297+
return(saletype)
298+
299+
def get_url(soup_obj):
300+
# Try to find url in the BeautifulSoup object.
301+
href = [n["href"] for n in soup_obj.find_all("a", href = True)]
302+
url = [i for i in href if "homedetails" in i]
303+
if len(url) > 0:
304+
url = "http://www.zillow.com/homes/for_sale/" + url[0]
305+
else:
306+
# If that fails, contruct the url from the zpid of the listing.
307+
url = [i for i in href if "zpid" in i and "avorite" not in i]
308+
if len(url) > 0:
309+
zpid = re.findall(r"\d{8,10}", href[0])
310+
if zpid is not None and len(zpid) > 0:
311+
url = 'http://www.zillow.com/homes/for_sale/' \
312+
+ str(zpid[0]) \
313+
+ '_zpid/any_days/globalrelevanceex_sort/29.759534,' \
314+
+ '-95.335321,29.675003,-95.502863_rect/12_zm/'
315+
else:
316+
url = "NA"
317+
else:
318+
url = "NA"
319+
return(url)
320+
321+
def close_connection(driver):
322+
driver.quit()

0 commit comments

Comments
 (0)