Skip to content

Commit

Permalink
MASTER first code pass implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Sulstice committed May 6, 2019
1 parent 041747f commit bd9195c
Show file tree
Hide file tree
Showing 5 changed files with 262 additions and 1 deletion.
Empty file added __init__.py
Empty file.
Empty file added main/__init__.py
Empty file.
225 changes: 225 additions & 0 deletions main/start_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
# -*- coding: utf-8 -*-
#
# Start scraping data.
#
# ------------------------------------------------

# imports
# -------
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait

from time import sleep


# Declare Constants
# -----------------
JOBS_LIST = []
NUM_JOBS = []


class ElementNotFound(Exception):

__version_error_parser__ = 1.0
__allow_update__ = False

"""
Raise an Element not found error
"""
def __init__(self, message, errors):
super().__init__(message)
self.errors = errors


def start_browser(url):

"""
Initialize the browser with chrome, we can extend this down the road to other browsers
Arguments:
url (String): the url for the host website to scrape data from.
Returns:
driver (object): the chrome driver object we are initiating.
"""
options = webdriver.ChromeOptions()
options.add_argument('--incognito')
options.add_argument('--disable-extensions')
options.add_argument('--disable-plugins-discovery')

# TODO: add this as an argument parser for other folks and their chrome driver executable.
driver = webdriver.Chrome(executable_path = "/Users/sulimansharif/Downloads/chromedriver")
driver.wait = WebDriverWait(driver, 10)
login(driver, "[email protected]", "swifter1")

return driver


def login(driver, username, password):

"""
The purpose of this function is to log into the site we are trying to access.
Arguments:
driver (Object): the Chrome driver object, for now I am just using chrome but we can extend this an option
username (String): the username of the account we are logging into. TODO: CHANGE THIS IMMEDIATELY TO NOT BE HARDCODED
password (String): the password of the account we are logging into. TODO: CHANGE THIS IMMEDIATELY TO NOT BE HARDCODED
Exceptions:
TimeoutException: if the login and password failed.
"""

import time
driver.get("http://www.glassdoor.com/profile/login_input.htm")
try:
user_field = driver.wait.until(EC.presence_of_element_located(
(By.ID, "userEmail")))
pw_field = driver.find_element_by_id("userPassword")
login_button = driver.find_element_by_class_name("gd-btn-1")
user_field.send_keys(username)
user_field.send_keys(Keys.TAB)
time.sleep(1)
pw_field.send_keys(password)
time.sleep(1)
login_button.click()
except TimeoutException:
print("TimeoutException! Username/password field or login button not found on glassdoor.com")

# Start search at home page then filter job listings
def start_search(browser, job, location):

"""
Initializes the search and parses the html of the search results into the parse_html function.
Arguments:
browser (Object): the initialize chrome browser with the previous set driver.
job (String): the job title we would like to search for.
location (String): where in the world would we want to search the job data for.
Returns:
jobs (Array): the list of jobs for that page number to be parsed.
Exceptions:
Element Not Found if the element isn't found from the selenium driver for whatever reason
"""
try:
# to make it think we are human we need sleep statements.
sleep(3)
keyword_elem = browser.find_element_by_class_name("keyword")
sleep(3)
keyword_elem.send_keys(job)
sleep(3)
# There is some spacing between the two search boxes in Glassdoor but this isn't necessary.
keyword_elem.send_keys(Keys.TAB)
keyword_elem.send_keys(Keys.TAB)
location_elem = browser.find_element(by=By.ID, value='sc.location')
location_elem.clear()

if not location_elem.is_displayed():
try:
location_elem = browser.find_elements(By.CLASS_NAME, 'loc')
location_elem.clear()
except ElementNotFound:
print('Element Not Found')

sleep(3)
location_elem.send_keys(location)
search_elem = browser.find_element_by_class_name("gd-btn-mkt")
search_elem.click()

for page_num in range(30):
job_num = len(NUM_JOBS)
jobs = parse_html(browser.page_source, job_num)
page_num = browser.find_element(By.CLASS_NAME, 'next')
page_num.click()
sleep(3)

except ElementNotFound:
print('Element not found')

return jobs


# Write Results to CSV file
# TODO: turn this into a file writer class that can support more than CSV, for now we need the data in any format.
def write_to_file(JOBS_LIST, filename):
"""
Arguments:
JOBS_LIST (Array): the list of the search results.
filename (String): the name of the file the user defines.
"""
with open(filename, 'a', errors='ignore') as csvfile:
fieldnames = ['Job Number', 'Title', 'Company', 'Location', 'URL']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, extrasaction='ignore')
writer.writeheader()
for row in JOBS_LIST:
writer.writerow(row)


# Parse current page and add job data to JOBS_LIST dictionary
def parse_html(html, job_num):

"""
Arguments:
html (String)
job_num (
"""

base_url = 'https://www.glassdoor.com'
soup = BeautifulSoup(html, 'html.parser')
job_titles = soup.find_all('div', attrs={'class':'flexbox jobTitle'})
for a in job_titles:
try:
next_listing = a.findNext('a',attrs={'class' : 'jobLink'})
title = next_listing.text
list_url = next_listing['href']

company_listing = a.findNext('div', attrs={'class' : 'flexbox empLoc'})
next_company = company_listing.findNext('div')
company = next_company.text
company = company.replace(u'\xa0','').strip()
company = company.replace(u'\n\n\n\n','').strip()

job_num += 1
NUM_JOBS.append(job_num)

except:
print("Error can't find job listing")

# Convert company to string then split and make new key/value 'Location'
try:
company = str(company)
company, location = company.split('–')
except:
print("Can't change company string")

# Add job data to JOBS_LIST
job_info = {
'Job Number' : job_num,
'Title' : title,
'Company' : company,
'Location' : location,
'URL' : base_url + list_url
}

JOBS_LIST.append(job_info)

return JOBS_LIST
37 changes: 37 additions & 0 deletions main/swifter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
#
# Main module for initializing swifter
#
# ------------------------------------------------


# imports
# -------
import requests, os
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from selenium import webdriver


from start_scrape import start_browser, start_search, write_to_file

job = 'Lab Technician'
location = 'Austin, TX'

if __name__ == '__main__':
output_filename = 'jobs.csv'
url = 'https://www.glassdoor.com/index.htm'

# Start driver and return driver object named 'browser'
# Params: URL
browser = start_browser(url)

# Start search and return job_list
# Params: browser, job, location
job_list = start_search(browser, job, location)

# Write to csv file
# Params: job_list, output_filename
write_to_file(job_list, output_filename)

1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
description="Scrape Scrape Scrape",
long_description=README,
author="Suliman Sharif",
author_email="[email protected]",
url="www.github",
install_requires=REQUIREMENTS,
zip_safe=False,
Expand Down

0 comments on commit bd9195c

Please sign in to comment.