Skip to content

Commit

Permalink
feat: scraping details from companiesmarketcap
Browse files Browse the repository at this point in the history
  • Loading branch information
furyhawk committed Jan 2, 2023
1 parent ef246e7 commit 288c1b2
Show file tree
Hide file tree
Showing 3 changed files with 373 additions and 0 deletions.
205 changes: 205 additions & 0 deletions scrape_companies_stock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
from typing import Any, Literal
import logging
from math import ceil
import datetime

from dataclasses import dataclass

from bs4 import BeautifulSoup


from tqdm import tqdm

import webscraping_lib

from omegaconf import MISSING, OmegaConf, DictConfig
import hydra

from hydra.core.config_store import ConfigStore

from webscrape import get_url_page, write_csv, verify_results

logger: logging.Logger = logging.getLogger(__name__)


@dataclass
class Config:
web: webscraping_lib.CompaniesMarketCapConfig = MISSING
debug: bool = False


cs: ConfigStore = ConfigStore.instance()
cs.store(name="base_config", node=Config)

# webscraping_lib registers its configs
# in webscraping_lib/web
webscraping_lib.register_configs()


def float_or_na(value: Any) -> float | Literal[0]:
try:
return float(value)
except (ValueError, TypeError):
return 0


def get_stocks(num_stocks: int, start_num: int, cfg: DictConfig) -> list:
"""
This functions builds a list of most popular stock symbols.
Returns the list of N number of popular stocks
"""
# Get the number of pages to access based on the number of stocks that need to be processed. each page has 100 stocks
start_page: int = int((lambda x: 1 if x < 1 else ceil(x / 100))(start_num))
end_page: int = int(
(lambda x: 1 if x < 1 else ceil(x / 100))(start_num + num_stocks)
)

stocks_symbols: list = []
for page_number in range(start_page, end_page + 1):
stocks_url: str = (
str(cfg.web.companies_url)
+ str(cfg.web.page_param)
+ str(page_number)
+ "/"
)

logger.info(f"Web Page: {stocks_url}")
# Call the function 'get_url_page' and get parsed html document
stocks_symbols_tags = get_url_page(
url_link=stocks_url,
user_agent=cfg.web.user_agent,
parser=cfg.web.parser,
).find_all("div", {"class": "company-code"})

# Extract ticker symbol name from the tag 'div' in the document
for stocks_symbols_tag in stocks_symbols_tags:
stocks_symbols.append(stocks_symbols_tag.text.strip())

# Return the list with N stocks
return stocks_symbols[:num_stocks]


def get_name_n_symbol(companyName: str) -> tuple[str, str]:
"""
A Helper function to accept Name and returns company Name and ticker symbol
"""
cName: list[str] = companyName.split("(")
name: str = "(".join(cName[:-1]).strip()
ticker: str = cName[-1].strip(")")
return (name, ticker)


def get_ticker_details(ticker_symbol: str, cfg: DictConfig):
"""
This function accepts the ticker symbol,
gets the html parsed document, finds appropriate tags and its value(text)
massages the data and returns stocks details as a python Dictionary
"""
# time.sleep(random.uniform(0, 1))
# logger.info("Processing : ", ticker_symbol)
ticker_url: str = "https://finance.yahoo.com/quote/" + ticker_symbol

# get html parsed document.
stock_page_doc: BeautifulSoup | Literal[""] = get_url_page(
url_link=ticker_url,
user_agent=cfg.web.user_agent,
parser=cfg.web.parser,
)

if len(stock_page_doc) == 0:
return ""

# Use find function of BeatufulSoup objet to get the values of the tags
# Use helper function get_name_n_symbol to extract company name and ticker symbol from the h1 name
company_text = stock_page_doc.find("h1")
if company_text is None:
return ""
cName, ticker = get_name_n_symbol(company_text.text)
MarketPrice = stock_page_doc.find(
"fin-streamer",
{"class": "Fw(b) Fz(36px) Mb(-4px) D(ib)", "data-field": "regularMarketPrice"},
).text.replace(",", "")
previousClosePrice = stock_page_doc.find(
"td", {"class": "Ta(end) Fw(600) Lh(14px)", "data-test": "PREV_CLOSE-value"}
).text.replace(",", "")
Volume = stock_page_doc.find(
"td", {"class": "Ta(end) Fw(600) Lh(14px)", "data-test": "TD_VOLUME-value"}
).text.replace(",", "")
pe_ratio = stock_page_doc.find(
"td", {"class": "Ta(end) Fw(600) Lh(14px)", "data-test": "PE_RATIO-value"}
).text.replace(",", "")
eps_ratio = stock_page_doc.find(
"td", {"class": "Ta(end) Fw(600) Lh(14px)", "data-test": "EPS_RATIO-value"}
).text.replace(",", "")

# Some of the stocks(ex.S&P) does not have market capital, using lambda function to replace such vaules with 0
MarketCap = (lambda x: x.text.replace(",", "") if x != None else "0")(
stock_page_doc.find(
"td", {"class": "Ta(end) Fw(600) Lh(14px)", "data-test": "MARKET_CAP-value"}
)
)

ticker_dict = {
"Company": cName.replace(",", ""),
"Symbol": ticker,
"Marketprice": float_or_na(MarketPrice),
"previousClosePrice": float_or_na(previousClosePrice),
"changeInPrice": round(
float_or_na(MarketPrice) - float_or_na(previousClosePrice), 2
),
"pe_ratio": float_or_na(pe_ratio),
"eps_ratio": float_or_na(eps_ratio),
"Volume": int(Volume),
"MarketCap": MarketCap,
}

# Return Dictionary with stock details
return ticker_dict


def scrape_stocks_info(num_stocks: int, start_num: int, cfg: DictConfig) -> None:
"""
This function Accepts number of stocks to be processed and writes the stock information to a file
"""

# Gets List of popular stocks and passes them to the function 'get_ticker_details' one by one.
# This is return a list of dictionaries with stock details.
logger.info("Start processing Stock symbols...")
stocks_info: list = []
pbar = tqdm(get_stocks(num_stocks=num_stocks, start_num=start_num, cfg=cfg))
for ticker_name in pbar:
pbar.set_description(f"Processing {ticker_name}")
stocks_info.append(get_ticker_details(ticker_name, cfg))

logger.info("End processing Stock symbols...")

# Pass the list of dictionies to the 'write_csv' function which writes it to the file.
today: datetime = datetime.datetime.now()
file_name: str = (
str(start_num)
+ "_to_"
+ str(start_num + num_stocks - 1)
+ cfg.web.output_filename
+ today.strftime("%Y-%m-%d")
+ ".csv"
)
write_csv(stocks_info, file_name)

# Verify Results:
verify_results(file_name)


@hydra.main(
version_base=None,
config_path="conf",
config_name="config",
)
def app(cfg: DictConfig) -> None:
logger.info(OmegaConf.to_yaml(cfg))
scrape_stocks_info(
num_stocks=cfg.web.max_companies, start_num=cfg.web.start_from, cfg=cfg
)


if __name__ == "__main__":
app()
163 changes: 163 additions & 0 deletions scrape_companiesmarketcap.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from typing import Any, Literal
import logging
from math import ceil
import datetime

from dataclasses import dataclass

import requests
from requests import Response
from bs4 import BeautifulSoup
import pandas as pd

from tqdm import tqdm

import webscraping_lib

from omegaconf import MISSING, OmegaConf, DictConfig
import hydra

from hydra.core.config_store import ConfigStore


logger: logging.Logger = logging.getLogger(__name__)


@dataclass
class Config:
web: webscraping_lib.CompaniesMarketCapConfig = MISSING
debug: bool = False


cs: ConfigStore = ConfigStore.instance()
cs.store(name="base_config", node=Config)

# webscraping_lib registers its configs
# in webscraping_lib/web
webscraping_lib.register_configs()


def scrape_companiesmarketcap(
num_stocks: int, start_num: int, companies_by, cfg: DictConfig
) -> None:
# Initialise an empty DataFrame
columns: list[str] = [
"company",
"ticker",
f"{companies_by.by}",
"price",
"daily Change",
"country",
]
df: pd.DataFrame = pd.DataFrame(columns=columns)
# Get the number of pages to access based on the number of stocks that need to be processed. each page has 100 stocks
start_page: int = int((lambda x: 1 if x < 1 else ceil(x / 100))(start_num))
end_page: int = int(
(lambda x: 1 if x < 1 else ceil(x / 100))(start_num + num_stocks - 1)
)
pbar: tqdm[int] = tqdm(range(start_page, end_page + 1))
for page_number in pbar:
# 01. Define the URL of the website
URL: str = (
str(cfg.web.companies_url)
+ str(companies_by.category)
+ str(cfg.web.page_param)
+ str(page_number)
+ "/"
)

# 02. Make a get request and print a message about whether it was successful or not
while True:
response: Response = requests.get(
URL, headers={"user-agent": cfg.web.user_agent}
)

if response.ok:
break

logger.info(
f" Page {page_number:02d} - {'!The request was not accepted!'}"
)

message = (
"The request was successfully."
if response.ok
else "The request was not successful."
)
pbar.set_description(f"Page {page_number:02d} - {message}")

# 03. Extract the raw HTML and create a Beatiful Soup object
html: bytes = response.content
soup: BeautifulSoup = BeautifulSoup(html, cfg.web.parser)

# 04. Retrieve data for all companies
table = soup.find("tbody").find_all("tr")

# 05. Retrieve data for each feauture individually
companies, tickers, by_value, prices, changes, countries = (
[],
[],
[],
[],
[],
[],
)

for i in range(len(table)):
companies.append(
table[i].find("div", {"class": "company-name"}).text.strip()
)
tickers.append(table[i].find("div", {"class": "company-code"}).text)
by_value.append(
table[i]
.find_all("td", {"class": "td-right"})[1]
.text.replace(",", "")
.strip()
)
prices.append(table[i].find_all("td", {"class": "td-right"})[2].text)
changes.append(table[i].find_all("span")[1].text)
countries.append(
table[i].find_all("span", {"class": "responsive-hidden"})[0].text
)

# 06. Append to the existing DataFrame
dfCurrent: pd.DataFrame = pd.DataFrame(
{
"company": companies,
"ticker": tickers,
f"{companies_by.by}": by_value,
"price": prices,
"daily Change": changes,
"country": countries,
}
)
dfCurrent["company"] = dfCurrent["company"].str.strip("\r\n")
df = pd.concat([df, dfCurrent])
today: datetime = datetime.datetime.now()
df.to_csv(
f"{str(start_num + num_stocks - 1)}"
f"_{companies_by.by}"
f"{cfg.web.output_filename}"
f"{today.strftime('%Y-%m-%d')}.csv",
index=False,
)


@hydra.main(
version_base=None,
config_path="conf",
config_name="config",
)
def main(cfg: DictConfig) -> None:
logger.info(OmegaConf.to_yaml(cfg))
for category in cfg.web.companies_by:
scrape_companiesmarketcap(
num_stocks=cfg.web.max_companies,
start_num=cfg.web.start_from,
companies_by=category,
cfg=cfg,
)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions webscraping_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@ class WebConfig:
parser: str = "lxml"
companies_url: str = MISSING
ticker_url: str = MISSING
page_param: str = MISSING
companies_by: list = MISSING
# companies_by_profit: str = MISSING
# companies_by_revenue: str = MISSING
# companies_by_employees: str = MISSING
output_filename: str = MISSING


Expand Down

0 comments on commit 288c1b2

Please sign in to comment.