Skip to content

Commit

Permalink
added configs for table detection models, logging for sec_edgar_inges…
Browse files Browse the repository at this point in the history
…tion.py (and renamed), and updated table_utils to handle configs
  • Loading branch information
snova-codym committed Sep 25, 2024
1 parent 8a24e53 commit 01af9c9
Show file tree
Hide file tree
Showing 6 changed files with 162 additions and 88 deletions.
8 changes: 7 additions & 1 deletion yoda/llava_data_prep/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ You have to set up your environment before you can run or customize the starter

## Install LaTex

FOr mac users, you will need a LaTex system dependency:
For mac users, you will need a LaTex system dependency:

```bash
brew install --cask mactex
Expand Down Expand Up @@ -109,6 +109,12 @@ Future work will potentially include:
- Run yoda/llava_data_prep/notebooks/edgar_ingestion.ipynb and ensure it completes. Check some of the outputs.
- Run yoda/llava_data_prep/scripts/generate_tables.py and ensure yoda/synthetic_data/tmp/images/test_table.jpg is created.
- Run:

```bash
python python /llava_data_prep/scripts/ingest_edgar_data.py --name <name>
```
For the above, please test with various congiurations, which can be found in: [config.yaml](config.yaml)

```bash
python yoda/llava_data_prep/scripts/create_synthetic_tables.py --name test --num-its 4
```
Expand Down
11 changes: 10 additions & 1 deletion yoda/llava_data_prep/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,17 @@ prompts:
sec:
"company": "company"
"email": [email protected]
"tickers":
- AAPL
"form_types":
- "10-K"
"end_date": "2024-01-01"
"start_date": "2023-01-01"

table_generation:
table_options:
"model": "doclaynet" # "yolo" or "doclaynet"
"threshold": 0.75
"offset": 20
"do_reshape": True
"size":
- 336
Expand Down
58 changes: 0 additions & 58 deletions yoda/llava_data_prep/scripts/edgar_ingestion.py

This file was deleted.

72 changes: 72 additions & 0 deletions yoda/llava_data_prep/scripts/ingest_edgar_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os
import sys
import argparse
import logging
import yaml


current_dir = os.path.dirname(os.path.abspath(__file__))
kit_dir = os.path.abspath(os.path.join(current_dir, "../.."))
repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))

sys.path.append(kit_dir)
sys.path.append(repo_dir)

from yoda.llava_data_prep.src.edgar_ingestion import SECTools # type: ignore
from yoda.llava_data_prep.src.table_utils import TableTools # type: ignore

CONFIG_PATH = os.path.join(kit_dir, 'llava_data_prep', 'config.yaml')
DATA_DIRECTORY = os.path.join(kit_dir, 'llava_data_prep', 'sec_data')
LOGS_PATH = os.path.join(DATA_DIRECTORY, "logs")

parser = argparse.ArgumentParser(description="Download EDGAR reports, detect tables, crop tables, and store.")

parser.add_argument("--name", type=str, help="Name of run", default="defaults")

def main() -> None:

try:
with open(CONFIG_PATH, 'r') as file:
configs = yaml.safe_load(file)
except FileNotFoundError:
raise FileNotFoundError(f'The YAML configuration file {CONFIG_PATH} was not found.')
except yaml.YAMLError as e:
raise RuntimeError(f'Error parsing YAML file: {e}')

if not os.path.exists(LOGS_PATH):
os.makedirs(LOGS_PATH, exist_ok=True)

args = parser.parse_args()
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler = logging.FileHandler(os.path.join(LOGS_PATH, f"{args.name}.log"))
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)

logging.info(f'Ingesting SEC EDGAR data with configs: ' +
f'{configs["sec"]}')

args = parser.parse_args()

OUTPUT_DIR = os.path.join(DATA_DIRECTORY, args.name)

logging.info(f"Running ingest_edgar_data.py with args: " +
f"{args} to output directory: " +
f"{OUTPUT_DIR}")

sec_tool = SECTools(config_path=CONFIG_PATH)

sec_tool.download_filings(download_folder=OUTPUT_DIR)
sec_tool.convert_txt_to_pdf(data_directory=OUTPUT_DIR)

logging.info("Downloaded filings and converted to pdf.")
logging.info("Converting pdfs to images.")

table_tools = TableTools(config_path=CONFIG_PATH)

table_tools.convert_pdf_to_images(data_directory=OUTPUT_DIR)
table_tools.crop_tables(data_directory=OUTPUT_DIR)

if __name__ == "__main__":
main()
24 changes: 9 additions & 15 deletions yoda/llava_data_prep/src/edgar_ingestion.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import yaml
import glob
from typing import List, Dict
from sec_edgar_downloader import Downloader
from typing import Dict
import pdfkit
from sec_edgar_downloader import Downloader


class SECTools:

def __init__(self,
config: dict) -> None:
config_path: str) -> None:

self.configs = self.load_config(config)
self.configs = self.load_config(config_path)


@staticmethod
Expand All @@ -33,26 +33,20 @@ def load_config(filename: str) -> dict:
except yaml.YAMLError as e:
raise RuntimeError(f'Error parsing YAML file: {e}')

def download_filings(self, tickers: List[str],
form_types: List[str],
after: str,
before: str,
download_folder: str) -> None:
def download_filings(self, download_folder: str) -> None:

dl = Downloader(company_name=self.configs["sec"]["company"],
email_address=self.configs["sec"]["email"],
download_folder=download_folder)

for ticker in tickers:
for form_type in form_types:

print(f"Downloading {form_type} filing for {ticker} from {after}, {before}")
for ticker in self.configs["sec"]["tickers"]:
for form_type in self.configs["sec"]["form_types"]:

dl.get(
form=form_type,
ticker_or_cik=ticker,
after=after,
before=before
after=self.configs["sec"]["start_date"],
before=self.configs["sec"]["end_date"]
)

def _read_txt_file(self, filename: str) -> str:
Expand Down
77 changes: 64 additions & 13 deletions yoda/llava_data_prep/src/table_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,12 +70,44 @@

class TableTools:

def __init__(self,
do_reshape: bool = False,
size: Tuple[int,int] = (336,336)) -> None:
def __init__(self, config_path: str) -> None:

self.llm_info, self.prompt_info, self.table_info = self.load_configs(config_path)

if self.table_info["model"] not in ["yolo", "doclaynet"]:
raise ValueError(f'Got {self.table_info["model"]}, must be of either ' +
'["yolo", "doclaynet"]')

self.model = self.table_info["model"]
self.threshold = self.table_info["threshold"]
self.offset = self.table_info["offset"]
self.do_reshape = self.table_info["do_reshape"]
self.size = self.table_info["size"]

self.do_reshape = do_reshape
self.size = size
def load_configs(self, config_path: str) -> Any:
"""
Loads a yaml config file and returns llm info.
Args:
config_path: Path to the config yaml file.
Returns:
A tuple of dictionaries containing the llm information.
"""

assert isinstance(config_path, str), \
TypeError(f"Must be type str, but got {type(config_path)}.")

try:
with open(config_path, "r") as yaml_file:
config: dict = yaml.safe_load(yaml_file)
except FileNotFoundError:
logging.error(f"{config_path} not found.")
raise FileNotFoundError(f"{config_path} does not exist.")

llm_info, prompt_info, table_info = config["llm"], config["prompts"], config["table_options"]

return llm_info, prompt_info, table_info

def convert_pdf_to_images(self, data_directory: str) -> None:
"""
Expand Down Expand Up @@ -117,7 +149,7 @@ def convert_pdf_to_images(self, data_directory: str) -> None:
output_path: str = f"{output_folder}/{img_name}"
image.save(output_path, 'JPEG')

def crop_tables(self,
def crop_tables_yolo(self,
data_directory: str,
conf: float = 0.25,
iou: float = 0.45,
Expand Down Expand Up @@ -284,6 +316,28 @@ def crop_tables_doclaynet(self, data_directory: str, threshold: float = 0.75, of

logging.info(f"Cropped tables saved to {data_directory} " +
"in subdirectories.")

# TODO: Handle differing options if both models are to be kept - experimental testing for now.
def crop_tables(self, data_directory: str) -> None:

"""
This method crops tables using the chosen model.
Args:
data_directory: directory of images
threshold: the float value for confidence thresholding
offset: How much to pad the table detection when cropping.
"""

model_map = {
"yolo": self.crop_tables_yolo,
"doclaynet": self.crop_tables_doclaynet
}

if self.model in model_map:
model_map[self.model](data_directory=data_directory,
threshold=self.threshold,
offset=self.offset)

def replace_special_to_latex(self, text: str) -> str:

Expand Down Expand Up @@ -658,6 +712,7 @@ def _write_tex_file(self, tex_filepath: str,
with open(tex_filepath, 'w') as f:
f.write(latex_code)

# TODO: Decide to replace with detr model - yolo or doclaynet
def _crop_synth_table(self, image_path: str) -> None:

"""
Expand Down Expand Up @@ -851,16 +906,13 @@ class QAList(BaseModel):
class TableAugmentor:

def __init__(self, config_path: str) -> None:

assert isinstance(config_path, str), TypeError(f"Expected str, got {type(config_path)}.")

self.config_path = config_path
self.llm_info, self.prompt_info, self.table_info = self.load_configs(config_path)
self.init_llm()
self.init_table_modifying_chain()
self.init_table_qa_chain()
self.init_table_ocr_chain()
self.do_reshape = self.table_info["do_reshape"]
self.size = self.table_info["size"]

def load_configs(self, config_path: str) -> Any:
"""
Expand All @@ -883,7 +935,7 @@ def load_configs(self, config_path: str) -> Any:
logging.error(f"{config_path} not found.")
raise FileNotFoundError(f"{config_path} does not exist.")

llm_info, prompt_info, table_info = config["llm"], config["prompts"], config["table_generation"]
llm_info, prompt_info, table_info = config["llm"], config["prompts"], config["table_options"]

return llm_info, prompt_info, table_info

Expand Down Expand Up @@ -1055,8 +1107,7 @@ def create_training_data(self,
json.dump([], f)

# Instantiate table tools object for later use.
table_tools = TableTools(do_reshape=self.do_reshape,
size=self.size)
table_tools = TableTools(config_path=self.config_path)

# Generate synthetic data for specified number of
# samples.
Expand Down

0 comments on commit 01af9c9

Please sign in to comment.