added configs for table detection models, logging for sec_edgar_inges…

…tion.py (and renamed), and updated table_utils to handle configs
lelzeiny · Sep 25, 2024 · 01af9c9 · 01af9c9
1 parent 8a24e53
commit 01af9c9
Show file tree

Hide file tree

Showing 6 changed files with 162 additions and 88 deletions.
diff --git a/yoda/llava_data_prep/README.md b/yoda/llava_data_prep/README.md
@@ -36,7 +36,7 @@ You have to set up your environment before you can run or customize the starter
 
 ## Install LaTex
 
-FOr mac users, you will need a LaTex system dependency:
+For mac users, you will need a LaTex system dependency:
 
 ```bash
 brew install --cask mactex
@@ -109,6 +109,12 @@ Future work will potentially include:
 - Run yoda/llava_data_prep/notebooks/edgar_ingestion.ipynb and ensure it completes.  Check some of the outputs.
 - Run yoda/llava_data_prep/scripts/generate_tables.py and ensure yoda/synthetic_data/tmp/images/test_table.jpg is created.
 - Run:
+
+```bash
+python python /llava_data_prep/scripts/ingest_edgar_data.py --name <name>
+```
+For the above, please test with various congiurations, which can be found in: [config.yaml](config.yaml)
+
 ```bash
 python yoda/llava_data_prep/scripts/create_synthetic_tables.py --name test --num-its 4
 ```

diff --git a/yoda/llava_data_prep/config.yaml b/yoda/llava_data_prep/config.yaml
@@ -15,8 +15,17 @@ prompts:
 sec:
     "company": "company"
     "email": [email protected]
+    "tickers":
+        - AAPL
+    "form_types":
+        - "10-K"
+    "end_date": "2024-01-01"
+    "start_date": "2023-01-01"
 
-table_generation:
+table_options:
+    "model": "doclaynet" # "yolo" or "doclaynet"
+    "threshold": 0.75
+    "offset": 20
     "do_reshape": True
     "size":
         - 336

diff --git a/yoda/llava_data_prep/scripts/edgar_ingestion.py b/yoda/llava_data_prep/scripts/edgar_ingestion.py
diff --git a/yoda/llava_data_prep/scripts/ingest_edgar_data.py b/yoda/llava_data_prep/scripts/ingest_edgar_data.py
@@ -0,0 +1,72 @@
+import os
+import sys
+import argparse
+import logging
+import yaml
+
+
+current_dir = os.path.dirname(os.path.abspath(__file__))
+kit_dir = os.path.abspath(os.path.join(current_dir, "../.."))
+repo_dir = os.path.abspath(os.path.join(kit_dir, ".."))
+
+sys.path.append(kit_dir)
+sys.path.append(repo_dir)
+
+from yoda.llava_data_prep.src.edgar_ingestion import SECTools # type: ignore
+from yoda.llava_data_prep.src.table_utils import TableTools # type: ignore
+
+CONFIG_PATH = os.path.join(kit_dir, 'llava_data_prep', 'config.yaml')
+DATA_DIRECTORY = os.path.join(kit_dir, 'llava_data_prep', 'sec_data')
+LOGS_PATH = os.path.join(DATA_DIRECTORY, "logs")
+
+parser = argparse.ArgumentParser(description="Download EDGAR reports, detect tables, crop tables, and store.")
+
+parser.add_argument("--name", type=str, help="Name of run", default="defaults")
+
+def main() -> None:
+
+    try:
+        with open(CONFIG_PATH, 'r') as file:
+            configs =  yaml.safe_load(file)
+    except FileNotFoundError:
+        raise FileNotFoundError(f'The YAML configuration file {CONFIG_PATH} was not found.')
+    except yaml.YAMLError as e:
+        raise RuntimeError(f'Error parsing YAML file: {e}')
+
+    if not os.path.exists(LOGS_PATH):
+        os.makedirs(LOGS_PATH, exist_ok=True)
+
+    args = parser.parse_args()
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+    file_handler = logging.FileHandler(os.path.join(LOGS_PATH, f"{args.name}.log"))
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    logging.info(f'Ingesting SEC EDGAR data with configs: ' +
+                 f'{configs["sec"]}')
+
+    args = parser.parse_args()
+
+    OUTPUT_DIR = os.path.join(DATA_DIRECTORY, args.name)
+
+    logging.info(f"Running ingest_edgar_data.py with args: " +
+                 f"{args} to output directory: " +
+                 f"{OUTPUT_DIR}")
+
+    sec_tool = SECTools(config_path=CONFIG_PATH)
+
+    sec_tool.download_filings(download_folder=OUTPUT_DIR)
+    sec_tool.convert_txt_to_pdf(data_directory=OUTPUT_DIR)
+
+    logging.info("Downloaded filings and converted to pdf.")
+    logging.info("Converting pdfs to images.")
+
+    table_tools = TableTools(config_path=CONFIG_PATH)
+
+    table_tools.convert_pdf_to_images(data_directory=OUTPUT_DIR)
+    table_tools.crop_tables(data_directory=OUTPUT_DIR)
+
+if __name__ == "__main__":
+    main()
diff --git a/yoda/llava_data_prep/src/edgar_ingestion.py b/yoda/llava_data_prep/src/edgar_ingestion.py
@@ -1,16 +1,16 @@
 import yaml
 import glob
-from typing import List, Dict
-from sec_edgar_downloader import Downloader
+from typing import Dict
 import pdfkit
+from sec_edgar_downloader import Downloader
 
 
 class SECTools:
 
     def __init__(self,
-                config: dict) -> None:
+                config_path: str) -> None:
 
-        self.configs = self.load_config(config)
+        self.configs = self.load_config(config_path)
 
 
     @staticmethod
@@ -33,26 +33,20 @@ def load_config(filename: str) -> dict:
         except yaml.YAMLError as e:
             raise RuntimeError(f'Error parsing YAML file: {e}')
 
-    def download_filings(self, tickers: List[str],
-                        form_types: List[str],
-                        after: str,
-                        before: str,
-                        download_folder: str) -> None:
+    def download_filings(self, download_folder: str) -> None:
 
         dl = Downloader(company_name=self.configs["sec"]["company"],
                         email_address=self.configs["sec"]["email"],
                         download_folder=download_folder)
 
-        for ticker in tickers:
-            for form_type in form_types:
-
-                print(f"Downloading {form_type} filing for {ticker} from {after}, {before}")
+        for ticker in self.configs["sec"]["tickers"]:
+            for form_type in self.configs["sec"]["form_types"]:
 
                 dl.get(
                     form=form_type,
                     ticker_or_cik=ticker,
-                    after=after,
-                    before=before
+                    after=self.configs["sec"]["start_date"],
+                    before=self.configs["sec"]["end_date"]
                 )
 
     def _read_txt_file(self, filename: str) -> str:

diff --git a/yoda/llava_data_prep/src/table_utils.py b/yoda/llava_data_prep/src/table_utils.py
@@ -70,12 +70,44 @@
 
 class TableTools:
 
-    def __init__(self, 
-                 do_reshape: bool = False, 
-                 size: Tuple[int,int] = (336,336)) -> None:
+    def __init__(self, config_path: str) -> None:
+
+        self.llm_info, self.prompt_info, self.table_info = self.load_configs(config_path)
+
+        if self.table_info["model"] not in ["yolo", "doclaynet"]:
+            raise ValueError(f'Got {self.table_info["model"]}, must be of either ' +
+                             '["yolo", "doclaynet"]')
+
+        self.model = self.table_info["model"]
+        self.threshold = self.table_info["threshold"]
+        self.offset = self.table_info["offset"]
+        self.do_reshape = self.table_info["do_reshape"]
+        self.size = self.table_info["size"]
 
-        self.do_reshape = do_reshape
-        self.size = size
+    def load_configs(self, config_path: str) -> Any:
+        """
+        Loads a yaml config file and returns llm info.
+
+        Args:
+            config_path: Path to the config yaml file.
+
+        Returns:
+            A tuple of dictionaries containing the llm information.
+        """
+
+        assert isinstance(config_path, str), \
+            TypeError(f"Must be type str, but got {type(config_path)}.")
+
+        try:
+            with open(config_path, "r") as yaml_file:
+                config: dict = yaml.safe_load(yaml_file)
+        except FileNotFoundError:
+            logging.error(f"{config_path} not found.")
+            raise FileNotFoundError(f"{config_path} does not exist.")
+
+        llm_info, prompt_info, table_info = config["llm"], config["prompts"], config["table_options"]
+
+        return llm_info, prompt_info, table_info
 
     def convert_pdf_to_images(self, data_directory: str) -> None:
             """
@@ -117,7 +149,7 @@ def convert_pdf_to_images(self, data_directory: str) -> None:
                     output_path: str = f"{output_folder}/{img_name}"
                     image.save(output_path, 'JPEG')
 
-    def crop_tables(self,
+    def crop_tables_yolo(self,
                     data_directory: str, 
                     conf: float = 0.25,
                     iou: float = 0.45,
@@ -284,6 +316,28 @@ def crop_tables_doclaynet(self, data_directory: str, threshold: float = 0.75, of
 
         logging.info(f"Cropped tables saved to {data_directory} " +
                      "in subdirectories.")
+
+    # TODO: Handle differing options if both models are to be kept - experimental testing for now.
+    def crop_tables(self, data_directory: str) -> None:
+
+        """
+        This method crops tables using the chosen model.
+
+        Args:
+            data_directory: directory of images
+            threshold: the float value for confidence thresholding
+            offset: How much to pad the table detection when cropping.
+        """
+
+        model_map = {
+            "yolo": self.crop_tables_yolo,
+            "doclaynet": self.crop_tables_doclaynet
+        }
+
+        if self.model in model_map:
+            model_map[self.model](data_directory=data_directory,
+                                  threshold=self.threshold,
+                                  offset=self.offset)
 
     def replace_special_to_latex(self, text: str) -> str:
 
@@ -658,6 +712,7 @@ def _write_tex_file(self, tex_filepath: str,
         with open(tex_filepath, 'w') as f:
             f.write(latex_code)
 
+    # TODO: Decide to replace with detr model - yolo or doclaynet
     def _crop_synth_table(self, image_path: str) -> None:
 
         """
@@ -851,16 +906,13 @@ class QAList(BaseModel):
 class TableAugmentor:
 
     def __init__(self, config_path: str) -> None:
-
-        assert isinstance(config_path, str), TypeError(f"Expected str, got {type(config_path)}.")
 
+        self.config_path = config_path
         self.llm_info, self.prompt_info, self.table_info = self.load_configs(config_path)
         self.init_llm()
         self.init_table_modifying_chain()
         self.init_table_qa_chain()
         self.init_table_ocr_chain()
-        self.do_reshape = self.table_info["do_reshape"]
-        self.size = self.table_info["size"]
 
     def load_configs(self, config_path: str) -> Any:
         """
@@ -883,7 +935,7 @@ def load_configs(self, config_path: str) -> Any:
             logging.error(f"{config_path} not found.")
             raise FileNotFoundError(f"{config_path} does not exist.")
 
-        llm_info, prompt_info, table_info = config["llm"], config["prompts"], config["table_generation"]
+        llm_info, prompt_info, table_info = config["llm"], config["prompts"], config["table_options"]
 
         return llm_info, prompt_info, table_info
 
@@ -1055,8 +1107,7 @@ def create_training_data(self,
             json.dump([], f)
 
         # Instantiate table tools object for later use.
-        table_tools = TableTools(do_reshape=self.do_reshape,
-                                 size=self.size)
+        table_tools = TableTools(config_path=self.config_path)
 
         # Generate synthetic data for specified number of
         # samples.