diff --git a/transforms/universal/html2parquet/Makefile b/transforms/universal/html2parquet/Makefile new file mode 100644 index 000000000..017eb23b4 --- /dev/null +++ b/transforms/universal/html2parquet/Makefile @@ -0,0 +1,66 @@ +REPOROOT=../../.. +# Use make help, to see the available rules +include $(REPOROOT)/.make.defaults + +setup:: + @# Help: Recursively make $@ all subdirs + $(MAKE) RULE=$@ .recurse + +clean:: + @# Help: Recursively make $@ all subdirs + $(MAKE) RULE=$@ .recurse + +build:: + @# Help: Recursively make $@ in subdirs + $(MAKE) RULE=$@ .recurse +venv:: + @# Help: Recursively make $@ in subdirs + $(MAKE) RULE=$@ .recurse + +image:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +set-versions: + @# Help: Recursively $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +publish:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test-image:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test:: + @# Help: Recursively make $@ in all subdirs + @$(MAKE) RULE=$@ .recurse + +test-src:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +kind-load-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +docker-load-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +docker-save-image:: + @# Help: Recursively make $@ in all subdirs + $(MAKE) RULE=$@ .recurse + +.PHONY: workflow-venv +workflow-venv: + +.PHONY: workflow-test +workflow-test: + +.PHONY: workflow-upload +workflow-upload: + +.PHONY: workflow-build +workflow-build: diff --git a/transforms/universal/html2parquet/python/Dockerfile b/transforms/universal/html2parquet/python/Dockerfile index 84ea63ffd..3d49cb79e 100644 --- a/transforms/universal/html2parquet/python/Dockerfile +++ b/transforms/universal/html2parquet/python/Dockerfile @@ -22,10 +22,10 @@ COPY --chown=dpk:root pyproject.toml pyproject.toml RUN pip install --no-cache-dir -e . # copy transform main() entry point to the image -COPY ./src/noop_transform_python.py . +COPY ./src/html2parquet_transform_python.py . # copy some of the samples in -COPY ./src/noop_local.py local/ +COPY ./src/html2parquet_local.py local/ # copy test COPY test/ test/ diff --git a/transforms/universal/html2parquet/python/Makefile b/transforms/universal/html2parquet/python/Makefile index 28139172f..0e552d5be 100644 --- a/transforms/universal/html2parquet/python/Makefile +++ b/transforms/universal/html2parquet/python/Makefile @@ -41,7 +41,11 @@ publish-dist:: .defaults.publish-dist test-image:: .transforms.python-test-image -run-cli-sample: .transforms.run-cli-python-sample +run-cli-sample: + $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \ + RUN_ARGS=" \ + --data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \ + --data_files_to_use \"['.html','.zip']\" run-local-sample: .transforms.run-local-sample diff --git a/transforms/universal/html2parquet/python/README.md b/transforms/universal/html2parquet/python/README.md index 7d0e3d607..3e7b0045e 100644 --- a/transforms/universal/html2parquet/python/README.md +++ b/transforms/universal/html2parquet/python/README.md @@ -19,3 +19,10 @@ The output format will contain the following colums } ``` ## Parameters +The transform can be initialized with the following parameters. + +| Parameter | Default | Description | +|------------|----------|--------------| +| `output_format` | `markdown` | The output type for the `contents` column. Valid types are `markdown` and `text`. | + +When invoking the CLI, the parameters must be set as `--html2parquet_`, e.g. `--html2parquet_output_format='markdown'`. \ No newline at end of file diff --git a/transforms/universal/html2parquet/python/src/html2parquet_local.py b/transforms/universal/html2parquet/python/src/html2parquet_local.py index 9f7882d27..b68cc89a5 100644 --- a/transforms/universal/html2parquet/python/src/html2parquet_local.py +++ b/transforms/universal/html2parquet/python/src/html2parquet_local.py @@ -13,7 +13,7 @@ import os from data_processing.data_access import DataAccessLocal -from html2parquet_transform import HtmlToParquetTransform +from html2parquet_transform import Html2ParquetTransform # create parameters @@ -23,10 +23,10 @@ if __name__ == "__main__": # Here we show how to run outside of the runtime # Create and configure the transform. - transform = HtmlToParquetTransform(html2parquet_params) + transform = Html2ParquetTransform(html2parquet_params) # Use the local data access to read a parquet table. data_access = DataAccessLocal() - file_to_process = os.path.join(input_folder, "hmlt_test1.html") + file_to_process = os.path.join(input_folder, "test1.html") byte_array, _ = data_access.get_file(file_to_process) print(f"input file: {file_to_process}") # Transform the table diff --git a/transforms/universal/html2parquet/python/src/html2parquet_local_python.py b/transforms/universal/html2parquet/python/src/html2parquet_local_python.py index 13d5415be..d06383b6d 100644 --- a/transforms/universal/html2parquet/python/src/html2parquet_local_python.py +++ b/transforms/universal/html2parquet/python/src/html2parquet_local_python.py @@ -16,7 +16,7 @@ from data_processing.runtime.pure_python import PythonTransformLauncher from data_processing.utils import ParamsUtils -from html2parquet_transform_python import HtmlToParquetPythonTransformConfiguration +from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration # create parameters @@ -41,6 +41,6 @@ # Set the simulated command line args sys.argv = ParamsUtils.dict_to_req(d=params) # create launcher - launcher = PythonTransformLauncher(runtime_config=HtmlToParquetPythonTransformConfiguration()) + launcher = PythonTransformLauncher(runtime_config=Html2ParquetPythonTransformConfiguration()) # Launch the ray actor(s) to process the input launcher.launch() diff --git a/transforms/universal/html2parquet/python/src/html2parquet_transform.py b/transforms/universal/html2parquet/python/src/html2parquet_transform.py index 829a0b955..b2be84cc9 100644 --- a/transforms/universal/html2parquet/python/src/html2parquet_transform.py +++ b/transforms/universal/html2parquet/python/src/html2parquet_transform.py @@ -1,3 +1,4 @@ +import enum import time from argparse import ArgumentParser, Namespace from typing import Any @@ -20,15 +21,26 @@ from data_processing.utils import CLIArgumentProvider, get_logger, TransformUtils + class Html2ParquetTransform(AbstractBinaryTransform): def __init__(self, config: dict[str, Any]): - super().__init__(config) + super().__init__(config) + + self.output_format = config.get(html2parquet_output_format_key, html2parquet_output_format.MARKDOWN) + if not isinstance(self.output_format, html2parquet_output_format): + self.output_format = html2parquet_output_format[self.output_format] def _convert_html2parquet(self, member_filename:str, file_name:str, content_bytes: bytes) -> dict: title = member_filename if member_filename else TransformUtils.get_file_basename(file_name) # Use Trafilatura library - content_string = trafilatura.extract(content_bytes) + if self.output_format == html2parquet_output_format.MARKDOWN: + content_string = trafilatura.extract(content_bytes, output_format="markdown") + elif self.output_format == html2parquet_output_format.TEXT: + content_string = trafilatura.extract(content_bytes) + else: + raise RuntimeError(f"Uknown output_format {self.output_format}.") + if content_string is None: raise RuntimeError("Failed in converting.") @@ -55,7 +67,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl """ if TransformUtils.get_file_extension(file_name)[1] not in [".zip", ".html"]: error_message = f"Unsupported file type: {file_name}. Only ZIP and HTML files are supported." - self.logger.error(error_message) + logger.error(error_message) raise ValueError(error_message) # Raising an exception with the error message data = [] number_of_rows = 0 @@ -76,7 +88,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl data.append(row_data) number_of_rows += 1 except Exception as e: - self.logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping") + logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping") # Process single HTML documents @@ -92,16 +104,28 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl number_of_rows += 1 except Exception as e: - self.logger.warning(f"Exception {str(e)} processing file {file_name}, skipping") + logger.warning(f"Exception {str(e)} processing file {file_name}, skipping") table = pa.Table.from_pylist(data) - return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"number of rows": number_of_rows} + return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"nrows": number_of_rows} + logger = get_logger(__name__) short_name = "html2parquet" cli_prefix = f"{short_name}_" +html2parquet_output_format_key = f"output_format" + +class html2parquet_output_format(str, enum.Enum): + MARKDOWN = "markdown" + TEXT = "text" + + def __str__(self): + return str(self.value) + +html2parquet_output_format_default = html2parquet_output_format.MARKDOWN +html2parquet_output_format_cli_param = f"{cli_prefix}{html2parquet_output_format_key}" class Html2ParquetTransformConfiguration(TransformConfiguration): @@ -111,7 +135,16 @@ def __init__(self): transform_class=Html2ParquetTransform, ) def add_input_params(self, parser: ArgumentParser) -> None: - pass + parser.add_argument( + f"--{html2parquet_output_format_cli_param}", + type=html2parquet_output_format, + choices=list(html2parquet_output_format), + help="Output format for the contents column.", + default=html2parquet_output_format.MARKDOWN, + ) def apply_input_params(self, args: Namespace) -> bool: + captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False) + self.params = self.params | captured + logger.info(f"html2parquet parameters are : {self.params}") return True \ No newline at end of file diff --git a/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py b/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py index 59bf3dad1..826b9b5a0 100644 --- a/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py +++ b/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py @@ -5,13 +5,13 @@ PythonTransformRuntimeConfiguration, ) from data_processing.utils import get_logger -from html2parquet_transform import HtmlToParquetTransformConfiguration +from html2parquet_transform import Html2ParquetTransformConfiguration logger = get_logger(__name__) -class HtmlToParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration): +class Html2ParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration): """ Implements the PythonTransformConfiguration for HTML2PARQUET as required by the PythonTransformLauncher. """ @@ -20,9 +20,9 @@ def __init__(self): Initialization :param base_configuration - base configuration class """ - super().__init__(transform_config=HtmlToParquetTransformConfiguration()) + super().__init__(transform_config=Html2ParquetTransformConfiguration()) if __name__ == "__main__": - launcher = PythonTransformLauncher(HtmlToParquetPythonTransformConfiguration()) + launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration()) logger.info("Launching html2parquet transform") launcher.launch() \ No newline at end of file diff --git a/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet b/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet index 6bf19d36a..962d910b5 100644 Binary files a/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet and b/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet differ diff --git a/transforms/universal/html2parquet/python/test-data/expected/metadata.json b/transforms/universal/html2parquet/python/test-data/expected/metadata.json index ebac7ae05..2afec1488 100644 --- a/transforms/universal/html2parquet/python/test-data/expected/metadata.json +++ b/transforms/universal/html2parquet/python/test-data/expected/metadata.json @@ -5,8 +5,8 @@ "job name": "html2parquet", "job type": "pure python", "job id": "job_id", - "start_time": "2024-08-13 16:36:02", - "end_time": "2024-08-13 16:36:02", + "start_time": "2024-08-29 16:51:41", + "end_time": "2024-08-29 16:51:41", "status": "success" }, "code": null, @@ -23,9 +23,9 @@ "source_files": 2, "source_size": 460391, "result_files": 2, - "result_size": 14487, - "processing_time": 0.0719749927520752, - "number of rows": 3 + "result_size": 13508, + "processing_time": 0.09080028533935547, + "nrows": 3 }, "source": { "name": "/Users/sungeunan/Desktop/temp/data-prep-kit/transforms/universal/html2parquet/python/test-data/input", diff --git a/transforms/universal/html2parquet/python/test-data/expected/test1.parquet b/transforms/universal/html2parquet/python/test-data/expected/test1.parquet index 7cd44e6aa..256682c76 100644 Binary files a/transforms/universal/html2parquet/python/test-data/expected/test1.parquet and b/transforms/universal/html2parquet/python/test-data/expected/test1.parquet differ diff --git a/transforms/universal/html2parquet/python/test/test_html2parquet.py b/transforms/universal/html2parquet/python/test/test_html2parquet.py index 1f689cacd..12b5b15b9 100644 --- a/transforms/universal/html2parquet/python/test/test_html2parquet.py +++ b/transforms/universal/html2parquet/python/test/test_html2parquet.py @@ -16,10 +16,10 @@ from data_processing.test_support import get_files_in_folder from data_processing.test_support.transform import AbstractBinaryTransformTest from data_processing.utils import TransformUtils -from html2parquet_transform import HtmlToParquetTransform +from html2parquet_transform import Html2ParquetTransform -class TestHtmlToParquetTransform(AbstractBinaryTransformTest): +class TestHtml2ParquetTransform(AbstractBinaryTransformTest): """ Extends the super-class to define the test data for the tests defined there. The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. @@ -33,7 +33,8 @@ def get_test_transform_fixtures(self) -> list[tuple]: input_dir = os.path.join(basedir, "input") input_files = get_files_in_folder(input_dir, ".html") input_files = [(name, binary) for name, binary in input_files.items()] - expected_metadata_list = [{"nrows": 1, "nsuccess": 1, "nfail": 0, "nskip": 0}, {}] + expected_metadata_list = [{"nrows": 1}, {}] + config = {} expected_files = [ @@ -47,7 +48,7 @@ def get_test_transform_fixtures(self) -> list[tuple]: ] return [ ( - HtmlToParquetTransform(config), + Html2ParquetTransform(config), input_files, expected_files, expected_metadata_list, diff --git a/transforms/universal/html2parquet/python/test/test_html2parquet_python.py b/transforms/universal/html2parquet/python/test/test_html2parquet_python.py index 2de01743d..722dbf80d 100644 --- a/transforms/universal/html2parquet/python/test/test_html2parquet_python.py +++ b/transforms/universal/html2parquet/python/test/test_html2parquet_python.py @@ -17,9 +17,9 @@ from data_processing.test_support.launch.transform_test import ( AbstractTransformLauncherTest, ) -from html2parquet_transform_python import HtmlToParquetPythonTransformConfiguration +from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration -class TestPythonHtmlToParquetTransform(AbstractTransformLauncherTest): +class TestPythonHtml2ParquetTransform(AbstractTransformLauncherTest): """ Extends the super-class to define the test data for the tests defined there. The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. @@ -30,16 +30,23 @@ def get_test_transform_fixtures(self) -> list[tuple]: basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) config = { "data_files_to_use": ast.literal_eval("['.html','.zip']"), + "html2parquet_output_format": "markdown", } + # this is added as a fixture to remove these columns from comparison + ignore_columns = ["date_acquired", "document_id", "pdf_convert_time", "hash"] + ignore_columns = ["date_acquired"] fixtures = [] - launcher = PythonTransformLauncher(HtmlToParquetPythonTransformConfiguration()) + launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration()) fixtures.append( ( launcher, config, basedir + "/input", basedir + "/expected", + ignore_columns, + ) ) return fixtures +