diff --git a/transforms/universal/html2parquet/Makefile b/transforms/universal/html2parquet/Makefile
new file mode 100644
index 000000000..017eb23b4
--- /dev/null
+++ b/transforms/universal/html2parquet/Makefile
@@ -0,0 +1,66 @@
+REPOROOT=../../..
+# Use make help, to see the available rules
+include $(REPOROOT)/.make.defaults
+
+setup::
+ @# Help: Recursively make $@ all subdirs
+ $(MAKE) RULE=$@ .recurse
+
+clean::
+ @# Help: Recursively make $@ all subdirs
+ $(MAKE) RULE=$@ .recurse
+
+build::
+ @# Help: Recursively make $@ in subdirs
+ $(MAKE) RULE=$@ .recurse
+venv::
+ @# Help: Recursively make $@ in subdirs
+ $(MAKE) RULE=$@ .recurse
+
+image::
+ @# Help: Recursively make $@ in all subdirs
+ @$(MAKE) RULE=$@ .recurse
+
+set-versions:
+ @# Help: Recursively $@ in all subdirs
+ @$(MAKE) RULE=$@ .recurse
+
+publish::
+ @# Help: Recursively make $@ in all subdirs
+ @$(MAKE) RULE=$@ .recurse
+
+test-image::
+ @# Help: Recursively make $@ in all subdirs
+ @$(MAKE) RULE=$@ .recurse
+
+test::
+ @# Help: Recursively make $@ in all subdirs
+ @$(MAKE) RULE=$@ .recurse
+
+test-src::
+ @# Help: Recursively make $@ in all subdirs
+ $(MAKE) RULE=$@ .recurse
+
+kind-load-image::
+ @# Help: Recursively make $@ in all subdirs
+ $(MAKE) RULE=$@ .recurse
+
+docker-load-image::
+ @# Help: Recursively make $@ in all subdirs
+ $(MAKE) RULE=$@ .recurse
+
+docker-save-image::
+ @# Help: Recursively make $@ in all subdirs
+ $(MAKE) RULE=$@ .recurse
+
+.PHONY: workflow-venv
+workflow-venv:
+
+.PHONY: workflow-test
+workflow-test:
+
+.PHONY: workflow-upload
+workflow-upload:
+
+.PHONY: workflow-build
+workflow-build:
diff --git a/transforms/universal/html2parquet/python/Dockerfile b/transforms/universal/html2parquet/python/Dockerfile
index 84ea63ffd..3d49cb79e 100644
--- a/transforms/universal/html2parquet/python/Dockerfile
+++ b/transforms/universal/html2parquet/python/Dockerfile
@@ -22,10 +22,10 @@ COPY --chown=dpk:root pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .
# copy transform main() entry point to the image
-COPY ./src/noop_transform_python.py .
+COPY ./src/html2parquet_transform_python.py .
# copy some of the samples in
-COPY ./src/noop_local.py local/
+COPY ./src/html2parquet_local.py local/
# copy test
COPY test/ test/
diff --git a/transforms/universal/html2parquet/python/Makefile b/transforms/universal/html2parquet/python/Makefile
index 28139172f..0e552d5be 100644
--- a/transforms/universal/html2parquet/python/Makefile
+++ b/transforms/universal/html2parquet/python/Makefile
@@ -41,7 +41,11 @@ publish-dist:: .defaults.publish-dist
test-image:: .transforms.python-test-image
-run-cli-sample: .transforms.run-cli-python-sample
+run-cli-sample:
+ $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \
+ RUN_ARGS=" \
+ --data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \
+ --data_files_to_use \"['.html','.zip']\"
run-local-sample: .transforms.run-local-sample
diff --git a/transforms/universal/html2parquet/python/README.md b/transforms/universal/html2parquet/python/README.md
index 7d0e3d607..3e7b0045e 100644
--- a/transforms/universal/html2parquet/python/README.md
+++ b/transforms/universal/html2parquet/python/README.md
@@ -19,3 +19,10 @@ The output format will contain the following colums
}
```
## Parameters
+The transform can be initialized with the following parameters.
+
+| Parameter | Default | Description |
+|------------|----------|--------------|
+| `output_format` | `markdown` | The output type for the `contents` column. Valid types are `markdown` and `text`. |
+
+When invoking the CLI, the parameters must be set as `--html2parquet_`, e.g. `--html2parquet_output_format='markdown'`.
\ No newline at end of file
diff --git a/transforms/universal/html2parquet/python/src/html2parquet_local.py b/transforms/universal/html2parquet/python/src/html2parquet_local.py
index 9f7882d27..b68cc89a5 100644
--- a/transforms/universal/html2parquet/python/src/html2parquet_local.py
+++ b/transforms/universal/html2parquet/python/src/html2parquet_local.py
@@ -13,7 +13,7 @@
import os
from data_processing.data_access import DataAccessLocal
-from html2parquet_transform import HtmlToParquetTransform
+from html2parquet_transform import Html2ParquetTransform
# create parameters
@@ -23,10 +23,10 @@
if __name__ == "__main__":
# Here we show how to run outside of the runtime
# Create and configure the transform.
- transform = HtmlToParquetTransform(html2parquet_params)
+ transform = Html2ParquetTransform(html2parquet_params)
# Use the local data access to read a parquet table.
data_access = DataAccessLocal()
- file_to_process = os.path.join(input_folder, "hmlt_test1.html")
+ file_to_process = os.path.join(input_folder, "test1.html")
byte_array, _ = data_access.get_file(file_to_process)
print(f"input file: {file_to_process}")
# Transform the table
diff --git a/transforms/universal/html2parquet/python/src/html2parquet_local_python.py b/transforms/universal/html2parquet/python/src/html2parquet_local_python.py
index 13d5415be..d06383b6d 100644
--- a/transforms/universal/html2parquet/python/src/html2parquet_local_python.py
+++ b/transforms/universal/html2parquet/python/src/html2parquet_local_python.py
@@ -16,7 +16,7 @@
from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
-from html2parquet_transform_python import HtmlToParquetPythonTransformConfiguration
+from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration
# create parameters
@@ -41,6 +41,6 @@
# Set the simulated command line args
sys.argv = ParamsUtils.dict_to_req(d=params)
# create launcher
- launcher = PythonTransformLauncher(runtime_config=HtmlToParquetPythonTransformConfiguration())
+ launcher = PythonTransformLauncher(runtime_config=Html2ParquetPythonTransformConfiguration())
# Launch the ray actor(s) to process the input
launcher.launch()
diff --git a/transforms/universal/html2parquet/python/src/html2parquet_transform.py b/transforms/universal/html2parquet/python/src/html2parquet_transform.py
index 829a0b955..b2be84cc9 100644
--- a/transforms/universal/html2parquet/python/src/html2parquet_transform.py
+++ b/transforms/universal/html2parquet/python/src/html2parquet_transform.py
@@ -1,3 +1,4 @@
+import enum
import time
from argparse import ArgumentParser, Namespace
from typing import Any
@@ -20,15 +21,26 @@
from data_processing.utils import CLIArgumentProvider, get_logger, TransformUtils
+
class Html2ParquetTransform(AbstractBinaryTransform):
def __init__(self, config: dict[str, Any]):
- super().__init__(config)
+ super().__init__(config)
+
+ self.output_format = config.get(html2parquet_output_format_key, html2parquet_output_format.MARKDOWN)
+ if not isinstance(self.output_format, html2parquet_output_format):
+ self.output_format = html2parquet_output_format[self.output_format]
def _convert_html2parquet(self, member_filename:str, file_name:str, content_bytes: bytes) -> dict:
title = member_filename if member_filename else TransformUtils.get_file_basename(file_name)
# Use Trafilatura library
- content_string = trafilatura.extract(content_bytes)
+ if self.output_format == html2parquet_output_format.MARKDOWN:
+ content_string = trafilatura.extract(content_bytes, output_format="markdown")
+ elif self.output_format == html2parquet_output_format.TEXT:
+ content_string = trafilatura.extract(content_bytes)
+ else:
+ raise RuntimeError(f"Uknown output_format {self.output_format}.")
+
if content_string is None:
raise RuntimeError("Failed in converting.")
@@ -55,7 +67,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
"""
if TransformUtils.get_file_extension(file_name)[1] not in [".zip", ".html"]:
error_message = f"Unsupported file type: {file_name}. Only ZIP and HTML files are supported."
- self.logger.error(error_message)
+ logger.error(error_message)
raise ValueError(error_message) # Raising an exception with the error message
data = []
number_of_rows = 0
@@ -76,7 +88,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
data.append(row_data)
number_of_rows += 1
except Exception as e:
- self.logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping")
+ logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping")
# Process single HTML documents
@@ -92,16 +104,28 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
number_of_rows += 1
except Exception as e:
- self.logger.warning(f"Exception {str(e)} processing file {file_name}, skipping")
+ logger.warning(f"Exception {str(e)} processing file {file_name}, skipping")
table = pa.Table.from_pylist(data)
- return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"number of rows": number_of_rows}
+ return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"nrows": number_of_rows}
+
logger = get_logger(__name__)
short_name = "html2parquet"
cli_prefix = f"{short_name}_"
+html2parquet_output_format_key = f"output_format"
+
+class html2parquet_output_format(str, enum.Enum):
+ MARKDOWN = "markdown"
+ TEXT = "text"
+
+ def __str__(self):
+ return str(self.value)
+
+html2parquet_output_format_default = html2parquet_output_format.MARKDOWN
+html2parquet_output_format_cli_param = f"{cli_prefix}{html2parquet_output_format_key}"
class Html2ParquetTransformConfiguration(TransformConfiguration):
@@ -111,7 +135,16 @@ def __init__(self):
transform_class=Html2ParquetTransform,
)
def add_input_params(self, parser: ArgumentParser) -> None:
- pass
+ parser.add_argument(
+ f"--{html2parquet_output_format_cli_param}",
+ type=html2parquet_output_format,
+ choices=list(html2parquet_output_format),
+ help="Output format for the contents column.",
+ default=html2parquet_output_format.MARKDOWN,
+ )
def apply_input_params(self, args: Namespace) -> bool:
+ captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+ self.params = self.params | captured
+ logger.info(f"html2parquet parameters are : {self.params}")
return True
\ No newline at end of file
diff --git a/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py b/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py
index 59bf3dad1..826b9b5a0 100644
--- a/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py
+++ b/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py
@@ -5,13 +5,13 @@
PythonTransformRuntimeConfiguration,
)
from data_processing.utils import get_logger
-from html2parquet_transform import HtmlToParquetTransformConfiguration
+from html2parquet_transform import Html2ParquetTransformConfiguration
logger = get_logger(__name__)
-class HtmlToParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+class Html2ParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
"""
Implements the PythonTransformConfiguration for HTML2PARQUET as required by the PythonTransformLauncher.
"""
@@ -20,9 +20,9 @@ def __init__(self):
Initialization
:param base_configuration - base configuration class
"""
- super().__init__(transform_config=HtmlToParquetTransformConfiguration())
+ super().__init__(transform_config=Html2ParquetTransformConfiguration())
if __name__ == "__main__":
- launcher = PythonTransformLauncher(HtmlToParquetPythonTransformConfiguration())
+ launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())
logger.info("Launching html2parquet transform")
launcher.launch()
\ No newline at end of file
diff --git a/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet b/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet
index 6bf19d36a..962d910b5 100644
Binary files a/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet and b/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet differ
diff --git a/transforms/universal/html2parquet/python/test-data/expected/metadata.json b/transforms/universal/html2parquet/python/test-data/expected/metadata.json
index ebac7ae05..2afec1488 100644
--- a/transforms/universal/html2parquet/python/test-data/expected/metadata.json
+++ b/transforms/universal/html2parquet/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
"job name": "html2parquet",
"job type": "pure python",
"job id": "job_id",
- "start_time": "2024-08-13 16:36:02",
- "end_time": "2024-08-13 16:36:02",
+ "start_time": "2024-08-29 16:51:41",
+ "end_time": "2024-08-29 16:51:41",
"status": "success"
},
"code": null,
@@ -23,9 +23,9 @@
"source_files": 2,
"source_size": 460391,
"result_files": 2,
- "result_size": 14487,
- "processing_time": 0.0719749927520752,
- "number of rows": 3
+ "result_size": 13508,
+ "processing_time": 0.09080028533935547,
+ "nrows": 3
},
"source": {
"name": "/Users/sungeunan/Desktop/temp/data-prep-kit/transforms/universal/html2parquet/python/test-data/input",
diff --git a/transforms/universal/html2parquet/python/test-data/expected/test1.parquet b/transforms/universal/html2parquet/python/test-data/expected/test1.parquet
index 7cd44e6aa..256682c76 100644
Binary files a/transforms/universal/html2parquet/python/test-data/expected/test1.parquet and b/transforms/universal/html2parquet/python/test-data/expected/test1.parquet differ
diff --git a/transforms/universal/html2parquet/python/test/test_html2parquet.py b/transforms/universal/html2parquet/python/test/test_html2parquet.py
index 1f689cacd..12b5b15b9 100644
--- a/transforms/universal/html2parquet/python/test/test_html2parquet.py
+++ b/transforms/universal/html2parquet/python/test/test_html2parquet.py
@@ -16,10 +16,10 @@
from data_processing.test_support import get_files_in_folder
from data_processing.test_support.transform import AbstractBinaryTransformTest
from data_processing.utils import TransformUtils
-from html2parquet_transform import HtmlToParquetTransform
+from html2parquet_transform import Html2ParquetTransform
-class TestHtmlToParquetTransform(AbstractBinaryTransformTest):
+class TestHtml2ParquetTransform(AbstractBinaryTransformTest):
"""
Extends the super-class to define the test data for the tests defined there.
The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
@@ -33,7 +33,8 @@ def get_test_transform_fixtures(self) -> list[tuple]:
input_dir = os.path.join(basedir, "input")
input_files = get_files_in_folder(input_dir, ".html")
input_files = [(name, binary) for name, binary in input_files.items()]
- expected_metadata_list = [{"nrows": 1, "nsuccess": 1, "nfail": 0, "nskip": 0}, {}]
+ expected_metadata_list = [{"nrows": 1}, {}]
+
config = {}
expected_files = [
@@ -47,7 +48,7 @@ def get_test_transform_fixtures(self) -> list[tuple]:
]
return [
(
- HtmlToParquetTransform(config),
+ Html2ParquetTransform(config),
input_files,
expected_files,
expected_metadata_list,
diff --git a/transforms/universal/html2parquet/python/test/test_html2parquet_python.py b/transforms/universal/html2parquet/python/test/test_html2parquet_python.py
index 2de01743d..722dbf80d 100644
--- a/transforms/universal/html2parquet/python/test/test_html2parquet_python.py
+++ b/transforms/universal/html2parquet/python/test/test_html2parquet_python.py
@@ -17,9 +17,9 @@
from data_processing.test_support.launch.transform_test import (
AbstractTransformLauncherTest,
)
-from html2parquet_transform_python import HtmlToParquetPythonTransformConfiguration
+from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration
-class TestPythonHtmlToParquetTransform(AbstractTransformLauncherTest):
+class TestPythonHtml2ParquetTransform(AbstractTransformLauncherTest):
"""
Extends the super-class to define the test data for the tests defined there.
The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
@@ -30,16 +30,23 @@ def get_test_transform_fixtures(self) -> list[tuple]:
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
config = {
"data_files_to_use": ast.literal_eval("['.html','.zip']"),
+ "html2parquet_output_format": "markdown",
}
+ # this is added as a fixture to remove these columns from comparison
+ ignore_columns = ["date_acquired", "document_id", "pdf_convert_time", "hash"]
+ ignore_columns = ["date_acquired"]
fixtures = []
- launcher = PythonTransformLauncher(HtmlToParquetPythonTransformConfiguration())
+ launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())
fixtures.append(
(
launcher,
config,
basedir + "/input",
basedir + "/expected",
+ ignore_columns,
+
)
)
return fixtures
+