Skip to content

Commit

Permalink
Merge pull request IBM#524 from IBM/html2parquet
Browse files Browse the repository at this point in the history
Html2parquet Makefile added
  • Loading branch information
touma-I authored Sep 12, 2024
2 parents 03cba30 + 39a83b4 commit dd96ca0
Show file tree
Hide file tree
Showing 13 changed files with 149 additions and 31 deletions.
66 changes: 66 additions & 0 deletions transforms/universal/html2parquet/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
REPOROOT=../../..
# Use make help, to see the available rules
include $(REPOROOT)/.make.defaults

setup::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

clean::
@# Help: Recursively make $@ all subdirs
$(MAKE) RULE=$@ .recurse

build::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse
venv::
@# Help: Recursively make $@ in subdirs
$(MAKE) RULE=$@ .recurse

image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

set-versions:
@# Help: Recursively $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

publish::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-image::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test::
@# Help: Recursively make $@ in all subdirs
@$(MAKE) RULE=$@ .recurse

test-src::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

kind-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-load-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

docker-save-image::
@# Help: Recursively make $@ in all subdirs
$(MAKE) RULE=$@ .recurse

.PHONY: workflow-venv
workflow-venv:

.PHONY: workflow-test
workflow-test:

.PHONY: workflow-upload
workflow-upload:

.PHONY: workflow-build
workflow-build:
4 changes: 2 additions & 2 deletions transforms/universal/html2parquet/python/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@ COPY --chown=dpk:root pyproject.toml pyproject.toml
RUN pip install --no-cache-dir -e .

# copy transform main() entry point to the image
COPY ./src/noop_transform_python.py .
COPY ./src/html2parquet_transform_python.py .

# copy some of the samples in
COPY ./src/noop_local.py local/
COPY ./src/html2parquet_local.py local/

# copy test
COPY test/ test/
Expand Down
6 changes: 5 additions & 1 deletion transforms/universal/html2parquet/python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,11 @@ publish-dist:: .defaults.publish-dist

test-image:: .transforms.python-test-image

run-cli-sample: .transforms.run-cli-python-sample
run-cli-sample:
$(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \
RUN_ARGS=" \
--data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \
--data_files_to_use \"['.html','.zip']\"

run-local-sample: .transforms.run-local-sample

Expand Down
7 changes: 7 additions & 0 deletions transforms/universal/html2parquet/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,10 @@ The output format will contain the following colums
}
```
## Parameters
The transform can be initialized with the following parameters.

| Parameter | Default | Description |
|------------|----------|--------------|
| `output_format` | `markdown` | The output type for the `contents` column. Valid types are `markdown` and `text`. |

When invoking the CLI, the parameters must be set as `--html2parquet_<name>`, e.g. `--html2parquet_output_format='markdown'`.
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import os

from data_processing.data_access import DataAccessLocal
from html2parquet_transform import HtmlToParquetTransform
from html2parquet_transform import Html2ParquetTransform


# create parameters
Expand All @@ -23,10 +23,10 @@
if __name__ == "__main__":
# Here we show how to run outside of the runtime
# Create and configure the transform.
transform = HtmlToParquetTransform(html2parquet_params)
transform = Html2ParquetTransform(html2parquet_params)
# Use the local data access to read a parquet table.
data_access = DataAccessLocal()
file_to_process = os.path.join(input_folder, "hmlt_test1.html")
file_to_process = os.path.join(input_folder, "test1.html")
byte_array, _ = data_access.get_file(file_to_process)
print(f"input file: {file_to_process}")
# Transform the table
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

from data_processing.runtime.pure_python import PythonTransformLauncher
from data_processing.utils import ParamsUtils
from html2parquet_transform_python import HtmlToParquetPythonTransformConfiguration
from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration


# create parameters
Expand All @@ -41,6 +41,6 @@
# Set the simulated command line args
sys.argv = ParamsUtils.dict_to_req(d=params)
# create launcher
launcher = PythonTransformLauncher(runtime_config=HtmlToParquetPythonTransformConfiguration())
launcher = PythonTransformLauncher(runtime_config=Html2ParquetPythonTransformConfiguration())
# Launch the ray actor(s) to process the input
launcher.launch()
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import enum
import time
from argparse import ArgumentParser, Namespace
from typing import Any
Expand All @@ -20,15 +21,26 @@
from data_processing.utils import CLIArgumentProvider, get_logger, TransformUtils



class Html2ParquetTransform(AbstractBinaryTransform):
def __init__(self, config: dict[str, Any]):
super().__init__(config)
super().__init__(config)

self.output_format = config.get(html2parquet_output_format_key, html2parquet_output_format.MARKDOWN)
if not isinstance(self.output_format, html2parquet_output_format):
self.output_format = html2parquet_output_format[self.output_format]

def _convert_html2parquet(self, member_filename:str, file_name:str, content_bytes: bytes) -> dict:
title = member_filename if member_filename else TransformUtils.get_file_basename(file_name)

# Use Trafilatura library
content_string = trafilatura.extract(content_bytes)
if self.output_format == html2parquet_output_format.MARKDOWN:
content_string = trafilatura.extract(content_bytes, output_format="markdown")
elif self.output_format == html2parquet_output_format.TEXT:
content_string = trafilatura.extract(content_bytes)
else:
raise RuntimeError(f"Uknown output_format {self.output_format}.")


if content_string is None:
raise RuntimeError("Failed in converting.")
Expand All @@ -55,7 +67,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
"""
if TransformUtils.get_file_extension(file_name)[1] not in [".zip", ".html"]:
error_message = f"Unsupported file type: {file_name}. Only ZIP and HTML files are supported."
self.logger.error(error_message)
logger.error(error_message)
raise ValueError(error_message) # Raising an exception with the error message
data = []
number_of_rows = 0
Expand All @@ -76,7 +88,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
data.append(row_data)
number_of_rows += 1
except Exception as e:
self.logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping")
logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping")


# Process single HTML documents
Expand All @@ -92,16 +104,28 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
number_of_rows += 1

except Exception as e:
self.logger.warning(f"Exception {str(e)} processing file {file_name}, skipping")
logger.warning(f"Exception {str(e)} processing file {file_name}, skipping")


table = pa.Table.from_pylist(data)
return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"number of rows": number_of_rows}
return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"nrows": number_of_rows}


logger = get_logger(__name__)

short_name = "html2parquet"
cli_prefix = f"{short_name}_"
html2parquet_output_format_key = f"output_format"

class html2parquet_output_format(str, enum.Enum):
MARKDOWN = "markdown"
TEXT = "text"

def __str__(self):
return str(self.value)

html2parquet_output_format_default = html2parquet_output_format.MARKDOWN
html2parquet_output_format_cli_param = f"{cli_prefix}{html2parquet_output_format_key}"


class Html2ParquetTransformConfiguration(TransformConfiguration):
Expand All @@ -111,7 +135,16 @@ def __init__(self):
transform_class=Html2ParquetTransform,
)
def add_input_params(self, parser: ArgumentParser) -> None:
pass
parser.add_argument(
f"--{html2parquet_output_format_cli_param}",
type=html2parquet_output_format,
choices=list(html2parquet_output_format),
help="Output format for the contents column.",
default=html2parquet_output_format.MARKDOWN,
)

def apply_input_params(self, args: Namespace) -> bool:
captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
self.params = self.params | captured
logger.info(f"html2parquet parameters are : {self.params}")
return True
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
PythonTransformRuntimeConfiguration,
)
from data_processing.utils import get_logger
from html2parquet_transform import HtmlToParquetTransformConfiguration
from html2parquet_transform import Html2ParquetTransformConfiguration


logger = get_logger(__name__)


class HtmlToParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
class Html2ParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
"""
Implements the PythonTransformConfiguration for HTML2PARQUET as required by the PythonTransformLauncher.
"""
Expand All @@ -20,9 +20,9 @@ def __init__(self):
Initialization
:param base_configuration - base configuration class
"""
super().__init__(transform_config=HtmlToParquetTransformConfiguration())
super().__init__(transform_config=Html2ParquetTransformConfiguration())

if __name__ == "__main__":
launcher = PythonTransformLauncher(HtmlToParquetPythonTransformConfiguration())
launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())
logger.info("Launching html2parquet transform")
launcher.launch()
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
"job name": "html2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-08-13 16:36:02",
"end_time": "2024-08-13 16:36:02",
"start_time": "2024-08-29 16:51:41",
"end_time": "2024-08-29 16:51:41",
"status": "success"
},
"code": null,
Expand All @@ -23,9 +23,9 @@
"source_files": 2,
"source_size": 460391,
"result_files": 2,
"result_size": 14487,
"processing_time": 0.0719749927520752,
"number of rows": 3
"result_size": 13508,
"processing_time": 0.09080028533935547,
"nrows": 3
},
"source": {
"name": "/Users/sungeunan/Desktop/temp/data-prep-kit/transforms/universal/html2parquet/python/test-data/input",
Expand Down
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,10 @@
from data_processing.test_support import get_files_in_folder
from data_processing.test_support.transform import AbstractBinaryTransformTest
from data_processing.utils import TransformUtils
from html2parquet_transform import HtmlToParquetTransform
from html2parquet_transform import Html2ParquetTransform


class TestHtmlToParquetTransform(AbstractBinaryTransformTest):
class TestHtml2ParquetTransform(AbstractBinaryTransformTest):
"""
Extends the super-class to define the test data for the tests defined there.
The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
Expand All @@ -33,7 +33,8 @@ def get_test_transform_fixtures(self) -> list[tuple]:
input_dir = os.path.join(basedir, "input")
input_files = get_files_in_folder(input_dir, ".html")
input_files = [(name, binary) for name, binary in input_files.items()]
expected_metadata_list = [{"nrows": 1, "nsuccess": 1, "nfail": 0, "nskip": 0}, {}]
expected_metadata_list = [{"nrows": 1}, {}]

config = {}

expected_files = [
Expand All @@ -47,7 +48,7 @@ def get_test_transform_fixtures(self) -> list[tuple]:
]
return [
(
HtmlToParquetTransform(config),
Html2ParquetTransform(config),
input_files,
expected_files,
expected_metadata_list,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@
from data_processing.test_support.launch.transform_test import (
AbstractTransformLauncherTest,
)
from html2parquet_transform_python import HtmlToParquetPythonTransformConfiguration
from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration

class TestPythonHtmlToParquetTransform(AbstractTransformLauncherTest):
class TestPythonHtml2ParquetTransform(AbstractTransformLauncherTest):
"""
Extends the super-class to define the test data for the tests defined there.
The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
Expand All @@ -30,16 +30,23 @@ def get_test_transform_fixtures(self) -> list[tuple]:
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
config = {
"data_files_to_use": ast.literal_eval("['.html','.zip']"),
"html2parquet_output_format": "markdown",
}
# this is added as a fixture to remove these columns from comparison
ignore_columns = ["date_acquired", "document_id", "pdf_convert_time", "hash"]
ignore_columns = ["date_acquired"]

fixtures = []
launcher = PythonTransformLauncher(HtmlToParquetPythonTransformConfiguration())
launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())
fixtures.append(
(
launcher,
config,
basedir + "/input",
basedir + "/expected",
ignore_columns,

)
)
return fixtures

0 comments on commit dd96ca0

Please sign in to comment.