Merge pull request IBM#524 from IBM/html2parquet

Html2parquet Makefile added
pankajskku · Sep 12, 2024 · dd96ca0 · dd96ca0
2 parents 03cba30 + 39a83b4
commit dd96ca0
Show file tree

Hide file tree

Showing 13 changed files with 149 additions and 31 deletions.
diff --git a/transforms/universal/html2parquet/Makefile b/transforms/universal/html2parquet/Makefile
@@ -0,0 +1,66 @@
+REPOROOT=../../..
+# Use make help, to see the available rules
+include $(REPOROOT)/.make.defaults
+
+setup::
+ @# Help: Recursively make $@ all subdirs 
+ $(MAKE) RULE=$@ .recurse
+
+clean::
+ @# Help: Recursively make $@ all subdirs 
+ $(MAKE) RULE=$@ .recurse
+
+build::
+ @# Help: Recursively make $@ in subdirs 
+ $(MAKE) RULE=$@ .recurse
+venv::
+ @# Help: Recursively make $@ in subdirs 
+ $(MAKE) RULE=$@ .recurse
+
+image:: 
+ @# Help: Recursively make $@ in all subdirs 
+ @$(MAKE) RULE=$@ .recurse
+
+set-versions: 
+ @# Help: Recursively $@ in all subdirs 
+ @$(MAKE) RULE=$@ .recurse
+
+publish:: 
+ @# Help: Recursively make $@ in all subdirs 
+ @$(MAKE) RULE=$@ .recurse
+
+test-image:: 
+ @# Help: Recursively make $@ in all subdirs 
+ @$(MAKE) RULE=$@ .recurse
+
+test:: 
+ @# Help: Recursively make $@ in all subdirs 
+ @$(MAKE) RULE=$@ .recurse
+
+test-src::
+ @# Help: Recursively make $@ in all subdirs 
+ $(MAKE) RULE=$@ .recurse
+
+kind-load-image::
+ @# Help: Recursively make $@ in all subdirs 
+ $(MAKE) RULE=$@ .recurse
+
+docker-load-image::
+ @# Help: Recursively make $@ in all subdirs
+ $(MAKE) RULE=$@ .recurse
+
+docker-save-image::
+ @# Help: Recursively make $@ in all subdirs 
+ $(MAKE) RULE=$@ .recurse
+
+.PHONY: workflow-venv
+workflow-venv:
+
+.PHONY: workflow-test
+workflow-test:
+
+.PHONY: workflow-upload
+workflow-upload:
+
+.PHONY: workflow-build
+workflow-build:
diff --git a/transforms/universal/html2parquet/python/Dockerfile b/transforms/universal/html2parquet/python/Dockerfile
@@ -22,10 +22,10 @@ COPY --chown=dpk:root pyproject.toml pyproject.toml
 RUN pip install --no-cache-dir -e .
 
 # copy transform main() entry point to the image 
-COPY ./src/noop_transform_python.py .
+COPY ./src/html2parquet_transform_python.py .
 
 # copy some of the samples in
-COPY ./src/noop_local.py local/
+COPY ./src/html2parquet_local.py local/
 
 # copy test
 COPY test/ test/

diff --git a/transforms/universal/html2parquet/python/Makefile b/transforms/universal/html2parquet/python/Makefile
@@ -41,7 +41,11 @@ publish-dist:: .defaults.publish-dist
 
 test-image:: .transforms.python-test-image
 
-run-cli-sample: .transforms.run-cli-python-sample
+run-cli-sample:
+ $(MAKE) RUN_FILE=$(TRANSFORM_NAME)_transform_python.py \
+ RUN_ARGS=" \
+ --data_local_config \" { 'input_folder' : '../test-data/input', 'output_folder' : '../output' } \" \
+ --data_files_to_use \"['.html','.zip']\" 
 
 run-local-sample: .transforms.run-local-sample
 

diff --git a/transforms/universal/html2parquet/python/README.md b/transforms/universal/html2parquet/python/README.md
@@ -19,3 +19,10 @@ The output format will contain the following colums
 }
 ```
 ## Parameters
+The transform can be initialized with the following parameters.
+
+| Parameter | Default | Description |
+|------------|----------|--------------|
+| `output_format` | `markdown` | The output type for the `contents` column. Valid types are `markdown` and `text`. |
+
+When invoking the CLI, the parameters must be set as `--html2parquet_<name>`, e.g. `--html2parquet_output_format='markdown'`.
diff --git a/transforms/universal/html2parquet/python/src/html2parquet_local.py b/transforms/universal/html2parquet/python/src/html2parquet_local.py
@@ -13,7 +13,7 @@
 import os
 
 from data_processing.data_access import DataAccessLocal
-from html2parquet_transform import HtmlToParquetTransform
+from html2parquet_transform import Html2ParquetTransform
 
 
 # create parameters
@@ -23,10 +23,10 @@
 if __name__ == "__main__":
  # Here we show how to run outside of the runtime
  # Create and configure the transform.
- transform = HtmlToParquetTransform(html2parquet_params)
+ transform = Html2ParquetTransform(html2parquet_params)
  # Use the local data access to read a parquet table.
  data_access = DataAccessLocal()
- file_to_process = os.path.join(input_folder, "hmlt_test1.html")
+ file_to_process = os.path.join(input_folder, "test1.html")
  byte_array, _ = data_access.get_file(file_to_process)
  print(f"input file: {file_to_process}")
  # Transform the table

diff --git a/transforms/universal/html2parquet/python/src/html2parquet_local_python.py b/transforms/universal/html2parquet/python/src/html2parquet_local_python.py
@@ -16,7 +16,7 @@
 
 from data_processing.runtime.pure_python import PythonTransformLauncher
 from data_processing.utils import ParamsUtils
-from html2parquet_transform_python import HtmlToParquetPythonTransformConfiguration
+from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration
 
 
 # create parameters
@@ -41,6 +41,6 @@
  # Set the simulated command line args
  sys.argv = ParamsUtils.dict_to_req(d=params)
  # create launcher
- launcher = PythonTransformLauncher(runtime_config=HtmlToParquetPythonTransformConfiguration())
+ launcher = PythonTransformLauncher(runtime_config=Html2ParquetPythonTransformConfiguration())
  # Launch the ray actor(s) to process the input
  launcher.launch()
diff --git a/transforms/universal/html2parquet/python/src/html2parquet_transform.py b/transforms/universal/html2parquet/python/src/html2parquet_transform.py
@@ -1,3 +1,4 @@
+import enum
 import time
 from argparse import ArgumentParser, Namespace
 from typing import Any
@@ -20,15 +21,26 @@
 from data_processing.utils import CLIArgumentProvider, get_logger, TransformUtils
 
 
+
 class Html2ParquetTransform(AbstractBinaryTransform):
  def __init__(self, config: dict[str, Any]):
- super().__init__(config) 
+ super().__init__(config)
+
+ self.output_format = config.get(html2parquet_output_format_key, html2parquet_output_format.MARKDOWN)
+ if not isinstance(self.output_format, html2parquet_output_format):
+ self.output_format = html2parquet_output_format[self.output_format] 
 
  def _convert_html2parquet(self, member_filename:str, file_name:str, content_bytes: bytes) -> dict:
  title = member_filename if member_filename else TransformUtils.get_file_basename(file_name)
 
  # Use Trafilatura library
- content_string = trafilatura.extract(content_bytes)
+ if self.output_format == html2parquet_output_format.MARKDOWN:
+ content_string = trafilatura.extract(content_bytes, output_format="markdown")
+ elif self.output_format == html2parquet_output_format.TEXT:
+ content_string = trafilatura.extract(content_bytes)
+ else:
+ raise RuntimeError(f"Uknown output_format {self.output_format}.")
+
 
  if content_string is None:
  raise RuntimeError("Failed in converting.")
@@ -55,7 +67,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
  """
  if TransformUtils.get_file_extension(file_name)[1] not in [".zip", ".html"]:
  error_message = f"Unsupported file type: {file_name}. Only ZIP and HTML files are supported."
- self.logger.error(error_message)
+ logger.error(error_message)
  raise ValueError(error_message) # Raising an exception with the error message
  data = []
  number_of_rows = 0
@@ -76,7 +88,7 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
  data.append(row_data)
  number_of_rows += 1
  except Exception as e:
- self.logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping")
+ logger.warning(f"Exception {str(e)} processing file {member.filename}, skipping")
 
 
  # Process single HTML documents
@@ -92,16 +104,28 @@ def transform_binary(self, file_name: str, byte_array: bytes) -> tuple[list[tupl
  number_of_rows += 1
 
  except Exception as e:
- self.logger.warning(f"Exception {str(e)} processing file {file_name}, skipping")
+ logger.warning(f"Exception {str(e)} processing file {file_name}, skipping")
 
 
  table = pa.Table.from_pylist(data)
- return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"number of rows": number_of_rows}
+ return [(TransformUtils.convert_arrow_to_binary(table=table), ".parquet")], {"nrows": number_of_rows}
+
 
 logger = get_logger(__name__)
 
 short_name = "html2parquet"
 cli_prefix = f"{short_name}_"
+html2parquet_output_format_key = f"output_format"
+
+class html2parquet_output_format(str, enum.Enum):
+ MARKDOWN = "markdown"
+ TEXT = "text"
+
+ def __str__(self):
+ return str(self.value)
+
+html2parquet_output_format_default = html2parquet_output_format.MARKDOWN
+html2parquet_output_format_cli_param = f"{cli_prefix}{html2parquet_output_format_key}"
 
 
 class Html2ParquetTransformConfiguration(TransformConfiguration):
@@ -111,7 +135,16 @@ def __init__(self):
  transform_class=Html2ParquetTransform,
  )
  def add_input_params(self, parser: ArgumentParser) -> None:
- pass 
+ parser.add_argument(
+ f"--{html2parquet_output_format_cli_param}",
+ type=html2parquet_output_format,
+ choices=list(html2parquet_output_format),
+ help="Output format for the contents column.",
+ default=html2parquet_output_format.MARKDOWN,
+ ) 
 
  def apply_input_params(self, args: Namespace) -> bool:
+ captured = CLIArgumentProvider.capture_parameters(args, cli_prefix, False)
+ self.params = self.params | captured
+ logger.info(f"html2parquet parameters are : {self.params}")
  return True
diff --git a/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py b/transforms/universal/html2parquet/python/src/html2parquet_transform_python.py
@@ -5,13 +5,13 @@
  PythonTransformRuntimeConfiguration,
 )
 from data_processing.utils import get_logger
-from html2parquet_transform import HtmlToParquetTransformConfiguration
+from html2parquet_transform import Html2ParquetTransformConfiguration
 
 
 logger = get_logger(__name__)
 
 
-class HtmlToParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
+class Html2ParquetPythonTransformConfiguration(PythonTransformRuntimeConfiguration):
  """
  Implements the PythonTransformConfiguration for HTML2PARQUET as required by the PythonTransformLauncher.
  """
@@ -20,9 +20,9 @@ def __init__(self):
  Initialization
  :param base_configuration - base configuration class
  """
- super().__init__(transform_config=HtmlToParquetTransformConfiguration())
+ super().__init__(transform_config=Html2ParquetTransformConfiguration())
 
 if __name__ == "__main__":
- launcher = PythonTransformLauncher(HtmlToParquetPythonTransformConfiguration())
+ launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())
  logger.info("Launching html2parquet transform")
  launcher.launch()
diff --git a/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet b/transforms/universal/html2parquet/python/test-data/expected/html_zip.parquet
diff --git a/transforms/universal/html2parquet/python/test-data/expected/metadata.json b/transforms/universal/html2parquet/python/test-data/expected/metadata.json
@@ -5,8 +5,8 @@
  "job name": "html2parquet",
  "job type": "pure python",
  "job id": "job_id",
- "start_time": "2024-08-13 16:36:02",
- "end_time": "2024-08-13 16:36:02",
+ "start_time": "2024-08-29 16:51:41",
+ "end_time": "2024-08-29 16:51:41",
  "status": "success"
  },
  "code": null,
@@ -23,9 +23,9 @@
  "source_files": 2,
  "source_size": 460391,
  "result_files": 2,
- "result_size": 14487,
- "processing_time": 0.0719749927520752,
- "number of rows": 3
+ "result_size": 13508,
+ "processing_time": 0.09080028533935547,
+ "nrows": 3
  },
  "source": {
  "name": "/Users/sungeunan/Desktop/temp/data-prep-kit/transforms/universal/html2parquet/python/test-data/input",

diff --git a/transforms/universal/html2parquet/python/test-data/expected/test1.parquet b/transforms/universal/html2parquet/python/test-data/expected/test1.parquet
diff --git a/transforms/universal/html2parquet/python/test/test_html2parquet.py b/transforms/universal/html2parquet/python/test/test_html2parquet.py
@@ -16,10 +16,10 @@
 from data_processing.test_support import get_files_in_folder
 from data_processing.test_support.transform import AbstractBinaryTransformTest
 from data_processing.utils import TransformUtils
-from html2parquet_transform import HtmlToParquetTransform
+from html2parquet_transform import Html2ParquetTransform
 
 
-class TestHtmlToParquetTransform(AbstractBinaryTransformTest):
+class TestHtml2ParquetTransform(AbstractBinaryTransformTest):
  """
  Extends the super-class to define the test data for the tests defined there.
  The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
@@ -33,7 +33,8 @@ def get_test_transform_fixtures(self) -> list[tuple]:
  input_dir = os.path.join(basedir, "input")
  input_files = get_files_in_folder(input_dir, ".html")
  input_files = [(name, binary) for name, binary in input_files.items()]
- expected_metadata_list = [{"nrows": 1, "nsuccess": 1, "nfail": 0, "nskip": 0}, {}]
+ expected_metadata_list = [{"nrows": 1}, {}]
+
  config = {}
 
  expected_files = [
@@ -47,7 +48,7 @@ def get_test_transform_fixtures(self) -> list[tuple]:
  ]
  return [
  (
- HtmlToParquetTransform(config),
+ Html2ParquetTransform(config),
  input_files,
  expected_files,
  expected_metadata_list,

diff --git a/transforms/universal/html2parquet/python/test/test_html2parquet_python.py b/transforms/universal/html2parquet/python/test/test_html2parquet_python.py
@@ -17,9 +17,9 @@
 from data_processing.test_support.launch.transform_test import (
  AbstractTransformLauncherTest,
 )
-from html2parquet_transform_python import HtmlToParquetPythonTransformConfiguration
+from html2parquet_transform_python import Html2ParquetPythonTransformConfiguration
 
-class TestPythonHtmlToParquetTransform(AbstractTransformLauncherTest):
+class TestPythonHtml2ParquetTransform(AbstractTransformLauncherTest):
  """
  Extends the super-class to define the test data for the tests defined there.
  The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
@@ -30,16 +30,23 @@ def get_test_transform_fixtures(self) -> list[tuple]:
  basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
  config = {
  "data_files_to_use": ast.literal_eval("['.html','.zip']"),
+ "html2parquet_output_format": "markdown",
  }
+ # this is added as a fixture to remove these columns from comparison
+ ignore_columns = ["date_acquired", "document_id", "pdf_convert_time", "hash"]
+ ignore_columns = ["date_acquired"]
 
  fixtures = []
- launcher = PythonTransformLauncher(HtmlToParquetPythonTransformConfiguration())
+ launcher = PythonTransformLauncher(Html2ParquetPythonTransformConfiguration())
  fixtures.append(
  (
  launcher,
  config,
  basedir + "/input",
  basedir + "/expected",
+ ignore_columns,
+
  )
  )
  return fixtures
+