add more tests for do_ocr and pdf2parquet_do_table_structure

Signed-off-by: Michele Dolfi <[email protected]>
IBM · Jul 25, 2024 · 2fdbf83 · 2fdbf83
1 parent 340a0cc
commit 2fdbf83
Show file tree

Hide file tree

Showing 7 changed files with 154 additions and 0 deletions.
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/2206.01062.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/2206.01062.parquet
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/archive1.parquet
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/metadata.json
@@ -0,0 +1,47 @@
+{
+ "pipeline": "pipeline_id",
+ "job details": {
+ "job category": "preprocessing",
+ "job name": "pdf2parquet",
+ "job type": "pure python",
+ "job id": "job_id",
+ "start_time": "2024-07-24 10:10:06",
+ "end_time": "2024-07-24 10:10:41",
+ "status": "success"
+ },
+ "code": {
+ "github": "github",
+ "commit_hash": "12345",
+ "path": "path"
+ },
+ "job_input_params": {
+ "artifacts_path": null,
+ "contents_type": "text/markdown",
+ "do_table_structure": false,
+ "do_ocr": false,
+ "checkpointing": false,
+ "max_files": -1,
+ "random_samples": -1,
+ "files_to_use": [
+ ".pdf",
+ ".zip"
+ ]
+ },
+ "job_output_stats": {
+ "source_files": 2,
+ "source_size": 8925664,
+ "result_files": 2,
+ "result_size": 81129,
+ "processing_time": 23.67374563217163,
+ "nrows": 3,
+ "nsuccess": 3
+ },
+ "source": {
+ "name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
+ "type": "path"
+ },
+ "target": {
+ "name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/output",
+ "type": "path"
+ }
+}
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/2206.01062.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/2206.01062.parquet
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/archive1.parquet
diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/metadata.json
@@ -0,0 +1,47 @@
+{
+ "pipeline": "pipeline_id",
+ "job details": {
+ "job category": "preprocessing",
+ "job name": "pdf2parquet",
+ "job type": "pure python",
+ "job id": "job_id",
+ "start_time": "2024-07-24 09:53:24",
+ "end_time": "2024-07-24 10:06:13",
+ "status": "success"
+ },
+ "code": {
+ "github": "github",
+ "commit_hash": "12345",
+ "path": "path"
+ },
+ "job_input_params": {
+ "artifacts_path": null,
+ "contents_type": "text/markdown",
+ "do_table_structure": true,
+ "do_ocr": true,
+ "checkpointing": false,
+ "max_files": -1,
+ "random_samples": -1,
+ "files_to_use": [
+ ".pdf",
+ ".zip"
+ ]
+ },
+ "job_output_stats": {
+ "source_files": 2,
+ "source_size": 8925664,
+ "result_files": 2,
+ "result_size": 87208,
+ "processing_time": 669.7063181400299,
+ "nrows": 3,
+ "nsuccess": 3
+ },
+ "source": {
+ "name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
+ "type": "path"
+ },
+ "target": {
+ "name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/output",
+ "type": "path"
+ }
+}
diff --git a/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py b/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py
@@ -75,3 +75,63 @@ def get_test_transform_fixtures(self) -> list[tuple]:
  )
  )
  return fixtures
+
+class TestPythonPdf2ParquetWithOcrTransform(AbstractTransformLauncherTest):
+ """
+ Extends the super-class to define the test data for the tests defined there.
+ The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+ """
+
+ def get_test_transform_fixtures(self) -> list[tuple]:
+ basedir = "../test-data"
+ basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
+ config = {
+ "data_files_to_use": ast.literal_eval("['.pdf','.zip']"),
+ "pdf2parquet_contents_type": "text/markdown",
+ "pdf2parquet_do_ocr": True,
+ "pdf2parquet_do_table_structure": True,
+ }
+
+ fixtures = []
+ launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())
+ fixtures.append(
+ (
+ launcher,
+ config,
+ basedir + "/input",
+ basedir + "/expected_md_ocr",
+ # this is added as a fixture to remove these columns from comparison
+ ["date_acquired", "document_id", "pdf_convert_time"],
+ )
+ )
+ return fixtures
+
+class TestPythonPdf2ParquetNoTableTransform(AbstractTransformLauncherTest):
+ """
+ Extends the super-class to define the test data for the tests defined there.
+ The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
+ """
+
+ def get_test_transform_fixtures(self) -> list[tuple]:
+ basedir = "../test-data"
+ basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
+ config = {
+ "data_files_to_use": ast.literal_eval("['.pdf','.zip']"),
+ "pdf2parquet_contents_type": "text/markdown",
+ "pdf2parquet_do_ocr": False,
+ "pdf2parquet_do_table_structure": False,
+ }
+
+ fixtures = []
+ launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())
+ fixtures.append(
+ (
+ launcher,
+ config,
+ basedir + "/input",
+ basedir + "/expected_md_no_table",
+ # this is added as a fixture to remove these columns from comparison
+ ["date_acquired", "document_id", "pdf_convert_time"],
+ )
+ )
+ return fixtures