diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/2206.01062.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/2206.01062.parquet new file mode 100644 index 000000000..cc14dbf7e Binary files /dev/null and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/2206.01062.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/archive1.parquet new file mode 100644 index 000000000..9c3c39df2 Binary files /dev/null and b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/metadata.json new file mode 100644 index 000000000..148d660a1 --- /dev/null +++ b/transforms/language/pdf2parquet/python/test-data/expected_md_no_table/metadata.json @@ -0,0 +1,47 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "pdf2parquet", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-07-24 10:10:06", + "end_time": "2024-07-24 10:10:41", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "artifacts_path": null, + "contents_type": "text/markdown", + "do_table_structure": false, + "do_ocr": false, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".pdf", + ".zip" + ] + }, + "job_output_stats": { + "source_files": 2, + "source_size": 8925664, + "result_files": 2, + "result_size": 81129, + "processing_time": 23.67374563217163, + "nrows": 3, + "nsuccess": 3 + }, + "source": { + "name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/2206.01062.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/2206.01062.parquet new file mode 100644 index 000000000..3f8e7f632 Binary files /dev/null and b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/2206.01062.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/archive1.parquet b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/archive1.parquet new file mode 100644 index 000000000..dc5ab9cc4 Binary files /dev/null and b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/archive1.parquet differ diff --git a/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/metadata.json b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/metadata.json new file mode 100644 index 000000000..17f83a2c7 --- /dev/null +++ b/transforms/language/pdf2parquet/python/test-data/expected_md_ocr/metadata.json @@ -0,0 +1,47 @@ +{ + "pipeline": "pipeline_id", + "job details": { + "job category": "preprocessing", + "job name": "pdf2parquet", + "job type": "pure python", + "job id": "job_id", + "start_time": "2024-07-24 09:53:24", + "end_time": "2024-07-24 10:06:13", + "status": "success" + }, + "code": { + "github": "github", + "commit_hash": "12345", + "path": "path" + }, + "job_input_params": { + "artifacts_path": null, + "contents_type": "text/markdown", + "do_table_structure": true, + "do_ocr": true, + "checkpointing": false, + "max_files": -1, + "random_samples": -1, + "files_to_use": [ + ".pdf", + ".zip" + ] + }, + "job_output_stats": { + "source_files": 2, + "source_size": 8925664, + "result_files": 2, + "result_size": 87208, + "processing_time": 669.7063181400299, + "nrows": 3, + "nsuccess": 3 + }, + "source": { + "name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input", + "type": "path" + }, + "target": { + "name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/output", + "type": "path" + } +} \ No newline at end of file diff --git a/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py b/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py index 050a3e3a0..cf95362b8 100644 --- a/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py +++ b/transforms/language/pdf2parquet/python/test/test_pdf2parquet_python.py @@ -75,3 +75,63 @@ def get_test_transform_fixtures(self) -> list[tuple]: ) ) return fixtures + +class TestPythonPdf2ParquetWithOcrTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../test-data" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + config = { + "data_files_to_use": ast.literal_eval("['.pdf','.zip']"), + "pdf2parquet_contents_type": "text/markdown", + "pdf2parquet_do_ocr": True, + "pdf2parquet_do_table_structure": True, + } + + fixtures = [] + launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration()) + fixtures.append( + ( + launcher, + config, + basedir + "/input", + basedir + "/expected_md_ocr", + # this is added as a fixture to remove these columns from comparison + ["date_acquired", "document_id", "pdf_convert_time"], + ) + ) + return fixtures + +class TestPythonPdf2ParquetNoTableTransform(AbstractTransformLauncherTest): + """ + Extends the super-class to define the test data for the tests defined there. + The name of this class MUST begin with the word Test so that pytest recognizes it as a test class. + """ + + def get_test_transform_fixtures(self) -> list[tuple]: + basedir = "../test-data" + basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir)) + config = { + "data_files_to_use": ast.literal_eval("['.pdf','.zip']"), + "pdf2parquet_contents_type": "text/markdown", + "pdf2parquet_do_ocr": False, + "pdf2parquet_do_table_structure": False, + } + + fixtures = [] + launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration()) + fixtures.append( + ( + launcher, + config, + basedir + "/input", + basedir + "/expected_md_no_table", + # this is added as a fixture to remove these columns from comparison + ["date_acquired", "document_id", "pdf_convert_time"], + ) + ) + return fixtures