Skip to content

Commit

Permalink
add more tests for do_ocr and pdf2parquet_do_table_structure
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Jul 25, 2024
1 parent 340a0cc commit 2fdbf83
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 0 deletions.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-07-24 10:10:06",
"end_time": "2024-07-24 10:10:41",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"artifacts_path": null,
"contents_type": "text/markdown",
"do_table_structure": false,
"do_ocr": false,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".pdf",
".zip"
]
},
"job_output_stats": {
"source_files": 2,
"source_size": 8925664,
"result_files": 2,
"result_size": 81129,
"processing_time": 23.67374563217163,
"nrows": 3,
"nsuccess": 3
},
"source": {
"name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
"type": "path"
},
"target": {
"name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/output",
"type": "path"
}
}
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
{
"pipeline": "pipeline_id",
"job details": {
"job category": "preprocessing",
"job name": "pdf2parquet",
"job type": "pure python",
"job id": "job_id",
"start_time": "2024-07-24 09:53:24",
"end_time": "2024-07-24 10:06:13",
"status": "success"
},
"code": {
"github": "github",
"commit_hash": "12345",
"path": "path"
},
"job_input_params": {
"artifacts_path": null,
"contents_type": "text/markdown",
"do_table_structure": true,
"do_ocr": true,
"checkpointing": false,
"max_files": -1,
"random_samples": -1,
"files_to_use": [
".pdf",
".zip"
]
},
"job_output_stats": {
"source_files": 2,
"source_size": 8925664,
"result_files": 2,
"result_size": 87208,
"processing_time": 669.7063181400299,
"nrows": 3,
"nsuccess": 3
},
"source": {
"name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/test-data/input",
"type": "path"
},
"target": {
"name": "/data/docling-dev/data-prep-kit/transforms/language/pdf2parquet/python/output",
"type": "path"
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,63 @@ def get_test_transform_fixtures(self) -> list[tuple]:
)
)
return fixtures

class TestPythonPdf2ParquetWithOcrTransform(AbstractTransformLauncherTest):
"""
Extends the super-class to define the test data for the tests defined there.
The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
"""

def get_test_transform_fixtures(self) -> list[tuple]:
basedir = "../test-data"
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
config = {
"data_files_to_use": ast.literal_eval("['.pdf','.zip']"),
"pdf2parquet_contents_type": "text/markdown",
"pdf2parquet_do_ocr": True,
"pdf2parquet_do_table_structure": True,
}

fixtures = []
launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())
fixtures.append(
(
launcher,
config,
basedir + "/input",
basedir + "/expected_md_ocr",
# this is added as a fixture to remove these columns from comparison
["date_acquired", "document_id", "pdf_convert_time"],
)
)
return fixtures

class TestPythonPdf2ParquetNoTableTransform(AbstractTransformLauncherTest):
"""
Extends the super-class to define the test data for the tests defined there.
The name of this class MUST begin with the word Test so that pytest recognizes it as a test class.
"""

def get_test_transform_fixtures(self) -> list[tuple]:
basedir = "../test-data"
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), basedir))
config = {
"data_files_to_use": ast.literal_eval("['.pdf','.zip']"),
"pdf2parquet_contents_type": "text/markdown",
"pdf2parquet_do_ocr": False,
"pdf2parquet_do_table_structure": False,
}

fixtures = []
launcher = PythonTransformLauncher(Pdf2ParquetPythonTransformConfiguration())
fixtures.append(
(
launcher,
config,
basedir + "/input",
basedir + "/expected_md_no_table",
# this is added as a fixture to remove these columns from comparison
["date_acquired", "document_id", "pdf_convert_time"],
)
)
return fixtures

0 comments on commit 2fdbf83

Please sign in to comment.