Save table prediction in cells format (Unstructured-IO#2892)

This pull request allows to return predictions in raw cell representation from table transformer. It will be later used to save prediction in a cells format for simpler metrics calculation. This PR has to be merged, after Unstructured-IO/unstructured-inference#335
jiangquan8 · Apr 25, 2024 · df1f7bc · df1f7bc
1 parent 3843af6
commit df1f7bc
Show file tree

Hide file tree

Showing 14 changed files with 764 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.13.4-dev2
+## 0.13.4-dev3
 
 ### Enhancements
 * **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning
@@ -8,6 +8,7 @@
 * **Enable remote chunking via unstructured-ingest** Chunking using unstructured-ingest was
   previously limited to local chunking using the strategies `basic` and `by_title`. Remote chunking
   options via the API are now accessible.
+* **Save table in cells format**. `UnstructuredTableTransformerModel` is able to return predicted table in cells format
 
 ### Features
 

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -9,10 +9,6 @@ anyio==3.7.1
     #   -c ././deps/constraints.txt
     #   httpx
     #   jupyter-server
-appnope==0.1.4
-    # via
-    #   ipykernel
-    #   ipython
 argon2-cffi==23.1.0
     # via jupyter-server
 argon2-cffi-bindings==21.2.0

diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -9,7 +9,7 @@ pillow_heif
 pypdf
 # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
 # when unstructured library is.
-unstructured-inference==0.7.27
+unstructured-inference==0.7.28
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12

diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -37,6 +37,7 @@ filelock==3.13.4
     #   huggingface-hub
     #   torch
     #   transformers
+    #   triton
 flatbuffers==24.3.25
     # via onnxruntime
 fonttools==4.51.0
@@ -114,6 +115,37 @@ numpy==1.26.4
     #   scipy
     #   torchvision
     #   transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.19.3
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
 omegaconf==2.3.0
     # via effdet
 onnx==1.16.0
@@ -275,6 +307,8 @@ tqdm==4.66.2
     #   transformers
 transformers==4.40.0
     # via unstructured-inference
+triton==2.2.0
+    # via torch
 typing-extensions==4.11.0
     # via
     #   -c ./base.txt
@@ -284,7 +318,7 @@ typing-extensions==4.11.0
     #   torch
 tzdata==2024.1
     # via pandas
-unstructured-inference==0.7.27
+unstructured-inference==0.7.28
     # via -r ./extra-pdf-image.in
 unstructured-pytesseract==0.3.12
     # via

diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt
@@ -22,6 +22,7 @@ filelock==3.13.4
     #   huggingface-hub
     #   torch
     #   transformers
+    #   triton
 fsspec==2024.3.1
     # via
     #   huggingface-hub
@@ -54,6 +55,37 @@ numpy==1.26.4
     # via
     #   -c ./base.txt
     #   transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.19.3
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
 packaging==23.2
     # via
     #   -c ././deps/constraints.txt
@@ -100,6 +132,8 @@ tqdm==4.66.2
     #   transformers
 transformers==4.40.0
     # via -r ./huggingface.in
+triton==2.2.0
+    # via torch
 typing-extensions==4.11.0
     # via
     #   -c ./base.txt

diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt
@@ -32,6 +32,7 @@ filelock==3.13.4
     #   huggingface-hub
     #   torch
     #   transformers
+    #   triton
 frozenlist==1.4.1
     # via
     #   aiohttp
@@ -98,6 +99,37 @@ numpy==1.26.4
     #   scipy
     #   sentence-transformers
     #   transformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==8.9.2.26
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-nccl-cu12==2.19.3
+    # via torch
+nvidia-nvjitlink-cu12==12.4.127
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
 orjson==3.10.1
     # via langsmith
 packaging==23.2
@@ -168,6 +200,8 @@ tqdm==4.66.2
     #   transformers
 transformers==4.40.0
     # via sentence-transformers
+triton==2.2.0
+    # via torch
 typing-extensions==4.11.0
     # via
     #   -c ./ingest/../base.txt

diff --git a/test_unstructured/metrics/test_table_formats.py b/test_unstructured/metrics/test_table_formats.py
@@ -0,0 +1,33 @@
+import pytest
+
+from unstructured.metrics.table.table_formats import SimpleTableCell
+
+
+@pytest.mark.parametrize(
+    ("row_nums", "column_nums", "x", "y", "w", "h"),
+    [
+        ([3, 2, 1], [6, 7], 6, 1, 2, 3),
+        ([2], [6, 7], 6, 2, 2, 1),
+        ([1, 2, 3], [20], 20, 1, 1, 3),
+        ([5], [5], 5, 5, 1, 1),
+    ],
+)
+def test_simple_table_cell_parsing_from_table_transformer_when_expected_input(
+    row_nums, column_nums, x, y, w, h
+):
+    table_transformer_cell = {"row_nums": row_nums, "column_nums": column_nums, "cell text": "text"}
+    transformed_cell = SimpleTableCell.from_table_transformer_cell(table_transformer_cell)
+    expected_cell = SimpleTableCell(x=x, y=y, w=w, h=h, content="text")
+    assert expected_cell == transformed_cell
+
+
+def test_simple_table_cell_parsing_from_table_transformer_when_missing_row_nums():
+    cell = {"row_nums": [], "column_nums": [1], "cell text": "text"}
+    with pytest.raises(ValueError, match='has missing values under "row_nums" key'):
+        SimpleTableCell.from_table_transformer_cell(cell)
+
+
+def test_simple_table_cell_parsing_from_table_transformer_when_missing_column_nums():
+    cell = {"row_nums": [1], "column_nums": [], "cell text": "text"}
+    with pytest.raises(ValueError, match='has missing values under "column_nums" key'):
+        SimpleTableCell.from_table_transformer_cell(cell)