Skip to content

Commit

Permalink
Save table prediction in cells format (Unstructured-IO#2892)
Browse files Browse the repository at this point in the history
This pull request allows to return predictions in raw cell
representation from table transformer. It will be later used to save
prediction in a cells format for simpler metrics calculation.

This PR has to be merged, after
Unstructured-IO/unstructured-inference#335
  • Loading branch information
plutasnyy authored Apr 25, 2024
1 parent 3843af6 commit df1f7bc
Show file tree
Hide file tree
Showing 14 changed files with 764 additions and 14 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.4-dev2
## 0.13.4-dev3

### Enhancements
* **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning
Expand All @@ -8,6 +8,7 @@
* **Enable remote chunking via unstructured-ingest** Chunking using unstructured-ingest was
previously limited to local chunking using the strategies `basic` and `by_title`. Remote chunking
options via the API are now accessible.
* **Save table in cells format**. `UnstructuredTableTransformerModel` is able to return predicted table in cells format

### Features

Expand Down
4 changes: 0 additions & 4 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,6 @@ anyio==3.7.1
# -c ././deps/constraints.txt
# httpx
# jupyter-server
appnope==0.1.4
# via
# ipykernel
# ipython
argon2-cffi==23.1.0
# via jupyter-server
argon2-cffi-bindings==21.2.0
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pillow_heif
pypdf
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.27
unstructured-inference==0.7.28
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call
unstructured.pytesseract>=0.3.12
Expand Down
36 changes: 35 additions & 1 deletion requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ filelock==3.13.4
# huggingface-hub
# torch
# transformers
# triton
flatbuffers==24.3.25
# via onnxruntime
fonttools==4.51.0
Expand Down Expand Up @@ -114,6 +115,37 @@ numpy==1.26.4
# scipy
# torchvision
# transformers
nvidia-cublas-cu12==12.1.3.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.1.105
# via torch
nvidia-cuda-nvrtc-cu12==12.1.105
# via torch
nvidia-cuda-runtime-cu12==12.1.105
# via torch
nvidia-cudnn-cu12==8.9.2.26
# via torch
nvidia-cufft-cu12==11.0.2.54
# via torch
nvidia-curand-cu12==10.3.2.106
# via torch
nvidia-cusolver-cu12==11.4.5.107
# via torch
nvidia-cusparse-cu12==12.1.0.106
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.19.3
# via torch
nvidia-nvjitlink-cu12==12.4.127
# via
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105
# via torch
omegaconf==2.3.0
# via effdet
onnx==1.16.0
Expand Down Expand Up @@ -275,6 +307,8 @@ tqdm==4.66.2
# transformers
transformers==4.40.0
# via unstructured-inference
triton==2.2.0
# via torch
typing-extensions==4.11.0
# via
# -c ./base.txt
Expand All @@ -284,7 +318,7 @@ typing-extensions==4.11.0
# torch
tzdata==2024.1
# via pandas
unstructured-inference==0.7.27
unstructured-inference==0.7.28
# via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.12
# via
Expand Down
34 changes: 34 additions & 0 deletions requirements/huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ filelock==3.13.4
# huggingface-hub
# torch
# transformers
# triton
fsspec==2024.3.1
# via
# huggingface-hub
Expand Down Expand Up @@ -54,6 +55,37 @@ numpy==1.26.4
# via
# -c ./base.txt
# transformers
nvidia-cublas-cu12==12.1.3.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.1.105
# via torch
nvidia-cuda-nvrtc-cu12==12.1.105
# via torch
nvidia-cuda-runtime-cu12==12.1.105
# via torch
nvidia-cudnn-cu12==8.9.2.26
# via torch
nvidia-cufft-cu12==11.0.2.54
# via torch
nvidia-curand-cu12==10.3.2.106
# via torch
nvidia-cusolver-cu12==11.4.5.107
# via torch
nvidia-cusparse-cu12==12.1.0.106
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.19.3
# via torch
nvidia-nvjitlink-cu12==12.4.127
# via
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105
# via torch
packaging==23.2
# via
# -c ././deps/constraints.txt
Expand Down Expand Up @@ -100,6 +132,8 @@ tqdm==4.66.2
# transformers
transformers==4.40.0
# via -r ./huggingface.in
triton==2.2.0
# via torch
typing-extensions==4.11.0
# via
# -c ./base.txt
Expand Down
34 changes: 34 additions & 0 deletions requirements/ingest/embed-huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ filelock==3.13.4
# huggingface-hub
# torch
# transformers
# triton
frozenlist==1.4.1
# via
# aiohttp
Expand Down Expand Up @@ -98,6 +99,37 @@ numpy==1.26.4
# scipy
# sentence-transformers
# transformers
nvidia-cublas-cu12==12.1.3.1
# via
# nvidia-cudnn-cu12
# nvidia-cusolver-cu12
# torch
nvidia-cuda-cupti-cu12==12.1.105
# via torch
nvidia-cuda-nvrtc-cu12==12.1.105
# via torch
nvidia-cuda-runtime-cu12==12.1.105
# via torch
nvidia-cudnn-cu12==8.9.2.26
# via torch
nvidia-cufft-cu12==11.0.2.54
# via torch
nvidia-curand-cu12==10.3.2.106
# via torch
nvidia-cusolver-cu12==11.4.5.107
# via torch
nvidia-cusparse-cu12==12.1.0.106
# via
# nvidia-cusolver-cu12
# torch
nvidia-nccl-cu12==2.19.3
# via torch
nvidia-nvjitlink-cu12==12.4.127
# via
# nvidia-cusolver-cu12
# nvidia-cusparse-cu12
nvidia-nvtx-cu12==12.1.105
# via torch
orjson==3.10.1
# via langsmith
packaging==23.2
Expand Down Expand Up @@ -168,6 +200,8 @@ tqdm==4.66.2
# transformers
transformers==4.40.0
# via sentence-transformers
triton==2.2.0
# via torch
typing-extensions==4.11.0
# via
# -c ./ingest/../base.txt
Expand Down
33 changes: 33 additions & 0 deletions test_unstructured/metrics/test_table_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
import pytest

from unstructured.metrics.table.table_formats import SimpleTableCell


@pytest.mark.parametrize(
("row_nums", "column_nums", "x", "y", "w", "h"),
[
([3, 2, 1], [6, 7], 6, 1, 2, 3),
([2], [6, 7], 6, 2, 2, 1),
([1, 2, 3], [20], 20, 1, 1, 3),
([5], [5], 5, 5, 1, 1),
],
)
def test_simple_table_cell_parsing_from_table_transformer_when_expected_input(
row_nums, column_nums, x, y, w, h
):
table_transformer_cell = {"row_nums": row_nums, "column_nums": column_nums, "cell text": "text"}
transformed_cell = SimpleTableCell.from_table_transformer_cell(table_transformer_cell)
expected_cell = SimpleTableCell(x=x, y=y, w=w, h=h, content="text")
assert expected_cell == transformed_cell


def test_simple_table_cell_parsing_from_table_transformer_when_missing_row_nums():
cell = {"row_nums": [], "column_nums": [1], "cell text": "text"}
with pytest.raises(ValueError, match='has missing values under "row_nums" key'):
SimpleTableCell.from_table_transformer_cell(cell)


def test_simple_table_cell_parsing_from_table_transformer_when_missing_column_nums():
cell = {"row_nums": [1], "column_nums": [], "cell text": "text"}
with pytest.raises(ValueError, match='has missing values under "column_nums" key'):
SimpleTableCell.from_table_transformer_cell(cell)
Loading

0 comments on commit df1f7bc

Please sign in to comment.