Skip to content

Commit 0bd48a4

Browse files
Parametrize page splitting logic with concurrency level, introduce constants for min and max pages per split (Unstructured-IO#86)
- [x] the default for `pdf_split_page` is `False`, according to [this comment](Unstructured-IO#86 (review)) - [x] number of processes used for handling each batch of PDF pages can be configured (via form data, env variable is no longer being used) - [x] introduced constants for controlling split size: `MIN_PAGES_PER_SPLIT=2` and `MAX_PAGES_PER_SPLIT=20` - [x] the page-splitting mechanism evenly divides pages among workers (processes). - [x] basic tests for the logic mentioned above - [x] regenerated speakeasy client ## How to verify that this PR works ### Unit & Integration Tests `make install && make test` ### Manually ```bash make install pip install --editable . python -m timeit --repeat 10 --verbose "$(cat test-client.py)" ``` Where `test-client.py` has the following contents: ```python import os import sys import unstructured_client from unstructured_client import UnstructuredClient print(unstructured_client.__file__) from unstructured_client.models import shared from unstructured_client.models.errors import SDKError s = UnstructuredClient(api_key_auth=os.environ["UNS_API_KEY"], server_url="http://localhost:8000") filename = "_sample_docs/layout-parser-paper.pdf" with open(filename, "rb") as f: files = shared.Files( content=f.read(), file_name=filename, ) req = shared.PartitionParameters( files=files, strategy="fast", languages=["eng"], split_pdf_page=True, split_pdf_concurrency_level=1, ) resp = s.general.partition(req) ids = [e.element_id for e in resp.elements] print(ids) ```
1 parent 6840aa7 commit 0bd48a4

23 files changed

+1045
-779
lines changed

.github/workflows/ci.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,10 @@ jobs:
5353
python-version: ${{ matrix.python-version }}
5454
- name: Install dependencies
5555
run: make install-test
56-
- name: Run unit tests
56+
- name: Run all tests
5757
run: |
5858
pip install .
59-
pytest _test_unstructured_client/test_integration*.py
59+
make test
6060
env:
6161
UNSTRUCTURED_API_KEY: ${{ secrets.UNSTRUCTURED_API_KEY }}
6262

.speakeasy/gen.lock

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
lockVersion: 2.0.0
22
id: 8b5fa338-9106-4734-abf0-e30d67044a90
33
management:
4-
docChecksum: 8c3ba3d80aca4eb8e7b1d6bdcb158ef1
4+
docChecksum: 3956d4428b6275f96d6859db352f202f
55
docVersion: 1.0.30
66
speakeasyVersion: 1.293.1
77
generationVersion: 2.333.3
8-
releaseVersion: 0.25.3
9-
configChecksum: 598c71935a2b9e23cc8460a9a80ce5bf
8+
releaseVersion: 0.26.4
9+
configChecksum: 6d27d7b73b27a82dfd0bbd6198beabcf
1010
repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
1111
repoSubDirectory: .
1212
installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ DOCKER_IMAGE ?= downloads.unstructured.io/unstructured-io/unstructured-api:lates
99

1010
.PHONY: install-test
1111
install-test:
12-
pip install pytest requests_mock pypdf deepdiff
12+
pip install pytest requests_mock pypdf deepdiff requests-toolbelt
1313

1414
.PHONY: install-dev
1515
install-dev:

README.md

+52-16
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
<p>Python SDK for the Unstructured API</p>
1414
</h2>
1515

16-
This is a Python client for the [Unstructured API](https://unstructured-io.github.io/unstructured/api.html).
16+
This is a Python client for the [Unstructured API](https://unstructured-io.github.io/unstructured/api.html).
1717

1818
<div align="center">
1919

@@ -33,7 +33,7 @@ pip install unstructured-client
3333
<!-- End SDK Installation [installation] -->
3434

3535
## Usage
36-
Only the `files` parameter is required.
36+
Only the `files` parameter is required.
3737

3838
```python
3939
from unstructured_client import UnstructuredClient
@@ -64,17 +64,17 @@ try:
6464
except SDKError as e:
6565
print(e)
6666
```
67-
67+
6868
Result:
6969

7070
```
7171
{
72-
'type': 'UncategorizedText',
73-
'element_id': 'fc550084fda1e008e07a0356894f5816',
72+
'type': 'UncategorizedText',
73+
'element_id': 'fc550084fda1e008e07a0356894f5816',
7474
'metadata': {
75-
'filename': 'layout-parser-paper-fast.pdf',
76-
'filetype': 'application/pdf',
77-
'languages': ['eng'],
75+
'filename': 'layout-parser-paper-fast.pdf',
76+
'filetype': 'application/pdf',
77+
'languages': ['eng'],
7878
'page_number': 1
7979
}
8080
}
@@ -106,18 +106,24 @@ See the [general partition](/docs/models/shared/partitionparameters.md) page for
106106

107107
#### Splitting PDF by pages
108108

109-
In order to speed up processing of long PDF files, set `split_pdf_page=True`. It will cause the PDF
110-
to be split page-by-page at client side, before sending to API, and combining individual responses
111-
as single result. This will work only for PDF files, so don't set it for other filetypes.
109+
In order to speed up processing of long PDF files, `split_pdf_page` can be set to `True` (defaults to `False`). It will cause the PDF to be split at client side, before sending to API, and combining individual responses as single result. This parameter will affect only PDF files, no need to disable it for other filetypes.
112110

113111
Warning: this feature causes the `parent_id` metadata generation in elements to be disabled, as that
114112
requires having context of multiple pages.
115113

116-
The amount of threads that will be used for sending individual pdf pages, is controlled by
117-
`UNSTRUCTURED_CLIENT_SPLIT_CALL_THREADS` env var. By default it equals to 5.
118-
It can't be more than 15, to avoid too high resource usage and costs.
114+
The amount of workers utilized for splitting PDFs is dictated by the `split_pdf_concurrency_level` parameter, with a default of 5 and a maximum of 15 to keep resource usage and costs in check. The splitting process leverages the `ProcessPoolExecutor` to manage concurrency effectively.
115+
The size of each batch of pages (ranging from 2 to 20) is internally determined based on the concurrency level and the total number of pages in the document.
119116

120-
<!-- No SDK Example Usage -->
117+
Example:
118+
```python
119+
req = shared.PartitionParameters(
120+
files=files,
121+
strategy="fast",
122+
languages=["eng"],
123+
split_pdf_page=True,
124+
split_pdf_concurrency_level=8
125+
)
126+
```
121127
<!-- No SDK Available Operations -->
122128
<!-- No Pagination -->
123129
<!-- No Error Handling -->
@@ -142,6 +148,36 @@ s = unstructured_client.UnstructuredClient(client=http_client)
142148
<!-- No Retries -->
143149
<!-- No Authentication -->
144150

151+
<!-- Start SDK Example Usage [usage] -->
152+
## SDK Example Usage
153+
154+
### Example
155+
156+
```python
157+
import unstructured_client
158+
from unstructured_client.models import operations, shared
159+
160+
s = unstructured_client.UnstructuredClient(
161+
api_key_auth="YOUR_API_KEY",
162+
)
163+
164+
res = s.general.partition(request=operations.PartitionRequest(
165+
partition_parameters=shared.PartitionParameters(
166+
files=shared.Files(
167+
content='0x2cC94b2FEF'.encode(),
168+
file_name='um.shtml',
169+
),
170+
strategy=shared.Strategy.HI_RES,
171+
),
172+
))
173+
174+
if res.elements is not None:
175+
# handle response
176+
pass
177+
178+
```
179+
<!-- End SDK Example Usage [usage] -->
180+
145181
<!-- Placeholder for Future Speakeasy SDK Sections -->
146182

147183
### Maturity
@@ -176,7 +212,7 @@ While we value open-source contributions to this SDK, this library is generated
176212

177213
There are two important files used by `make client-generate`:
178214
1. `openapi.json` which is actually not stored here, [but fetched from unstructured-api](https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/main/openapi.json), represents the API that is supported on backend.
179-
2. `overlay_client.yaml` is a handcrafted diff that when applied over above, produces `openapi_client.json`
215+
2. `overlay_client.yaml` is a handcrafted diff that when applied over above, produces `openapi_client.json`
180216
which is used to generate SDK.
181217

182218
Once PR with changes is merged, Github CI will autogenerate the Speakeasy client in a new PR, using

_sample_docs/csv-with-long-lines.csv

+11
Large diffs are not rendered by default.

_sample_docs/emoji.xlsx

4.59 KB
Binary file not shown.

_sample_docs/ideas-page.html

+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
2+
<html><script type="text/javascript">
3+
<!--
4+
(new Image).src="https://store.yahoo.net/cgi-bin/refsd?e=http://paulgraham.com/getideas.html&h=paulgraham.com&v=1.0&dr=" + escape(document.referrer);
5+
-->
6+
</script>
7+
<head><title>How to Get New Ideas</title><!-- <META NAME="ROBOTS" CONTENT="NOODP"> -->
8+
<link rel="shortcut icon" href="http://ycombinator.com/arc/arc.png">
9+
</head><body bgcolor=ffffff background="https://sep.yimg.com/ca/I/paulgraham_2271_0" text=000000 link=000099 vlink=464646><table border=0 cellspacing=0 cellpadding=0><tr valign=top><td><map name=c04963d10de5f><area shape=rect coords="0,0,67,21" href="index.html"><area shape=rect coords="0,21,67,42" href="articles.html"><area shape=rect coords="0,42,67,63" href="http://www.amazon.com/gp/product/0596006624"><area shape=rect coords="0,63,67,84" href="books.html"><area shape=rect coords="0,84,67,105" href="http://ycombinator.com"><area shape=rect coords="0,105,67,126" href="arc.html"><area shape=rect coords="0,126,67,147" href="bel.html"><area shape=rect coords="0,147,67,168" href="lisp.html"><area shape=rect coords="0,168,67,189" href="antispam.html"><area shape=rect coords="0,189,67,210" href="kedrosky.html"><area shape=rect coords="0,210,67,231" href="faq.html"><area shape=rect coords="0,231,67,252" href="raq.html"><area shape=rect coords="0,252,67,273" href="quo.html"><area shape=rect coords="0,273,67,294" href="rss.html"><area shape=rect coords="0,294,67,315" href="bio.html"><area shape=rect coords="0,315,67,336" href="https://twitter.com/paulg"><area shape=rect coords="0,336,67,357" href="https://mas.to/@paulg"></map><img src="https://s.yimg.com/aah/paulgraham/img-20.gif" width=69 height=357 usemap=#c04963d10de5f border=0 hspace=0 vspace=0 ismap></td><td><img src="https://sep.yimg.com/ca/Img/trans_1x1.gif" height=1 width=26 border=0></td><td><a href="index.html"><img src="https://sep.yimg.com/ca/I/paulgraham_2271_3232" width=410 height=45 border=0 hspace=0 vspace=0></a><br><br><table border=0 cellspacing=0 cellpadding=0 width=435><tr valign=top><td width=435><img src="https://s.yimg.com/aah/paulgraham/how-to-get-new-ideas-1.gif" width=176 height=18 border=0 hspace=0 vspace=0 alt="How to Get New Ideas"><br><br><font size=2 face="verdana">January 2023<br><br><i>(<a href="https://twitter.com/stef/status/1617222428727586816"><u>Someone</u></a> fed my essays into GPT to make something that could answer
10+
questions based on them, then asked it where good ideas come from. The
11+
answer was ok, but not what I would have said. This is what I would have said.)</i><br><br>The way to get new ideas is to notice anomalies: what seems strange,
12+
or missing, or broken? You can see anomalies in everyday life (much
13+
of standup comedy is based on this), but the best place to look for
14+
them is at the frontiers of knowledge.<br><br>Knowledge grows fractally.
15+
From a distance its edges look smooth, but when you learn enough
16+
to get close to one, you'll notice it's full of gaps. These gaps
17+
will seem obvious; it will seem inexplicable that no one has tried
18+
x or wondered about y. In the best case, exploring such gaps yields
19+
whole new fractal buds.<br><br></font></td></tr></table><table border=0 cellspacing=0 cellpadding=0 width=435><tr><td><font size=2 face="verdana"><br><br><hr></font></td></tr></table></td></tr></table></body>
20+
<script type="text/javascript">
21+
csell_env = 'bf1';
22+
var storeCheckoutDomain = 'order.store.yahoo.net';
23+
</script>
24+
<script type="text/javascript">
25+
// Begin Yahoo Store Generated Code
26+
</script> <script type="text/javascript" src="https://s.turbifycdn.com/lq/ult/ylc_1.9.js" ></script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/lib/smbiz/store/csell/beacon-a9518fc6e4.js" >
27+
</script>
28+
<script type="text/javascript">
29+
// Begin Yahoo Store Generated Code
30+
csell_page_data = {}; csell_page_rec_data = []; ts='TOK_STORE_ID';
31+
</script>
32+
<script type="text/javascript">
33+
// Begin Yahoo Store Generated Code
34+
function csell_GLOBAL_INIT_TAG() { var csell_token_map = {}; csell_token_map['TOK_ITEM_ID_LIST'] = 'getideas'; csell_token_map['TOK_BEACON_TYPE'] = 'prod'; csell_token_map['TOK_RAND_KEY'] = 't'; csell_token_map['TOK_SPACEID'] = '2022276099'; csell_token_map['TOK_IS_ORDERABLE'] = '2'; csell_token_map['TOK_STORE_ID'] = 'paulgraham'; csell_token_map['TOK_URL'] = ''; csell_token_map['TOK_ORDER_HOST'] = 'order.store.yahoo.net'; c = csell_page_data; var x = (typeof storeCheckoutDomain == 'string')?storeCheckoutDomain:'order.store.yahoo.net'; var t = csell_token_map; c['s'] = t['TOK_SPACEID']; c['url'] = t['TOK_URL']; c['si'] = t[ts]; c['ii'] = t['TOK_ITEM_ID_LIST']; c['bt'] = t['TOK_BEACON_TYPE']; c['rnd'] = t['TOK_RAND_KEY']; c['io'] = t['TOK_IS_ORDERABLE']; YStore.addItemUrl = 'http%s://'+x+'/'+t[ts]+'/ymix/MetaController.html?eventName.addEvent&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_itemId=%s&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_quantity=1&ysco_key_cs_item=1&sectionId=ysco.cart&ysco_key_store_id='+t[ts]; }
35+
</script>
36+
<script type="text/javascript">
37+
// Begin Yahoo Store Generated Code
38+
function csell_REC_VIEW_TAG() { var env = (typeof csell_env == 'string')?csell_env:'prod'; var p = csell_page_data; var a = '/sid='+p['si']+'/io='+p['io']+'/ii='+p['ii']+'/bt='+p['bt']+'-view'+'/en='+env; var r=Math.random(); YStore.CrossSellBeacon.renderBeaconWithRecData(p['url']+'/p/s='+p['s']+'/'+p['rnd']+'='+r+a); }
39+
</script>
40+
<script type="text/javascript">
41+
// Begin Yahoo Store Generated Code
42+
var csell_token_map = {}; csell_token_map['TOK_PAGE'] = 'p'; csell_token_map['TOK_WS_URL'] = 'https://paulgraham.csell.store.yahoo.net/cs/recommend?itemids=getideas&location=p'; csell_token_map['TOK_SHOW_CS_RECS'] = 'false'; csell_token_map['TOK_CURR_SYM'] = '$'; var t = csell_token_map; csell_GLOBAL_INIT_TAG(); YStore.page = t['TOK_PAGE']; YStore.currencySymbol = t['TOK_CURR_SYM']; YStore.crossSellUrl = t['TOK_WS_URL']; YStore.showCSRecs = t['TOK_SHOW_CS_RECS']; </script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/store/secure/recs-1.3.2.2.js" ></script> <script type="text/javascript" >
43+
</script>
44+
</html>

_test_unstructured_client/test__decorators.py renamed to _test_unstructured_client/integration/test_decorators.py

+19-24
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,41 @@
11
import os
2+
23
import pytest
34
import requests
45
from deepdiff import DeepDiff
5-
66
from unstructured_client import UnstructuredClient
77
from unstructured_client.models import shared
88
from unstructured_client.models.errors import HTTPValidationError
99

1010
FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
1111

1212

13-
@pytest.mark.parametrize("call_threads", [1, 2, 5])
13+
@pytest.mark.parametrize("concurrency_level", [1, 2, 5])
1414
@pytest.mark.parametrize(
15-
"filename, expected_ok",
15+
("filename", "expected_ok", "strategy"),
1616
[
17-
("_sample_docs/list-item-example-1.pdf", True), # 1 page
18-
("_sample_docs/layout-parser-paper-fast.pdf", True), # 2 pages
19-
("_sample_docs/layout-parser-paper.pdf", True), # 16 pages
20-
("_sample_docs/fake.doc", True),
21-
(
22-
"_sample_docs/fake.doc",
23-
False,
24-
), # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
17+
("_sample_docs/list-item-example-1.pdf", True, "fast"), # 1 page
18+
("_sample_docs/layout-parser-paper-fast.pdf", True, "fast"), # 2 pages
19+
# NOTE(mike): using "fast" strategy fails on this file for unknown reasons
20+
("_sample_docs/layout-parser-paper.pdf", True, "hi_res"), # 16 pages
21+
("_sample_docs/fake.doc", False, "fast"),
22+
("_sample_docs/emoji.xlsx", True, "fast"),
23+
("_sample_docs/csv-with-long-lines.csv", False, "fast"),
24+
("_sample_docs/ideas-page.html", False, "fast"),
2525
],
2626
)
2727
def test_integration_split_pdf_has_same_output_as_non_split(
28-
call_threads: int, filename: str, expected_ok: bool, caplog
28+
concurrency_level: int, filename: str, expected_ok: bool, strategy: str, caplog
2929
):
3030
"""
3131
Tests that output that we get from the split-by-page pdf is the same as from non-split.
3232
3333
Requires unstructured-api running in bg. See Makefile for how to run it.
34-
Doesn't check for raw_response as there's no clear patter for how it changes with the number of pages / call_threads.
34+
Doesn't check for raw_response as there's no clear patter for how it changes with the number of pages / concurrency_level.
3535
"""
3636
try:
3737
response = requests.get("http://localhost:8000/general/docs")
38-
assert (
39-
response.status_code == 200
40-
), "The unstructured-api is not running on localhost:8000"
38+
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
4139
except requests.exceptions.ConnectionError:
4240
assert False, "The unstructured-api is not running on localhost:8000"
4341

@@ -50,22 +48,22 @@ def test_integration_split_pdf_has_same_output_as_non_split(
5048
)
5149

5250
if not expected_ok:
51+
# This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
5352
files.file_name += ".pdf"
5453

5554
req = shared.PartitionParameters(
5655
files=files,
57-
strategy="fast",
56+
strategy=strategy,
5857
languages=["eng"],
5958
split_pdf_page=True,
59+
split_pdf_concurrency_level=concurrency_level,
6060
)
6161

62-
os.environ["UNSTRUCTURED_CLIENT_SPLIT_CALL_THREADS"] = str(call_threads)
63-
6462
try:
6563
resp_split = client.general.partition(req)
6664
except (HTTPValidationError, AttributeError) as exc:
6765
if not expected_ok:
68-
assert "error arose when splitting by pages" in caplog.text
66+
assert "The file does not appear to be a valid PDF." in caplog.text
6967
assert "File does not appear to be a valid PDF" in str(exc)
7068
return
7169
else:
@@ -78,7 +76,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(
7876
assert resp_split.content_type == resp_single.content_type
7977
assert resp_split.status_code == resp_single.status_code
8078

81-
# Difference in the parent_id is expected, because parent_ids are assigned when element crosses page boundary
8279
diff = DeepDiff(
8380
t1=resp_split.elements,
8481
t2=resp_single.elements,
@@ -95,9 +92,7 @@ def test_integration_split_pdf_for_file_with_no_name():
9592
"""
9693
try:
9794
response = requests.get("http://localhost:8000/general/docs")
98-
assert (
99-
response.status_code == 200
100-
), "The unstructured-api is not running on localhost:8000"
95+
assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
10196
except requests.exceptions.ConnectionError:
10297
assert False, "The unstructured-api is not running on localhost:8000"
10398

_test_unstructured_client/test_integration_freemium.py renamed to _test_unstructured_client/integration/test_integration_freemium.py

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
11
import os
22
from pathlib import Path
3-
import requests
43

54
import pytest
6-
5+
import requests
76
from unstructured_client import UnstructuredClient
87
from unstructured_client.models import shared
98

@@ -16,7 +15,7 @@ def client() -> UnstructuredClient:
1615

1716
@pytest.fixture(scope="module")
1817
def doc_path() -> Path:
19-
return Path(__file__).resolve().parent.parent / "_sample_docs"
18+
return Path(__file__).resolve().parents[2] / "_sample_docs"
2019

2120

2221
@pytest.mark.parametrize("strategy", ["fast", "ocr_only", "hi_res"])
@@ -37,4 +36,3 @@ def test_partition_strategies(strategy, client, doc_path):
3736
response = client.general.partition(req)
3837
assert response.status_code == 200
3938
assert len(response.elements)
40-

0 commit comments

Comments
 (0)