EmbeddedLLM
diff --git a/‎.github/workflows/ci.yaml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/ci.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.speakeasy/gen.lock
Lines changed: 3 additions & 3 deletions b/‎.speakeasy/gen.lock
Lines changed: 3 additions & 3 deletions
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 52 additions & 16 deletions b/‎README.md
Lines changed: 52 additions & 16 deletions
diff --git a/‎_sample_docs/csv-with-long-lines.csv
Lines changed: 11 additions & 0 deletions b/‎_sample_docs/csv-with-long-lines.csv
Lines changed: 11 additions & 0 deletions
diff --git a/‎_sample_docs/emoji.xlsx
4.59 KB b/‎_sample_docs/emoji.xlsx
4.59 KB
diff --git a/‎_sample_docs/ideas-page.html
Lines changed: 44 additions & 0 deletions b/‎_sample_docs/ideas-page.html
Lines changed: 44 additions & 0 deletions
diff --git a/‎_test_unstructured_client/test__decorators.py renamed to ‎_test_unstructured_client/integration/test_decorators.py
Lines changed: 19 additions & 24 deletions b/‎_test_unstructured_client/test__decorators.py renamed to ‎_test_unstructured_client/integration/test_decorators.py
Lines changed: 19 additions & 24 deletions
diff --git a/‎_test_unstructured_client/test_integration_freemium.py renamed to ‎_test_unstructured_client/integration/test_integration_freemium.py
Lines changed: 2 additions & 4 deletions b/‎_test_unstructured_client/test_integration_freemium.py renamed to ‎_test_unstructured_client/integration/test_integration_freemium.py
Lines changed: 2 additions & 4 deletions
@@ -53,10 +53,10 @@ jobs:
         python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: make install-test
-    - name: Run unit tests
+    - name: Run all tests
       run: |
         pip install  .
-        pytest _test_unstructured_client/test_integration*.py
+        make test
       env:
         UNSTRUCTURED_API_KEY: ${{ secrets.UNSTRUCTURED_API_KEY }}
 
@@ -1,12 +1,12 @@
 lockVersion: 2.0.0
 id: 8b5fa338-9106-4734-abf0-e30d67044a90
 management:
-  docChecksum: 8c3ba3d80aca4eb8e7b1d6bdcb158ef1
+  docChecksum: 3956d4428b6275f96d6859db352f202f
   docVersion: 1.0.30
   speakeasyVersion: 1.293.1
   generationVersion: 2.333.3
-  releaseVersion: 0.25.3
-  configChecksum: 598c71935a2b9e23cc8460a9a80ce5bf
+  releaseVersion: 0.26.4
+  configChecksum: 6d27d7b73b27a82dfd0bbd6198beabcf
   repoURL: https://github.com/Unstructured-IO/unstructured-python-client.git
   repoSubDirectory: .
   installationURL: https://github.com/Unstructured-IO/unstructured-python-client.git
 
@@ -9,7 +9,7 @@ DOCKER_IMAGE ?= downloads.unstructured.io/unstructured-io/unstructured-api:lates
 
 .PHONY: install-test
 install-test:
-	pip install pytest requests_mock pypdf deepdiff
+	pip install pytest requests_mock pypdf deepdiff requests-toolbelt
 
 .PHONY: install-dev
 install-dev:
 
@@ -13,7 +13,7 @@
   <p>Python SDK for the Unstructured API</p>
 </h2>
 
-This is a Python client for the [Unstructured API](https://unstructured-io.github.io/unstructured/api.html). 
+This is a Python client for the [Unstructured API](https://unstructured-io.github.io/unstructured/api.html).
 
 <div align="center">
 
@@ -33,7 +33,7 @@ pip install unstructured-client
 <!-- End SDK Installation [installation] -->
 
 ## Usage
-Only the `files` parameter is required. 
+Only the `files` parameter is required.
 
 ```python
 from unstructured_client import UnstructuredClient
@@ -64,17 +64,17 @@ try:
 except SDKError as e:
     print(e)
 ```
-    
+
 Result:
 
 ```
 {
-'type': 'UncategorizedText', 
-'element_id': 'fc550084fda1e008e07a0356894f5816', 
+'type': 'UncategorizedText',
+'element_id': 'fc550084fda1e008e07a0356894f5816',
 'metadata': {
-  'filename': 'layout-parser-paper-fast.pdf', 
-  'filetype': 'application/pdf', 
-  'languages': ['eng'], 
+  'filename': 'layout-parser-paper-fast.pdf',
+  'filetype': 'application/pdf',
+  'languages': ['eng'],
   'page_number': 1
   }
 }
@@ -106,18 +106,24 @@ See the [general partition](/docs/models/shared/partitionparameters.md) page for
 
 #### Splitting PDF by pages
 
-In order to speed up processing of long PDF files, set `split_pdf_page=True`. It will cause the PDF
-to be split page-by-page at client side, before sending to API, and combining individual responses
-as single result. This will work only for PDF files, so don't set it for other filetypes.
+In order to speed up processing of long PDF files, `split_pdf_page` can be set to `True` (defaults to `False`). It will cause the PDF to be split at client side, before sending to API, and combining individual responses as single result. This parameter will affect only PDF files, no need to disable it for other filetypes.
 
 Warning: this feature causes the `parent_id` metadata generation in elements to be disabled, as that
 requires having context of multiple pages.
 
-The amount of threads that will be used for sending individual pdf pages, is controlled by
-`UNSTRUCTURED_CLIENT_SPLIT_CALL_THREADS` env var. By default it equals to 5. 
-It can't be more than 15, to avoid too high resource usage and costs.
+The amount of workers utilized for splitting PDFs is dictated by the `split_pdf_concurrency_level` parameter, with a default of 5 and a maximum of 15 to keep resource usage and costs in check. The splitting process leverages the `ProcessPoolExecutor` to manage concurrency effectively. 
+The size of each batch of pages (ranging from 2 to 20) is internally determined based on the concurrency level and the total number of pages in the document.
 
-<!-- No SDK Example Usage -->
+Example:
+```python
+req = shared.PartitionParameters(
+    files=files,
+    strategy="fast",
+    languages=["eng"],
+    split_pdf_page=True,
+    split_pdf_concurrency_level=8
+)
+```
 <!-- No SDK Available Operations -->
 <!-- No Pagination -->
 <!-- No Error Handling -->
@@ -142,6 +148,36 @@ s = unstructured_client.UnstructuredClient(client=http_client)
 <!-- No Retries -->
 <!-- No Authentication -->
 
+<!-- Start SDK Example Usage [usage] -->
+## SDK Example Usage
+
+### Example
+
+```python
+import unstructured_client
+from unstructured_client.models import operations, shared
+
+s = unstructured_client.UnstructuredClient(
+    api_key_auth="YOUR_API_KEY",
+)
+
+res = s.general.partition(request=operations.PartitionRequest(
+    partition_parameters=shared.PartitionParameters(
+        files=shared.Files(
+            content='0x2cC94b2FEF'.encode(),
+            file_name='um.shtml',
+        ),
+        strategy=shared.Strategy.HI_RES,
+    ),
+))
+
+if res.elements is not None:
+    # handle response
+    pass
+
+```
+<!-- End SDK Example Usage [usage] -->
+
 <!-- Placeholder for Future Speakeasy SDK Sections -->
 
 ### Maturity
@@ -176,7 +212,7 @@ While we value open-source contributions to this SDK, this library is generated
 
 There are two important files used by `make client-generate`:
 1. `openapi.json` which is actually not stored here, [but fetched from unstructured-api](https://raw.githubusercontent.com/Unstructured-IO/unstructured-api/main/openapi.json), represents the API that is supported on backend.
-2. `overlay_client.yaml` is a handcrafted diff that when applied over above, produces `openapi_client.json` 
+2. `overlay_client.yaml` is a handcrafted diff that when applied over above, produces `openapi_client.json`
    which is used to generate SDK.
 
 Once PR with changes is merged, Github CI will autogenerate the Speakeasy client in a new PR, using
 
@@ -0,0 +1,44 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
+<html><script type="text/javascript"> 
+ <!-- 
+ (new Image).src="https://store.yahoo.net/cgi-bin/refsd?e=http://paulgraham.com/getideas.html&h=paulgraham.com&v=1.0&dr=" + escape(document.referrer); 
+ --> 
+ </script>
+<head><title>How to Get New Ideas</title><!-- <META NAME="ROBOTS" CONTENT="NOODP"> -->
+<link rel="shortcut icon" href="http://ycombinator.com/arc/arc.png">
+</head><body bgcolor=ffffff background="https://sep.yimg.com/ca/I/paulgraham_2271_0" text=000000 link=000099 vlink=464646><table border=0 cellspacing=0 cellpadding=0><tr valign=top><td><map name=c04963d10de5f><area shape=rect coords="0,0,67,21" href="index.html"><area shape=rect coords="0,21,67,42" href="articles.html"><area shape=rect coords="0,42,67,63" href="http://www.amazon.com/gp/product/0596006624"><area shape=rect coords="0,63,67,84" href="books.html"><area shape=rect coords="0,84,67,105" href="http://ycombinator.com"><area shape=rect coords="0,105,67,126" href="arc.html"><area shape=rect coords="0,126,67,147" href="bel.html"><area shape=rect coords="0,147,67,168" href="lisp.html"><area shape=rect coords="0,168,67,189" href="antispam.html"><area shape=rect coords="0,189,67,210" href="kedrosky.html"><area shape=rect coords="0,210,67,231" href="faq.html"><area shape=rect coords="0,231,67,252" href="raq.html"><area shape=rect coords="0,252,67,273" href="quo.html"><area shape=rect coords="0,273,67,294" href="rss.html"><area shape=rect coords="0,294,67,315" href="bio.html"><area shape=rect coords="0,315,67,336" href="https://twitter.com/paulg"><area shape=rect coords="0,336,67,357" href="https://mas.to/@paulg"></map><img src="https://s.yimg.com/aah/paulgraham/img-20.gif" width=69 height=357 usemap=#c04963d10de5f border=0 hspace=0 vspace=0 ismap></td><td><img src="https://sep.yimg.com/ca/Img/trans_1x1.gif" height=1 width=26 border=0></td><td><a href="index.html"><img src="https://sep.yimg.com/ca/I/paulgraham_2271_3232" width=410 height=45 border=0 hspace=0 vspace=0></a><br><br><table border=0 cellspacing=0 cellpadding=0 width=435><tr valign=top><td width=435><img src="https://s.yimg.com/aah/paulgraham/how-to-get-new-ideas-1.gif" width=176 height=18 border=0 hspace=0 vspace=0 alt="How to Get New Ideas"><br><br><font size=2 face="verdana">January 2023<br><br><i>(<a href="https://twitter.com/stef/status/1617222428727586816"><u>Someone</u></a> fed my essays into GPT to make something that could answer
+questions based on them, then asked it where good ideas come from.  The
+answer was ok, but not what I would have said. This is what I would have said.)</i><br><br>The way to get new ideas is to notice anomalies: what seems strange,
+or missing, or broken? You can see anomalies in everyday life (much
+of standup comedy is based on this), but the best place to look for
+them is at the frontiers of knowledge.<br><br>Knowledge grows fractally.
+From a distance its edges look smooth, but when you learn enough
+to get close to one, you'll notice it's full of gaps. These gaps
+will seem obvious; it will seem inexplicable that no one has tried
+x or wondered about y. In the best case, exploring such gaps yields
+whole new fractal buds.<br><br></font></td></tr></table><table border=0 cellspacing=0 cellpadding=0 width=435><tr><td><font size=2 face="verdana"><br><br><hr></font></td></tr></table></td></tr></table></body>
+<script type="text/javascript">
+csell_env = 'bf1';
+ var storeCheckoutDomain = 'order.store.yahoo.net';
+</script>
+<script type="text/javascript">
+// Begin Yahoo Store Generated Code
+ </script> <script type="text/javascript" src="https://s.turbifycdn.com/lq/ult/ylc_1.9.js" ></script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/lib/smbiz/store/csell/beacon-a9518fc6e4.js" >
+</script>
+<script type="text/javascript">
+// Begin Yahoo Store Generated Code
+ csell_page_data = {}; csell_page_rec_data = []; ts='TOK_STORE_ID';
+</script>
+<script type="text/javascript">
+// Begin Yahoo Store Generated Code
+function csell_GLOBAL_INIT_TAG() { var csell_token_map = {}; csell_token_map['TOK_ITEM_ID_LIST'] = 'getideas'; csell_token_map['TOK_BEACON_TYPE'] = 'prod'; csell_token_map['TOK_RAND_KEY'] = 't'; csell_token_map['TOK_SPACEID'] = '2022276099'; csell_token_map['TOK_IS_ORDERABLE'] = '2'; csell_token_map['TOK_STORE_ID'] = 'paulgraham'; csell_token_map['TOK_URL'] = ''; csell_token_map['TOK_ORDER_HOST'] = 'order.store.yahoo.net';  c = csell_page_data; var x = (typeof storeCheckoutDomain == 'string')?storeCheckoutDomain:'order.store.yahoo.net'; var t = csell_token_map; c['s'] = t['TOK_SPACEID']; c['url'] = t['TOK_URL']; c['si'] = t[ts]; c['ii'] = t['TOK_ITEM_ID_LIST']; c['bt'] = t['TOK_BEACON_TYPE']; c['rnd'] = t['TOK_RAND_KEY']; c['io'] = t['TOK_IS_ORDERABLE']; YStore.addItemUrl = 'http%s://'+x+'/'+t[ts]+'/ymix/MetaController.html?eventName.addEvent&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_itemId=%s&cartDS.shoppingcart_ROW0_m_orderItemVector_ROW0_m_quantity=1&ysco_key_cs_item=1&sectionId=ysco.cart&ysco_key_store_id='+t[ts]; } 
+</script>
+<script type="text/javascript">
+// Begin Yahoo Store Generated Code
+function csell_REC_VIEW_TAG() {  var env = (typeof csell_env == 'string')?csell_env:'prod'; var p = csell_page_data; var a = '/sid='+p['si']+'/io='+p['io']+'/ii='+p['ii']+'/bt='+p['bt']+'-view'+'/en='+env; var r=Math.random(); YStore.CrossSellBeacon.renderBeaconWithRecData(p['url']+'/p/s='+p['s']+'/'+p['rnd']+'='+r+a); } 
+</script>
+<script type="text/javascript">
+// Begin Yahoo Store Generated Code
+var csell_token_map = {}; csell_token_map['TOK_PAGE'] = 'p'; csell_token_map['TOK_WS_URL'] = 'https://paulgraham.csell.store.yahoo.net/cs/recommend?itemids=getideas&location=p'; csell_token_map['TOK_SHOW_CS_RECS'] = 'false'; csell_token_map['TOK_CURR_SYM'] = '$';  var t = csell_token_map; csell_GLOBAL_INIT_TAG(); YStore.page = t['TOK_PAGE']; YStore.currencySymbol = t['TOK_CURR_SYM']; YStore.crossSellUrl = t['TOK_WS_URL']; YStore.showCSRecs = t['TOK_SHOW_CS_RECS']; </script> <script type="text/javascript" src="https://s.turbifycdn.com/ae/store/secure/recs-1.3.2.2.js" ></script> <script type="text/javascript" >
+</script>
+</html>
@@ -1,43 +1,41 @@
 import os
+
 import pytest
 import requests
 from deepdiff import DeepDiff
-
 from unstructured_client import UnstructuredClient
 from unstructured_client.models import shared
 from unstructured_client.models.errors import HTTPValidationError
 
 FAKE_KEY = "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"
 
 
-@pytest.mark.parametrize("call_threads", [1, 2, 5])
+@pytest.mark.parametrize("concurrency_level", [1, 2, 5])
 @pytest.mark.parametrize(
-    "filename, expected_ok",
+    ("filename", "expected_ok", "strategy"),
     [
-        ("_sample_docs/list-item-example-1.pdf", True),  # 1 page
-        ("_sample_docs/layout-parser-paper-fast.pdf", True),  # 2 pages
-        ("_sample_docs/layout-parser-paper.pdf", True),  # 16 pages
-        ("_sample_docs/fake.doc", True),
-        (
-            "_sample_docs/fake.doc",
-            False,
-        ),  # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
+        ("_sample_docs/list-item-example-1.pdf", True, "fast"),  # 1 page
+        ("_sample_docs/layout-parser-paper-fast.pdf", True, "fast"),  # 2 pages
+        # NOTE(mike): using "fast" strategy fails on this file for unknown reasons
+        ("_sample_docs/layout-parser-paper.pdf", True, "hi_res"),  # 16 pages
+        ("_sample_docs/fake.doc", False, "fast"),
+        ("_sample_docs/emoji.xlsx", True, "fast"),
+        ("_sample_docs/csv-with-long-lines.csv", False, "fast"),
+        ("_sample_docs/ideas-page.html", False, "fast"),
     ],
 )
 def test_integration_split_pdf_has_same_output_as_non_split(
-    call_threads: int, filename: str, expected_ok: bool, caplog
+    concurrency_level: int, filename: str, expected_ok: bool, strategy: str, caplog
 ):
     """
     Tests that output that we get from the split-by-page pdf is the same as from non-split.
 
     Requires unstructured-api running in bg. See Makefile for how to run it.
-    Doesn't check for raw_response as there's no clear patter for how it changes with the number of pages / call_threads.
+    Doesn't check for raw_response as there's no clear patter for how it changes with the number of pages / concurrency_level.
     """
     try:
         response = requests.get("http://localhost:8000/general/docs")
-        assert (
-            response.status_code == 200
-        ), "The unstructured-api is not running on localhost:8000"
+        assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
     except requests.exceptions.ConnectionError:
         assert False, "The unstructured-api is not running on localhost:8000"
 
@@ -50,22 +48,22 @@ def test_integration_split_pdf_has_same_output_as_non_split(
         )
 
     if not expected_ok:
+        # This will append .pdf to filename to fool first line of filetype detection, to simulate decoding error
         files.file_name += ".pdf"
 
     req = shared.PartitionParameters(
         files=files,
-        strategy="fast",
+        strategy=strategy,
         languages=["eng"],
         split_pdf_page=True,
+        split_pdf_concurrency_level=concurrency_level,
     )
 
-    os.environ["UNSTRUCTURED_CLIENT_SPLIT_CALL_THREADS"] = str(call_threads)
-
     try:
         resp_split = client.general.partition(req)
     except (HTTPValidationError, AttributeError) as exc:
         if not expected_ok:
-            assert "error arose when splitting by pages" in caplog.text
+            assert "The file does not appear to be a valid PDF." in caplog.text
             assert "File does not appear to be a valid PDF" in str(exc)
             return
         else:
@@ -78,7 +76,6 @@ def test_integration_split_pdf_has_same_output_as_non_split(
     assert resp_split.content_type == resp_single.content_type
     assert resp_split.status_code == resp_single.status_code
 
-    # Difference in the parent_id is expected, because parent_ids are assigned when element crosses page boundary
     diff = DeepDiff(
         t1=resp_split.elements,
         t2=resp_single.elements,
@@ -95,9 +92,7 @@ def test_integration_split_pdf_for_file_with_no_name():
     """
     try:
         response = requests.get("http://localhost:8000/general/docs")
-        assert (
-            response.status_code == 200
-        ), "The unstructured-api is not running on localhost:8000"
+        assert response.status_code == 200, "The unstructured-api is not running on localhost:8000"
     except requests.exceptions.ConnectionError:
         assert False, "The unstructured-api is not running on localhost:8000"
 
 
@@ -1,9 +1,8 @@
 import os
 from pathlib import Path
-import requests
 
 import pytest
-
+import requests
 from unstructured_client import UnstructuredClient
 from unstructured_client.models import shared
 
@@ -16,7 +15,7 @@ def client() -> UnstructuredClient:
 
 @pytest.fixture(scope="module")
 def doc_path() -> Path:
-    return Path(__file__).resolve().parent.parent / "_sample_docs"
+    return Path(__file__).resolve().parents[2] / "_sample_docs"
 
 
 @pytest.mark.parametrize("strategy", ["fast", "ocr_only", "hi_res"])
@@ -37,4 +36,3 @@ def test_partition_strategies(strategy, client, doc_path):
     response = client.general.partition(req)
     assert response.status_code == 200
     assert len(response.elements)
-