aryn-ai · karanataryn · Aug 2, 2025 · Aug 1, 2025 · Aug 1, 2025 · Aug 1, 2025
diff --git a/api-reference/endpoint/docparse/openapi.json b/api-reference/endpoint/docparse/openapi.json
@@ -638,23 +638,27 @@
                 "default": false,
                 "description": "A boolean value indicating whether to crop images detected in the document and return them in the specified format converted to base64 within the binary_representation of returned image elements.\n"
               },
-              "extract_image_format": {
-                "type": "string",
-                "enum": [
-                  "ppm",
-                  "png",
-                  "jpeg"
-                ],
-                "title": "Extract Image Format",
-                "default": "ppm",
-                "description": "The format to use for extracted images. Defaults to ppm.\n"
-              },
-              "extract_table_structure": {
-                "type": "boolean",
-                "title": "Extract Table Structure",
-                "default": true,
-                "description": "Use `table_mode` instead. A boolean value indicating whether to extract table structure from the document. This means detecting cells of a table broken into rows and columns.\n",
-                "deprecated": true
+              "image_extraction_options": {
+                "type": "object",
+                "properties": {
+                  "associate_captions": {
+                    "type": "boolean",
+                    "title": "Associate Captions",
+                    "default": false,
+                    "description": "A boolean value indicating whether to associate captions with the images.\n"
+                  },
+                  "extract_image_format": {
+                    "type": "string",
+                    "enum": [
+                      "ppm",
+                      "png",
+                      "jpeg"
+                    ],
+                    "title": "Extract Image Format",
+                    "default": "ppm",
+                    "description": "The format to use for extracted images. Defaults to ppm.\n"
+                  }
+                }
               },
               "table_extraction_options": {
                 "type": "object",
@@ -720,12 +724,6 @@
                 "title": "Table Mode",
                 "description": "The mode to use for table structure extraction. Defaults to `none`, which will not extract table structure. Note that the `vision` mode is only available for PAYG users.\n"
               },
-              "use_ocr": {
-                "type": "boolean",
-                "title": "Use OCR",
-                "deprecated": true,
-                "description": "Use `text_mode` instead. A boolean value indicating whether to use OCR or not on the document.\n"
-              },
               "text_extraction_options": {
                 "type": "object",
                 "title": "Text Extraction Options",
@@ -960,6 +958,32 @@
                 },
                 "title": "Markdown Options",
                 "description": "A dictionary of options to specify what to include in the markdown output.\n"
+              },
+
+              "extract_table_structure": {
+                "type": "boolean",
+                "title": "Extract Table Structure",
+                "default": true,
+                "description": "Use `table_mode` instead. A boolean value indicating whether to extract table structure from the document. This means detecting cells of a table broken into rows and columns.\n",
+                "deprecated": true
+              },
+              "use_ocr": {
+                "type": "boolean",
+                "title": "Use OCR",
+                "deprecated": true,
+                "description": "Use `text_mode` instead. A boolean value indicating whether to use OCR or not on the document.\n"
+              },
+              "extract_image_format": {
+                    "deprecated": true,
+                    "type": "string",
+                    "enum": [
+                      "ppm",
+                      "png",
+                      "jpeg"
+                    ],
+                    "title": "Extract Image Format",
+                    "default": "ppm",
+                    "description": "The format to use for extracted images. Defaults to ppm.\n"
               }
             }
           }

diff --git a/docparse/processing_options.mdx b/docparse/processing_options.mdx
@@ -156,9 +156,14 @@ A `boolean` that when `True` the selected chunker will attempt to merge chunks a
 ### extract_images
 A boolean that determines whether to extract images from the document. The format is determined by the value of `extract_image_format`. `Default: False`.
 
-### extract_image_format
-A string indicating what in what format extracted images should be returned. Must be one of `ppm`, `png`, or `jpeg`. In all cases, the result will be base64 encoded before being returned. `Default: "ppm"`. 
+### image_extraction_options
+A dictionary of options for specifying image extraction behavior.
 
+#### image_extraction_options.associate_captions
+A boolean that specifies whether to associate captions with the images. `Default: False`.
+
+#### image_extraction_options.extract_image_format
+A string indicating what in what format extracted images should be returned. Must be one of `ppm`, `png`, or `jpeg`. In all cases, the result will be base64 encoded before being returned. `Default: "ppm"`. Deprecated out of `image_extraction_options`.
 
 ## Advanced
 

diff --git a/docparse/tutorials/images.mdx b/docparse/tutorials/images.mdx
@@ -112,10 +112,56 @@ image = Image.frombytes(image_mode,  (image_width, image_height), base64.b64deco
 #display the image
 image 
 ```
+
 ## Output Image
 
 Here's the output image after extraction. As you can see, the image has been successfully extracted from the PDF with clarity.
 
 <Frame>
   <img src="/images/board.png" />
-</Frame>
+</Frame>
+
+
+## Captions
+
+If you want to associate captions with the images, you can set the `associate_captions` parameter to `True`.
+
+```python
+partitioned_file = partition_file(file, aryn_api_key, extract_images=True, image_extraction_options={'associate_captions': True})
+```
+
+Here's the output of the image with the caption separated.
+
+<Frame>
+  <img src="/images/caption_output.png" />
+</Frame>
+
+After enabling the `associate_captions` parameter, you'll notice that the image is now associated with a caption, as seen below.
+
+<Frame>
+  <img src="/images/image_with_caption.png" />
+</Frame>
+
+The caption is also returned in the `caption` field of the image or table element.
+
+```json image_output.json
+{'type': 'Image',
+   'bbox': [0.5138835951861214,
+    0.09537044178355823,
+    0.8832426183363971,
+    0.2254865195534446],
+   'properties': {'score': 0.8971611857414246,
+    'image_size': None,
+    'image_mode': None,
+    'image_format': None,
+    'caption': {'type': 'Caption',
+     'bbox': [0.5880948414522059,
+      0.21432253750887784,
+      0.8085399672564338,
+      0.2254865195534446],
+     'properties': {'score': 0.9146057963371277,
+      '_element_index': 10,
+      'font_size': 9.251028571428524},
+     'text_representation': 'Figure 1: Example of an RL system.'},
+    'page_number': 2}}
+```
diff --git a/images/caption_output.png b/images/caption_output.png
diff --git a/images/image_with_caption.png b/images/image_with_caption.png
diff --git a/snippets/partition_options.mdx b/snippets/partition_options.mdx
@@ -36,7 +36,9 @@ set the threshold manually, we recommend starting with a value of `0.32`.
         - `pixels>50->table_transformer; chars<30->deformable_detr;chars>35->table_transformer;pixels>2->deformable_detr;table_transformer;comment` => if the biggest dimension is more than 50 pixels use table transformer. Else if the total number of chars in the table is less than 30 use deformable_detr. Else if there are mode than 35 chars use table transformer. Else if there are more than 2 pixels in the biggest dimension use deformable detr. Otherwise use table transformer. comment is not processed.
 
 - `extract_images`: A boolean that determines whether to extract images from the document. The format is determined by the value of `extract_image_format`. Default: `False`.
-- `extract_image_format`: A string indicating what in what format extracted images should be returned. Must be one of `ppm`, `png`, or `jpeg`. In all cases, the result will be base64 encoded before being returned. Default: `ppm`. 
+- `image_extraction_options`: A map with string keys specifying options for image extraction.
+    - `associate_captions`: A boolean that specifies whether to associate captions with the images. Default is `False`.
+    - `extract_image_format`: A string indicating what in what format extracted images should be returned. Must be one of `ppm`, `png`, or `jpeg`. In all cases, the result will be base64 encoded before being returned. Default: `ppm`. 
 - `summarize_images`: (PAYG Only) A boolean that, when `True`, generates a summary of the images in the document and returns it as the `text_representation`. When `False`, images are not summarized. Default is `False`.
 - `selected_pages`: A list specifying individual pages (1-indexed) and page ranges from the document to partition.
 Single pages are specified as integers and ranges are specified as lists with two integer entries in ascending order. A
@@ -92,6 +94,7 @@ an api key. If aryn_api_key is set it will override this. The default ArynConfig
 and then in the file ~/.aryn/config.yaml. Default is None (aryn-sdk will look in the aryn_api_key parameter, in your
 environment variables, and then in ~/.aryn/config.yaml).
 - `aryn_api_key`: An Aryn API key, provided as a string. You can get one for free at [aryn.ai/get-started](https://www.aryn.ai/get-started). Default is `None` (If not provided, the sdk will check for it in the environment variable `ARYN_API_KEY` or will look in aryn_config as specified above).
+- `extract_image_format` (deprecated out of `image_extraction_options`): A string indicating what in what format extracted images should be returned. Must be one of `ppm`, `png`, or `jpeg`. In all cases, the result will be base64 encoded before being returned. Default: `ppm`. 
 - `use_ocr` (deprecated): Use `text_mode` instead. A boolean value that, when set to `True`, causes DocParse to extract text using an OCR model. This is
 useful when the text is not directly extractable from the PDF, such as when the text is part of an image or when the
 text is rotated. When set to `False`, DocParse extracts embedded text from the input document. Default is `False`.