Expanding Unstructured loader to take server file inputs, more file t…

…ypes, and API parameters (langflow-ai#4738) * adding ability for APIRequest to retry and save to a file * [autofix.ci] apply automated fixes * adding ability for APIRequest to retry and save to a file * [autofix.ci] apply automated fixes * initial refactor of FileComponent to handle Data input * shifting potentially common logic into BaseFileComponent * improving readability and fixing problems * [autofix.ci] apply automated fixes * addressing linting * [autofix.ci] apply automated fixes * linting part 2 * [autofix.ci] apply automated fixes * linting part 3 * preserve input fields on data objects * [autofix.ci] apply automated fixes * ensuring processed data is linked to correct file data object * [autofix.ci] apply automated fixes * addressing linting * [autofix.ci] apply automated fixes * refactor Unstructured to BaseFileComponent * [autofix.ci] apply automated fixes * linting * refactor to new BaseData * adding chunking strategy selector * [autofix.ci] apply automated fixes * fixing edge case * allowing specific failure of missing file without forcing silent_errors * [autofix.ci] apply automated fixes * Fix mypy issues * Update unstructured.py --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Hare <[email protected]>
Styleebender · Nov 26, 2024 · 97fe69c · 97fe69c
1 parent 8e55a0e
commit 97fe69c
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 26 deletions.
diff --git a/src/backend/base/langflow/base/data/base_file.py b/src/backend/base/langflow/base/data/base_file.py
@@ -297,6 +297,7 @@ def _validate_and_resolve_paths(self) -> list[BaseFile]:
 
         def add_file(data: Data, path: str | Path, *, delete_after_processing: bool):
             resolved_path = Path(self.resolve_path(str(path)))
+
             if not resolved_path.exists():
                 msg = f"File or directory not found: {path}"
                 self.log(msg)

diff --git a/src/backend/base/langflow/components/unstructured/unstructured.py b/src/backend/base/langflow/components/unstructured/unstructured.py
@@ -1,54 +1,122 @@
 from langchain_unstructured import UnstructuredLoader
 
-from langflow.custom import Component
-from langflow.inputs import FileInput, SecretStrInput
+from langflow.base.data import BaseFileComponent
+from langflow.inputs import DropdownInput, MessageTextInput, NestedDictInput, SecretStrInput
 from langflow.schema import Data
-from langflow.template import Output
 
 
-class UnstructuredComponent(Component):
-    display_name = "Unstructured"
-    description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT"
-    documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
+class UnstructuredComponent(BaseFileComponent):
+    display_name = "Unstructured API"
+    description = (
+        "Uses Unstructured.io API to extract clean text from raw source documents. "
+        "Supports a wide range of file types."
+    )
+    documentation = (
+        "https://python.langchain.com/api_reference/unstructured/document_loaders/"
+        "langchain_unstructured.document_loaders.UnstructuredLoader.html"
+    )
     trace_type = "tool"
     icon = "Unstructured"
     name = "Unstructured"
 
+    # https://docs.unstructured.io/api-reference/api-services/overview#supported-file-types
+    VALID_EXTENSIONS = [
+        "bmp",
+        "csv",
+        "doc",
+        "docx",
+        "eml",
+        "epub",
+        "heic",
+        "html",
+        "jpeg",
+        "png",
+        "md",
+        "msg",
+        "odt",
+        "org",
+        "p7s",
+        "pdf",
+        "png",
+        "ppt",
+        "pptx",
+        "rst",
+        "rtf",
+        "tiff",
+        "txt",
+        "tsv",
+        "xls",
+        "xlsx",
+        "xml",
+    ]
+
     inputs = [
-        FileInput(
-            name="file",
-            display_name="File",
-            required=True,
-            info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT",
-            file_types=["pdf", "docx", "txt"],  # TODO: Support all unstructured file types
-        ),
+        *BaseFileComponent._base_inputs,
         SecretStrInput(
             name="api_key",
             display_name="Unstructured.io Serverless API Key",
             required=True,
             info="Unstructured API Key. Create at: https://app.unstructured.io/",
         ),
+        MessageTextInput(
+            name="api_url",
+            display_name="Unstructured.io API URL",
+            required=False,
+            info="Unstructured API URL.",
+        ),
+        DropdownInput(
+            name="chunking_strategy",
+            display_name="Chunking Strategy",
+            info="Chunking strategy to use, see https://docs.unstructured.io/api-reference/api-services/chunking",
+            options=["", "basic", "by_title", "by_page", "by_similarity"],
+            real_time_refresh=False,
+            value="",
+        ),
+        NestedDictInput(
+            name="unstructured_args",
+            display_name="Additional Arguments",
+            required=False,
+            info=(
+                "Optional dictionary of additional arguments to the Loader. "
+                "See https://docs.unstructured.io/api-reference/api-services/api-parameters for more information."
+            ),
+        ),
     ]
 
     outputs = [
-        Output(name="data", display_name="Data", method="load_documents"),
+        *BaseFileComponent._base_outputs,
     ]
 
-    def build_unstructured(self) -> UnstructuredLoader:
-        file_paths = [self.file]
+    def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
+        file_paths = [str(file.path) for file in file_list if file.path]
+
+        if not file_paths:
+            self.log("No files to process.")
+            return file_list
+
+        # https://docs.unstructured.io/api-reference/api-services/api-parameters
+        args = self.unstructured_args or {}
+
+        if self.chunking_strategy:
+            args["chunking_strategy"] = self.chunking_strategy
+
+        args["api_key"] = self.api_key
+        args["partition_via_api"] = True
+        if self.api_url:
+            args["url"] = self.api_url
 
-        return UnstructuredLoader(
+        loader = UnstructuredLoader(
             file_paths,
-            api_key=self.api_key,
-            partition_via_api=True,
+            **args,
         )
 
-    def load_documents(self) -> list[Data]:
-        unstructured = self.build_unstructured()
+        documents = loader.load()
 
-        documents = unstructured.load()
-        data = [Data.from_document(doc) for doc in documents]  # Using the from_document method of Data
+        processed_data: list[Data | None] = [Data.from_document(doc) if doc else None for doc in documents]
 
-        self.status = data
+        # Rename the `source` field to `self.SERVER_FILE_PATH_FIELDNAME`, to avoid conflicts with the `source` field
+        for data in processed_data:
+            if data and "source" in data.data:
+                data.data[self.SERVER_FILE_PATH_FIELDNAME] = data.data.pop("source")
 
-        return data
+        return self.rollup_data(file_list, processed_data)