Skip to content

Commit

Permalink
Expanding Unstructured loader to take server file inputs, more file t…
Browse files Browse the repository at this point in the history
…ypes, and API parameters (langflow-ai#4738)

* adding ability for APIRequest to retry and save to a file

* [autofix.ci] apply automated fixes

* adding ability for APIRequest to retry and save to a file

* [autofix.ci] apply automated fixes

* initial refactor of FileComponent to handle Data input

* shifting potentially common logic into BaseFileComponent

* improving readability and fixing problems

* [autofix.ci] apply automated fixes

* addressing linting

* [autofix.ci] apply automated fixes

* linting part 2

* [autofix.ci] apply automated fixes

* linting part 3

* preserve input fields on data objects

* [autofix.ci] apply automated fixes

* ensuring processed data is linked to correct file data object

* [autofix.ci] apply automated fixes

* addressing linting

* [autofix.ci] apply automated fixes

* refactor Unstructured to BaseFileComponent

* [autofix.ci] apply automated fixes

* linting

* refactor to new BaseData

* adding chunking strategy selector

* [autofix.ci] apply automated fixes

* fixing edge case

* allowing specific failure of missing file without forcing silent_errors

* [autofix.ci] apply automated fixes

* Fix mypy issues

* Update unstructured.py

---------

Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>
Co-authored-by: Eric Hare <[email protected]>
  • Loading branch information
3 people authored Nov 26, 2024
1 parent 8e55a0e commit 97fe69c
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 26 deletions.
1 change: 1 addition & 0 deletions src/backend/base/langflow/base/data/base_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ def _validate_and_resolve_paths(self) -> list[BaseFile]:

def add_file(data: Data, path: str | Path, *, delete_after_processing: bool):
resolved_path = Path(self.resolve_path(str(path)))

if not resolved_path.exists():
msg = f"File or directory not found: {path}"
self.log(msg)
Expand Down
120 changes: 94 additions & 26 deletions src/backend/base/langflow/components/unstructured/unstructured.py
Original file line number Diff line number Diff line change
@@ -1,54 +1,122 @@
from langchain_unstructured import UnstructuredLoader

from langflow.custom import Component
from langflow.inputs import FileInput, SecretStrInput
from langflow.base.data import BaseFileComponent
from langflow.inputs import DropdownInput, MessageTextInput, NestedDictInput, SecretStrInput
from langflow.schema import Data
from langflow.template import Output


class UnstructuredComponent(Component):
display_name = "Unstructured"
description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT"
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/"
class UnstructuredComponent(BaseFileComponent):
display_name = "Unstructured API"
description = (
"Uses Unstructured.io API to extract clean text from raw source documents. "
"Supports a wide range of file types."
)
documentation = (
"https://python.langchain.com/api_reference/unstructured/document_loaders/"
"langchain_unstructured.document_loaders.UnstructuredLoader.html"
)
trace_type = "tool"
icon = "Unstructured"
name = "Unstructured"

# https://docs.unstructured.io/api-reference/api-services/overview#supported-file-types
VALID_EXTENSIONS = [
"bmp",
"csv",
"doc",
"docx",
"eml",
"epub",
"heic",
"html",
"jpeg",
"png",
"md",
"msg",
"odt",
"org",
"p7s",
"pdf",
"png",
"ppt",
"pptx",
"rst",
"rtf",
"tiff",
"txt",
"tsv",
"xls",
"xlsx",
"xml",
]

inputs = [
FileInput(
name="file",
display_name="File",
required=True,
info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT",
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types
),
*BaseFileComponent._base_inputs,
SecretStrInput(
name="api_key",
display_name="Unstructured.io Serverless API Key",
required=True,
info="Unstructured API Key. Create at: https://app.unstructured.io/",
),
MessageTextInput(
name="api_url",
display_name="Unstructured.io API URL",
required=False,
info="Unstructured API URL.",
),
DropdownInput(
name="chunking_strategy",
display_name="Chunking Strategy",
info="Chunking strategy to use, see https://docs.unstructured.io/api-reference/api-services/chunking",
options=["", "basic", "by_title", "by_page", "by_similarity"],
real_time_refresh=False,
value="",
),
NestedDictInput(
name="unstructured_args",
display_name="Additional Arguments",
required=False,
info=(
"Optional dictionary of additional arguments to the Loader. "
"See https://docs.unstructured.io/api-reference/api-services/api-parameters for more information."
),
),
]

outputs = [
Output(name="data", display_name="Data", method="load_documents"),
*BaseFileComponent._base_outputs,
]

def build_unstructured(self) -> UnstructuredLoader:
file_paths = [self.file]
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]:
file_paths = [str(file.path) for file in file_list if file.path]

if not file_paths:
self.log("No files to process.")
return file_list

# https://docs.unstructured.io/api-reference/api-services/api-parameters
args = self.unstructured_args or {}

if self.chunking_strategy:
args["chunking_strategy"] = self.chunking_strategy

args["api_key"] = self.api_key
args["partition_via_api"] = True
if self.api_url:
args["url"] = self.api_url

return UnstructuredLoader(
loader = UnstructuredLoader(
file_paths,
api_key=self.api_key,
partition_via_api=True,
**args,
)

def load_documents(self) -> list[Data]:
unstructured = self.build_unstructured()
documents = loader.load()

documents = unstructured.load()
data = [Data.from_document(doc) for doc in documents] # Using the from_document method of Data
processed_data: list[Data | None] = [Data.from_document(doc) if doc else None for doc in documents]

self.status = data
# Rename the `source` field to `self.SERVER_FILE_PATH_FIELDNAME`, to avoid conflicts with the `source` field
for data in processed_data:
if data and "source" in data.data:
data.data[self.SERVER_FILE_PATH_FIELDNAME] = data.data.pop("source")

return data
return self.rollup_data(file_list, processed_data)

0 comments on commit 97fe69c

Please sign in to comment.