forked from langflow-ai/langflow
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Expanding Unstructured loader to take server file inputs, more file t…
…ypes, and API parameters (langflow-ai#4738) * adding ability for APIRequest to retry and save to a file * [autofix.ci] apply automated fixes * adding ability for APIRequest to retry and save to a file * [autofix.ci] apply automated fixes * initial refactor of FileComponent to handle Data input * shifting potentially common logic into BaseFileComponent * improving readability and fixing problems * [autofix.ci] apply automated fixes * addressing linting * [autofix.ci] apply automated fixes * linting part 2 * [autofix.ci] apply automated fixes * linting part 3 * preserve input fields on data objects * [autofix.ci] apply automated fixes * ensuring processed data is linked to correct file data object * [autofix.ci] apply automated fixes * addressing linting * [autofix.ci] apply automated fixes * refactor Unstructured to BaseFileComponent * [autofix.ci] apply automated fixes * linting * refactor to new BaseData * adding chunking strategy selector * [autofix.ci] apply automated fixes * fixing edge case * allowing specific failure of missing file without forcing silent_errors * [autofix.ci] apply automated fixes * Fix mypy issues * Update unstructured.py --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com> Co-authored-by: Eric Hare <[email protected]>
- Loading branch information
1 parent
8e55a0e
commit 97fe69c
Showing
2 changed files
with
95 additions
and
26 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
120 changes: 94 additions & 26 deletions
120
src/backend/base/langflow/components/unstructured/unstructured.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,54 +1,122 @@ | ||
from langchain_unstructured import UnstructuredLoader | ||
|
||
from langflow.custom import Component | ||
from langflow.inputs import FileInput, SecretStrInput | ||
from langflow.base.data import BaseFileComponent | ||
from langflow.inputs import DropdownInput, MessageTextInput, NestedDictInput, SecretStrInput | ||
from langflow.schema import Data | ||
from langflow.template import Output | ||
|
||
|
||
class UnstructuredComponent(Component): | ||
display_name = "Unstructured" | ||
description = "Uses Unstructured.io to extract clean text from raw source documents. Supports: PDF, DOCX, TXT" | ||
documentation = "https://python.langchain.com/v0.2/docs/integrations/providers/unstructured/" | ||
class UnstructuredComponent(BaseFileComponent): | ||
display_name = "Unstructured API" | ||
description = ( | ||
"Uses Unstructured.io API to extract clean text from raw source documents. " | ||
"Supports a wide range of file types." | ||
) | ||
documentation = ( | ||
"https://python.langchain.com/api_reference/unstructured/document_loaders/" | ||
"langchain_unstructured.document_loaders.UnstructuredLoader.html" | ||
) | ||
trace_type = "tool" | ||
icon = "Unstructured" | ||
name = "Unstructured" | ||
|
||
# https://docs.unstructured.io/api-reference/api-services/overview#supported-file-types | ||
VALID_EXTENSIONS = [ | ||
"bmp", | ||
"csv", | ||
"doc", | ||
"docx", | ||
"eml", | ||
"epub", | ||
"heic", | ||
"html", | ||
"jpeg", | ||
"png", | ||
"md", | ||
"msg", | ||
"odt", | ||
"org", | ||
"p7s", | ||
"pdf", | ||
"png", | ||
"ppt", | ||
"pptx", | ||
"rst", | ||
"rtf", | ||
"tiff", | ||
"txt", | ||
"tsv", | ||
"xls", | ||
"xlsx", | ||
"xml", | ||
] | ||
|
||
inputs = [ | ||
FileInput( | ||
name="file", | ||
display_name="File", | ||
required=True, | ||
info="The path to the file with which you want to use Unstructured to parse. Supports: PDF, DOCX, TXT", | ||
file_types=["pdf", "docx", "txt"], # TODO: Support all unstructured file types | ||
), | ||
*BaseFileComponent._base_inputs, | ||
SecretStrInput( | ||
name="api_key", | ||
display_name="Unstructured.io Serverless API Key", | ||
required=True, | ||
info="Unstructured API Key. Create at: https://app.unstructured.io/", | ||
), | ||
MessageTextInput( | ||
name="api_url", | ||
display_name="Unstructured.io API URL", | ||
required=False, | ||
info="Unstructured API URL.", | ||
), | ||
DropdownInput( | ||
name="chunking_strategy", | ||
display_name="Chunking Strategy", | ||
info="Chunking strategy to use, see https://docs.unstructured.io/api-reference/api-services/chunking", | ||
options=["", "basic", "by_title", "by_page", "by_similarity"], | ||
real_time_refresh=False, | ||
value="", | ||
), | ||
NestedDictInput( | ||
name="unstructured_args", | ||
display_name="Additional Arguments", | ||
required=False, | ||
info=( | ||
"Optional dictionary of additional arguments to the Loader. " | ||
"See https://docs.unstructured.io/api-reference/api-services/api-parameters for more information." | ||
), | ||
), | ||
] | ||
|
||
outputs = [ | ||
Output(name="data", display_name="Data", method="load_documents"), | ||
*BaseFileComponent._base_outputs, | ||
] | ||
|
||
def build_unstructured(self) -> UnstructuredLoader: | ||
file_paths = [self.file] | ||
def process_files(self, file_list: list[BaseFileComponent.BaseFile]) -> list[BaseFileComponent.BaseFile]: | ||
file_paths = [str(file.path) for file in file_list if file.path] | ||
|
||
if not file_paths: | ||
self.log("No files to process.") | ||
return file_list | ||
|
||
# https://docs.unstructured.io/api-reference/api-services/api-parameters | ||
args = self.unstructured_args or {} | ||
|
||
if self.chunking_strategy: | ||
args["chunking_strategy"] = self.chunking_strategy | ||
|
||
args["api_key"] = self.api_key | ||
args["partition_via_api"] = True | ||
if self.api_url: | ||
args["url"] = self.api_url | ||
|
||
return UnstructuredLoader( | ||
loader = UnstructuredLoader( | ||
file_paths, | ||
api_key=self.api_key, | ||
partition_via_api=True, | ||
**args, | ||
) | ||
|
||
def load_documents(self) -> list[Data]: | ||
unstructured = self.build_unstructured() | ||
documents = loader.load() | ||
|
||
documents = unstructured.load() | ||
data = [Data.from_document(doc) for doc in documents] # Using the from_document method of Data | ||
processed_data: list[Data | None] = [Data.from_document(doc) if doc else None for doc in documents] | ||
|
||
self.status = data | ||
# Rename the `source` field to `self.SERVER_FILE_PATH_FIELDNAME`, to avoid conflicts with the `source` field | ||
for data in processed_data: | ||
if data and "source" in data.data: | ||
data.data[self.SERVER_FILE_PATH_FIELDNAME] = data.data.pop("source") | ||
|
||
return data | ||
return self.rollup_data(file_list, processed_data) |