Skip to content

Commit

Permalink
refactor(agent/file_operations): Refactor file opening/reading and pa…
Browse files Browse the repository at this point in the history
…rsing

- Update the signature of `FileWorkspace.open_file` and fix implementations in every workspace backend
- Replace `open()` with `workspace.open_file` in the `read_file` command to use the workspace's file opening functionality
- Fix the parametrization of the `test_text_file_parsers` test to correctly test text file parsers
  • Loading branch information
Pwuts committed Dec 12, 2023
1 parent 198a0ec commit d95e3b5
Show file tree
Hide file tree
Showing 8 changed files with 124 additions and 110 deletions.
9 changes: 4 additions & 5 deletions autogpts/autogpt/autogpt/commands/file_operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from autogpt.memory.vector import MemoryItemFactory, VectorMemory

from .decorators import sanitize_path_arg
from .file_operations_utils import read_textual_file
from .file_operations_utils import decode_textual_file

COMMAND_CATEGORY = "file_operations"
COMMAND_CATEGORY_TITLE = "File Operations"
Expand Down Expand Up @@ -140,8 +140,7 @@ def log_operation(
)
},
)
@sanitize_path_arg("filename")
def read_file(filename: Path, agent: Agent) -> str:
def read_file(filename: str | Path, agent: Agent) -> str:
"""Read a file and return the contents
Args:
Expand All @@ -150,8 +149,8 @@ def read_file(filename: Path, agent: Agent) -> str:
Returns:
str: The contents of the file
"""
content = read_textual_file(filename, logger)
# TODO: content = agent.workspace.read_file(filename)
file = agent.workspace.open_file(filename, binary=True)
content = decode_textual_file(file, logger)

# # TODO: invalidate/update memory when file is edited
# file_memory = MemoryItem.from_text_file(content, str(filename), agent.config)
Expand Down
105 changes: 42 additions & 63 deletions autogpts/autogpt/autogpt/commands/file_operations_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import json
import logging
import os
from pathlib import Path
from abc import ABC, abstractmethod
from typing import BinaryIO

import charset_normalizer
import docx
import markdown
import pypdf
import yaml
from bs4 import BeautifulSoup
Expand All @@ -14,23 +14,24 @@
logger = logging.getLogger(__name__)


class ParserStrategy:
def read(self, file_path: Path) -> str:
raise NotImplementedError
class ParserStrategy(ABC):
@abstractmethod
def read(self, file: BinaryIO) -> str:
...


# Basic text file reading
class TXTParser(ParserStrategy):
def read(self, file_path: Path) -> str:
charset_match = charset_normalizer.from_path(file_path).best()
logger.debug(f"Reading '{file_path}' with encoding '{charset_match.encoding}'")
def read(self, file: BinaryIO) -> str:
charset_match = charset_normalizer.from_bytes(file.read()).best()
logger.debug(f"Reading '{file.name}' with encoding '{charset_match.encoding}'")
return str(charset_match)


# Reading text from binary file using pdf parser
class PDFParser(ParserStrategy):
def read(self, file_path: Path) -> str:
parser = pypdf.PdfReader(file_path)
def read(self, file: BinaryIO) -> str:
parser = pypdf.PdfReader(file)
text = ""
for page_idx in range(len(parser.pages)):
text += parser.pages[page_idx].extract_text()
Expand All @@ -39,8 +40,8 @@ def read(self, file_path: Path) -> str:

# Reading text from binary file using docs parser
class DOCXParser(ParserStrategy):
def read(self, file_path: Path) -> str:
doc_file = docx.Document(file_path)
def read(self, file: BinaryIO) -> str:
doc_file = docx.Document(file)
text = ""
for para in doc_file.paragraphs:
text += para.text
Expand All @@ -49,50 +50,37 @@ def read(self, file_path: Path) -> str:

# Reading as dictionary and returning string format
class JSONParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
data = json.load(f)
text = str(data)
def read(self, file: BinaryIO) -> str:
data = json.load(file)
text = str(data)
return text


class XMLParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
soup = BeautifulSoup(f, "xml")
text = soup.get_text()
def read(self, file: BinaryIO) -> str:
soup = BeautifulSoup(file, "xml")
text = soup.get_text()
return text


# Reading as dictionary and returning string format
class YAMLParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
data = yaml.load(f, Loader=yaml.FullLoader)
text = str(data)
def read(self, file: BinaryIO) -> str:
data = yaml.load(file, Loader=yaml.FullLoader)
text = str(data)
return text


class HTMLParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
soup = BeautifulSoup(f, "html.parser")
text = soup.get_text()
return text


class MarkdownParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
html = markdown.markdown(f.read())
text = "".join(BeautifulSoup(html, "html.parser").findAll(string=True))
def read(self, file: BinaryIO) -> str:
soup = BeautifulSoup(file, "html.parser")
text = soup.get_text()
return text


class LaTeXParser(ParserStrategy):
def read(self, file_path: Path) -> str:
with open(file_path, "r") as f:
latex = f.read()
def read(self, file: BinaryIO) -> str:
latex = file.read().decode()
text = LatexNodes2Text().latex_to_text(latex)
return text

Expand All @@ -106,13 +94,15 @@ def set_parser(self, parser: ParserStrategy) -> None:
self.logger.debug(f"Setting Context Parser to {parser}")
self.parser = parser

def read_file(self, file_path) -> str:
self.logger.debug(f"Reading file {file_path} with parser {self.parser}")
return self.parser.read(file_path)
def decode_file(self, file: BinaryIO) -> str:
self.logger.debug(f"Reading file {file.name} with parser {self.parser}")
return self.parser.read(file)


extension_to_parser = {
".txt": TXTParser(),
".md": TXTParser(),
".markdown": TXTParser(),
".csv": TXTParser(),
".pdf": PDFParser(),
".docx": DOCXParser(),
Expand All @@ -123,47 +113,36 @@ def read_file(self, file_path) -> str:
".html": HTMLParser(),
".htm": HTMLParser(),
".xhtml": HTMLParser(),
".md": MarkdownParser(),
".markdown": MarkdownParser(),
".tex": LaTeXParser(),
}


def is_file_binary_fn(file_path: Path):
def is_file_binary_fn(file: BinaryIO):
"""Given a file path load all its content and checks if the null bytes is present
Args:
file_path (_type_): _description_
file (_type_): _description_
Returns:
bool: is_binary
"""
with open(file_path, "rb") as f:
file_data = f.read()
file_data = file.read()
file.seek(0)
if b"\x00" in file_data:
return True
return False


def read_textual_file(file_path: Path, logger: logging.Logger) -> str:
if not file_path.is_absolute():
raise ValueError("File path must be absolute")

if not file_path.is_file():
if not file_path.exists():
raise FileNotFoundError(
f"read_file {file_path} failed: no such file or directory"
)
else:
raise ValueError(f"read_file failed: {file_path} is not a file")
def decode_textual_file(file: BinaryIO, logger: logging.Logger) -> str:
if not file.readable():
raise ValueError(f"read_file failed: {file.name} is not a file")

is_binary = is_file_binary_fn(file_path)
file_extension = os.path.splitext(file_path)[1].lower()
file_extension = os.path.splitext(file.name)[1].lower()
parser = extension_to_parser.get(file_extension)
if not parser:
if is_binary:
if is_file_binary_fn(file):
raise ValueError(f"Unsupported binary file format: {file_extension}")
# fallback to txt file parser (to support script and code files loading)
parser = TXTParser()
file_context = FileContext(parser, logger)
return file_context.read_file(file_path)
return file_context.decode_file(file)
21 changes: 18 additions & 3 deletions autogpts/autogpt/autogpt/file_workspace/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@

import logging
from abc import ABC, abstractmethod
from io import IOBase, TextIOBase
from pathlib import Path
from typing import Any, Callable, Literal, Optional, overload
from typing import IO, Any, BinaryIO, Callable, Literal, Optional, TextIO, overload

from autogpt.core.configuration.schema import SystemConfiguration

Expand Down Expand Up @@ -47,9 +48,23 @@ def initialize(self) -> None:
doesn't exist yet. E.g. a folder on disk, or an S3 Bucket.
"""

@overload
@abstractmethod
def open_file(
self, path: str | Path, binary: Literal[False] = False
) -> TextIO | TextIOBase:
"""Returns a readable text file-like object representing the file."""

@overload
@abstractmethod
def open_file(
self, path: str | Path, binary: Literal[True] = True
) -> BinaryIO | IOBase:
"""Returns a readable binary file-like object representing the file."""

@abstractmethod
def open_file(self, path: str | Path, mode: str = "r"):
"""Open a file in the workspace."""
def open_file(self, path: str | Path, binary: bool = False) -> IO | IOBase:
"""Returns a readable file-like object representing the file."""

@overload
@abstractmethod
Expand Down
33 changes: 18 additions & 15 deletions autogpts/autogpt/autogpt/file_workspace/gcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import inspect
import logging
from io import IOBase
from pathlib import Path

from google.cloud import storage
Expand Down Expand Up @@ -40,7 +41,7 @@ def root(self) -> Path:
return self._root

@property
def restrict_to_root(self):
def restrict_to_root(self) -> bool:
"""Whether to restrict generated paths to the root."""
return True

Expand All @@ -50,26 +51,28 @@ def initialize(self) -> None:
def get_path(self, relative_path: str | Path) -> Path:
return super().get_path(relative_path).relative_to("/")

def open_file(self, path: str | Path, mode: str = "r"):
"""Open a file in the workspace."""
def _get_blob(self, path: str | Path) -> storage.Blob:
path = self.get_path(path)
blob = self._bucket.blob(str(path))
return blob
return self._bucket.blob(str(path))

def open_file(self, path: str | Path, binary: bool = False) -> IOBase:
"""Open a file in the workspace."""
blob = self._get_blob(path)
blob.reload() # pin revision number to prevent version mixing while reading
return blob.open("rb" if binary else "r")

def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace."""
blob = self.open_file(path, "r")
file_content = (
blob.download_as_text() if not binary else blob.download_as_bytes()
)
return file_content
return self.open_file(path, binary).read()

async def write_file(self, path: str | Path, content: str | bytes):
async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace."""
blob = self.open_file(path, "w")
blob.upload_from_string(content) if isinstance(
content, str
) else blob.upload_from_file(content)
blob = self._get_blob(path)

if isinstance(content, str):
blob.upload_from_string(content)
else:
blob.upload_from_file(content)

if self.on_write_file:
path = Path(path)
Expand Down
20 changes: 12 additions & 8 deletions autogpts/autogpt/autogpt/file_workspace/local.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import inspect
import logging
from pathlib import Path
from typing import IO

from .base import FileWorkspace, FileWorkspaceConfiguration

Expand All @@ -26,26 +27,29 @@ def root(self) -> Path:
return self._root

@property
def restrict_to_root(self):
def restrict_to_root(self) -> bool:
"""Whether to restrict generated paths to the root."""
return self._restrict_to_root

def initialize(self) -> None:
self.root.mkdir(exist_ok=True, parents=True)

def open_file(self, path: str | Path, mode: str = "r"):
def open_file(self, path: str | Path, binary: bool = False) -> IO:
"""Open a file in the workspace."""
return self._open_file(path, "rb" if binary else "r")

def _open_file(self, path: str | Path, mode: str = "r") -> IO:
full_path = self.get_path(path)
return open(full_path, mode)
return open(full_path, mode) # type: ignore

def read_file(self, path: str | Path, binary: bool = False):
def read_file(self, path: str | Path, binary: bool = False) -> str | bytes:
"""Read a file in the workspace."""
with self.open_file(path, "rb" if binary else "r") as file:
with self._open_file(path, "rb" if binary else "r") as file:
return file.read()

async def write_file(self, path: str | Path, content: str | bytes):
async def write_file(self, path: str | Path, content: str | bytes) -> None:
"""Write to a file in the workspace."""
with self.open_file(path, "wb" if type(content) is bytes else "w") as file:
with self._open_file(path, "wb" if type(content) is bytes else "w") as file:
file.write(content)

if self.on_write_file:
Expand All @@ -61,7 +65,7 @@ def list(self, path: str | Path = ".") -> list[Path]:
path = self.get_path(path)
return [file.relative_to(path) for file in path.rglob("*") if file.is_file()]

def delete_file(self, path: str | Path):
def delete_file(self, path: str | Path) -> None:
"""Delete a file in the workspace."""
full_path = self.get_path(path)
full_path.unlink()
Loading

0 comments on commit d95e3b5

Please sign in to comment.