Skip to content

Commit

Permalink
add pdfreader and tokenchunker
Browse files Browse the repository at this point in the history
  • Loading branch information
thomashacker committed Nov 15, 2023
1 parent 586783e commit 262e4d7
Show file tree
Hide file tree
Showing 9 changed files with 345 additions and 88 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@

All notable changes to this project will be documented in this file.

## [0.3.1] - 15.11.2023

### Added
- PDFReader powered by PyPDF2
- TokenChunker powered by tiktoken

### Fixed
- Added missing dependencies
- Fixed restart bug

## [0.3.0] - 12.09.2023

### Added
Expand Down
4 changes: 3 additions & 1 deletion goldenverba/components/chunking/manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import tiktoken

from goldenverba.components.chunking.wordchunker import WordChunker
from goldenverba.components.chunking.tiktokenchunker import TokenChunker
from goldenverba.components.chunking.sentencechunker import SentenceChunker
from goldenverba.components.chunking.interface import Chunker
from goldenverba.components.reader.document import Document
Expand All @@ -11,10 +12,11 @@
class ChunkerManager:
def __init__(self):
self.chunker: dict[str, Chunker] = {
"TokenChunker": TokenChunker(),
"WordChunker": WordChunker(),
"SentenceChunker": SentenceChunker(),
}
self.selected_chunker: Chunker = self.chunker["WordChunker"]
self.selected_chunker: Chunker = self.chunker["TokenChunker"]

def chunk(
self, documents: list[Document], units: int, overlap: int
Expand Down
85 changes: 85 additions & 0 deletions goldenverba/components/chunking/tiktokenchunker.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from wasabi import msg
from tqdm import tqdm

try:
import tiktoken
except:
pass

from goldenverba.components.chunking.interface import Chunker
from goldenverba.components.chunking.chunk import Chunk
from goldenverba.components.reader.document import Document


class TokenChunker(Chunker):
"""
TokenChunker for Verba built with tiktoken
"""

def __init__(self):
super().__init__()
self.name = "TokenChunker"
self.requires_library = ["tiktoken"]
self.default_units = 250
self.default_overlap = 50
self.description = "Chunk documents by tokens powered by tiktoken. You can specify how many tokens should overlap between chunks to improve retrieval."
self.encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

def chunk(
self, documents: list[Document], units: int, overlap: int
) -> list[Document]:
"""Chunk verba documents into chunks based on units and overlap
@parameter: documents : list[Document] - List of Verba documents
@parameter: units : int - How many units per chunk (words, sentences, etc.)
@parameter: overlap : int - How much overlap between the chunks
@returns list[str] - List of documents that contain the chunks
"""
for document in tqdm(
documents, total=len(documents), desc="Chunking documents"
):
# Skip if document already contains chunks
if len(document.chunks) > 0:
continue

encoded_tokens = self.encoding.encode(document.text, disallowed_special=())

if units > len(encoded_tokens) or units < 1:
doc_chunk = Chunk(
text=document.text,
doc_name=document.name,
doc_type=document.type,
chunk_id=0,
)

if overlap >= units:
msg.warn(
f"Overlap value is greater than unit (Units {units}/ Overlap {overlap})"
)
continue

i = 0
split_id_counter = 0
while i < len(encoded_tokens):
# Overlap
start_i = i
end_i = min(i + units, len(encoded_tokens))

chunk_tokens = encoded_tokens[start_i:end_i]
chunk_text = self.encoding.decode(chunk_tokens)

doc_chunk = Chunk(
text=chunk_text,
doc_name=document.name,
doc_type=document.type,
chunk_id=split_id_counter,
)
document.chunks.append(doc_chunk)
split_id_counter += 1

# Exit loop if this was the last possible chunk
if end_i == len(encoded_tokens):
break

i += units - overlap # Step forward, considering overlap

return documents
4 changes: 3 additions & 1 deletion goldenverba/components/reader/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from goldenverba.components.reader.simplereader import SimpleReader
from goldenverba.components.reader.githubreader import GithubReader
from goldenverba.components.reader.unstructuredpdf import UnstructuredPDF
from goldenverba.components.reader.pdfreader import PDFReader
from goldenverba.components.reader.interface import Reader
from goldenverba.components.reader.document import Document
Expand All @@ -11,8 +12,9 @@ class ReaderManager:
def __init__(self):
self.readers: dict[str, Reader] = {
"SimpleReader": SimpleReader(),
"GithubReader": GithubReader(),
"PDFReader": PDFReader(),
"GithubReader": GithubReader(),
"UnstructuredPDF": UnstructuredPDF(),
}
self.selected_reader: Reader = self.readers["SimpleReader"]

Expand Down
102 changes: 18 additions & 84 deletions goldenverba/components/reader/pdfreader.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import glob
import base64
import os
import requests

from wasabi import msg
from pathlib import Path
Expand All @@ -10,6 +9,11 @@
from goldenverba.components.reader.interface import Reader, InputForm
from goldenverba.components.reader.document import Document

try:
from PyPDF2 import PdfReader
except Exception as e:
msg.warn("PyPDF2 not installed, your base installation might be corrupted.")


class PDFReader(Reader):
"""
Expand All @@ -19,9 +23,9 @@ class PDFReader(Reader):
def __init__(self):
super().__init__()
self.file_types = [".pdf"]
self.requires_env = ["UNSTRUCTURED_API_KEY"]
self.requires_library = ["PyPDF2"]
self.name = "PDFReader"
self.description = "Reads PDF files powered by unstructured.io"
self.description = "Reads PDF files using the PyPDF2 library"
self.input_form = InputForm.UPLOAD.value

def load(
Expand Down Expand Up @@ -60,7 +64,12 @@ def load(
if len(bytes) > 0:
if len(bytes) == len(fileNames):
for byte, fileName in zip(bytes, fileNames):
documents += self.load_bytes(byte, fileName, document_type)
decoded_bytes = base64.b64decode(byte)
with open(f"{fileName}", "wb") as file:
file.write(decoded_bytes)

documents += self.load_file(f"{fileName}", document_type)
os.remove(f"{fileName}")

# If content exist
if len(contents) > 0:
Expand All @@ -78,96 +87,21 @@ def load(
msg.good(f"Loaded {len(documents)} documents")
return documents

def load_bytes(self, bytes_string, fileName, document_type: str) -> list[Document]:
"""Loads a pdf bytes file
@param bytes_string : str - PDF File bytes coming from the frontend
@param fileName : str - Filename
@param document_type : str - Document Type
@returns list[Document] - Lists of documents
"""
documents = []

url = "https://api.unstructured.io/general/v0/general"

headers = {
"accept": "application/json",
"unstructured-api-key": os.environ.get("UNSTRUCTURED_API_KEY", ""),
}

data = {
"strategy": "auto",
}

decoded_bytes = base64.b64decode(bytes_string)
with open("reconstructed.pdf", "wb") as file:
file.write(decoded_bytes)

file_data = {"files": open("reconstructed.pdf", "rb")}

response = requests.post(url, headers=headers, data=data, files=file_data)

json_response = response.json()

full_content = ""

for chunk in json_response:
if "text" in chunk:
text = chunk["text"]
full_content += text + " "

document = Document(
text=full_content,
type=document_type,
name=str(fileName),
link=str(fileName),
timestamp=str(datetime.now().strftime("%Y-%m-%d %H:%M:%S")),
reader=self.name,
)
documents.append(document)
msg.good(f"Loaded {str(fileName)}")
os.remove("reconstructed.pdf")
return documents

def load_file(self, file_path: Path, document_type: str) -> list[Document]:
"""Loads .pdf file
@param file_path : Path - Path to file
@param document_type : str - Document Type
@returns list[Document] - Lists of documents
"""
documents = []
full_text = ""
reader = PdfReader(file_path)

if file_path.suffix not in self.file_types:
msg.warn(f"{file_path.suffix} not supported")
return []

url = "https://api.unstructured.io/general/v0/general"

headers = {
"accept": "application/json",
"unstructured-api-key": os.environ.get("UNSTRUCTURED_API_KEY", ""),
}

data = {
"strategy": "auto",
}

file_data = {"files": open(file_path, "rb")}

response = requests.post(url, headers=headers, data=data, files=file_data)

file_data["files"].close()

json_response = response.json()

full_content = ""

for chunk in json_response:
if "text" in chunk:
text = chunk["text"]
full_content += text + " "
for page in reader.pages:
full_text += page.extract_text() + "\n\n"

document = Document(
text=full_content,
text=full_text,
type=document_type,
name=str(file_path),
link=str(file_path),
Expand Down
Loading

0 comments on commit 262e4d7

Please sign in to comment.