Skip to content

Commit

Permalink
Use RecursiveCharacterTextSplitter to avoid llama_tokenize: too many …
Browse files Browse the repository at this point in the history
…tokens error during ingestion
  • Loading branch information
imartinez committed May 8, 2023
1 parent 75a1141 commit 026b9f8
Showing 1 changed file with 2 additions and 2 deletions.
4 changes: 2 additions & 2 deletions ingest.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import LlamaCppEmbeddings
from sys import argv
Expand All @@ -8,7 +8,7 @@ def main():
# Load document and split in chunks
loader = TextLoader(argv[1], encoding="utf8")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
texts = text_splitter.split_documents(documents)
# Create embeddings
llama = LlamaCppEmbeddings(model_path="./models/ggml-model-q4_0.bin")
Expand Down

0 comments on commit 026b9f8

Please sign in to comment.