-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathencod_langchain.py
79 lines (63 loc) · 2.6 KB
/
encod_langchain.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
from asyncio import as_completed
from dotenv import load_dotenv
import os
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from PyPDF2 import PdfReader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from datetime import datetime
from tqdm import tqdm
# Provide the relative path to the .env file
load_dotenv()
pdf_dir = 'data/PDFs'
embeddings = OpenAIEmbeddings()
# Initialize the FAISS vector store with actual parameters
faiss_store = FAISS(
embedding_function=embeddings, # Actual embedding function
index="IVF1024,Flat", # Example index type, modify according to your needs
docstore=None, # Example docstore, choose based on your requirement
index_to_docstore_id=lambda x: x # Simplest mapping, can be customized
)
splitter = CharacterTextSplitter(chunk_size=10000)
def process_pdf(file_path):
try:
with open(file_path, 'rb') as f:
pdf_reader = PdfReader(f)
text = ''.join([pdf_reader.pages[page_num].extract_text() for page_num in range(len(pdf_reader.pages))])
chunks = splitter.split_text(text)
for chunk in chunks:
vector = embeddings.embed_documents(chunk)
metadata = extract_metadata(file_path)
faiss_store.store(vector, metadata)
except Exception as e:
print(f"Error processing {file_path}: {e}")
def extract_metadata(file_path):
filename = os.path.basename(file_path)
pattern = r'(\d{4}-\d{2}-\d{2}) (.*?) - (.*?)-(\d+)\.pdf'
match = re.match(pattern, filename)
if match:
return {
'meeting_date': match.group(1),
'meeting_type': match.group(2),
'document_type': match.group(3),
'document_id': match.group(4)
}
return {}
num_files_to_process = 10
processed_files = 0
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(process_pdf, os.path.join(pdf_dir, filename))
for filename in os.listdir(pdf_dir)
if filename.endswith('.pdf')]
for future in tqdm(as_completed(futures), total=len(futures), desc="Processing PDFs"):
pass # or handle results if needed
current_date = datetime.now().strftime("%Y%m%d")
index_filename = f"council_meetings_faiss_index_{current_date}.index"
save_path = os.path.join('data', 'faiss_indexes')
full_path = os.path.join(save_path, index_filename)
if not os.path.exists(save_path):
os.makedirs(save_path)
# Use the save_local method to save the FAISS index
faiss_store.save_local(full_path)