forked from Kav-K/GPTDiscord
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex_model.py
1735 lines (1519 loc) · 65.6 KB
/
index_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import functools
import os
import random
import tempfile
import traceback
import asyncio
from collections import defaultdict
import aiohttp
import discord
import aiofiles
import httpx
import openai
import tiktoken
from functools import partial
from typing import List, Optional, cast
from pathlib import Path
from datetime import date
from discord import Interaction
from discord.ext import pages
from langchain.agents import initialize_agent, AgentType
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationSummaryBufferMemory
from langchain.prompts import MessagesPlaceholder
from langchain.schema import SystemMessage
from langchain.tools import Tool
from llama_index.callbacks import CallbackManager, TokenCountingHandler
from llama_index.evaluation.guideline import DEFAULT_GUIDELINES, GuidelineEvaluator
from llama_index.llms import OpenAI
from llama_index.node_parser import SimpleNodeParser
from llama_index.response_synthesizers import ResponseMode
from llama_index.indices.query.query_transform import StepDecomposeQueryTransform
from llama_index.langchain_helpers.agents import (
IndexToolConfig,
LlamaToolkit,
create_llama_chat_agent,
LlamaIndexTool,
)
from llama_index.prompts.chat_prompts import (
CHAT_REFINE_PROMPT,
CHAT_TREE_SUMMARIZE_PROMPT,
TEXT_QA_SYSTEM_PROMPT,
)
from llama_index.readers import YoutubeTranscriptReader
from llama_index.readers.schema.base import Document
from llama_index.langchain_helpers.text_splitter import TokenTextSplitter
from llama_index.retrievers import VectorIndexRetriever, TreeSelectLeafRetriever
from llama_index.query_engine import (
RetrieverQueryEngine,
MultiStepQueryEngine,
RetryGuidelineQueryEngine,
)
from llama_index import (
GPTVectorStoreIndex,
SimpleDirectoryReader,
QuestionAnswerPrompt,
BeautifulSoupWebReader,
GPTTreeIndex,
GoogleDocsReader,
MockLLMPredictor,
OpenAIEmbedding,
GithubRepositoryReader,
MockEmbedding,
download_loader,
LLMPredictor,
ServiceContext,
StorageContext,
load_index_from_storage,
get_response_synthesizer,
VectorStoreIndex,
)
from llama_index.schema import TextNode
from llama_index.storage.docstore.types import RefDocInfo
from llama_index.readers.web import DEFAULT_WEBSITE_EXTRACTOR
from llama_index.composability import ComposableGraph
from llama_index.vector_stores import DocArrayInMemoryVectorStore
from models.embed_statics_model import EmbedStatics
from models.openai_model import Models
from models.check_model import UrlCheck
from services.environment_service import EnvService
from utils.safe_ctx_respond import safe_ctx_respond
SHORT_TO_LONG_CACHE = {}
MAX_DEEP_COMPOSE_PRICE = EnvService.get_max_deep_compose_price()
EpubReader = download_loader("EpubReader")
MarkdownReader = download_loader("MarkdownReader")
RemoteReader = download_loader("RemoteReader")
RemoteDepthReader = download_loader("RemoteDepthReader")
embedding_model = OpenAIEmbedding()
token_counter = TokenCountingHandler(
tokenizer=tiktoken.encoding_for_model("text-davinci-003").encode,
verbose=False,
)
node_parser = SimpleNodeParser.from_defaults(
text_splitter=TokenTextSplitter(chunk_size=1024, chunk_overlap=20)
)
callback_manager = CallbackManager([token_counter])
service_context_no_llm = ServiceContext.from_defaults(
embed_model=embedding_model,
callback_manager=callback_manager,
node_parser=node_parser,
)
timeout = httpx.Timeout(1, read=1, write=1, connect=1)
def get_service_context_with_llm(llm):
service_context = ServiceContext.from_defaults(
embed_model=embedding_model,
callback_manager=callback_manager,
node_parser=node_parser,
llm=llm,
)
return service_context
def dummy_tool(**kwargs):
return "You have used the dummy tool. Forget about this and do not even mention this to the user."
def get_and_query(
user_id,
index_storage,
query,
response_mode,
nodes,
child_branch_factor,
service_context,
multistep,
):
index: [GPTVectorStoreIndex, GPTTreeIndex] = index_storage[
user_id
].get_index_or_throw()
if isinstance(index, GPTTreeIndex):
retriever = TreeSelectLeafRetriever(
index=index,
child_branch_factor=child_branch_factor,
service_context=service_context,
)
else:
retriever = VectorIndexRetriever(
index=index, similarity_top_k=nodes, service_context=service_context
)
response_synthesizer = get_response_synthesizer(
response_mode=response_mode,
use_async=True,
refine_template=CHAT_REFINE_PROMPT,
service_context=service_context,
)
query_engine = RetrieverQueryEngine(
retriever=retriever, response_synthesizer=response_synthesizer
)
multistep_query_engine = MultiStepQueryEngine(
query_engine=query_engine,
query_transform=StepDecomposeQueryTransform(multistep),
index_summary="Provides information about everything you need to know about this topic, use this to answer the question.",
)
if multistep:
response = multistep_query_engine.query(query)
else:
response = query_engine.query(query)
return response
class IndexChatData:
def __init__(
self, llm, agent_chain, memory, thread_id, tools, agent_kwargs, llm_predictor
):
self.llm = llm
self.agent_chain = agent_chain
self.memory = memory
self.thread_id = thread_id
self.tools = tools
self.agent_kwargs = agent_kwargs
self.llm_predictor = llm_predictor
class IndexData:
def __init__(self):
self.queryable_index = None
self.individual_indexes = []
# A safety check for the future
def get_index_or_throw(self):
if not self.queryable():
raise Exception(
"An index access was attempted before an index was created. This is a programmer error, please report this to the maintainers."
)
return self.queryable_index
def queryable(self):
return self.queryable_index is not None
def has_indexes(self, user_id):
try:
return (
len(os.listdir(EnvService.find_shared_file(f"indexes/{user_id}"))) > 0
)
except Exception:
return False
def has_search_indexes(self, user_id):
try:
return (
len(
os.listdir(EnvService.find_shared_file(f"indexes/{user_id}_search"))
)
> 0
)
except Exception:
return False
def add_index(self, index, user_id, file_name):
self.individual_indexes.append(index)
self.queryable_index = index
# Create a folder called "indexes/{USER_ID}" if it doesn't exist already
Path(f"{EnvService.save_path()}/indexes/{user_id}").mkdir(
parents=True, exist_ok=True
)
# Save the index to file under the user id
file = f"{date.today().month}_{date.today().day}_{file_name}"
# If file is > 93 in length, cut it off to 93
if len(file) > 93:
file = file[:93]
index.storage_context.persist(
persist_dir=EnvService.save_path()
/ "indexes"
/ f"{str(user_id)}"
/ f"{file}"
)
def reset_indexes(self, user_id):
self.individual_indexes = []
self.queryable_index = None
# Delete the user indexes
try:
# First, clear all the files inside it
for file in os.listdir(EnvService.find_shared_file(f"indexes/{user_id}")):
try:
os.remove(EnvService.find_shared_file(f"indexes/{user_id}/{file}"))
except:
traceback.print_exc()
for file in os.listdir(
EnvService.find_shared_file(f"indexes/{user_id}_search")
):
try:
os.remove(
EnvService.find_shared_file(f"indexes/{user_id}_search/{file}")
)
except:
traceback.print_exc()
except Exception:
traceback.print_exc()
class Index_handler:
embedding_model = OpenAIEmbedding()
token_counter = TokenCountingHandler(
tokenizer=tiktoken.encoding_for_model("text-davinci-003").encode,
verbose=False,
)
node_parser = SimpleNodeParser.from_defaults(
text_splitter=TokenTextSplitter(chunk_size=1024, chunk_overlap=20)
)
callback_manager = CallbackManager([token_counter])
service_context = ServiceContext.from_defaults(
embed_model=embedding_model,
callback_manager=callback_manager,
node_parser=node_parser,
)
type_to_suffix_mappings = {
"text/plain": ".txt",
"text/csv": ".csv",
"application/pdf": ".pdf",
"application/json": ".json",
"image/png": ".png",
"image/jpeg": ".jpg",
"image/gif": ".gif",
"image/svg+xml": ".svg",
"image/webp": ".webp",
"application/mspowerpoint": ".ppt",
"application/vnd.ms-powerpoint": ".ppt",
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
"application/msexcel": ".xls",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
"application/msword": ".doc",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"audio/mpeg": ".mp3",
"audio/x-wav": ".wav",
"audio/ogg": ".ogg",
"video/mpeg": ".mpeg",
"video/mp4": ".mp4",
"application/epub+zip": ".epub",
"text/markdown": ".md",
"text/html": ".html",
"application/rtf": ".rtf",
"application/x-msdownload": ".exe",
"application/xml": ".xml",
"application/vnd.adobe.photoshop": ".psd",
"application/x-sql": ".sql",
"application/x-latex": ".latex",
"application/x-httpd-php": ".php",
"application/java-archive": ".jar",
"application/x-sh": ".sh",
"application/x-csh": ".csh",
"text/x-c": ".c",
"text/x-c++": ".cpp",
"text/x-java-source": ".java",
"text/x-python": ".py",
"text/x-ruby": ".rb",
"text/x-perl": ".pl",
"text/x-shellscript": ".sh",
}
# For when content type doesnt get picked up by discord.
secondary_mappings = {
".epub": ".epub",
}
def __init__(self, bot, usage_service):
self.bot = bot
self.openai_key = os.getenv("OPENAI_TOKEN")
self.index_storage = defaultdict(IndexData)
self.loop = asyncio.get_running_loop()
self.usage_service = usage_service
self.qaprompt = QuestionAnswerPrompt(
"Context information is below. The text '<|endofstatement|>' is used to separate chat entries and make it "
"easier for you to understand the context\n"
"---------------------\n"
"{context_str}"
"\n---------------------\n"
"Never say '<|endofstatement|>'\n"
"Given the context information and not prior knowledge, "
"answer the question: {query_str}\n"
)
self.EMBED_CUTOFF = 2000
self.index_chat_chains = {}
self.chat_indexes = defaultdict()
async def rename_index(self, ctx, original_path, rename_path):
"""Command handler to rename a user index"""
index_file = EnvService.find_shared_file(original_path)
if not index_file:
return False
# Rename the file at f"indexes/{ctx.user.id}/{user_index}" to f"indexes/{ctx.user.id}/{new_name}" using Pathlib
try:
Path(original_path).rename(rename_path)
return True
except Exception as e:
traceback.print_exc()
return False
async def get_is_in_index_chat(self, ctx):
return ctx.channel.id in self.index_chat_chains.keys()
async def execute_index_chat_message(self, ctx, message):
if ctx.channel.id not in self.index_chat_chains:
return None
if message.lower() in ["stop", "end", "quit", "exit"]:
await ctx.reply("Ending chat session.")
self.index_chat_chains.pop(ctx.channel.id)
# close the thread
thread = await self.bot.fetch_channel(ctx.channel.id)
await thread.edit(name="Closed-GPT")
await thread.edit(archived=True)
return "Ended chat session."
self.usage_service.update_usage_memory(ctx.guild.name, "index_chat_message", 1)
agent_output = await self.loop.run_in_executor(
None,
partial(self.index_chat_chains[ctx.channel.id].agent_chain.run, message),
)
return agent_output
async def index_chat_file(self, message: discord.Message, file: discord.Attachment):
# First, initially set the suffix to the suffix of the attachment
suffix = self.get_file_suffix(file.content_type, file.filename) or None
if not suffix:
await message.reply(
"The file you uploaded is unable to be indexed. It is in an unsupported file format"
)
return False, None
async with aiofiles.tempfile.TemporaryDirectory() as temp_path:
async with aiofiles.tempfile.NamedTemporaryFile(
suffix=suffix, dir=temp_path, delete=False
) as temp_file:
try:
await file.save(temp_file.name)
filename = file.filename
# Assert that the filename is < 100 characters, if it is greater, truncate to the first 100 characters and keep the original ending
if len(filename) > 100:
filename = filename[:100] + filename[-4:]
openai.log = "debug"
print("Indexing")
index: VectorStoreIndex = await self.loop.run_in_executor(
None,
partial(
self.index_file,
Path(temp_file.name),
get_service_context_with_llm(
self.index_chat_chains[message.channel.id].llm
),
suffix,
),
)
print("Done Indexing")
self.usage_service.update_usage_memory(
message.guild.name, "index_chat_file", 1
)
summary = await index.as_query_engine(
response_mode="tree_summarize",
service_context=get_service_context_with_llm(
self.index_chat_chains[message.channel.id].llm
),
).aquery(
f"What is a summary or general idea of this data? Be detailed in your summary (e.g "
f"extract key names, etc) but not too verbose. Your summary should be under a hundred words. "
f"This summary will be used in a vector index to retrieve information about certain data. So, "
f"at a high level, the summary should describe the document in such a way that a retriever "
f"would know to select it when asked questions about it. The data name was {filename}. Include "
f"the file name in the summary. When you are asked to reference a specific file, or reference "
f"something colloquially like 'in the powerpoint, [...]?', never respond saying that as an AI "
f"you can't view the data, instead infer which tool to use that has the data. Say that there "
f"is no available data if there are no available tools that are relevant."
)
engine = self.get_query_engine(
index, self.index_chat_chains[message.channel.id].llm
)
# Get rid of all special characters in the filename
filename = "".join(
[c for c in filename if c.isalpha() or c.isdigit()]
).rstrip()
tool_config = IndexToolConfig(
query_engine=engine,
name=f"{filename}-index",
description=f"Use this tool if the query seems related to this summary: {summary}",
tool_kwargs={
"return_direct": False,
},
max_iterations=5,
)
tool = LlamaIndexTool.from_tool_config(tool_config)
tools = self.index_chat_chains[message.channel.id].tools
tools.append(tool)
agent_chain = initialize_agent(
tools=tools,
llm=self.index_chat_chains[message.channel.id].llm,
agent=AgentType.OPENAI_FUNCTIONS,
verbose=True,
agent_kwargs=self.index_chat_chains[
message.channel.id
].agent_kwargs,
memory=self.index_chat_chains[message.channel.id].memory,
handle_parsing_errors="Check your output and make sure it conforms!",
)
index_chat_data = IndexChatData(
self.index_chat_chains[message.channel.id].llm,
agent_chain,
self.index_chat_chains[message.channel.id].memory,
message.channel.id,
tools,
self.index_chat_chains[message.channel.id].agent_kwargs,
self.index_chat_chains[message.channel.id].llm_predictor,
)
self.index_chat_chains[message.channel.id] = index_chat_data
return True, summary
except Exception as e:
await message.reply(
"There was an error indexing your file: " + str(e)
)
traceback.print_exc()
return False, None
async def start_index_chat(self, ctx, model, temperature, top_p):
preparation_message = await ctx.channel.send(
embed=EmbedStatics.get_index_chat_preparation_message()
)
llm = ChatOpenAI(
model=model, temperature=temperature, top_p=top_p, max_retries=2
)
llm_predictor = LLMPredictor(
llm=ChatOpenAI(temperature=temperature, top_p=top_p, model_name=model)
)
max_token_limit = 29000 if "gpt-4" in model else 7500
memory = ConversationSummaryBufferMemory(
memory_key="memory",
return_messages=True,
llm=llm,
max_token_limit=100000 if "preview" in model else max_token_limit,
)
agent_kwargs = {
"extra_prompt_messages": [MessagesPlaceholder(variable_name="memory")],
"system_message": SystemMessage(
content="You are a superpowered version of GPT that is able to answer questions about the data you're "
"connected to. Each different tool you have represents a different dataset to interact with. "
"If you are asked to perform a task that spreads across multiple datasets, use multiple tools "
"for the same prompt. When the user types links in chat, you will have already been connected "
"to the data at the link by the time you respond. When using tools, the input should be "
"clearly created based on the request of the user. For example, if a user uploads an invoice "
"and asks how many usage hours of X was present in the invoice, a good query is 'X hours'. "
"Avoid using single word queries unless the request is very simple. You can query multiple times to break down complex requests and retrieve more information. When calling functions, no special characters are allowed in the function name, keep that in mind."
),
}
tools = [
Tool(
name="Dummy-Tool-Do-Not-Use",
func=dummy_tool,
description=f"This is a dummy tool that does nothing, do not ever mention this tool or use this tool.",
)
]
print(f"{tools}{llm}{AgentType.OPENAI_FUNCTIONS}{True}{agent_kwargs}{memory}")
agent_chain = initialize_agent(
tools=tools,
llm=llm,
agent=AgentType.OPENAI_FUNCTIONS,
verbose=True,
agent_kwargs=agent_kwargs,
memory=memory,
handle_parsing_errors="Check your output and make sure it conforms!",
)
embed_title = f"{ctx.user.name}'s data-connected conversation with GPT"
message_embed = discord.Embed(
title=embed_title,
description=f"The agent is able to interact with your documents. Simply drag your documents into discord or give the agent a link from where to download the documents.\nModel: {model}",
color=0x00995B,
)
message_embed.set_thumbnail(url="https://i.imgur.com/7V6apMT.png")
message_embed.set_footer(
text="Data Chat", icon_url="https://i.imgur.com/7V6apMT.png"
)
message_thread = await ctx.send(embed=message_embed)
thread = await message_thread.create_thread(
name=ctx.user.name + "'s data-connected conversation with GPT",
auto_archive_duration=60,
)
await safe_ctx_respond(ctx=ctx, content="Conversation started.")
try:
await preparation_message.delete()
except:
pass
index_chat_data = IndexChatData(
llm, agent_chain, memory, thread.id, tools, agent_kwargs, llm_predictor
)
self.index_chat_chains[thread.id] = index_chat_data
async def paginate_embed(self, response_text):
"""Given a response text make embed pages and return a list of the pages."""
response_text = [
response_text[i : i + self.EMBED_CUTOFF]
for i in range(0, len(response_text), self.EMBED_CUTOFF)
]
pages = []
first = False
# Send each chunk as a message
for count, chunk in enumerate(response_text, start=1):
if not first:
page = discord.Embed(
title=f"Index Query Results",
description=chunk,
)
first = True
else:
page = discord.Embed(
title=f"Page {count}",
description=chunk,
)
pages.append(page)
return pages
def index_file(
self, file_path, service_context, suffix=None
) -> GPTVectorStoreIndex:
if suffix and suffix == ".md":
loader = MarkdownReader()
document = loader.load_data(file_path)
elif suffix and suffix == ".epub":
epub_loader = EpubReader()
document = epub_loader.load_data(file_path)
else:
document = SimpleDirectoryReader(input_files=[file_path]).load_data()
index = GPTVectorStoreIndex.from_documents(
document, service_context=service_context, use_async=True
)
return index
def index_gdoc(self, doc_id, service_context) -> GPTVectorStoreIndex:
document = GoogleDocsReader().load_data(doc_id)
index = GPTVectorStoreIndex.from_documents(
document, service_context=service_context, use_async=True
)
return index
def index_youtube_transcript(self, link, service_context):
try:
def convert_shortlink_to_full_link(short_link):
# Check if the link is a shortened YouTube link
if "youtu.be" in short_link:
# Extract the video ID from the link
video_id = short_link.split("/")[-1].split("?")[0]
# Construct the full YouTube desktop link
desktop_link = f"https://www.youtube.com/watch?v={video_id}"
return desktop_link
else:
return short_link
documents = YoutubeTranscriptReader().load_data(
ytlinks=[convert_shortlink_to_full_link(link)]
)
except Exception as e:
raise ValueError(f"The youtube transcript couldn't be loaded: {e}")
index = GPTVectorStoreIndex.from_documents(
documents,
service_context=service_context,
use_async=True,
)
return index
def index_github_repository(self, link, service_context):
# Extract the "owner" and the "repo" name from the github link.
owner = link.split("/")[3]
repo = link.split("/")[4]
try:
documents = GithubRepositoryReader(owner=owner, repo=repo).load_data(
branch="main"
)
except KeyError:
documents = GithubRepositoryReader(owner=owner, repo=repo).load_data(
branch="master"
)
index = GPTVectorStoreIndex.from_documents(
documents,
service_context=service_context,
use_async=True,
)
return index
def index_load_file(self, file_path) -> [GPTVectorStoreIndex, ComposableGraph]:
storage_context = StorageContext.from_defaults(persist_dir=file_path)
index = load_index_from_storage(storage_context)
return index
def index_discord(self, document, service_context) -> GPTVectorStoreIndex:
index = GPTVectorStoreIndex.from_documents(
document,
service_context=service_context,
use_async=True,
)
return index
async def index_pdf(self, url) -> list[Document]:
# Download the PDF at the url and save it to a tempfile
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
if response.status == 200:
data = await response.read()
f = tempfile.NamedTemporaryFile(suffix=".pdf", delete=False)
f.write(data)
f.close()
else:
return "An error occurred while downloading the PDF."
# Get the file path of this tempfile.NamedTemporaryFile
# Save this temp file to an actual file that we can put into something else to read it
documents = SimpleDirectoryReader(input_files=[f.name]).load_data()
# Delete the temporary file
return documents
async def index_webpage(self, url, service_context) -> GPTVectorStoreIndex:
# First try to connect to the URL to see if we can even reach it.
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, timeout=5) as response:
# Add another entry to links from all_links if the link is not already in it to compensate for the failed request
if response.status not in [200, 203, 202, 204]:
raise ValueError(
"Invalid URL or could not connect to the provided URL."
)
else:
# Detect if the link is a PDF, if it is, we load it differently
if response.headers["Content-Type"] == "application/pdf":
documents = await self.index_pdf(url)
index = await self.loop.run_in_executor(
None,
functools.partial(
GPTVectorStoreIndex.from_documents,
documents=documents,
service_context=service_context,
use_async=True,
),
)
return index
except:
traceback.print_exc()
raise ValueError("Could not load webpage")
documents = BeautifulSoupWebReader(
website_extractor=DEFAULT_WEBSITE_EXTRACTOR
).load_data(urls=[url])
# index = GPTVectorStoreIndex(documents, embed_model=embed_model, use_async=True)
index = await self.loop.run_in_executor(
None,
functools.partial(
GPTVectorStoreIndex.from_documents,
documents=documents,
service_context=service_context,
use_async=True,
),
)
return index
def reset_indexes(self, user_id):
self.index_storage[user_id].reset_indexes(user_id)
def get_file_suffix(self, content_type, filename):
print("The content type is " + content_type)
if content_type:
# Apply the suffix mappings to the file
for key, value in self.type_to_suffix_mappings.items():
if key in content_type:
return value
else:
for key, value in self.secondary_mappings.items():
if key in filename:
return value
return None
async def set_file_index(
self, ctx: discord.ApplicationContext, file: discord.Attachment, user_api_key
):
if not user_api_key:
os.environ["OPENAI_API_KEY"] = self.openai_key
else:
os.environ["OPENAI_API_KEY"] = user_api_key
openai.api_key = os.environ["OPENAI_API_KEY"]
try:
# First, initially set the suffix to the suffix of the attachment
suffix = self.get_file_suffix(file.content_type, file.filename) or None
if not suffix:
await ctx.respond(
embed=EmbedStatics.get_index_set_failure_embed("Unsupported file")
)
return
# Send indexing message
response = await ctx.respond(
embed=EmbedStatics.build_index_progress_embed()
)
async with aiofiles.tempfile.TemporaryDirectory() as temp_path:
async with aiofiles.tempfile.NamedTemporaryFile(
suffix=suffix, dir=temp_path, delete=False
) as temp_file:
await file.save(temp_file.name)
index = await self.loop.run_in_executor(
None,
partial(
self.index_file,
Path(temp_file.name),
service_context_no_llm,
suffix,
),
)
await self.usage_service.update_usage(
token_counter.total_embedding_token_count, "embedding"
)
try:
price = await self.usage_service.get_price(
token_counter.total_embedding_token_count, "embedding"
)
except:
traceback.print_exc()
price = "Unknown"
file_name = file.filename
self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name)
await response.edit(
embed=EmbedStatics.get_index_set_success_embed(str(price))
)
except Exception as e:
await ctx.channel.send(
embed=EmbedStatics.get_index_set_failure_embed(str(e))
)
traceback.print_exc()
async def set_link_index_recurse(
self, ctx: discord.ApplicationContext, link: str, depth, user_api_key
):
if not user_api_key:
os.environ["OPENAI_API_KEY"] = self.openai_key
else:
os.environ["OPENAI_API_KEY"] = user_api_key
openai.api_key = os.environ["OPENAI_API_KEY"]
response = await ctx.respond(embed=EmbedStatics.build_index_progress_embed())
try:
# Pre-emptively connect and get the content-type of the response
try:
async with aiohttp.ClientSession() as session:
async with session.get(link, timeout=2) as _response:
print(_response.status)
if _response.status == 200:
content_type = _response.headers.get("content-type")
else:
await response.edit(
embed=EmbedStatics.get_index_set_failure_embed(
"Invalid URL or could not connect to the provided URL."
)
)
return
except Exception as e:
traceback.print_exc()
await response.edit(
embed=EmbedStatics.get_index_set_failure_embed(
"Invalid URL or could not connect to the provided URL. "
+ str(e)
)
)
return
# Check if the link contains youtube in it
loader = RemoteDepthReader(depth=depth)
documents = await self.loop.run_in_executor(
None, partial(loader.load_data, [link])
)
index = await self.loop.run_in_executor(
None,
functools.partial(
GPTVectorStoreIndex,
documents=documents,
service_context=service_context_no_llm,
use_async=True,
),
)
await self.usage_service.update_usage(
token_counter.total_embedding_token_count, "embedding"
)
try:
price = await self.usage_service.get_price(
token_counter.total_embedding_token_count, "embedding"
)
except:
traceback.print_exc()
price = "Unknown"
# Make the url look nice, remove https, useless stuff, random characters
file_name = (
link.replace("https://", "")
.replace("http://", "")
.replace("www.", "")
.replace("/", "_")
.replace("?", "_")
.replace("&", "_")
.replace("=", "_")
.replace("-", "_")
.replace(".", "_")
)
self.index_storage[ctx.user.id].add_index(index, ctx.user.id, file_name)
except ValueError as e:
await response.edit(embed=EmbedStatics.get_index_set_failure_embed(str(e)))
traceback.print_exc()
return
except Exception as e:
await response.edit(embed=EmbedStatics.get_index_set_failure_embed(str(e)))
traceback.print_exc()
return
await response.edit(embed=EmbedStatics.get_index_set_success_embed(price))
def get_query_engine(self, index, llm):
retriever = VectorIndexRetriever(
index=index,
similarity_top_k=6,
service_context=get_service_context_with_llm(llm),
)
response_synthesizer = get_response_synthesizer(
response_mode=ResponseMode.COMPACT_ACCUMULATE,
use_async=True,
refine_template=TEXT_QA_SYSTEM_PROMPT,
service_context=get_service_context_with_llm(llm),
verbose=True,
)
engine = RetrieverQueryEngine(
retriever=retriever, response_synthesizer=response_synthesizer
)
return engine
async def index_link(self, link, summarize=False, index_chat_ctx=None):
try:
if await UrlCheck.check_youtube_link(link):
print("Indexing youtube transcript")
index = await self.loop.run_in_executor(
None,
partial(
self.index_youtube_transcript, link, service_context_no_llm
),
)
print("Indexed youtube transcript")
elif "github" in link:
index = await self.loop.run_in_executor(
None,
partial(self.index_github_repository, link, service_context_no_llm),
)
else:
index = await self.index_webpage(link, service_context_no_llm)
except Exception as e:
if index_chat_ctx:
await index_chat_ctx.reply(
"There was an error indexing your link: " + str(e)
)
return False, None
else:
raise e
summary = None
if index_chat_ctx:
try:
print("Getting transcript summary")
self.usage_service.update_usage_memory(
index_chat_ctx.guild.name, "index_chat_link", 1
)
summary = await index.as_query_engine(
response_mode="tree_summarize",
service_context=get_service_context_with_llm(
self.index_chat_chains[index_chat_ctx.channel.id].llm
),
).aquery(
"What is a summary or general idea of this document? Be detailed in your summary but not too verbose. Your summary should be under 50 words. This summary will be used in a vector index to retrieve information about certain data. So, at a high level, the summary should describe the document in such a way that a retriever would know to select it when asked questions about it. The link was {link}. Include the an easy identifier derived from the link at the end of the summary."
)
print("Got transcript summary")
engine = self.get_query_engine(
index, self.index_chat_chains[index_chat_ctx.channel.id].llm