Skip to content

Commit

Permalink
feat: finish file parser
Browse files Browse the repository at this point in the history
  • Loading branch information
madawei2699 committed Mar 21, 2023
1 parent 466820d commit 8083f14
Show file tree
Hide file tree
Showing 7 changed files with 834 additions and 22 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,18 +56,18 @@ For now it is in development, but you can try it out by join this [channel](http
- Invest News
- [x] Xueqiu daily hot topics
- [x] Jisilu daily hot topics
- Support file reading and analysis 💥 🚩
- Support file reading and analysis 💥
- Considering the expensive billing, it needs to use the slack userID whitelist to restrict the access this feature
- Need to cache the file Documents to save extract cost
- [ ] EPUB
- [ ] DOCX
- [ ] TEXT
- [ ] PDF
- [x] EPUB
- [] DOCX
- [x] TEXT
- [x] PDF
- Use [Google Vision](https://cloud.google.com/vision/docs/pdf) to handle the PDF reading
- [ ] Image
- may use GPT4
- [ ] Support voice reading with self-hosting [whisper](https://github.com/aarnphm/whispercpp)
- (whisper -> chatGPT -> azure text2speech) to play language speaking practices 💥
- (whisper -> chatGPT -> azure text2speech) to play language speaking practices 💥 🚩
- [ ] Integrated with Azure OpenAI Service
- [ ] User access limit
- Limit the number of requests to bot per user per day to save the cost
Expand Down
1 change: 1 addition & 0 deletions app/data/vip_whitelist.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
U02SZH43AL8
36 changes: 33 additions & 3 deletions app/gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import hashlib
import openai
from llama_index import GPTSimpleVectorIndex, LLMPredictor, RssReader
from llama_index import GPTSimpleVectorIndex, LLMPredictor, RssReader, SimpleDirectoryReader
from llama_index.prompts.prompts import QuestionAnswerPrompt
from llama_index.readers.schema.base import Document
from langchain.chat_models import ChatOpenAI
Expand All @@ -16,10 +16,14 @@
llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.2, model_name="gpt-3.5-turbo"))

index_cache_web_dir = '/tmp/myGPTReader/cache_web/'
index_cache_file_dir = '/data/myGPTReader/file/'

if not os.path.exists(index_cache_web_dir):
os.makedirs(index_cache_web_dir)

if not os.path.exists(index_cache_file_dir):
os.makedirs(index_cache_file_dir)

def get_unique_md5(urls):
urls_str = ''.join(sorted(urls))
hashed_str = hashlib.md5(urls_str.encode('utf-8')).hexdigest()
Expand Down Expand Up @@ -66,12 +70,19 @@ def get_index_from_web_cache(name):
if not os.path.exists(index_cache_web_dir + name):
return None
index = GPTSimpleVectorIndex.load_from_disk(index_cache_web_dir + name)
logging.info(f"=====> Get index from cache: {index_cache_web_dir + name}")
logging.info(f"=====> Get index from web cache: {index_cache_web_dir + name}")
return index

def get_index_from_file_cache(name):
if not os.path.exists(index_cache_file_dir + name):
return None
index = GPTSimpleVectorIndex.load_from_disk(index_cache_file_dir + name)
logging.info(f"=====> Get index from file cache: {index_cache_file_dir + name}")
return index

def get_answer_from_llama_web(messages, urls):
dialog_messages = format_dialog_messages(messages)
logging.info('=====> Use llama with chatGPT to answer!')
logging.info('=====> Use llama web with chatGPT to answer!')
logging.info(dialog_messages)
combained_urls = get_urls(urls)
logging.info(combained_urls)
Expand All @@ -85,3 +96,22 @@ def get_answer_from_llama_web(messages, urls):
logging.info(f"=====> Save index to disk path: {index_cache_web_dir + index_file_name}")
index.save_to_disk(index_cache_web_dir + index_file_name)
return index.query(dialog_messages, llm_predictor=llm_predictor, text_qa_template=QUESTION_ANSWER_PROMPT)

def get_index_name_from_file(file: str):
file_md5_with_extension = file.replace(index_cache_file_dir, '')
file_md5 = file_md5_with_extension.split('.')[0]
return file_md5 + '.json'

def get_answer_from_llama_file(messages, file):
dialog_messages = format_dialog_messages(messages)
logging.info('=====> Use llama file with chatGPT to answer!')
logging.info(dialog_messages)
index_name = get_index_name_from_file(file)
index = get_index_from_file_cache(index_name)
if index is None:
logging.info(f"=====> Build index from file!")
documents = SimpleDirectoryReader(input_files=[file]).load_data()
index = GPTSimpleVectorIndex(documents)
logging.info(f"=====> Save index to disk path: {index_cache_file_dir + index_name}")
index.save_to_disk(index_cache_file_dir + index_name)
return index.query(dialog_messages, llm_predictor=llm_predictor, text_qa_template=QUESTION_ANSWER_PROMPT)
73 changes: 61 additions & 12 deletions app/server.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import re
import os
import requests
from urllib.parse import urlparse
from flask import Flask, request
from flask_apscheduler import APScheduler
from slack_bolt import App
from slack_bolt.adapter.flask import SlackRequestHandler
import concurrent.futures
from app.daily_hot_news import *
from app.gpt import get_answer_from_chatGPT, get_answer_from_llama_web
from app.gpt import get_answer_from_chatGPT, get_answer_from_llama_file, get_answer_from_llama_web, index_cache_file_dir
from app.slash_command import register_slack_slash_commands
from app.util import md5

class Config:
SCHEDULER_API_ENABLED = True
Expand Down Expand Up @@ -75,17 +77,20 @@ def insert_space(text):
thread_message_history = {}
MAX_THREAD_MESSAGE_HISTORY = 10

def update_thread_history(thread_ts, message_str, urls=None):
def update_thread_history(thread_ts, message_str=None, urls=None, file=None):
if urls is not None:
thread_message_history[thread_ts]['context_urls'].update(urls)
if thread_ts in thread_message_history:
dialog_texts = thread_message_history[thread_ts]['dialog_texts']
dialog_texts.append(message_str)
if len(dialog_texts) > MAX_THREAD_MESSAGE_HISTORY:
dialog_texts = dialog_texts[-MAX_THREAD_MESSAGE_HISTORY:]
thread_message_history[thread_ts]['dialog_texts'] = dialog_texts
else:
thread_message_history[thread_ts]['dialog_texts'] = [message_str]
if message_str is not None:
if thread_ts in thread_message_history:
dialog_texts = thread_message_history[thread_ts]['dialog_texts']
dialog_texts.append(message_str)
if len(dialog_texts) > MAX_THREAD_MESSAGE_HISTORY:
dialog_texts = dialog_texts[-MAX_THREAD_MESSAGE_HISTORY:]
thread_message_history[thread_ts]['dialog_texts'] = dialog_texts
else:
thread_message_history[thread_ts]['dialog_texts'] = [message_str]
if file is not None:
thread_message_history[thread_ts]['file'] = file

def extract_urls_from_event(event):
urls = set()
Expand All @@ -97,26 +102,70 @@ def extract_urls_from_event(event):
urls.add(url)
return list(urls)

whitelist_file = "app/data//vip_whitelist.txt"

filetype_extension_allowed = ['epub', 'pdf', 'txt', 'docx', 'md']

def is_authorized(user_id: str) -> bool:
with open(whitelist_file, "r") as f:
return user_id in f.read().splitlines()

@slack_app.event("app_mention")
def handle_mentions(event, say, logger):
logger.info(event)

user = event["user"]
thread_ts = event["ts"]

file_md5_name = None

if event.get('files'):
if not is_authorized(event['user']):
say(f'<@{user}>, this feature is only allowed by whitelist user, please contact the admin to open it.', thread_ts=thread_ts)
return
file = event['files'][0] # only support one file for one thread
logger.info('=====> Received file:')
logger.info(file)
filetype = file["filetype"]
if filetype not in filetype_extension_allowed:
say(f'<@{user}>, this filetype is not supported, please upload a file with extension [{", ".join(filetype_extension_allowed)}]', thread_ts=thread_ts)
return
url_private = file["url_private"]
temp_file_path = index_cache_file_dir + user
if not os.path.exists(temp_file_path):
os.makedirs(temp_file_path)
temp_file_filename = temp_file_path + '/' + file["name"]
with open(temp_file_filename, "wb") as f:
response = requests.get(url_private, headers={"Authorization": "Bearer " + slack_app.client.token})
f.write(response.content)
logger.info(f'=====> Downloaded file to save {temp_file_filename}')
temp_file_md5 = md5(temp_file_filename)
file_md5_name = index_cache_file_dir + temp_file_md5 + '.' + filetype
if not os.path.exists(file_md5_name):
logger.info(f'=====> Rename file to {file_md5_name}')
os.rename(temp_file_filename, file_md5_name)

parent_thread_ts = event["thread_ts"] if "thread_ts" in event else thread_ts
if parent_thread_ts not in thread_message_history:
thread_message_history[parent_thread_ts] = { 'dialog_texts': [], 'context_urls': set()}
thread_message_history[parent_thread_ts] = { 'dialog_texts': [], 'context_urls': set(), 'file': None}

if "text" in event:
update_thread_history(parent_thread_ts, 'User: %s' % insert_space(event["text"].replace('<@U04TCNR9MNF>', '')), extract_urls_from_event(event))

if file_md5_name is not None:
update_thread_history(parent_thread_ts, None, None, file_md5_name)

urls = thread_message_history[parent_thread_ts]['context_urls']
file = thread_message_history[parent_thread_ts]['file']

logger.info('=====> Current thread conversation messages are:')
logger.info(thread_message_history[parent_thread_ts])

# TODO: https://github.com/jerryjliu/llama_index/issues/778
# if it can get the context_str, then put this prompt into the thread_message_history to provide more context to the chatGPT
if len(urls) > 0: # if this conversation has urls, use llama with all urls in this thread
if file is not None:
future = executor.submit(get_answer_from_llama_file, thread_message_history[parent_thread_ts]['dialog_texts'], file)
elif len(urls) > 0: # if this conversation has urls, use llama with all urls in this thread
future = executor.submit(get_answer_from_llama_web, thread_message_history[parent_thread_ts]['dialog_texts'], list(urls))
else:
future = executor.submit(get_answer_from_chatGPT, thread_message_history[parent_thread_ts]['dialog_texts'])
Expand Down
8 changes: 8 additions & 0 deletions app/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import hashlib

def md5(file_path):
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
Loading

0 comments on commit 8083f14

Please sign in to comment.