feat: finish file parser

neove · Mar 21, 2023 · 8083f14 · 8083f14
1 parent 466820d
commit 8083f14
Show file tree

Hide file tree

Showing 7 changed files with 834 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -56,18 +56,18 @@ For now it is in development, but you can try it out by join this [channel](http
     - Invest News
       - [x] Xueqiu daily hot topics
       - [x] Jisilu daily hot topics
-- Support file reading and analysis 💥 🚩
+- Support file reading and analysis 💥
   - Considering the expensive billing, it needs to use the slack userID whitelist to restrict the access this feature
   - Need to cache the file Documents to save extract cost
-  - [ ] EPUB
-  - [ ] DOCX
-  - [ ] TEXT
-  - [ ] PDF
+  - [x] EPUB
+  - [] DOCX
+  - [x] TEXT
+  - [x] PDF
     - Use [Google Vision](https://cloud.google.com/vision/docs/pdf) to handle the PDF reading
   - [ ] Image
     - may use GPT4
 - [ ] Support voice reading with self-hosting [whisper](https://github.com/aarnphm/whispercpp)
-  - (whisper -> chatGPT -> azure text2speech) to play language speaking practices 💥
+  - (whisper -> chatGPT -> azure text2speech) to play language speaking practices 💥 🚩
 - [ ] Integrated with Azure OpenAI Service
 - [ ] User access limit
   - Limit the number of requests to bot per user per day to save the cost

diff --git a/app/data/vip_whitelist.txt b/app/data/vip_whitelist.txt
@@ -0,0 +1 @@
+U02SZH43AL8
diff --git a/app/gpt.py b/app/gpt.py
@@ -3,7 +3,7 @@
 import logging
 import hashlib
 import openai
-from llama_index import GPTSimpleVectorIndex, LLMPredictor, RssReader
+from llama_index import GPTSimpleVectorIndex, LLMPredictor, RssReader, SimpleDirectoryReader
 from llama_index.prompts.prompts import QuestionAnswerPrompt
 from llama_index.readers.schema.base import Document
 from langchain.chat_models import ChatOpenAI
@@ -16,10 +16,14 @@
 llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0.2, model_name="gpt-3.5-turbo"))
 
 index_cache_web_dir = '/tmp/myGPTReader/cache_web/'
+index_cache_file_dir = '/data/myGPTReader/file/'
 
 if not os.path.exists(index_cache_web_dir):
     os.makedirs(index_cache_web_dir)
 
+if not os.path.exists(index_cache_file_dir):
+    os.makedirs(index_cache_file_dir)
+
 def get_unique_md5(urls):
     urls_str = ''.join(sorted(urls))
     hashed_str = hashlib.md5(urls_str.encode('utf-8')).hexdigest()
@@ -66,12 +70,19 @@ def get_index_from_web_cache(name):
     if not os.path.exists(index_cache_web_dir + name):
         return None
     index = GPTSimpleVectorIndex.load_from_disk(index_cache_web_dir + name)
-    logging.info(f"=====> Get index from cache: {index_cache_web_dir + name}")
+    logging.info(f"=====> Get index from web cache: {index_cache_web_dir + name}")
+    return index
+
+def get_index_from_file_cache(name):
+    if not os.path.exists(index_cache_file_dir + name):
+        return None
+    index = GPTSimpleVectorIndex.load_from_disk(index_cache_file_dir + name)
+    logging.info(f"=====> Get index from file cache: {index_cache_file_dir + name}")
     return index
 
 def get_answer_from_llama_web(messages, urls):
     dialog_messages = format_dialog_messages(messages)
-    logging.info('=====> Use llama with chatGPT to answer!')
+    logging.info('=====> Use llama web with chatGPT to answer!')
     logging.info(dialog_messages)
     combained_urls = get_urls(urls)
     logging.info(combained_urls)
@@ -85,3 +96,22 @@ def get_answer_from_llama_web(messages, urls):
         logging.info(f"=====> Save index to disk path: {index_cache_web_dir + index_file_name}")
         index.save_to_disk(index_cache_web_dir + index_file_name)
     return index.query(dialog_messages, llm_predictor=llm_predictor, text_qa_template=QUESTION_ANSWER_PROMPT)
+
+def get_index_name_from_file(file: str):
+    file_md5_with_extension = file.replace(index_cache_file_dir, '')
+    file_md5 = file_md5_with_extension.split('.')[0]
+    return file_md5 + '.json'
+
+def get_answer_from_llama_file(messages, file):
+    dialog_messages = format_dialog_messages(messages)
+    logging.info('=====> Use llama file with chatGPT to answer!')
+    logging.info(dialog_messages)
+    index_name = get_index_name_from_file(file)
+    index = get_index_from_file_cache(index_name)
+    if index is None:
+        logging.info(f"=====> Build index from file!")
+        documents = SimpleDirectoryReader(input_files=[file]).load_data()
+        index = GPTSimpleVectorIndex(documents)
+        logging.info(f"=====> Save index to disk path: {index_cache_file_dir + index_name}")
+        index.save_to_disk(index_cache_file_dir + index_name)
+    return index.query(dialog_messages, llm_predictor=llm_predictor, text_qa_template=QUESTION_ANSWER_PROMPT)
diff --git a/app/server.py b/app/server.py
@@ -1,14 +1,16 @@
 import re
 import os
+import requests
 from urllib.parse import urlparse
 from flask import Flask, request
 from flask_apscheduler import APScheduler
 from slack_bolt import App
 from slack_bolt.adapter.flask import SlackRequestHandler
 import concurrent.futures
 from app.daily_hot_news import *
-from app.gpt import get_answer_from_chatGPT, get_answer_from_llama_web
+from app.gpt import get_answer_from_chatGPT, get_answer_from_llama_file, get_answer_from_llama_web, index_cache_file_dir
 from app.slash_command import register_slack_slash_commands
+from app.util import md5
 
 class Config:
     SCHEDULER_API_ENABLED = True
@@ -75,17 +77,20 @@ def insert_space(text):
 thread_message_history = {}
 MAX_THREAD_MESSAGE_HISTORY = 10
 
-def update_thread_history(thread_ts, message_str, urls=None):
+def update_thread_history(thread_ts, message_str=None, urls=None, file=None):
     if urls is not None:
         thread_message_history[thread_ts]['context_urls'].update(urls)
-    if thread_ts in thread_message_history:
-        dialog_texts = thread_message_history[thread_ts]['dialog_texts']
-        dialog_texts.append(message_str)
-        if len(dialog_texts) > MAX_THREAD_MESSAGE_HISTORY:
-            dialog_texts = dialog_texts[-MAX_THREAD_MESSAGE_HISTORY:]
-        thread_message_history[thread_ts]['dialog_texts'] = dialog_texts
-    else:
-        thread_message_history[thread_ts]['dialog_texts'] = [message_str]
+    if message_str is not None:
+        if thread_ts in thread_message_history:
+            dialog_texts = thread_message_history[thread_ts]['dialog_texts']
+            dialog_texts.append(message_str)
+            if len(dialog_texts) > MAX_THREAD_MESSAGE_HISTORY:
+                dialog_texts = dialog_texts[-MAX_THREAD_MESSAGE_HISTORY:]
+            thread_message_history[thread_ts]['dialog_texts'] = dialog_texts
+        else:
+            thread_message_history[thread_ts]['dialog_texts'] = [message_str]
+    if file is not None:
+        thread_message_history[thread_ts]['file'] = file
 
 def extract_urls_from_event(event):
     urls = set()
@@ -97,26 +102,70 @@ def extract_urls_from_event(event):
                     urls.add(url)
     return list(urls)
 
+whitelist_file = "app/data//vip_whitelist.txt"
+
+filetype_extension_allowed = ['epub', 'pdf', 'txt', 'docx', 'md']
+
+def is_authorized(user_id: str) -> bool:
+    with open(whitelist_file, "r") as f:
+        return user_id in f.read().splitlines()
+
 @slack_app.event("app_mention")
 def handle_mentions(event, say, logger):
+    logger.info(event)
+
     user = event["user"]
     thread_ts = event["ts"]
 
+    file_md5_name = None
+
+    if event.get('files'):
+        if not is_authorized(event['user']):
+            say(f'<@{user}>, this feature is only allowed by whitelist user, please contact the admin to open it.', thread_ts=thread_ts)
+            return
+        file = event['files'][0] # only support one file for one thread
+        logger.info('=====> Received file:')
+        logger.info(file)
+        filetype = file["filetype"]
+        if filetype not in filetype_extension_allowed:
+            say(f'<@{user}>, this filetype is not supported, please upload a file with extension [{", ".join(filetype_extension_allowed)}]', thread_ts=thread_ts)
+            return
+        url_private = file["url_private"]
+        temp_file_path = index_cache_file_dir + user
+        if not os.path.exists(temp_file_path):
+            os.makedirs(temp_file_path)
+        temp_file_filename = temp_file_path + '/' + file["name"]
+        with open(temp_file_filename, "wb") as f:
+            response = requests.get(url_private, headers={"Authorization": "Bearer " + slack_app.client.token})
+            f.write(response.content)
+            logger.info(f'=====> Downloaded file to save {temp_file_filename}')
+            temp_file_md5 = md5(temp_file_filename)
+            file_md5_name = index_cache_file_dir + temp_file_md5 + '.' + filetype
+            if not os.path.exists(file_md5_name):
+                logger.info(f'=====> Rename file to {file_md5_name}')
+                os.rename(temp_file_filename, file_md5_name)
+
     parent_thread_ts = event["thread_ts"] if "thread_ts" in event else thread_ts
     if parent_thread_ts not in thread_message_history:
-        thread_message_history[parent_thread_ts] = { 'dialog_texts': [], 'context_urls': set()}
+        thread_message_history[parent_thread_ts] = { 'dialog_texts': [], 'context_urls': set(), 'file': None}
 
     if "text" in event:
         update_thread_history(parent_thread_ts, 'User: %s' % insert_space(event["text"].replace('<@U04TCNR9MNF>', '')), extract_urls_from_event(event))
+
+    if file_md5_name is not None:
+        update_thread_history(parent_thread_ts, None, None, file_md5_name)
 
     urls = thread_message_history[parent_thread_ts]['context_urls']
+    file = thread_message_history[parent_thread_ts]['file']
 
     logger.info('=====> Current thread conversation messages are:')
     logger.info(thread_message_history[parent_thread_ts])
 
     # TODO: https://github.com/jerryjliu/llama_index/issues/778
     # if it can get the context_str, then put this prompt into the thread_message_history to provide more context to the chatGPT
-    if len(urls) > 0: # if this conversation has urls, use llama with all urls in this thread
+    if file is not None:
+        future = executor.submit(get_answer_from_llama_file, thread_message_history[parent_thread_ts]['dialog_texts'], file)
+    elif len(urls) > 0: # if this conversation has urls, use llama with all urls in this thread
         future = executor.submit(get_answer_from_llama_web, thread_message_history[parent_thread_ts]['dialog_texts'], list(urls))
     else:
         future = executor.submit(get_answer_from_chatGPT, thread_message_history[parent_thread_ts]['dialog_texts'])

diff --git a/app/util.py b/app/util.py
@@ -0,0 +1,8 @@
+import hashlib
+
+def md5(file_path):
+    hash_md5 = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()