fixed microsoft#50; reduced local deployments

MatijaB · Apr 6, 2023 · bc66e5a · bc66e5a
1 parent 766e44b
commit bc66e5a
Show file tree

Hide file tree

Showing 7 changed files with 91 additions and 94 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 # for server
 server/models/*
 !server/models/download.sh
+!server/models/download.ps1
 server/logs/
 server/models_dev
 server/public/*

diff --git a/README.md b/README.md
@@ -30,12 +30,14 @@ We introduce a collaborative system that consists of **an LLM as the controller*
 ### Default
 
 + Ubuntu 16.04 LTS
-+ NVIDIA GeForce RTX 3090 * 1
++ VRAM >= 12GB
 + RAM > 12GB (minimal), 16GB (standard), 42GB (full)
++ Disk > 78G (with 42G for `damo-vilab/text-to-video-ms-1.7b`)
 
 ### Minimum
 
 + Ubuntu 16.04 LTS
++ Nothing else
 
 The configuration `lite.yaml` does not require any expert models to be downloaded and deployed locally. However, it means that Jarvis is restricted to models running stably on HuggingFace Inference Endpoints.
 
@@ -49,8 +51,6 @@ First replace `openai.key` and `huggingface.token` in `server/config.yaml` with
 
 ### For Server:
 
-
-
 ```bash
 # setup env
 cd server
@@ -104,6 +104,9 @@ cd web
 npm install
 npm run dev
 ```
+
+Here's a tip, you can switch to ChatGPT by `double click` on the setting icon!
+
 Note that in order to display the video properly in HTML, you need to compile `ffmpeg` manually with H.264
 
 ```bash

diff --git a/server/awesome_chat.py b/server/awesome_chat.py
@@ -79,9 +79,12 @@
  raise ValueError("Incrorrect OpenAI key. Please check your config.yaml file.")
  OPENAI_KEY = config["openai"]["key"]
  endpoint = f"https://api.openai.com/v1/{api_name}"
- HEADER = {
- "Authorization": f"Bearer {OPENAI_KEY}"
- }
+ if OPENAI_KEY.startswith("sk-"):
+ HEADER = {
+ "Authorization": f"Bearer {OPENAI_KEY}"
+ }
+ else:
+ HEADER = None
 else:
  endpoint = f"{config['local']['endpoint']}/v1/{api_name}"
  HEADER = None
@@ -163,10 +166,11 @@ def send_request(data):
  openaikey = data.pop("openaikey")
  if use_completion:
  data = convert_chat_to_completion(data)
- if "openaikey" in data:
+ if openaikey and openaikey.startswith("sk-"):
  HEADER = {
- "Authorization": f"Bearer {data['openaikey']}"
+ "Authorization": f"Bearer {openaikey}"
  }
+
  response = requests.post(endpoint, json=data, headers=HEADER, proxies=PROXY)
  logger.debug(response.text.strip())
  if use_completion:
@@ -772,7 +776,7 @@ def run_task(input, command, results, openaikey = None):
  choose = {"id": best_model_id, "reason": reason}
  messages = [{
  "role": "user",
- "content": f"[ {input} ] contains a task in JSON format {command}, 'task' indicates the task type and 'args' indicates the arguments required for the task. Don't explain the task to me, just help me do it and give me the result. The result can must be in text form."
+ "content": f"[ {input} ] contains a task in JSON format {command}, 'task' indicates the task type and 'args' indicates the arguments required for the task. Don't explain the task to me, just help me do it and give me the result. The result must be in text form without any urls."
  }]
  response = chitchat(messages, openaikey)
  results[id] = collect_result(command, choose, {"response": response})

diff --git a/server/models/download.ps1 b/server/models/download.ps1
@@ -9,16 +9,11 @@ $models = @(
  "lllyasviel/sd-controlnet-scribble",
  "lllyasviel/sd-controlnet-seg",
  "runwayml/stable-diffusion-v1-5",
- "Salesforce/blip-image-captioning-large",
  "damo-vilab/text-to-video-ms-1.7b",
  "microsoft/speecht5_asr",
- "facebook/maskformer-swin-large-ade",
- "microsoft/biogpt",
- "facebook/esm2_t12_35M_UR50D",
  "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k",
  "espnet/kan-bayashi_ljspeech_vits",
  "facebook/detr-resnet-101",
- "microsoft/speecht5_tts",
  "microsoft/speecht5_hifigan",
  "microsoft/speecht5_vc",
  "openai/whisper-base",

diff --git a/server/models/download.sh b/server/models/download.sh
@@ -1,63 +1,57 @@
-models="
-nlpconnect/vit-gpt2-image-captioning
-lllyasviel/ControlNet
-lllyasviel/sd-controlnet-canny
-lllyasviel/sd-controlnet-depth
-lllyasviel/sd-controlnet-hed
-lllyasviel/sd-controlnet-mlsd
-lllyasviel/sd-controlnet-openpose
-lllyasviel/sd-controlnet-scribble
-lllyasviel/sd-controlnet-seg
-runwayml/stable-diffusion-v1-5
-Salesforce/blip-image-captioning-large
-damo-vilab/text-to-video-ms-1.7b
-microsoft/speecht5_asr
-facebook/maskformer-swin-large-ade
-microsoft/biogpt
-facebook/esm2_t12_35M_UR50D
-JorisCos/DCCRNet_Libri1Mix_enhsingle_16k
-espnet/kan-bayashi_ljspeech_vits
-facebook/detr-resnet-101
-microsoft/speecht5_tts
-microsoft/speecht5_hifigan
-microsoft/speecht5_vc
-openai/whisper-base
-Intel/dpt-large
-facebook/detr-resnet-50-panoptic
-facebook/detr-resnet-50
-google/owlvit-base-patch32
-impira/layoutlm-document-qa
-ydshieh/vit-gpt2-coco-en
-dandelin/vilt-b32-finetuned-vqa
-lambdalabs/sd-image-variations-diffusers
-facebook/maskformer-swin-base-coco
-Intel/dpt-hybrid-midas
-"
+#!/bin/bash
 
-# CURRENT_DIR=$(cd `dirname $0`; pwd)
-CURRENT_DIR=$(pwd)
-for model in $models;
-do
- echo "----- Downloading from https://huggingface.co/"$model" -----"
- if [ -d "$model" ]; then
- # cd $model && git reset --hard && git pull && git lfs pull
- cd $model && git pull && git lfs pull
- cd $CURRENT_DIR
- else
- # git clone 包含了lfs
- git clone https://huggingface.co/$model $model
- fi
-done
+# Set models and datasets to download
+models=(
+ "nlpconnect/vit-gpt2-image-captioning"
+ "lllyasviel/ControlNet"
+ "lllyasviel/sd-controlnet-canny"
+ "lllyasviel/sd-controlnet-depth"
+ "lllyasviel/sd-controlnet-hed"
+ "lllyasviel/sd-controlnet-mlsd"
+ "lllyasviel/sd-controlnet-openpose"
+ "lllyasviel/sd-controlnet-scribble"
+ "lllyasviel/sd-controlnet-seg"
+ "runwayml/stable-diffusion-v1-5"
+ "damo-vilab/text-to-video-ms-1.7b"
+ "microsoft/speecht5_asr"
+ "JorisCos/DCCRNet_Libri1Mix_enhsingle_16k"
+ "espnet/kan-bayashi_ljspeech_vits"
+ "facebook/detr-resnet-101"
+ "microsoft/speecht5_hifigan"
+ "microsoft/speecht5_vc"
+ "openai/whisper-base"
+ "Intel/dpt-large"
+ "facebook/detr-resnet-50-panoptic"
+ "facebook/detr-resnet-50"
+ "google/owlvit-base-patch32"
+ "impira/layoutlm-document-qa"
+ "ydshieh/vit-gpt2-coco-en"
+ "dandelin/vilt-b32-finetuned-vqa"
+ "lambdalabs/sd-image-variations-diffusers"
+ "facebook/maskformer-swin-base-coco"
+ "Intel/dpt-hybrid-midas"
+)
+datasets=("Matthijs/cmu-arctic-xvectors")
 
-datasets="Matthijs/cmu-arctic-xvectors"
+# Set the current directory
+CURRENT_DIR=$(pwd)
 
-for dataset in $datasets;
- do
- echo "----- Downloading from https://huggingface.co/datasets/"$dataset" -----"
- if [ -d "$dataset" ]; then
- cd $dataset && git pull && git lfs pull
- cd $CURRENT_DIR
- else
- git clone https://huggingface.co/datasets/$dataset $dataset
- fi
+# Download models
+for model in "${models[@]}"; do
+ echo "----- Downloading from https://huggingface.co/${model} -----"
+ if [ -d "${model}" ]; then
+ (cd "${model}" && git pull && git lfs pull)
+ else
+ git clone --recurse-submodules "https://huggingface.co/${model}" "${model}"
+ fi
 done
+
+# Download datasets
+for dataset in "${datasets[@]}"; do
+ echo "----- Downloading from https://huggingface.co/datasets/${dataset} -----"
+ if [ -d "${dataset}" ]; then
+ (cd "${dataset}" && git pull && git lfs pull)
+ else
+ git clone --recurse-submodules "https://huggingface.co/datasets/${dataset}" "${dataset}"
+ fi
+done
diff --git a/server/models_server.py b/server/models_server.py
@@ -73,8 +73,8 @@
 start = time.time()
 
 local_fold = "models"
-if args.config.endswith(".dev"):
- local_fold = "models_dev"
+# if args.config.endswith(".dev"):
+#  local_fold = "models_dev"
 
 
 def load_pipes(local_deployment):
@@ -89,20 +89,20 @@ def load_pipes(local_deployment):
  "tokenizer": AutoTokenizer.from_pretrained(f"{local_fold}/nlpconnect/vit-gpt2-image-captioning"),
  "device": "cuda:0"
  },
- "Salesforce/blip-image-captioning-large": {
- "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
- "processor": BlipProcessor.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
- "device": "cuda:0"
- },
+ # "Salesforce/blip-image-captioning-large": {
+ #  "model": BlipForConditionalGeneration.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
+ #  "processor": BlipProcessor.from_pretrained(f"{local_fold}/Salesforce/blip-image-captioning-large"),
+ #  "device": "cuda:0"
+ # },
  "damo-vilab/text-to-video-ms-1.7b": {
  "model": DiffusionPipeline.from_pretrained(f"{local_fold}/damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16"),
  "device": "cuda:0"
  },
- "facebook/maskformer-swin-large-ade": {
- "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
- "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"),
- "device": "cuda:0"
- },
+ # "facebook/maskformer-swin-large-ade": {
+ #  "model": MaskFormerForInstanceSegmentation.from_pretrained(f"{local_fold}/facebook/maskformer-swin-large-ade"),
+ #  "feature_extractor" : AutoFeatureExtractor.from_pretrained("facebook/maskformer-swin-large-ade"),
+ #  "device": "cuda:0"
+ # },
  # "microsoft/trocr-base-printed": {
  # "processor": TrOCRProcessor.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
  # "model": VisionEncoderDecoderModel.from_pretrained(f"{local_fold}/microsoft/trocr-base-printed"),
@@ -137,13 +137,13 @@ def load_pipes(local_deployment):
  "model": DiffusionPipeline.from_pretrained(f"{local_fold}/runwayml/stable-diffusion-v1-5"),
  "device": "cuda:0"
  },
- "microsoft/speecht5_tts":{
- "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
- "model": SpeechT5ForTextToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
- "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
- "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
- "device": "cuda:0"
- },
+ # "microsoft/speecht5_tts":{
+ #  "processor": SpeechT5Processor.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
+ #  "model": SpeechT5ForTextToSpeech.from_pretrained(f"{local_fold}/microsoft/speecht5_tts"),
+ #  "vocoder": SpeechT5HifiGan.from_pretrained(f"{local_fold}/microsoft/speecht5_hifigan"),
+ #  "embeddings_dataset": load_dataset(f"{local_fold}/Matthijs/cmu-arctic-xvectors", split="validation"),
+ #  "device": "cuda:0"
+ # },
  # "speechbrain/mtl-mimic-voicebank": {
  # "model": WaveformEnhancement.from_hparams(source="speechbrain/mtl-mimic-voicebank", savedir="models/mtl-mimic-voicebank"),
  # "device": "cuda:0"
@@ -458,7 +458,7 @@ def models(model_id):
  generated_text = pipes[model_id]["tokenizer"].batch_decode(generated_ids, skip_special_tokens=True)[0]
  result = {"generated text": generated_text}
  # image to text: OCR
- if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten" :
+ if model_id == "microsoft/trocr-base-printed" or model_id == "microsoft/trocr-base-handwritten":
  image = load_image(request.get_json()["img_url"]).convert("RGB")
  pixel_values = pipes[model_id]["processor"](image, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(pipes[model_id]["device"])

diff --git a/server/run_gradio_demo.py b/server/run_gradio_demo.py
@@ -78,7 +78,7 @@ def bot(messages):
  # response = requests.post("http://localhost:8004/hugginggpt", json={"messages": all_messages, "openaikey": OPENAI_KEY})
  # message = response.json()["message"]
  # print(message)
- message = chat_huggingface(all_messages, OPENAI_KEY)
+ message = chat_huggingface(all_messages, OPENAI_KEY)["message"]
  image_urls, audio_urls, video_urls = extract_medias(message)
  add_message(message, "assistant")
  messages[-1][1] = message