fix bugs (modelscope#1242)

kronoszhang · Jun 27, 2024 · 8728264 · 8728264
1 parent d8682d0
commit 8728264
Show file tree

Hide file tree

Showing 9 changed files with 59 additions and 66 deletions.
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -162,7 +162,7 @@
 |yi-1_5-6b-chat|[01ai/Yi-1.5-6B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-6B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat)|
 |yi-1_5-9b|[01ai/Yi-1.5-9B](https://modelscope.cn/models/01ai/Yi-1.5-9B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B](https://huggingface.co/01-ai/Yi-1.5-9B)|
 |yi-1_5-9b-chat|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)|
-|yi-1_5-9b-chat-16k|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
+|yi-1_5-9b-chat-16k|[01ai/Yi-1.5-9B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat-16K/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
 |yi-1_5-34b|[01ai/Yi-1.5-34B](https://modelscope.cn/models/01ai/Yi-1.5-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B](https://huggingface.co/01-ai/Yi-1.5-34B)|
 |yi-1_5-34b-chat|[01ai/Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B-Chat](https://huggingface.co/01-ai/Yi-1.5-34B-Chat)|
 |yi-1_5-34b-chat-16k|[01ai/Yi-1.5-34B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat-16K/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-34B-Chat-16K)|
@@ -311,7 +311,7 @@
 | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |
 | ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- |
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwenvl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen-vl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
@@ -330,8 +330,8 @@
 |internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |mini-internvl-chat-2b-v1_5|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
 |mini-internvl-chat-4b-v1_5|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5/summary)|qkv_proj|internvl-phi3|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)|
-|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
-|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
+|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
+|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)|
 |paligemma-3b-pt-448|[AI-ModelScope/paligemma-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-448/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-448](https://huggingface.co/google/paligemma-3b-pt-448)|
 |paligemma-3b-pt-896|[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)|

diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
@@ -162,7 +162,7 @@ The table below introcudes all models supported by SWIFT:
 |yi-1_5-6b-chat|[01ai/Yi-1.5-6B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-6B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-6B-Chat](https://huggingface.co/01-ai/Yi-1.5-6B-Chat)|
 |yi-1_5-9b|[01ai/Yi-1.5-9B](https://modelscope.cn/models/01ai/Yi-1.5-9B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B](https://huggingface.co/01-ai/Yi-1.5-9B)|
 |yi-1_5-9b-chat|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat](https://huggingface.co/01-ai/Yi-1.5-9B-Chat)|
-|yi-1_5-9b-chat-16k|[01ai/Yi-1.5-9B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
+|yi-1_5-9b-chat-16k|[01ai/Yi-1.5-9B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-9B-Chat-16K/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-9B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-9B-Chat-16K)|
 |yi-1_5-34b|[01ai/Yi-1.5-34B](https://modelscope.cn/models/01ai/Yi-1.5-34B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B](https://huggingface.co/01-ai/Yi-1.5-34B)|
 |yi-1_5-34b-chat|[01ai/Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B-Chat](https://huggingface.co/01-ai/Yi-1.5-34B-Chat)|
 |yi-1_5-34b-chat-16k|[01ai/Yi-1.5-34B-Chat-16K](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat-16K/summary)|q_proj, k_proj, v_proj|yi1_5|&#x2714;|&#x2714;||-|[01-ai/Yi-1.5-34B-Chat-16K](https://huggingface.co/01-ai/Yi-1.5-34B-Chat-16K)|
@@ -311,7 +311,7 @@ The table below introcudes all models supported by SWIFT:
 | Model Type | Model ID | Default Lora Target Modules | Default Template | Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |
 | ---------  | -------- | --------------------------- | ---------------- | ------------------ | ------------ | -------- | ---- | ----------- |
 |qwen-vl|[qwen/Qwen-VL](https://modelscope.cn/models/qwen/Qwen-VL/summary)|c_attn|default-generation|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL](https://huggingface.co/Qwen/Qwen-VL)|
-|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwenvl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
+|qwen-vl-chat|[qwen/Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)|c_attn|qwen-vl|&#x2714;|&#x2718;||vision|[Qwen/Qwen-VL-Chat](https://huggingface.co/Qwen/Qwen-VL-Chat)|
 |qwen-vl-chat-int4|[qwen/Qwen-VL-Chat-Int4](https://modelscope.cn/models/qwen/Qwen-VL-Chat-Int4/summary)|c_attn|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5|vision|[Qwen/Qwen-VL-Chat-Int4](https://huggingface.co/Qwen/Qwen-VL-Chat-Int4)|
 |qwen-audio|[qwen/Qwen-Audio](https://modelscope.cn/models/qwen/Qwen-Audio/summary)|c_attn|qwen-audio-generation|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio](https://huggingface.co/Qwen/Qwen-Audio)|
 |qwen-audio-chat|[qwen/Qwen-Audio-Chat](https://modelscope.cn/models/qwen/Qwen-Audio-Chat/summary)|c_attn|qwen-audio|&#x2714;|&#x2718;||audio|[Qwen/Qwen-Audio-Chat](https://huggingface.co/Qwen/Qwen-Audio-Chat)|
@@ -330,8 +330,8 @@ The table below introcudes all models supported by SWIFT:
 |internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |mini-internvl-chat-2b-v1_5|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-2B-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-2B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-2B-V1-5)|
 |mini-internvl-chat-4b-v1_5|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://modelscope.cn/models/OpenGVLab/Mini-InternVL-Chat-4B-V1-5/summary)|qkv_proj|internvl-phi3|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/Mini-InternVL-Chat-4B-V1-5](https://huggingface.co/OpenGVLab/Mini-InternVL-Chat-4B-V1-5)|
-|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
-|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
+|deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
+|deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;||vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |paligemma-3b-pt-224|[AI-ModelScope/paligemma-3b-pt-224](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-224/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-224](https://huggingface.co/google/paligemma-3b-pt-224)|
 |paligemma-3b-pt-448|[AI-ModelScope/paligemma-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-448/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-448](https://huggingface.co/google/paligemma-3b-pt-448)|
 |paligemma-3b-pt-896|[AI-ModelScope/paligemma-3b-pt-896](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-896/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-896](https://huggingface.co/google/paligemma-3b-pt-896)|

diff --git a/requirements/llm.txt b/requirements/llm.txt
@@ -1,3 +1,4 @@
+attrdict
 charset_normalizer
 cpm_kernels
 fastapi

diff --git a/swift/llm/eval.py b/swift/llm/eval.py
@@ -6,7 +6,11 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import json
+from llmuses.config import TaskConfig
+from llmuses.constants import DEFAULT_ROOT_CACHE_DIR
 from llmuses.models.custom import CustomModel
+from llmuses.run import run_task
+from llmuses.summarizer import Summarizer
 from modelscope import GenerationConfig
 from tqdm import tqdm
 
@@ -130,9 +134,6 @@ def predict(self, prompts: List[str], **kwargs) -> List[Dict[str, Any]]:
 
 
 def llm_eval(args: EvalArguments) -> List[Dict[str, Any]]:
-    from llmuses.run import run_task
-    from llmuses.config import TaskConfig
-    from llmuses.summarizer import Summarizer
     logger.info(f'args: {args}')
     seed_everything(args.seed)
     model_name = args.model_type
@@ -150,6 +151,7 @@ def llm_eval(args: EvalArguments) -> List[Dict[str, Any]]:
 
     task_configs = TaskConfig.load(custom_model=eval_model, tasks=args.eval_dataset + custom_names)
     for task_config in task_configs:
+        task_config.dataset_dir = DEFAULT_ROOT_CACHE_DIR
         task_config.use_cache = args.eval_use_cache
         if args.eval_limit is not None:
             task_config.limit = args.eval_limit

diff --git a/swift/llm/infer.py b/swift/llm/infer.py
@@ -369,7 +369,7 @@ def llm_infer(args: InferArguments) -> Dict[str, List[Dict[str, Any]]]:
             if system is None and template.use_default_system:
                 system = template.default_system
             if args.infer_backend == 'vllm':
-                request_list = [{'query': query, 'history': history, 'system': system}]
+                request_list = [{'query': query, 'history': history, 'system': system, **infer_kwargs}]
                 if args.stream:
                     gen = inference_stream_vllm(llm_engine, template, request_list, lora_request=lora_request)
                     print_idx = 0

diff --git a/swift/llm/sft.py b/swift/llm/sft.py
@@ -190,10 +190,6 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     logger.info(f'train_dataset: {train_dataset}')
     logger.info(f'val_dataset: {val_dataset}')
     template_kwargs = {}
-    template_info = TEMPLATE_MAPPING[args.template_type]
-    use_model = template_info.get('use_model', False)
-    if use_model:
-        template_kwargs['model'] = model
     template_kwargs['use_loss_scale'] = args.use_loss_scale
     if args.loss_scale_config_path is not None:
         cwd = os.getcwd()
@@ -204,8 +200,14 @@ def llm_sft(args: SftArguments) -> Dict[str, Union[str, Any]]:
     template_kwargs['tools_prompt'] = args.tools_prompt
     if args.sequence_parallel_size and args.sequence_parallel_size > 1:
         template_kwargs['sequence_parallel_size'] = args.sequence_parallel_size
-    template: Template = get_template(args.template_type, tokenizer, args.system, args.max_length,
-                                      args.truncation_strategy, **template_kwargs)
+    template: Template = get_template(
+        args.template_type,
+        tokenizer,
+        args.system,
+        args.max_length,
+        args.truncation_strategy,
+        model=model,
+        **template_kwargs)
     args.system = template.default_system
     logger.info(f'system: {args.system}')
     logger.info(f'args.lazy_tokenize: {args.lazy_tokenize}')

diff --git a/swift/llm/utils/client_utils.py b/swift/llm/utils/client_utils.py
@@ -33,7 +33,7 @@ def _parse_stream_data(data: bytes) -> Optional[str]:
     data = data.strip()
     if len(data) == 0:
         return
-    assert data.startswith('data:')
+    assert data.startswith('data:'), f'data: {data}'
     return data[5:].strip()
 
 

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -914,6 +914,9 @@ def get_model_tokenizer_from_repo(model_dir: str,
             with context:
                 model = automodel_class.from_pretrained(
                     model_dir, config=model_config, torch_dtype=torch_dtype, trust_remote_code=True, **model_kwargs)
+        if is_training:
+            model.train()
+            model.requires_grad_(True)
         model.is_gptq = is_gptq
         model.is_awq = is_awq
         model.is_aqlm = is_aqlm
@@ -2153,7 +2156,7 @@ def _output_device_map_hook(module, input, output):
     hf_model_id='01-ai/Yi-1.5-9B-Chat')
 @register_model(
     ModelType.yi_1_5_9b_chat_16k,
-    '01ai/Yi-1.5-9B-Chat',
+    '01ai/Yi-1.5-9B-Chat-16K',
     LoRATM.llama,
     TemplateType.yi1_5,
     support_flash_attn=True,
@@ -3537,7 +3540,6 @@ def _new_forward(*args, **kwargs) -> Tensor:
     TemplateType.deepseek_vl,
     support_flash_attn=True,
     tags=['multi-modal', 'vision'],
-    requires=['attrdict'],
     hf_model_id='deepseek-ai/deepseek-vl-7b-chat')
 @register_model(
     ModelType.deepseek_vl_1_3b_chat,
@@ -3546,7 +3548,6 @@ def _new_forward(*args, **kwargs) -> Tensor:
     TemplateType.deepseek_vl,
     support_flash_attn=True,
     tags=['multi-modal', 'vision'],
-    requires=['attrdict'],
     hf_model_id='deepseek-ai/deepseek-vl-1.3b-chat')
 def get_model_tokenizer_deepseek_vl(model_dir: str,
                                     torch_dtype: Dtype,
@@ -4003,7 +4004,7 @@ def _qwen_vl_audio_decode(self, *args, skip_special_tokens=False, **kwargs) -> s
     ModelType.qwen_vl_chat,
     'qwen/Qwen-VL-Chat',
     LoRATM.qwen,
-    TemplateType.qwenvl,
+    TemplateType.qwen_vl,
     support_flash_attn=True,
     tags=['multi-modal', 'vision'],
     hf_model_id='Qwen/Qwen-VL-Chat')