forked from datawhalechina/self-llm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
e60711b
commit 8ae4adc
Showing
3 changed files
with
229 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
# DeepSeek-MoE-16b-chat Transformers 部署调用 | ||
|
||
## DeepSeek-MoE-16b-chat 介绍 | ||
|
||
DeepSeek MoE目前推出的版本参数量为160亿,实际激活参数量大约是28亿。与自家的7B密集模型相比,二者在19个数据集上的表现各有胜负,但整体比较接近。而与同为密集模型的Llama 2-7B相比,DeepSeek MoE在数学、代码等方面还体现出来明显的优势。但两种密集模型的计算量都超过了180TFLOPs每4k token,DeepSeek MoE却只有74.4TFLOPs,只有两者的40%。 | ||
|
||
## 环境准备 | ||
在autodl平台中租一个**双卡3090等24G(共计48G)**显存的显卡机器,如下图所示镜像选择PyTorch-->2.1.0-->3.10(ubuntu22.04)-->12.1 | ||
接下来打开刚刚租用服务器的JupyterLab, 图像 并且打开其中的终端开始环境配置、模型下载和运行演示。 | ||
 | ||
pip换源和安装依赖包 | ||
``` | ||
# 升级pip | ||
python -m pip install --upgrade pip | ||
# 更换 pypi 源加速库的安装 | ||
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple | ||
pip install fastapi==0.104.1 | ||
pip install uvicorn==0.24.0.post1 | ||
pip install requests==2.25.1 | ||
pip install modelscope==1.9.5 | ||
pip install transformers==4.35.2 | ||
pip install streamlit==1.24.0 | ||
pip install sentencepiece==0.1.99 | ||
pip install accelerate==0.24.1 | ||
pip install transformers_stream_generator==0.0.4 | ||
``` | ||
## 模型下载 | ||
使用 modelscope 中的snapshot_download函数下载模型,第一个参数为模型名称,参数cache_dir为模型的下载路径。 | ||
|
||
在 /root/autodl-tmp 路径下新建 download.py 文件并在其中输入以下内容,粘贴代码后记得保存文件,如下图所示。并运行 python /root/autodl-tmp/download.py 执行下载,模型大小为15 GB,下载模型大概需要10~20分钟 | ||
|
||
``` | ||
import torch | ||
from modelscope import snapshot_download, AutoModel, AutoTokenizer | ||
from modelscope import GenerationConfig | ||
model_dir = snapshot_download('deepseek-ai/deepseek-llm-7b-chat', cache_dir='/root/autodl-tmp', revision='master') | ||
``` | ||
|
||
## 代码准备 | ||
|
||
在/root/autodl-tmp路径下新建api.py文件并在其中输入以下内容,粘贴代码后记得保存文件。下面的代码有很详细的注释,大家如有不理解的地方,欢迎提出issue。 | ||
``` | ||
from fastapi import FastAPI, Request | ||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig | ||
import uvicorn | ||
import json | ||
import datetime | ||
import torch | ||
# 设置设备参数 | ||
DEVICE = "cuda" # 使用CUDA | ||
DEVICE_ID = "0" # CUDA设备ID,如果未设置则为空 | ||
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE # 组合CUDA设备信息 | ||
# 清理GPU内存函数 | ||
def torch_gc(): | ||
if torch.cuda.is_available(): # 检查是否可用CUDA | ||
with torch.cuda.device(CUDA_DEVICE): # 指定CUDA设备 | ||
torch.cuda.empty_cache() # 清空CUDA缓存 | ||
torch.cuda.ipc_collect() # 收集CUDA内存碎片 | ||
# 创建FastAPI应用 | ||
app = FastAPI() | ||
# 处理POST请求的端点 | ||
@app.post("/") | ||
async def create_item(request: Request): | ||
global model, tokenizer # 声明全局变量以便在函数内部使用模型和分词器 | ||
json_post_raw = await request.json() # 获取POST请求的JSON数据 | ||
json_post = json.dumps(json_post_raw) # 将JSON数据转换为字符串 | ||
json_post_list = json.loads(json_post) # 将字符串转换为Python对象 | ||
prompt = json_post_list.get('prompt') # 获取请求中的提示 | ||
max_length = json_post_list.get('max_length') # 获取请求中的最大长度 | ||
# 构建 messages | ||
messages = [ | ||
{"role": "user", "content": prompt} | ||
] | ||
# 构建输入 | ||
input_tensor = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt") | ||
# 通过模型获得输出 | ||
outputs = model.generate(input_tensor.to(model.device), max_new_tokens=max_length) | ||
result = tokenizer.decode(outputs[0][input_tensor.shape[1]:], skip_special_tokens=True) | ||
now = datetime.datetime.now() # 获取当前时间 | ||
time = now.strftime("%Y-%m-%d %H:%M:%S") # 格式化时间为字符串 | ||
# 构建响应JSON | ||
answer = { | ||
"response": result, | ||
"status": 200, | ||
"time": time | ||
} | ||
# 构建日志信息 | ||
log = "[" + time + "] " + '", prompt:"' + prompt + '", response:"' + repr(result) + '"' | ||
print(log) # 打印日志 | ||
torch_gc() # 执行GPU内存清理 | ||
return answer # 返回响应 | ||
# 主函数入口 | ||
if __name__ == '__main__': | ||
mode_name_or_path = '/root/autodl-tmp/deepseek-ai/deepseek-llm-7b-chat' | ||
# 加载预训练的分词器和模型 | ||
tokenizer = AutoTokenizer.from_pretrained(mode_name_or_path, trust_remote_code=True) | ||
model = AutoModelForCausalLM.from_pretrained(mode_name_or_path, trust_remote_code=True,torch_dtype=torch.bfloat16, device_map="auto") | ||
model.generation_config = GenerationConfig.from_pretrained(mode_name_or_path) | ||
model.generation_config.pad_token_id = model.generation_config.eos_token_id | ||
model.eval() # 设置模型为评估模式 | ||
# 启动FastAPI应用 | ||
# 用6006端口可以将autodl的端口映射到本地,从而在本地使用api | ||
uvicorn.run(app, host='0.0.0.0', port=6006, workers=1) # 在指定端口和主机上启动应用 | ||
``` | ||
|
||
## Api 部署 | ||
|
||
在终端输入以下命令启动api服务 | ||
``` | ||
cd /root/autodl-tmp | ||
python api.py | ||
``` | ||
加载完毕后出现如下信息说明成功。 | ||
 | ||
|
||
默认部署在 6006 端口,通过 POST 方法进行调用,可以使用curl调用,如下所示: | ||
``` | ||
curl -X POST "http://127.0.0.1:6006" \ | ||
-H 'Content-Type: application/json' \ | ||
-d '{"prompt": "你好"}' | ||
``` | ||
也可以使用python中的requests库进行调用,如下所示: | ||
``` | ||
import requests | ||
import json | ||
def get_completion(prompt): | ||
headers = {'Content-Type': 'application/json'} | ||
data = {"prompt": prompt} | ||
response = requests.post(url='http://127.0.0.1:6006', headers=headers, data=json.dumps(data)) | ||
return response.json()['response'] | ||
if __name__ == '__main__': | ||
print(get_completion('你好')) | ||
``` | ||
得到的返回值如下所示: | ||
|
||
```text | ||
{ | ||
'response': '你好!有什么我可以帮助你的吗?', | ||
'status': 200, | ||
'time': '2023-12-01 17:06:10' | ||
} | ||
``` | ||
 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# DeepSeek-MoE-16b-chat Transformers 部署调用 | ||
|
||
## DeepSeek-MoE-16b-chat 介绍 | ||
|
||
DeepSeek MoE目前推出的版本参数量为160亿,实际激活参数量大约是28亿。与自家的7B密集模型相比,二者在19个数据集上的表现各有胜负,但整体比较接近。而与同为密集模型的Llama 2-7B相比,DeepSeek MoE在数学、代码等方面还体现出来明显的优势。但两种密集模型的计算量都超过了180TFLOPs每4k token,DeepSeek MoE却只有74.4TFLOPs,只有两者的40%。 | ||
|
||
## 环境准备 | ||
在autodl平台中租一个**双卡3090等24G(共计48G)**显存的显卡机器,如下图所示镜像选择PyTorch-->2.1.0-->3.10(ubuntu22.04)-->12.1 | ||
接下来打开刚刚租用服务器的JupyterLab, 图像 并且打开其中的终端开始环境配置、模型下载和运行演示。 | ||
 | ||
|
||
接下来打开刚刚租用服务器的`JupyterLab`,并且打开其中的终端开始环境配置、模型下载和运行`demo`。 | ||
|
||
pip换源和安装依赖包 | ||
|
||
```shell | ||
# 升级pip | ||
python -m pip install --upgrade pip | ||
# 更换 pypi 源加速库的安装 | ||
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple | ||
|
||
pip install modelscope transformers sentencepiece | ||
pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.4.2/flash_attn-2.4.2+cu118torch2.1cxx11abiTRUE-cp310-cp310-linux_x86_64.whl | ||
``` | ||
|
||
## 模型下载 | ||
|
||
使用 `modelscope` 中的`snapshot_download`函数下载模型,第一个参数为模型名称,参数`cache_dir`为模型的下载路径。 | ||
|
||
在 `/root/autodl-tmp` 路径下新建 `download.py` 文件并在其中输入以下内容,粘贴代码后记得保存文件,如下图所示。并运行 `python /root/autodl-tmp/download.py`执行下载,模型大小为 30 GB,下载模型大概需要 10~20 分钟 | ||
|
||
```python | ||
import torch | ||
from modelscope import snapshot_download, AutoModel, AutoTokenizer | ||
import os | ||
model_dir = snapshot_download('deepseek-ai/deepseek-moe-16b-chat', cache_dir='/root/autodl-tmp', revision='master') | ||
``` | ||
|
||
## 代码准备 | ||
|
||
在/root/autodl-tmp路径下新建trans.py文件并在其中输入以下内容 | ||
```python | ||
# 使用Hugging Face中'transformer'库中的AutoTokenizer和AutoModelForCausalLM以加载分词器和对话模型 | ||
from transformers import AutoTokenizer, AutoModelForCausalLM | ||
import torch | ||
# 使用模型下载到的本地路径以加载 | ||
model_dir = '/root/autodl-tmp/deepseek-ai/deepseek-moe-16b-chat' | ||
# 分词器的加载,本地加载,trust_remote_code=True设置允许从网络上下载模型权重和相关的代码 | ||
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) | ||
# 模型加载,本地加载,使用AutoModelForCausalLM类 | ||
model = AutoModelForCausalLM.from_pretrained(model_dir, trust_remote_code=True) | ||
# 将模型移动到GPU上进行加速(如果有GPU的话) | ||
device = torch.device("cuda") | ||
model.to(device) | ||
# 使用模型的评估模式来产生对话 | ||
model.eval() | ||
# 第一轮对话 | ||
response, history = model.chat(tokenizer, "你好", history=[]) | ||
print(response) | ||
# 第二轮对话 | ||
response, history = model.chat(tokenizer, "请介绍一下你自己", history=history) | ||
print(response) | ||
# 第三轮对话 | ||
response, history = model.chat(tokenizer, "请帮我使用python语言写一段冒泡排序的代码", history=history) | ||
print(response) | ||
``` | ||
### 部署 | ||
|
||
在终端输入以下命令运行trans.py,即实现ChatGLM3-6B的Transformers部署调用 | ||
|
||
```shell | ||
cd /root/autodl-tmp | ||
python trans.py | ||
``` | ||
观察命令行中loading checkpoint表示模型正在加载,等待模型加载完成产生对话,如下图所示 | ||
 |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.