Skip to content

Commit

Permalink
android device support and test.
Browse files Browse the repository at this point in the history
  • Loading branch information
wangzhaode committed Mar 28, 2023
1 parent 8dc71bb commit 45edaf1
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 35 deletions.
38 changes: 26 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ project(chatglm-mnn)

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")

option(MINI_MEM_MODE "Using mini memory mode for small memory device." OFF)
option(BUILD_FOR_ANDROID "Build for android whith mini memory mode." OFF)

# include dir
include_directories(${CMAKE_CURRENT_LIST_DIR}/include/)
Expand All @@ -17,16 +17,30 @@ FILE(GLOB SRCS ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
# compile dynamic lib
add_library(chat SHARED ${SRCS})
target_link_libraries(chat MNN MNN_Express)

if (MINI_MEM_MODE)
target_compile_options(chat PRIVATE -DMINI_MEM_MODE)
endif()
# target_link_libraries(chat MNN MNN_Express MNN_CL) # if using OPENCL

# demo targets
add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp)
target_link_libraries(cli_demo chat)

add_executable(web_demo ${CMAKE_CURRENT_LIST_DIR}/demo/web_demo.cpp)
# target_link_libraries(web_demo chat pthread ssl crypto)
target_link_libraries(web_demo chat pthread)
if (BUILD_FOR_ANDROID)
target_compile_options(chat PRIVATE -DMINI_MEM_MODE)
add_library(MNN SHARED IMPORTED)
add_library(MNN_Express SHARED IMPORTED)
set_target_properties(
MNN
PROPERTIES IMPORTED_LOCATION
${CMAKE_CURRENT_LIST_DIR}/libs/libMNN.so
)
set_target_properties(
MNN_Express
PROPERTIES IMPORTED_LOCATION
${CMAKE_CURRENT_LIST_DIR}/libs/libMNN_Express.so
)
# just cli demo
add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp)
target_link_libraries(cli_demo chat log)
else()
# cli demo
add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp)
target_link_libraries(cli_demo chat)
# web demo
add_executable(web_demo ${CMAKE_CURRENT_LIST_DIR}/demo/web_demo.cpp)
target_link_libraries(web_demo chat pthread)
endif()
22 changes: 18 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,32 @@
2. `Embedding`操作调用次数较少,使用`fseek`, `fread`加载的方式降低内存;
3. `lm_head`操作为`[num, 4096] @ [4096, 130528]`,转换为`[130528, 4096] @ [4096, 1]`;
4. 原模型对显存要求较高;将模型按层拆分成28个模型,可以根据用户显存大小动态将计算任务分配给GPU和CPU,充分利用GPU与CPU内存与算力; 即使小显存显卡也可以加速生成。
5. 针对端侧设备可以逐次加载计算,`2G`内存的Android设备也可以执行推理(速度较慢)。

目前支持命令行对话与Web UI对话两种形式的Demo
![web_demo](./resource/web/web_demo.png)

## Speed

测试平台:
移动端:将分段模型逐个加载推理可以在内存大小大于`2G`的设备执行推理,实验性测试性能较差;目前性能约为:63 `s/word`

PC测试平台:
- Memory: 32G (+32G Swap)
- CPU: AMD Ryzen 9 3900X 12-Core Processor
- GPU: GeForce RTX 2080 Ti

### FP Model
仅测试浮点模型(CPU: fp32/ GPU: fp16),输入`你好`,在回复完内容相同的情况下,每秒生成的词数(word/s)对比如下:
仅测试浮点模型(CPU: fp32/ GPU: fp16),输入`你好`,在回复完内容相同的情况下,每秒生成的词数(`s/word`)对比如下:

| impl | GPU + CPU | CPU only |
|---------|---------------|-----------|
| MNN | 3.424 | 1.140 |
| Pytorch | out of memory | 0.744 |
| MNN | 0.292 | 0.877 |
| Pytorch | out of memory | 1.344 |

### Quantize Model
`TODO`


## Usage
### 1. Compile MNN library
从源码编译MNN
Expand All @@ -51,6 +55,7 @@ cd resource/models
```

### 3. Build and Run
Mac/Linux/Windows:
```bash
mkdir build
cd build
Expand All @@ -59,6 +64,15 @@ make -j8
./cli_demo # cli demo
./web_demo # web ui demo
```

Android:
```
mkdir build
cd build
../android_build.sh
make -j8
```

#### 4. Using CUDA
默认用法为使用`CPU`, 使用`CUDA`需要在编译MNN时添加宏`-DMNN_CUDA=ON`,在创建`ChatGLM`时指定显存大小,如下:
```cpp
Expand Down
4 changes: 3 additions & 1 deletion android_build.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
cmake .. \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_STL=c++_static \
-DANDROID_ABI="arm64-v8a" \
-DANDROID_NATIVE_API_LEVEL=android-21 \
-DCMAKE_BUILD_TYPE=Release \
-DMINI_MEM_MODE=ON
-DBUILD_FOR_ANDROID=ON
77 changes: 59 additions & 18 deletions src/chat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

#include "chat.hpp"
#include "cppjieba/Jieba.hpp"
#include <MNN/expr/ExecutorScope.hpp>

void ChatGLM::chat() {
while (true) {
Expand All @@ -28,7 +29,7 @@ void ChatGLM::chat() {
}

std::string ChatGLM::response(const std::string& input_str, std::ostream* os) {
// AUTOTIME;
AUTOTIME;
// init status
mSeqLen = 0, mContextLen = -1, mMaskIdx = -1;
if (mHistoryVars.empty()) mHistoryVars.resize(LAYER_SIZE);
Expand Down Expand Up @@ -95,11 +96,13 @@ void ChatGLM::init(float gpu_memory) {
BackendConfig cpuBackendConfig;
config.type = MNN_FORWARD_CPU;
config.numThread = 4;
cpuBackendConfig.precision = BackendConfig::Precision_Low;
config.backendConfig = &cpuBackendConfig;
mCPURtmgr.reset(Executor::RuntimeManager::createRuntimeManager(config));
BackendConfig gpuBackendConfig;
config.type = MNN_FORWARD_CUDA;
config.backupType = MNN_FORWARD_OPENCL;
config.numThread = 1;
gpuBackendConfig.precision = BackendConfig::Precision_Low;
config.backendConfig = &gpuBackendConfig;
mGPURtmgr.reset(Executor::RuntimeManager::createRuntimeManager(config));
Expand All @@ -115,11 +118,11 @@ void ChatGLM::init(float gpu_memory) {
printf("Done!\n");
// 2. load models
mModules.resize(LAYER_SIZE + 1);
int gpu_run_layers = (gpu_memory - 2) * 1024.0 / 385.0;
char buffer[50];
#ifdef MINI_MEM_MODE
loadModel("../resource/models/glm_block_0.mnn", false, 0);
#else
int gpu_run_layers = (gpu_memory - 2) * 1024.0 / 385.0;
char buffer[50];
for (int i = 0; i < LAYER_SIZE; i++) {
sprintf(buffer, "../resource/models/glm_block_%d.mnn", i);
loadModel(buffer, i <= gpu_run_layers, i);
Expand All @@ -131,12 +134,12 @@ void ChatGLM::init(float gpu_memory) {

void ChatGLM::loadModel(const char* fileName, bool cuda, int i) {
// AUTOTIME;
Module::Config config;
config.shapeMutable = true;
#ifndef MINI_MEM_MODE
config.rearrange = true;
printf("load %s model ... ", fileName);
#endif
Module::Config config;
config.shapeMutable = true;
config.rearrange = true;
auto rtmgr = cuda ? mGPURtmgr : mCPURtmgr;
std::shared_ptr<Module> net(Module::load({}, {}, fileName, rtmgr, &config));
mModules[i] = std::move(net);
Expand Down Expand Up @@ -211,29 +214,29 @@ VARP ChatGLM::gen_position_ids(const std::vector<int>& input_ids) {
}

int ChatGLM::forward(const std::vector<int>& input_ids) {
AUTOTIME;
mSeqLen += input_ids.size();
auto hidden_states = gen_embedding(input_ids);
auto attention_mask = gen_attention_mask(input_ids);
auto position_ids = gen_position_ids(input_ids);
#ifdef MINI_MEM_MODE
char buffer[50];
int i = 0;
std::thread load_lm(&ChatGLM::loadModel, this, "../resource/models/lm.mnn", false, LAYER_SIZE);
for (; i < LAYER_SIZE; i++) {
AUTOTIME;
for (int i = 0; i < LAYER_SIZE; i++) {
int loadIdx = i < LAYER_SIZE - 1 ? i + 1 : 0;
sprintf(buffer, "../resource/8bit_models/glm_block_%d.mnn", loadIdx);
sprintf(buffer, "../resource/models/glm_block_%d.mnn", loadIdx);
std::thread load_next_model(&ChatGLM::loadModel, this, buffer, false, loadIdx);
auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]});
hidden_states = outputs[0];
mHistoryVars[i] = outputs[1];
{
// AUTOTIME;
auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]});
hidden_states = outputs[0];
mHistoryVars[i] = outputs[1];
}
load_next_model.join();
mModules[i].reset();
}
load_lm.join();
#else
for (int i = 0; i < LAYER_SIZE; i++) {
AUTOTIME;
// AUTOTIME;
auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]});
hidden_states = outputs[0];
mHistoryVars[i] = outputs[1];
Expand All @@ -249,10 +252,48 @@ int ChatGLM::var_to_token(VARP var) {
var = _Gather(var, _Scalar<int>(num - 1));
}
var = _Reshape(var, {HIDDEN_SIZE, 1});
#ifdef MINI_MEM_MODE
// naive impl to save memory : gemm + argmax
auto ptr = var->readMap<float>();
constexpr int TILE = 512;
FILE* file = fopen("../resource/models/slim_lm.bin", "rb");
std::vector<float> buffer(TILE * HIDDEN_SIZE);
int id = -1;
float max_score = 0.f;
for (size_t i = 0; i < VOCAB_SIZE / TILE; i++) {
fseek(file, i * TILE * HIDDEN_SIZE * sizeof(float), SEEK_SET);
fread(reinterpret_cast<char*>(buffer.data()), 1, TILE * HIDDEN_SIZE * sizeof(float), file);
for (int j = 0; j < TILE; j++) {
float sum = 0.f;
for (int k = 0; k < HIDDEN_SIZE; k++) {
sum += (buffer[j * HIDDEN_SIZE + k]) * ptr[k];
}
if (sum > max_score) {
max_score = sum;
id = i * TILE + j;
}
}
}
{
int i = VOCAB_SIZE / TILE;
constexpr int tile = VOCAB_SIZE % TILE;
fseek(file, i * TILE * HIDDEN_SIZE * sizeof(float), SEEK_SET);
fread(reinterpret_cast<char*>(buffer.data()), 1, tile * HIDDEN_SIZE * sizeof(float), file);
for (int j = 0; j < tile; j++) {
float sum = 0.f;
for (int k = 0; k < HIDDEN_SIZE; k++) {
sum += (buffer[j * HIDDEN_SIZE + k]) * ptr[k];
}
if (sum > max_score) {
max_score = sum;
id = i * TILE + j;
}
}
}
fclose(file);
#else
auto outputs = mModules.back()->onForward({var});
int id = outputs[0]->readMap<int>()[0];
#ifdef MINI_MEM_MODE
mModules.back().reset();
#endif
// printf("### %d\n", id);
return id;
Expand Down

0 comments on commit 45edaf1

Please sign in to comment.