From 45edaf1e84801a795efce720e44bcdf5026aeafa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=81=E8=A1=8C?= Date: Tue, 28 Mar 2023 20:56:00 +0800 Subject: [PATCH] android device support and test. --- CMakeLists.txt | 38 ++++++++++++++++-------- README.md | 22 +++++++++++--- android_build.sh | 4 ++- src/chat.cpp | 77 +++++++++++++++++++++++++++++++++++++----------- 4 files changed, 106 insertions(+), 35 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f4641d4a..f252113d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ project(chatglm-mnn) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -option(MINI_MEM_MODE "Using mini memory mode for small memory device." OFF) +option(BUILD_FOR_ANDROID "Build for android whith mini memory mode." OFF) # include dir include_directories(${CMAKE_CURRENT_LIST_DIR}/include/) @@ -17,16 +17,30 @@ FILE(GLOB SRCS ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp) # compile dynamic lib add_library(chat SHARED ${SRCS}) target_link_libraries(chat MNN MNN_Express) - -if (MINI_MEM_MODE) - target_compile_options(chat PRIVATE -DMINI_MEM_MODE) -endif() # target_link_libraries(chat MNN MNN_Express MNN_CL) # if using OPENCL -# demo targets -add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp) -target_link_libraries(cli_demo chat) - -add_executable(web_demo ${CMAKE_CURRENT_LIST_DIR}/demo/web_demo.cpp) -# target_link_libraries(web_demo chat pthread ssl crypto) -target_link_libraries(web_demo chat pthread) +if (BUILD_FOR_ANDROID) + target_compile_options(chat PRIVATE -DMINI_MEM_MODE) + add_library(MNN SHARED IMPORTED) + add_library(MNN_Express SHARED IMPORTED) + set_target_properties( + MNN + PROPERTIES IMPORTED_LOCATION + ${CMAKE_CURRENT_LIST_DIR}/libs/libMNN.so + ) + set_target_properties( + MNN_Express + PROPERTIES IMPORTED_LOCATION + ${CMAKE_CURRENT_LIST_DIR}/libs/libMNN_Express.so + ) + # just cli demo + add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp) + target_link_libraries(cli_demo chat log) +else() + # cli demo + add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp) + target_link_libraries(cli_demo chat) + # web demo + add_executable(web_demo ${CMAKE_CURRENT_LIST_DIR}/demo/web_demo.cpp) + target_link_libraries(web_demo chat pthread) +endif() \ No newline at end of file diff --git a/README.md b/README.md index d4b7db47..e526d1fe 100644 --- a/README.md +++ b/README.md @@ -7,28 +7,32 @@ 2. `Embedding`操作调用次数较少,使用`fseek`, `fread`加载的方式降低内存; 3. `lm_head`操作为`[num, 4096] @ [4096, 130528]`,转换为`[130528, 4096] @ [4096, 1]`; 4. 原模型对显存要求较高;将模型按层拆分成28个模型,可以根据用户显存大小动态将计算任务分配给GPU和CPU,充分利用GPU与CPU内存与算力; 即使小显存显卡也可以加速生成。 +5. 针对端侧设备可以逐次加载计算,`2G`内存的Android设备也可以执行推理(速度较慢)。 目前支持命令行对话与Web UI对话两种形式的Demo ![web_demo](./resource/web/web_demo.png) ## Speed -测试平台: +移动端:将分段模型逐个加载推理可以在内存大小大于`2G`的设备执行推理,实验性测试性能较差;目前性能约为:63 `s/word`。 + +PC测试平台: - Memory: 32G (+32G Swap) - CPU: AMD Ryzen 9 3900X 12-Core Processor - GPU: GeForce RTX 2080 Ti ### FP Model -仅测试浮点模型(CPU: fp32/ GPU: fp16),输入`你好`,在回复完内容相同的情况下,每秒生成的词数(word/s)对比如下: +仅测试浮点模型(CPU: fp32/ GPU: fp16),输入`你好`,在回复完内容相同的情况下,每秒生成的词数(`s/word`)对比如下: | impl | GPU + CPU | CPU only | |---------|---------------|-----------| -| MNN | 3.424 | 1.140 | -| Pytorch | out of memory | 0.744 | +| MNN | 0.292 | 0.877 | +| Pytorch | out of memory | 1.344 | ### Quantize Model `TODO` + ## Usage ### 1. Compile MNN library 从源码编译MNN @@ -51,6 +55,7 @@ cd resource/models ``` ### 3. Build and Run +Mac/Linux/Windows: ```bash mkdir build cd build @@ -59,6 +64,15 @@ make -j8 ./cli_demo # cli demo ./web_demo # web ui demo ``` + +Android: +``` +mkdir build +cd build +../android_build.sh +make -j8 +``` + #### 4. Using CUDA 默认用法为使用`CPU`, 使用`CUDA`需要在编译MNN时添加宏`-DMNN_CUDA=ON`,在创建`ChatGLM`时指定显存大小,如下: ```cpp diff --git a/android_build.sh b/android_build.sh index a5065236..2e1cc23d 100755 --- a/android_build.sh +++ b/android_build.sh @@ -1,5 +1,7 @@ cmake .. \ -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \ -DANDROID_STL=c++_static \ +-DANDROID_ABI="arm64-v8a" \ +-DANDROID_NATIVE_API_LEVEL=android-21 \ -DCMAKE_BUILD_TYPE=Release \ --DMINI_MEM_MODE=ON +-DBUILD_FOR_ANDROID=ON diff --git a/src/chat.cpp b/src/chat.cpp index 9e9e39b0..77cd0ead 100644 --- a/src/chat.cpp +++ b/src/chat.cpp @@ -15,6 +15,7 @@ #include "chat.hpp" #include "cppjieba/Jieba.hpp" +#include void ChatGLM::chat() { while (true) { @@ -28,7 +29,7 @@ void ChatGLM::chat() { } std::string ChatGLM::response(const std::string& input_str, std::ostream* os) { - // AUTOTIME; + AUTOTIME; // init status mSeqLen = 0, mContextLen = -1, mMaskIdx = -1; if (mHistoryVars.empty()) mHistoryVars.resize(LAYER_SIZE); @@ -95,11 +96,13 @@ void ChatGLM::init(float gpu_memory) { BackendConfig cpuBackendConfig; config.type = MNN_FORWARD_CPU; config.numThread = 4; + cpuBackendConfig.precision = BackendConfig::Precision_Low; config.backendConfig = &cpuBackendConfig; mCPURtmgr.reset(Executor::RuntimeManager::createRuntimeManager(config)); BackendConfig gpuBackendConfig; config.type = MNN_FORWARD_CUDA; config.backupType = MNN_FORWARD_OPENCL; + config.numThread = 1; gpuBackendConfig.precision = BackendConfig::Precision_Low; config.backendConfig = &gpuBackendConfig; mGPURtmgr.reset(Executor::RuntimeManager::createRuntimeManager(config)); @@ -115,11 +118,11 @@ void ChatGLM::init(float gpu_memory) { printf("Done!\n"); // 2. load models mModules.resize(LAYER_SIZE + 1); + int gpu_run_layers = (gpu_memory - 2) * 1024.0 / 385.0; + char buffer[50]; #ifdef MINI_MEM_MODE loadModel("../resource/models/glm_block_0.mnn", false, 0); #else - int gpu_run_layers = (gpu_memory - 2) * 1024.0 / 385.0; - char buffer[50]; for (int i = 0; i < LAYER_SIZE; i++) { sprintf(buffer, "../resource/models/glm_block_%d.mnn", i); loadModel(buffer, i <= gpu_run_layers, i); @@ -131,12 +134,12 @@ void ChatGLM::init(float gpu_memory) { void ChatGLM::loadModel(const char* fileName, bool cuda, int i) { // AUTOTIME; - Module::Config config; - config.shapeMutable = true; #ifndef MINI_MEM_MODE - config.rearrange = true; printf("load %s model ... ", fileName); #endif + Module::Config config; + config.shapeMutable = true; + config.rearrange = true; auto rtmgr = cuda ? mGPURtmgr : mCPURtmgr; std::shared_ptr net(Module::load({}, {}, fileName, rtmgr, &config)); mModules[i] = std::move(net); @@ -211,29 +214,29 @@ VARP ChatGLM::gen_position_ids(const std::vector& input_ids) { } int ChatGLM::forward(const std::vector& input_ids) { + AUTOTIME; mSeqLen += input_ids.size(); auto hidden_states = gen_embedding(input_ids); auto attention_mask = gen_attention_mask(input_ids); auto position_ids = gen_position_ids(input_ids); #ifdef MINI_MEM_MODE char buffer[50]; - int i = 0; - std::thread load_lm(&ChatGLM::loadModel, this, "../resource/models/lm.mnn", false, LAYER_SIZE); - for (; i < LAYER_SIZE; i++) { - AUTOTIME; + for (int i = 0; i < LAYER_SIZE; i++) { int loadIdx = i < LAYER_SIZE - 1 ? i + 1 : 0; - sprintf(buffer, "../resource/8bit_models/glm_block_%d.mnn", loadIdx); + sprintf(buffer, "../resource/models/glm_block_%d.mnn", loadIdx); std::thread load_next_model(&ChatGLM::loadModel, this, buffer, false, loadIdx); - auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]}); - hidden_states = outputs[0]; - mHistoryVars[i] = outputs[1]; + { + // AUTOTIME; + auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]}); + hidden_states = outputs[0]; + mHistoryVars[i] = outputs[1]; + } load_next_model.join(); mModules[i].reset(); } - load_lm.join(); #else for (int i = 0; i < LAYER_SIZE; i++) { - AUTOTIME; + // AUTOTIME; auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]}); hidden_states = outputs[0]; mHistoryVars[i] = outputs[1]; @@ -249,10 +252,48 @@ int ChatGLM::var_to_token(VARP var) { var = _Gather(var, _Scalar(num - 1)); } var = _Reshape(var, {HIDDEN_SIZE, 1}); +#ifdef MINI_MEM_MODE + // naive impl to save memory : gemm + argmax + auto ptr = var->readMap(); + constexpr int TILE = 512; + FILE* file = fopen("../resource/models/slim_lm.bin", "rb"); + std::vector buffer(TILE * HIDDEN_SIZE); + int id = -1; + float max_score = 0.f; + for (size_t i = 0; i < VOCAB_SIZE / TILE; i++) { + fseek(file, i * TILE * HIDDEN_SIZE * sizeof(float), SEEK_SET); + fread(reinterpret_cast(buffer.data()), 1, TILE * HIDDEN_SIZE * sizeof(float), file); + for (int j = 0; j < TILE; j++) { + float sum = 0.f; + for (int k = 0; k < HIDDEN_SIZE; k++) { + sum += (buffer[j * HIDDEN_SIZE + k]) * ptr[k]; + } + if (sum > max_score) { + max_score = sum; + id = i * TILE + j; + } + } + } + { + int i = VOCAB_SIZE / TILE; + constexpr int tile = VOCAB_SIZE % TILE; + fseek(file, i * TILE * HIDDEN_SIZE * sizeof(float), SEEK_SET); + fread(reinterpret_cast(buffer.data()), 1, tile * HIDDEN_SIZE * sizeof(float), file); + for (int j = 0; j < tile; j++) { + float sum = 0.f; + for (int k = 0; k < HIDDEN_SIZE; k++) { + sum += (buffer[j * HIDDEN_SIZE + k]) * ptr[k]; + } + if (sum > max_score) { + max_score = sum; + id = i * TILE + j; + } + } + } + fclose(file); +#else auto outputs = mModules.back()->onForward({var}); int id = outputs[0]->readMap()[0]; -#ifdef MINI_MEM_MODE - mModules.back().reset(); #endif // printf("### %d\n", id); return id;