From 45edaf1e84801a795efce720e44bcdf5026aeafa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=81=E8=A1=8C?= <zhaode.wzd@alibaba-inc.com>
Date: Tue, 28 Mar 2023 20:56:00 +0800
Subject: [PATCH] android device support and test.

---
 CMakeLists.txt   | 38 ++++++++++++++++--------
 README.md        | 22 +++++++++++---
 android_build.sh |  4 ++-
 src/chat.cpp     | 77 +++++++++++++++++++++++++++++++++++++-----------
 4 files changed, 106 insertions(+), 35 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f4641d4a..f252113d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ project(chatglm-mnn)
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
-option(MINI_MEM_MODE "Using mini memory mode for small memory device." OFF)
+option(BUILD_FOR_ANDROID "Build for android whith mini memory mode." OFF)
 
 # include dir
 include_directories(${CMAKE_CURRENT_LIST_DIR}/include/)
@@ -17,16 +17,30 @@ FILE(GLOB SRCS ${CMAKE_CURRENT_LIST_DIR}/src/*.cpp)
 # compile dynamic lib
 add_library(chat SHARED ${SRCS})
 target_link_libraries(chat MNN MNN_Express)
-
-if (MINI_MEM_MODE)
-    target_compile_options(chat PRIVATE -DMINI_MEM_MODE)
-endif()
 # target_link_libraries(chat MNN MNN_Express MNN_CL) # if using OPENCL
 
-# demo targets
-add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp)
-target_link_libraries(cli_demo chat)
-
-add_executable(web_demo ${CMAKE_CURRENT_LIST_DIR}/demo/web_demo.cpp)
-# target_link_libraries(web_demo chat pthread ssl crypto)
-target_link_libraries(web_demo chat pthread)
+if (BUILD_FOR_ANDROID)
+    target_compile_options(chat PRIVATE -DMINI_MEM_MODE)
+    add_library(MNN SHARED IMPORTED)
+    add_library(MNN_Express SHARED IMPORTED)
+    set_target_properties(
+        MNN
+        PROPERTIES IMPORTED_LOCATION
+        ${CMAKE_CURRENT_LIST_DIR}/libs/libMNN.so
+        )
+    set_target_properties(
+        MNN_Express
+        PROPERTIES IMPORTED_LOCATION
+        ${CMAKE_CURRENT_LIST_DIR}/libs/libMNN_Express.so
+        )
+    # just cli demo
+    add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp)
+    target_link_libraries(cli_demo chat log)
+else()
+    # cli demo
+    add_executable(cli_demo ${CMAKE_CURRENT_LIST_DIR}/demo/cli_demo.cpp)
+    target_link_libraries(cli_demo chat)
+    # web demo
+    add_executable(web_demo ${CMAKE_CURRENT_LIST_DIR}/demo/web_demo.cpp)
+    target_link_libraries(web_demo chat pthread)
+endif()
\ No newline at end of file
diff --git a/README.md b/README.md
index d4b7db47..e526d1fe 100644
--- a/README.md
+++ b/README.md
@@ -7,28 +7,32 @@
 2. `Embedding`操作调用次数较少，使用`fseek`, `fread`加载的方式降低内存;
 3. `lm_head`操作为`[num, 4096] @ [4096, 130528]`，转换为`[130528, 4096] @ [4096, 1]`;
 4. 原模型对显存要求较高；将模型按层拆分成28个模型，可以根据用户显存大小动态将计算任务分配给GPU和CPU，充分利用GPU与CPU内存与算力; 即使小显存显卡也可以加速生成。
+5. 针对端侧设备可以逐次加载计算，`2G`内存的Android设备也可以执行推理（速度较慢）。
 
 目前支持命令行对话与Web UI对话两种形式的Demo
 ![web_demo](./resource/web/web_demo.png)
 
 ## Speed
 
-测试平台：
+移动端：将分段模型逐个加载推理可以在内存大小大于`2G`的设备执行推理，实验性测试性能较差；目前性能约为：63 `s/word`。
+
+PC测试平台：
 - Memory: 32G (+32G Swap)
 - CPU: AMD Ryzen 9 3900X 12-Core Processor
 - GPU: GeForce RTX 2080 Ti
 
 ### FP Model
-仅测试浮点模型(CPU: fp32/ GPU: fp16)，输入`你好`，在回复完内容相同的情况下，每秒生成的词数(word/s)对比如下：
+仅测试浮点模型(CPU: fp32/ GPU: fp16)，输入`你好`，在回复完内容相同的情况下，每秒生成的词数(`s/word`)对比如下：
 
 |   impl  |   GPU + CPU   | CPU only  |
 |---------|---------------|-----------|
-|   MNN   |      3.424    |   1.140   |
-| Pytorch | out of memory |   0.744   |
+|   MNN   |      0.292    |   0.877   |
+| Pytorch | out of memory |   1.344   |
 
 ### Quantize Model
 `TODO`
 
+
 ## Usage
 ### 1. Compile MNN library
 从源码编译MNN
@@ -51,6 +55,7 @@ cd resource/models
 ```
 
 ### 3. Build and Run
+Mac/Linux/Windows:
 ```bash
 mkdir build
 cd build
@@ -59,6 +64,15 @@ make -j8
 ./cli_demo # cli demo
 ./web_demo # web ui demo
 ```
+
+Android:
+```
+mkdir build
+cd build
+../android_build.sh
+make -j8
+```
+
 #### 4. Using CUDA
 默认用法为使用`CPU`, 使用`CUDA`需要在编译MNN时添加宏`-DMNN_CUDA=ON`，在创建`ChatGLM`时指定显存大小，如下：
 ```cpp
diff --git a/android_build.sh b/android_build.sh
index a5065236..2e1cc23d 100755
--- a/android_build.sh
+++ b/android_build.sh
@@ -1,5 +1,7 @@
 cmake .. \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DANDROID_STL=c++_static \
+-DANDROID_ABI="arm64-v8a" \
+-DANDROID_NATIVE_API_LEVEL=android-21  \
 -DCMAKE_BUILD_TYPE=Release \
--DMINI_MEM_MODE=ON
+-DBUILD_FOR_ANDROID=ON
diff --git a/src/chat.cpp b/src/chat.cpp
index 9e9e39b0..77cd0ead 100644
--- a/src/chat.cpp
+++ b/src/chat.cpp
@@ -15,6 +15,7 @@
 
 #include "chat.hpp"
 #include "cppjieba/Jieba.hpp"
+#include <MNN/expr/ExecutorScope.hpp>
 
 void ChatGLM::chat() {
     while (true) {
@@ -28,7 +29,7 @@ void ChatGLM::chat() {
 }
 
 std::string ChatGLM::response(const std::string& input_str, std::ostream* os) {
-    // AUTOTIME;
+    AUTOTIME;
     // init status
     mSeqLen = 0, mContextLen = -1, mMaskIdx = -1;
     if (mHistoryVars.empty()) mHistoryVars.resize(LAYER_SIZE);
@@ -95,11 +96,13 @@ void ChatGLM::init(float gpu_memory) {
     BackendConfig cpuBackendConfig;
     config.type          = MNN_FORWARD_CPU;
     config.numThread     = 4;
+    cpuBackendConfig.precision = BackendConfig::Precision_Low;
     config.backendConfig = &cpuBackendConfig;
     mCPURtmgr.reset(Executor::RuntimeManager::createRuntimeManager(config));
     BackendConfig gpuBackendConfig;
     config.type          = MNN_FORWARD_CUDA;
     config.backupType    = MNN_FORWARD_OPENCL;
+    config.numThread     = 1;
     gpuBackendConfig.precision = BackendConfig::Precision_Low;
     config.backendConfig = &gpuBackendConfig;
     mGPURtmgr.reset(Executor::RuntimeManager::createRuntimeManager(config));
@@ -115,11 +118,11 @@ void ChatGLM::init(float gpu_memory) {
     printf("Done!\n");
     // 2. load models
     mModules.resize(LAYER_SIZE + 1);
+    int gpu_run_layers = (gpu_memory - 2) * 1024.0 / 385.0;
+    char buffer[50];
 #ifdef MINI_MEM_MODE
     loadModel("../resource/models/glm_block_0.mnn", false, 0);
 #else
-    int gpu_run_layers = (gpu_memory - 2) * 1024.0 / 385.0;
-    char buffer[50];
     for (int i = 0; i < LAYER_SIZE; i++) {
         sprintf(buffer, "../resource/models/glm_block_%d.mnn", i);
         loadModel(buffer, i <= gpu_run_layers, i);
@@ -131,12 +134,12 @@ void ChatGLM::init(float gpu_memory) {
 
 void ChatGLM::loadModel(const char* fileName, bool cuda, int i) {
     // AUTOTIME;
-    Module::Config config;
-    config.shapeMutable = true;
 #ifndef MINI_MEM_MODE
-    config.rearrange = true;
     printf("load %s model ... ", fileName);
 #endif
+    Module::Config config;
+    config.shapeMutable = true;
+    config.rearrange = true;
     auto rtmgr = cuda ? mGPURtmgr : mCPURtmgr;
     std::shared_ptr<Module> net(Module::load({}, {}, fileName, rtmgr, &config));
     mModules[i] = std::move(net);
@@ -211,29 +214,29 @@ VARP ChatGLM::gen_position_ids(const std::vector<int>& input_ids) {
 }
 
 int ChatGLM::forward(const std::vector<int>& input_ids) {
+    AUTOTIME;
     mSeqLen += input_ids.size();
     auto hidden_states = gen_embedding(input_ids);
     auto attention_mask = gen_attention_mask(input_ids);
     auto position_ids = gen_position_ids(input_ids);
 #ifdef MINI_MEM_MODE
     char buffer[50];
-    int i = 0;
-    std::thread load_lm(&ChatGLM::loadModel, this, "../resource/models/lm.mnn", false, LAYER_SIZE);
-    for (; i < LAYER_SIZE; i++) {
-        AUTOTIME;
+    for (int i = 0; i < LAYER_SIZE; i++) {
         int loadIdx = i < LAYER_SIZE - 1 ? i + 1 : 0;
-        sprintf(buffer, "../resource/8bit_models/glm_block_%d.mnn", loadIdx);
+        sprintf(buffer, "../resource/models/glm_block_%d.mnn", loadIdx);
         std::thread load_next_model(&ChatGLM::loadModel, this, buffer, false, loadIdx);
-        auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]});
-        hidden_states = outputs[0];
-        mHistoryVars[i] = outputs[1];
+        {
+            // AUTOTIME;
+            auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]});
+            hidden_states = outputs[0];
+            mHistoryVars[i] = outputs[1];
+        }
         load_next_model.join();
         mModules[i].reset();
     }
-    load_lm.join();
 #else
     for (int i = 0; i < LAYER_SIZE; i++) {
-        AUTOTIME;
+        // AUTOTIME;
         auto outputs = mModules[i]->onForward({hidden_states, attention_mask, position_ids, mHistoryVars[i]});
         hidden_states = outputs[0];
         mHistoryVars[i] = outputs[1];
@@ -249,10 +252,48 @@ int ChatGLM::var_to_token(VARP var) {
         var = _Gather(var, _Scalar<int>(num - 1));
     }
     var = _Reshape(var, {HIDDEN_SIZE, 1});
+#ifdef MINI_MEM_MODE
+    // naive impl to save memory : gemm + argmax
+    auto ptr = var->readMap<float>();
+    constexpr int TILE = 512;
+    FILE* file = fopen("../resource/models/slim_lm.bin", "rb");
+    std::vector<float> buffer(TILE * HIDDEN_SIZE);
+    int id = -1;
+    float max_score = 0.f;
+    for (size_t i = 0; i < VOCAB_SIZE / TILE; i++) {
+        fseek(file, i * TILE * HIDDEN_SIZE * sizeof(float), SEEK_SET);
+        fread(reinterpret_cast<char*>(buffer.data()), 1, TILE * HIDDEN_SIZE * sizeof(float), file);
+        for (int j = 0; j < TILE; j++) {
+            float sum = 0.f;
+            for (int k = 0; k < HIDDEN_SIZE; k++) {
+                sum += (buffer[j * HIDDEN_SIZE + k]) * ptr[k];
+            }
+            if (sum > max_score) {
+                max_score = sum;
+                id = i * TILE + j;
+            }
+        }
+    }
+    {
+        int i = VOCAB_SIZE / TILE;
+        constexpr int tile = VOCAB_SIZE % TILE;
+        fseek(file, i * TILE * HIDDEN_SIZE * sizeof(float), SEEK_SET);
+        fread(reinterpret_cast<char*>(buffer.data()), 1, tile * HIDDEN_SIZE * sizeof(float), file);
+        for (int j = 0; j < tile; j++) {
+            float sum = 0.f;
+            for (int k = 0; k < HIDDEN_SIZE; k++) {
+                sum += (buffer[j * HIDDEN_SIZE + k]) * ptr[k];
+            }
+            if (sum > max_score) {
+                max_score = sum;
+                id = i * TILE + j;
+            }
+        }
+    }
+    fclose(file);
+#else
     auto outputs = mModules.back()->onForward({var});
     int id = outputs[0]->readMap<int>()[0];
-#ifdef MINI_MEM_MODE
-    mModules.back().reset();
 #endif
     // printf("### %d\n", id);
     return id;