Skip to content

Commit

Permalink
low level api & 优化convert
Browse files Browse the repository at this point in the history
  • Loading branch information
wildkid1024 committed Jun 26, 2023
1 parent 35d3719 commit 56ced61
Show file tree
Hide file tree
Showing 6 changed files with 432 additions and 3 deletions.
117 changes: 117 additions & 0 deletions cli_low_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# -*- coding: utf-8 -*-
import sys
import platform
import logging
import argparse

sys.path.append('./build-py')
import pyfastllm as fastllm # 或fastllm

logging.info(f"python gcc version:{platform.python_compiler()}")

def args_parser():
parser = argparse.ArgumentParser(description='pyfastllm')
parser.add_argument('-m', '--model', type=int, required=False, default=0, help='模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)')
parser.add_argument('-p', '--path', type=str, required=True, default='', help='模型文件的路径')
parser.add_argument('-t', '--threads', type=int, default=4, help='使用的线程数量')
parser.add_argument('-l', '--low', action='store_true', help='使用低内存模式')
args = parser.parse_args()
return args

# 请谨慎使用该函数,目前仍存在bug,仅作为low level api调用示例,请勿在生产环境使用
def response(model, prompt_input:str, stream_output:bool=False):
gmask_token_id = 130001
bos_token_id = 130004
eos_token_id = 130005

input_ids = model.weight.tokenizer.encode(prompt_input)
gmask_bos = fastllm.Tensor(fastllm.float32, [1, 2], [gmask_token_id, bos_token_id])
input_ids = fastllm.cat([input_ids, gmask_bos], 0)

seq_len = input_ids.count(0)
vmask = [0] * (seq_len * seq_len)
vpids = [0] * (seq_len * 2)
for i in range(seq_len-1):
vmask[i*seq_len + seq_len -1] = 1
vpids[i] = i
vpids[seq_len - 1] = seq_len - 2
vpids[seq_len * 2 - 1] = 1
attention_mask = fastllm.Tensor(fastllm.float32, [seq_len, seq_len], vmask)
position_ids = fastllm.Tensor(fastllm.float32, [2, seq_len], vpids)

pastKeyValues = []
for _ in range(model.block_cnt):
pastKeyValues.append([fastllm.Tensor(fastllm.float32), fastllm.Tensor(fastllm.float32)])

ret_str = ""
ret_len = 1
mask_ids = -1
results = []
penalty_factor = fastllm.Tensor()

while True:
ret, pastKeyValues = model(input_ids, attention_mask, position_ids, penalty_factor, pastKeyValues)
if ret == eos_token_id:
break

results.append(ret)
cur_str = model.weight.tokenizer.decode(fastllm.Tensor(fastllm.float32, [len(results)], results))
ret_str += cur_str

print(cur_str, end="")
sys.stdout.flush()
if stream_output:
yield cur_str

ret_len += 1
results = []

if mask_ids == -1:
mask_ids = seq_len - 2

input_ids = fastllm.Tensor(fastllm.float32, [1, 1], [ret])
attention_mask = fastllm.Tensor()
position_ids = fastllm.Tensor(fastllm.float32, [2, 1], [mask_ids, ret_len])

print()
return ret_str


LLM_TYPE = ""
def print_back(idx:int, content: str):
if idx == 0:
print(f"{LLM_TYPE}:{content}", end='')
elif idx > 0:
print(f"{content}", end='')
elif idx == -1:
print()

sys.stdout.flush()

def main(args):
model_path = args.path
OLD_API = True
if OLD_API:
model = fastllm.ChatGLMModel()
model.load_weights(model_path)
model.warmup()
else:
global LLM_TYPE
LLM_TYPE = fastllm.get_llm_type(model_path)
print(f"llm model: {LLM_TYPE}")
model = fastllm.create_llm(model_path)

prompt = ""
while prompt != "exit":
prompt = input("User: ")
# model.response(prompt, print_back)
outputs = response(model, prompt_input=prompt)

for output in outputs:
# print()
print(output)
sys.stdout.flush()

if __name__ == "__main__":
args = args_parser()
main(args)
3 changes: 2 additions & 1 deletion include/fastllm.h
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ namespace fastllm {
NONE = 0, LINEAR = 1, EMBEDDING = 2
};

struct Data {
class Data {
public:
bool lockInCPU = false; // 如果lock在CPU上,那么不允许移动到其余设备
WeightType weightType = WeightType::NONE; // 权重类型,NONE代表非权重(或未知权重)

Expand Down
106 changes: 106 additions & 0 deletions pyfastllm/fastllm/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
import sys
import logging
import sys
import struct
import numpy as np
import argparse
from .utils import torch2flm

try:
import torch
from transformers import AutoTokenizer, AutoModel # chatglm
from transformers import LlamaTokenizer, LlamaForCausalLM # alpaca
from transformers import AutoModelForCausalLM, AutoTokenizer # baichuan, moss
from peft import PeftModel
except Exception as e:
logging.error("Make sure that you installed transformers and peft!!!")
sys.exit(1)

MODEL_DICT = {
"alpaca":{
"tokenizer": "minlik/chinese-alpaca-33b-merged",
"model": "minlik/chinese-alpaca-33b-merged"
},
"baichuan7B":{
"model": "baichuan-inc/baichuan-7B",
"tokenizer": "baichuan-inc/baichuan-7B",
"peft": "hiyouga/baichuan-7b-sft",
},
"chatglm6B":{
"tokenizer": "THUDM/chatglm-6b",
"model": "THUDM/chatglm-6b"
},
"moss":{
"model": "fnlp/moss-moon-003-sft",
"tokenizer": "fnlp/moss-moon-003-sft",
}
}

def parse_args():
# -p 模型路径或hf路径
# -o --out_path 导出路径
# -q 量化位数
parser = argparse.ArgumentParser(description='build fastllm libs')
parser.add_argument('-o', dest='export_path', default=None,
help='output export path')
parser.add_argument('-p', dest='model_path', type=list,
help='a list with "tokenizer", "model", "peft model", such as: -p THUDM/chatglm-6b THUDM/chatglm-6b')
parser.add_argument('-m', dest='model', default='',
help='model name with(alpaca, baichuan7B, chatglm6B, moss)')
args = parser.parse_args()
return args


def convert(model, tokenizer, export_path):
torch2flm.tofile(model, tokenizer, export_path)


def alpaca(model_path):
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(model_path).float()
return model, tokenizer

def baichuan7B(model_path, peft_path):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
model = PeftModel.from_pretrained(model, peft_path).float()
layers = model.model.model.layers
for i in range(len(layers)):
layers[i].self_attn.W_pack.weight += torch.mm(layers[i].self_attn.W_pack.lora_B.default.weight, layers[i].self_attn.W_pack.lora_A.default.weight) * layers[i].self_attn.W_pack.scaling["default"]

return model, tokenizer

def chatglm6B(model_path, ):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float()
model = model.eval()
return model, tokenizer

def moss(model_path, ):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float()
model = model.eval()
return model, tokenizer

def main(args):
if args.model not in MODEL_DICT:
assert f"Not Support {args.model} Yet!!!"


model_args = {}
model_args["model_path"] = MODEL_DICT[args.model].get("model")
if MODEL_DICT[args.model].has_key("peft"):
model_args["peft_path"] = MODEL_DICT[args.model].get("peft")

if args.model_path:
model_args["model_path"] = args.model_path[0]
if len(args.model_path) > 2:
model_args["peft_path"] = args.model_path[2]

model, tokenizer = globals().get(args.model)(**model_args)
export_path = args.export_path or f"{args.model}-fp32.bin"
torch2flm.tofile(export_path, model.model, tokenizer)

if __name__ == "__main__":
args = parse_args()
main(args)
78 changes: 78 additions & 0 deletions pyfastllm/fastllm/utils/torch2flm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import struct
import numpy as np

def write_str(fp, s):
fp.write(struct.pack('i', len(s)))
fp.write(s.encode())

def write_kv(fp, key, value):
write_str(fp, key)
write_str(fp, value)


def write_dict(fp, kv_dict):
fp.write(struct.pack('i', len(kv_dict)))
for k, v in kv_dict.items():
write_kv(fp, str(k), str(v))


def tofile(export_path,
model,
tokenizer = None,
pre_prompt = None,
user_role = None,
bot_role = None,
history_sep = None):
dict = model.state_dict()

with open(export_path, "wb") as fp:
# 0. version id
fp.write(struct.pack('i', 2))

# 0.1 model infp
modelInfp = model.config.__dict__
modelInfp["pre_prompt"] = pre_prompt or None
modelInfp["user_role"] = user_role or None
modelInfp["bot_role"] = bot_role or None
modelInfp["history_sep"] = history_sep or None
write_dict(fp, modelInfp)


# 1. vocab
if tokenizer:
if (hasattr(tokenizer, "sp_model")):
piece_size = tokenizer.sp_model.piece_size()
fp.write(struct.pack('i', piece_size))
for i in range(piece_size):
s = tokenizer.sp_model.id_to_piece(i).encode()
fp.write(struct.pack('i', len(s)))
for c in s:
fp.write(struct.pack('i', c))
fp.write(struct.pack('i', i))
else:
vocab = tokenizer.get_vocab()
fp.write(struct.pack('i', len(vocab)))
for v in vocab.keys():
s = v.encode()
fp.write(struct.pack('i', len(s)))
for c in s:
fp.write(struct.pack('i', c))
fp.write(struct.pack('i', vocab[v]))
else:
fp.write(struct.pack('i', 0))

# 2. weight
fp.write(struct.pack('i', len(dict)))
tot = 0
for key in dict:
cur = dict[key].numpy().astype(np.float32)
fp.write(struct.pack('i', len(key)))
fp.write(key.encode())
fp.write(struct.pack('i', len(cur.shape)))
for i in cur.shape: fp.write(struct.pack('i', i))
fp.write(struct.pack('i', 0))
fp.write(cur.data)
tot += 1
print("output (", tot, "/", len(dict), end = " )\r")
print("\nfinish.")
fp.close()
4 changes: 2 additions & 2 deletions src/models/chatglm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ TimeRecord batchRecord;
unitLen = 128;
#endif
while ((pastKey.dims.size() == 0 && (pastKey.expansionDims.size() == 0 || k.dims[1] > pastKey.expansionDims[1]))
|| (pastKey.dims.size() > 0 && pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1])) {
|| (pastKey.dims.size() > 0 && (pastKey.expansionDims.size() == 0 || pastKey.dims[1] + k.dims[1] > pastKey.expansionDims[1]))) {
std::vector <int> newDims;
if (pastKey.Count(0) == 0 || pastKey.dims.size() == 0) {
newDims = std::vector <int> {k.dims[0], ((k.dims[1] - 1) / unitLen + 1) * unitLen, k.dims[2]};
Expand All @@ -139,7 +139,7 @@ TimeRecord batchRecord;
}

while ((pastValue.dims.size() == 0 && (pastValue.expansionDims.size() == 0 || v.dims[1] > pastValue.expansionDims[1]))
|| (pastValue.dims.size() > 0 && pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1])) {
|| (pastValue.dims.size() > 0 && (pastValue.expansionDims.size() == 0 || pastValue.dims[1] + v.dims[1] > pastValue.expansionDims[1]) )) {
std::vector <int> newDims;
if (pastValue.Count(0) == 0 || pastValue.dims.size() == 0) {
newDims = std::vector <int> {v.dims[0], ((v.dims[1] - 1) / unitLen + 1) * unitLen, v.dims[2]};
Expand Down
Loading

0 comments on commit 56ced61

Please sign in to comment.