forked from ztxz16/fastllm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
35d3719
commit 56ced61
Showing
6 changed files
with
432 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# -*- coding: utf-8 -*- | ||
import sys | ||
import platform | ||
import logging | ||
import argparse | ||
|
||
sys.path.append('./build-py') | ||
import pyfastllm as fastllm # 或fastllm | ||
|
||
logging.info(f"python gcc version:{platform.python_compiler()}") | ||
|
||
def args_parser(): | ||
parser = argparse.ArgumentParser(description='pyfastllm') | ||
parser.add_argument('-m', '--model', type=int, required=False, default=0, help='模型类型,默认为0, 可以设置为0(chatglm),1(moss),2(vicuna),3(baichuan)') | ||
parser.add_argument('-p', '--path', type=str, required=True, default='', help='模型文件的路径') | ||
parser.add_argument('-t', '--threads', type=int, default=4, help='使用的线程数量') | ||
parser.add_argument('-l', '--low', action='store_true', help='使用低内存模式') | ||
args = parser.parse_args() | ||
return args | ||
|
||
# 请谨慎使用该函数,目前仍存在bug,仅作为low level api调用示例,请勿在生产环境使用 | ||
def response(model, prompt_input:str, stream_output:bool=False): | ||
gmask_token_id = 130001 | ||
bos_token_id = 130004 | ||
eos_token_id = 130005 | ||
|
||
input_ids = model.weight.tokenizer.encode(prompt_input) | ||
gmask_bos = fastllm.Tensor(fastllm.float32, [1, 2], [gmask_token_id, bos_token_id]) | ||
input_ids = fastllm.cat([input_ids, gmask_bos], 0) | ||
|
||
seq_len = input_ids.count(0) | ||
vmask = [0] * (seq_len * seq_len) | ||
vpids = [0] * (seq_len * 2) | ||
for i in range(seq_len-1): | ||
vmask[i*seq_len + seq_len -1] = 1 | ||
vpids[i] = i | ||
vpids[seq_len - 1] = seq_len - 2 | ||
vpids[seq_len * 2 - 1] = 1 | ||
attention_mask = fastllm.Tensor(fastllm.float32, [seq_len, seq_len], vmask) | ||
position_ids = fastllm.Tensor(fastllm.float32, [2, seq_len], vpids) | ||
|
||
pastKeyValues = [] | ||
for _ in range(model.block_cnt): | ||
pastKeyValues.append([fastllm.Tensor(fastllm.float32), fastllm.Tensor(fastllm.float32)]) | ||
|
||
ret_str = "" | ||
ret_len = 1 | ||
mask_ids = -1 | ||
results = [] | ||
penalty_factor = fastllm.Tensor() | ||
|
||
while True: | ||
ret, pastKeyValues = model(input_ids, attention_mask, position_ids, penalty_factor, pastKeyValues) | ||
if ret == eos_token_id: | ||
break | ||
|
||
results.append(ret) | ||
cur_str = model.weight.tokenizer.decode(fastllm.Tensor(fastllm.float32, [len(results)], results)) | ||
ret_str += cur_str | ||
|
||
print(cur_str, end="") | ||
sys.stdout.flush() | ||
if stream_output: | ||
yield cur_str | ||
|
||
ret_len += 1 | ||
results = [] | ||
|
||
if mask_ids == -1: | ||
mask_ids = seq_len - 2 | ||
|
||
input_ids = fastllm.Tensor(fastllm.float32, [1, 1], [ret]) | ||
attention_mask = fastllm.Tensor() | ||
position_ids = fastllm.Tensor(fastllm.float32, [2, 1], [mask_ids, ret_len]) | ||
|
||
print() | ||
return ret_str | ||
|
||
|
||
LLM_TYPE = "" | ||
def print_back(idx:int, content: str): | ||
if idx == 0: | ||
print(f"{LLM_TYPE}:{content}", end='') | ||
elif idx > 0: | ||
print(f"{content}", end='') | ||
elif idx == -1: | ||
print() | ||
|
||
sys.stdout.flush() | ||
|
||
def main(args): | ||
model_path = args.path | ||
OLD_API = True | ||
if OLD_API: | ||
model = fastllm.ChatGLMModel() | ||
model.load_weights(model_path) | ||
model.warmup() | ||
else: | ||
global LLM_TYPE | ||
LLM_TYPE = fastllm.get_llm_type(model_path) | ||
print(f"llm model: {LLM_TYPE}") | ||
model = fastllm.create_llm(model_path) | ||
|
||
prompt = "" | ||
while prompt != "exit": | ||
prompt = input("User: ") | ||
# model.response(prompt, print_back) | ||
outputs = response(model, prompt_input=prompt) | ||
|
||
for output in outputs: | ||
# print() | ||
print(output) | ||
sys.stdout.flush() | ||
|
||
if __name__ == "__main__": | ||
args = args_parser() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import sys | ||
import logging | ||
import sys | ||
import struct | ||
import numpy as np | ||
import argparse | ||
from .utils import torch2flm | ||
|
||
try: | ||
import torch | ||
from transformers import AutoTokenizer, AutoModel # chatglm | ||
from transformers import LlamaTokenizer, LlamaForCausalLM # alpaca | ||
from transformers import AutoModelForCausalLM, AutoTokenizer # baichuan, moss | ||
from peft import PeftModel | ||
except Exception as e: | ||
logging.error("Make sure that you installed transformers and peft!!!") | ||
sys.exit(1) | ||
|
||
MODEL_DICT = { | ||
"alpaca":{ | ||
"tokenizer": "minlik/chinese-alpaca-33b-merged", | ||
"model": "minlik/chinese-alpaca-33b-merged" | ||
}, | ||
"baichuan7B":{ | ||
"model": "baichuan-inc/baichuan-7B", | ||
"tokenizer": "baichuan-inc/baichuan-7B", | ||
"peft": "hiyouga/baichuan-7b-sft", | ||
}, | ||
"chatglm6B":{ | ||
"tokenizer": "THUDM/chatglm-6b", | ||
"model": "THUDM/chatglm-6b" | ||
}, | ||
"moss":{ | ||
"model": "fnlp/moss-moon-003-sft", | ||
"tokenizer": "fnlp/moss-moon-003-sft", | ||
} | ||
} | ||
|
||
def parse_args(): | ||
# -p 模型路径或hf路径 | ||
# -o --out_path 导出路径 | ||
# -q 量化位数 | ||
parser = argparse.ArgumentParser(description='build fastllm libs') | ||
parser.add_argument('-o', dest='export_path', default=None, | ||
help='output export path') | ||
parser.add_argument('-p', dest='model_path', type=list, | ||
help='a list with "tokenizer", "model", "peft model", such as: -p THUDM/chatglm-6b THUDM/chatglm-6b') | ||
parser.add_argument('-m', dest='model', default='', | ||
help='model name with(alpaca, baichuan7B, chatglm6B, moss)') | ||
args = parser.parse_args() | ||
return args | ||
|
||
|
||
def convert(model, tokenizer, export_path): | ||
torch2flm.tofile(model, tokenizer, export_path) | ||
|
||
|
||
def alpaca(model_path): | ||
tokenizer = LlamaTokenizer.from_pretrained(model_path) | ||
model = LlamaForCausalLM.from_pretrained(model_path).float() | ||
return model, tokenizer | ||
|
||
def baichuan7B(model_path, peft_path): | ||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | ||
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True) | ||
model = PeftModel.from_pretrained(model, peft_path).float() | ||
layers = model.model.model.layers | ||
for i in range(len(layers)): | ||
layers[i].self_attn.W_pack.weight += torch.mm(layers[i].self_attn.W_pack.lora_B.default.weight, layers[i].self_attn.W_pack.lora_A.default.weight) * layers[i].self_attn.W_pack.scaling["default"] | ||
|
||
return model, tokenizer | ||
|
||
def chatglm6B(model_path, ): | ||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | ||
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float() | ||
model = model.eval() | ||
return model, tokenizer | ||
|
||
def moss(model_path, ): | ||
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) | ||
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float() | ||
model = model.eval() | ||
return model, tokenizer | ||
|
||
def main(args): | ||
if args.model not in MODEL_DICT: | ||
assert f"Not Support {args.model} Yet!!!" | ||
|
||
|
||
model_args = {} | ||
model_args["model_path"] = MODEL_DICT[args.model].get("model") | ||
if MODEL_DICT[args.model].has_key("peft"): | ||
model_args["peft_path"] = MODEL_DICT[args.model].get("peft") | ||
|
||
if args.model_path: | ||
model_args["model_path"] = args.model_path[0] | ||
if len(args.model_path) > 2: | ||
model_args["peft_path"] = args.model_path[2] | ||
|
||
model, tokenizer = globals().get(args.model)(**model_args) | ||
export_path = args.export_path or f"{args.model}-fp32.bin" | ||
torch2flm.tofile(export_path, model.model, tokenizer) | ||
|
||
if __name__ == "__main__": | ||
args = parse_args() | ||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import struct | ||
import numpy as np | ||
|
||
def write_str(fp, s): | ||
fp.write(struct.pack('i', len(s))) | ||
fp.write(s.encode()) | ||
|
||
def write_kv(fp, key, value): | ||
write_str(fp, key) | ||
write_str(fp, value) | ||
|
||
|
||
def write_dict(fp, kv_dict): | ||
fp.write(struct.pack('i', len(kv_dict))) | ||
for k, v in kv_dict.items(): | ||
write_kv(fp, str(k), str(v)) | ||
|
||
|
||
def tofile(export_path, | ||
model, | ||
tokenizer = None, | ||
pre_prompt = None, | ||
user_role = None, | ||
bot_role = None, | ||
history_sep = None): | ||
dict = model.state_dict() | ||
|
||
with open(export_path, "wb") as fp: | ||
# 0. version id | ||
fp.write(struct.pack('i', 2)) | ||
|
||
# 0.1 model infp | ||
modelInfp = model.config.__dict__ | ||
modelInfp["pre_prompt"] = pre_prompt or None | ||
modelInfp["user_role"] = user_role or None | ||
modelInfp["bot_role"] = bot_role or None | ||
modelInfp["history_sep"] = history_sep or None | ||
write_dict(fp, modelInfp) | ||
|
||
|
||
# 1. vocab | ||
if tokenizer: | ||
if (hasattr(tokenizer, "sp_model")): | ||
piece_size = tokenizer.sp_model.piece_size() | ||
fp.write(struct.pack('i', piece_size)) | ||
for i in range(piece_size): | ||
s = tokenizer.sp_model.id_to_piece(i).encode() | ||
fp.write(struct.pack('i', len(s))) | ||
for c in s: | ||
fp.write(struct.pack('i', c)) | ||
fp.write(struct.pack('i', i)) | ||
else: | ||
vocab = tokenizer.get_vocab() | ||
fp.write(struct.pack('i', len(vocab))) | ||
for v in vocab.keys(): | ||
s = v.encode() | ||
fp.write(struct.pack('i', len(s))) | ||
for c in s: | ||
fp.write(struct.pack('i', c)) | ||
fp.write(struct.pack('i', vocab[v])) | ||
else: | ||
fp.write(struct.pack('i', 0)) | ||
|
||
# 2. weight | ||
fp.write(struct.pack('i', len(dict))) | ||
tot = 0 | ||
for key in dict: | ||
cur = dict[key].numpy().astype(np.float32) | ||
fp.write(struct.pack('i', len(key))) | ||
fp.write(key.encode()) | ||
fp.write(struct.pack('i', len(cur.shape))) | ||
for i in cur.shape: fp.write(struct.pack('i', i)) | ||
fp.write(struct.pack('i', 0)) | ||
fp.write(cur.data) | ||
tot += 1 | ||
print("output (", tot, "/", len(dict), end = " )\r") | ||
print("\nfinish.") | ||
fp.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.