diff --git a/Intel_device_demo/README.md b/Intel_device_demo/README.md index 847aa663..197cc7d1 100644 --- a/Intel_device_demo/README.md +++ b/Intel_device_demo/README.md @@ -10,5 +10,7 @@ - 其他理论支持 OpenVINO 加速的Intel 工具套件。 ## 2. 文件目录 +- IPEX_llm_xxx_demo: IPEX-LLM 是一个为Intel XPU(Xeon/Core/Flex/Arc/PVC)打造的低精度轻量级大语言模型库,在Intel平台上具有广泛的模型支持、最低的延迟和最小的内存占用,实现加速模型部署示例。 - OpenVINO_demo: 使用 Intel OpenVINO 推理加速框架,实现加速模型部署示例。 -- Pytorch_demo (暂未推出) : 使用 Intel Pytorch Extension 实现在 Pytorch 环境上开发(适用于 Intel Arc 系列 GPU) \ No newline at end of file +- Pytorch_demo (暂未推出) : 使用 Intel Pytorch Extension 实现在 Pytorch 环境上开发(适用于 Intel Arc 系列 GPU) + diff --git a/Intel_device_demo/ipex_llm_cpu_demo/api_server.py b/Intel_device_demo/ipex_llm_cpu_demo/api_server.py new file mode 100644 index 00000000..ef131707 --- /dev/null +++ b/Intel_device_demo/ipex_llm_cpu_demo/api_server.py @@ -0,0 +1,531 @@ +""" +This script implements an API for the ChatGLM3-6B model, +formatted similarly to OpenAI's API (https://platform.openai.com/docs/api-reference/chat). +It's designed to be run as a web server using FastAPI and uvicorn, +making the ChatGLM3-6B model accessible through OpenAI Client. + +Key Components and Features: +- Model and Tokenizer Setup: Configures the model and tokenizer paths and loads them. +- FastAPI Configuration: Sets up a FastAPI application with CORS middleware for handling cross-origin requests. +- API Endpoints: + - "/v1/models": Lists the available models, specifically ChatGLM3-6B. + - "/v1/chat/completions": Processes chat completion requests with options for streaming and regular responses. + - "/v1/embeddings": Processes Embedding request of a list of text inputs. +- Token Limit Caution: In the OpenAI API, 'max_tokens' is equivalent to HuggingFace's 'max_new_tokens', not 'max_length'. +For instance, setting 'max_tokens' to 8192 for a 6b model would result in an error due to the model's inability to output +that many tokens after accounting for the history and prompt tokens. +- Stream Handling and Custom Functions: Manages streaming responses and custom function calls within chat responses. +- Pydantic Models: Defines structured models for requests and responses, enhancing API documentation and type safety. +- Main Execution: Initializes the model and tokenizer, and starts the FastAPI app on the designated host and port. + +Note: + This script doesn't include the setup for special tokens or multi-GPU support by default. + Users need to configure their special tokens and can enable multi-GPU support as per the provided instructions. + Embedding Models only support in One GPU. + +""" + +import os +import time +import tiktoken +import torch +import uvicorn + +from fastapi import FastAPI, HTTPException, Response +from fastapi.middleware.cors import CORSMiddleware + +from contextlib import asynccontextmanager +from typing import List, Literal, Optional, Union +from loguru import logger +from pydantic import BaseModel, Field +from ipex_llm.transformers import AutoModel +from transformers import AutoTokenizer +from utils import process_response, generate_chatglm3, generate_stream_chatglm3 +# from sentence_transformers import SentenceTransformer + +from sse_starlette.sse import EventSourceResponse + +# Set up limit request time +EventSourceResponse.DEFAULT_PING_INTERVAL = 1000 + +# set LLM path +MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b') +TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", MODEL_PATH) + +# set Embedding Model path +EMBEDDING_PATH = os.environ.get('EMBEDDING_PATH', 'BAAI/bge-large-zh-v1.5') + + +@asynccontextmanager +async def lifespan(app: FastAPI): + yield + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + + +app = FastAPI(lifespan=lifespan) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +class ModelCard(BaseModel): + id: str + object: str = "model" + created: int = Field(default_factory=lambda: int(time.time())) + owned_by: str = "owner" + root: Optional[str] = None + parent: Optional[str] = None + permission: Optional[list] = None + + +class ModelList(BaseModel): + object: str = "list" + data: List[ModelCard] = [] + + +class FunctionCallResponse(BaseModel): + name: Optional[str] = None + arguments: Optional[str] = None + + +class ChatMessage(BaseModel): + role: Literal["user", "assistant", "system", "function"] + content: str = None + name: Optional[str] = None + function_call: Optional[FunctionCallResponse] = None + + +class DeltaMessage(BaseModel): + role: Optional[Literal["user", "assistant", "system"]] = None + content: Optional[str] = None + function_call: Optional[FunctionCallResponse] = None + + +## for Embedding +class EmbeddingRequest(BaseModel): + input: List[str] + model: str + + +class CompletionUsage(BaseModel): + prompt_tokens: int + completion_tokens: int + total_tokens: int + + +class EmbeddingResponse(BaseModel): + data: list + model: str + object: str + usage: CompletionUsage + + +# for ChatCompletionRequest + +class UsageInfo(BaseModel): + prompt_tokens: int = 0 + total_tokens: int = 0 + completion_tokens: Optional[int] = 0 + + +class ChatCompletionRequest(BaseModel): + model: str + messages: List[ChatMessage] + temperature: Optional[float] = 0.8 + top_p: Optional[float] = 0.8 + max_tokens: Optional[int] = None + stream: Optional[bool] = False + tools: Optional[Union[dict, List[dict]]] = None + repetition_penalty: Optional[float] = 1.1 + + +class ChatCompletionResponseChoice(BaseModel): + index: int + message: ChatMessage + finish_reason: Literal["stop", "length", "function_call"] + + +class ChatCompletionResponseStreamChoice(BaseModel): + delta: DeltaMessage + finish_reason: Optional[Literal["stop", "length", "function_call"]] + index: int + + +class ChatCompletionResponse(BaseModel): + model: str + id: str + object: Literal["chat.completion", "chat.completion.chunk"] + choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]] + created: Optional[int] = Field(default_factory=lambda: int(time.time())) + usage: Optional[UsageInfo] = None + + +@app.get("/health") +async def health() -> Response: + """Health check.""" + return Response(status_code=200) + + +@app.post("/v1/embeddings", response_model=EmbeddingResponse) +async def get_embeddings(request: EmbeddingRequest): + embeddings = [embedding_model.encode(text) for text in request.input] + embeddings = [embedding.tolist() for embedding in embeddings] + + def num_tokens_from_string(string: str) -> int: + """ + Returns the number of tokens in a text string. + use cl100k_base tokenizer + """ + encoding = tiktoken.get_encoding('cl100k_base') + num_tokens = len(encoding.encode(string)) + return num_tokens + + response = { + "data": [ + { + "object": "embedding", + "embedding": embedding, + "index": index + } + for index, embedding in enumerate(embeddings) + ], + "model": request.model, + "object": "list", + "usage": CompletionUsage( + prompt_tokens=sum(len(text.split()) for text in request.input), + completion_tokens=0, + total_tokens=sum(num_tokens_from_string(text) for text in request.input), + ) + } + return response + + +@app.get("/v1/models", response_model=ModelList) +async def list_models(): + model_card = ModelCard( + id="chatglm3-6b" + ) + return ModelList( + data=[model_card] + ) + + +@app.post("/v1/chat/completions", response_model=ChatCompletionResponse) +async def create_chat_completion(request: ChatCompletionRequest): + global model, tokenizer + + if len(request.messages) < 1 or request.messages[-1].role == "assistant": + raise HTTPException(status_code=400, detail="Invalid request") + + gen_params = dict( + messages=request.messages, + temperature=request.temperature, + top_p=request.top_p, + max_tokens=request.max_tokens or 1024, + echo=False, + stream=request.stream, + repetition_penalty=request.repetition_penalty, + tools=request.tools, + ) + logger.debug(f"==== request ====\n{gen_params}") + + if request.stream: + + # Use the stream mode to read the first few characters, if it is not a function call, direct stram output + predict_stream_generator = predict_stream(request.model, gen_params) + output = next(predict_stream_generator) + if not contains_custom_function(output): + return EventSourceResponse(predict_stream_generator, media_type="text/event-stream") + + # Obtain the result directly at one time and determine whether tools needs to be called. + logger.debug(f"First result output:\n{output}") + + function_call = None + if output and request.tools: + try: + function_call = process_response(output, use_tool=True) + except: + logger.warning("Failed to parse tool call") + + # CallFunction + if isinstance(function_call, dict): + function_call = FunctionCallResponse(**function_call) + + """ + In this demo, we did not register any tools. + You can use the tools that have been implemented in our `tools_using_demo` and implement your own streaming tool implementation here. + Similar to the following method: + function_args = json.loads(function_call.arguments) + tool_response = dispatch_tool(tool_name: str, tool_params: dict) + """ + tool_response = "" + + if not gen_params.get("messages"): + gen_params["messages"] = [] + + gen_params["messages"].append(ChatMessage( + role="assistant", + content=output, + )) + gen_params["messages"].append(ChatMessage( + role="function", + name=function_call.name, + content=tool_response, + )) + + # Streaming output of results after function calls + generate = predict(request.model, gen_params) + return EventSourceResponse(generate, media_type="text/event-stream") + + else: + # Handled to avoid exceptions in the above parsing function process. + generate = parse_output_text(request.model, output) + return EventSourceResponse(generate, media_type="text/event-stream") + + # Here is the handling of stream = False + response = generate_chatglm3(model, tokenizer, gen_params) + + # Remove the first newline character + if response["text"].startswith("\n"): + response["text"] = response["text"][1:] + response["text"] = response["text"].strip() + + usage = UsageInfo() + function_call, finish_reason = None, "stop" + if request.tools: + try: + function_call = process_response(response["text"], use_tool=True) + except: + logger.warning("Failed to parse tool call, maybe the response is not a tool call or have been answered.") + + if isinstance(function_call, dict): + finish_reason = "function_call" + function_call = FunctionCallResponse(**function_call) + + message = ChatMessage( + role="assistant", + content=response["text"], + function_call=function_call if isinstance(function_call, FunctionCallResponse) else None, + ) + + logger.debug(f"==== message ====\n{message}") + + choice_data = ChatCompletionResponseChoice( + index=0, + message=message, + finish_reason=finish_reason, + ) + task_usage = UsageInfo.model_validate(response["usage"]) + for usage_key, usage_value in task_usage.model_dump().items(): + setattr(usage, usage_key, getattr(usage, usage_key) + usage_value) + + return ChatCompletionResponse( + model=request.model, + id="", # for open_source model, id is empty + choices=[choice_data], + object="chat.completion", + usage=usage + ) + + +async def predict(model_id: str, params: dict): + global model, tokenizer + + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(role="assistant"), + finish_reason=None + ) + chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk") + yield "{}".format(chunk.model_dump_json(exclude_unset=True)) + + previous_text = "" + for new_response in generate_stream_chatglm3(model, tokenizer, params): + decoded_unicode = new_response["text"] + delta_text = decoded_unicode[len(previous_text):] + previous_text = decoded_unicode + + finish_reason = new_response["finish_reason"] + if len(delta_text) == 0 and finish_reason != "function_call": + continue + + function_call = None + if finish_reason == "function_call": + try: + function_call = process_response(decoded_unicode, use_tool=True) + except: + logger.warning( + "Failed to parse tool call, maybe the response is not a tool call or have been answered.") + + if isinstance(function_call, dict): + function_call = FunctionCallResponse(**function_call) + + delta = DeltaMessage( + content=delta_text, + role="assistant", + function_call=function_call if isinstance(function_call, FunctionCallResponse) else None, + ) + + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=delta, + finish_reason=finish_reason + ) + chunk = ChatCompletionResponse( + model=model_id, + id="", + choices=[choice_data], + object="chat.completion.chunk" + ) + yield "{}".format(chunk.model_dump_json(exclude_unset=True)) + + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(), + finish_reason="stop" + ) + chunk = ChatCompletionResponse( + model=model_id, + id="", + choices=[choice_data], + object="chat.completion.chunk" + ) + yield "{}".format(chunk.model_dump_json(exclude_unset=True)) + yield '[DONE]' + + +def predict_stream(model_id, gen_params): + """ + The function call is compatible with stream mode output. + + The first seven characters are determined. + If not a function call, the stream output is directly generated. + Otherwise, the complete character content of the function call is returned. + + :param model_id: + :param gen_params: + :return: + """ + output = "" + is_function_call = False + has_send_first_chunk = False + for new_response in generate_stream_chatglm3(model, tokenizer, gen_params): + decoded_unicode = new_response["text"] + delta_text = decoded_unicode[len(output):] + output = decoded_unicode + + # When it is not a function call and the character length is> 7, + # try to judge whether it is a function call according to the special function prefix + if not is_function_call and len(output) > 7: + + # Determine whether a function is called + is_function_call = contains_custom_function(output) + if is_function_call: + continue + + # Non-function call, direct stream output + finish_reason = new_response["finish_reason"] + + # Send an empty string first to avoid truncation by subsequent next() operations. + if not has_send_first_chunk: + message = DeltaMessage( + content="", + role="assistant", + function_call=None, + ) + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=message, + finish_reason=finish_reason + ) + chunk = ChatCompletionResponse( + model=model_id, + id="", + choices=[choice_data], + created=int(time.time()), + object="chat.completion.chunk" + ) + yield "{}".format(chunk.model_dump_json(exclude_unset=True)) + + send_msg = delta_text if has_send_first_chunk else output + has_send_first_chunk = True + message = DeltaMessage( + content=send_msg, + role="assistant", + function_call=None, + ) + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=message, + finish_reason=finish_reason + ) + chunk = ChatCompletionResponse( + model=model_id, + id="", + choices=[choice_data], + created=int(time.time()), + object="chat.completion.chunk" + ) + yield "{}".format(chunk.model_dump_json(exclude_unset=True)) + + if is_function_call: + yield output + else: + yield '[DONE]' + + +async def parse_output_text(model_id: str, value: str): + """ + Directly output the text content of value + + :param model_id: + :param value: + :return: + """ + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(role="assistant", content=value), + finish_reason=None + ) + chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk") + yield "{}".format(chunk.model_dump_json(exclude_unset=True)) + + choice_data = ChatCompletionResponseStreamChoice( + index=0, + delta=DeltaMessage(), + finish_reason="stop" + ) + chunk = ChatCompletionResponse(model=model_id, id="", choices=[choice_data], object="chat.completion.chunk") + yield "{}".format(chunk.model_dump_json(exclude_unset=True)) + yield '[DONE]' + + +def contains_custom_function(value: str) -> bool: + """ + Determine whether 'function_call' according to a special function prefix. + + For example, the functions defined in "tools_using_demo/tool_register.py" are all "get_xxx" and start with "get_" + + [Note] This is not a rigorous judgment method, only for reference. + + :param value: + :return: + """ + return value and 'get_' in value + +if __name__ == "__main__": + # Load LLM + tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH, trust_remote_code=True) + model = AutoModel.from_pretrained(MODEL_PATH, + load_in_4bit=True, + trust_remote_code=True) + # load Embedding + # embedding_model = SentenceTransformer(EMBEDDING_PATH, device="cuda") + uvicorn.run(app, host='0.0.0.0', port=8000, workers=1) diff --git a/Intel_device_demo/ipex_llm_cpu_demo/chatglm3_infer.py b/Intel_device_demo/ipex_llm_cpu_demo/chatglm3_infer.py new file mode 100644 index 00000000..b8ca6dcc --- /dev/null +++ b/Intel_device_demo/ipex_llm_cpu_demo/chatglm3_infer.py @@ -0,0 +1,32 @@ +import time +from ipex_llm.transformers import AutoModel +from transformers import AutoTokenizer + +CHATGLM_V3_PROMPT_FORMAT = "\n{prompt}\n" + +# Please specify the local path to the chatglm3-6b model + +model_path = "D:\AI\ChatGLM3\model/chatglm3-6b/" + +# Load the ChatGLM3-6B model and quantize it to INT4 +model = AutoModel.from_pretrained(model_path, + load_in_4bit=True, + trust_remote_code=True) +# Load the tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) +# Prepare ChatGLM3 format prompt +prompt = CHATGLM_V3_PROMPT_FORMAT.format(prompt="Who are you?") +# Encode the prompt +input_ids = tokenizer.encode(prompt, return_tensors="pt") +st = time.time() +# Perform inference calculation and generate Tokens +output = model.generate(input_ids,max_new_tokens=32) +end = time.time() +# Decode the generated Tokens and display them +output_str = tokenizer.decode(output[0], skip_special_tokens=True) +print(f'Inference time: {end-st} s') +print('-'*20, 'Prompt', '-'*20) +print(prompt) +print('-'*20, 'Output', '-'*20) +print(output_str) diff --git a/Intel_device_demo/ipex_llm_cpu_demo/chatglm3_web_demo.py b/Intel_device_demo/ipex_llm_cpu_demo/chatglm3_web_demo.py new file mode 100644 index 00000000..c19744a0 --- /dev/null +++ b/Intel_device_demo/ipex_llm_cpu_demo/chatglm3_web_demo.py @@ -0,0 +1,93 @@ +""" +This script creates an interactive web demo for the ChatGLM3-6B model using Gradio, +a Python library for building quick and easy UI components for machine learning models. +It's designed to showcase the capabilities of the ChatGLM3-6B model in a user-friendly interface, +allowing users to interact with the model through a chat-like interface. + +Usage: +- Run the script to start the Gradio web server. +- Interact with the model by typing questions and receiving responses. + +Requirements: +- Gradio (required for 4.13.0 and later, 3.x is not support now) should be installed. + +Note: The script includes a modification to the Chatbot's postprocess method to handle markdown to HTML conversion, +ensuring that the chat interface displays formatted text correctly. + +""" + +import os +import streamlit as st +from ipex_llm.transformers import AutoModel +from transformers import AutoTokenizer + + +st.set_page_config( + page_title="ChatGLM3-6B+BigDL-LLM demo", + page_icon=":robot:", + layout="wide" +) + +MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/chatglm3-6b') + +@st.cache_resource +def get_model(): + model = AutoModel.from_pretrained(MODEL_PATH, + load_in_4bit=True, + trust_remote_code=True) + + tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, + trust_remote_code=True) + return tokenizer, model + +tokenizer, model = get_model() + +if "history" not in st.session_state: + st.session_state.history = [] +if "past_key_values" not in st.session_state: + st.session_state.past_key_values = None + +max_length = st.sidebar.slider("max_length", 0, 32768, 8192, step=1) +top_p = st.sidebar.slider("top_p", 0.0, 1.0, 0.8, step=0.01) +temperature = st.sidebar.slider("temperature", 0.0, 1.0, 0.6, step=0.01) + +buttonClean = st.sidebar.button("clearing session history", key="clean") +if buttonClean: + st.session_state.history = [] + st.session_state.past_key_values = None + st.rerun() + +for i, message in enumerate(st.session_state.history): + if message["role"] == "user": + with st.chat_message(name="user", avatar="user"): + st.markdown(message["content"]) + else: + with st.chat_message(name="assistant", avatar="assistant"): + st.markdown(message["content"]) + +with st.chat_message(name="user", avatar="user"): + input_placeholder = st.empty() +with st.chat_message(name="assistant", avatar="assistant"): + message_placeholder = st.empty() + +prompt_text = st.chat_input("please enter your question.") + +if prompt_text: + + input_placeholder.markdown(prompt_text) + history = st.session_state.history + past_key_values = st.session_state.past_key_values + for response, history, past_key_values in model.stream_chat( + tokenizer, + prompt_text, + history, + past_key_values=past_key_values, + max_length=max_length, + top_p=top_p, + temperature=temperature, + return_past_key_values=True, + ): + message_placeholder.markdown(response) + + st.session_state.history = history + st.session_state.past_key_values = past_key_values \ No newline at end of file diff --git a/Intel_device_demo/ipex_llm_cpu_demo/generate.py b/Intel_device_demo/ipex_llm_cpu_demo/generate.py new file mode 100644 index 00000000..d266d5c0 --- /dev/null +++ b/Intel_device_demo/ipex_llm_cpu_demo/generate.py @@ -0,0 +1,56 @@ +import torch +import time +import argparse +import numpy as np + +from ipex_llm.transformers import AutoModel +from modelscope import AutoTokenizer +from transformers import AutoTokenizer + +# you could tune the prompt based on your own model, +# here the prompt tuning refers to https://github.com/THUDM/ChatGLM3/blob/main/PROMPT.md +CHATGLM_V3_PROMPT_FORMAT = "<|user|>\n{prompt}\n<|assistant|>" + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Predict Tokens using `generate()` API for ModelScope ChatGLM3 model') + parser.add_argument('--repo-id-or-model-path', type=str, default="ZhipuAI/chatglm3-6b", + help='The ModelScope repo id for the ChatGLM3 model to be downloaded' + ', or the path to the ModelScope checkpoint folder') + parser.add_argument('--prompt', type=str, default="AI是什么?", + help='Prompt to infer') + parser.add_argument('--n-predict', type=int, default=32, + help='Max tokens to predict') + + args = parser.parse_args() + model_path = args.repo_id_or_model_path + + # Load model in 4 bit, + # which convert the relevant layers in the model into INT4 format + # It is important to set `model_hub='modelscope'`, otherwise model hub is default to be huggingface + model = AutoModel.from_pretrained(model_path, + load_in_4bit=True, + trust_remote_code=True, + model_hub='modelscope') + + # Load tokenizer + tokenizer = AutoTokenizer.from_pretrained(model_path, + trust_remote_code=True) + + # Generate predicted tokens + with torch.inference_mode(): + prompt = CHATGLM_V3_PROMPT_FORMAT.format(prompt=args.prompt) + input_ids = tokenizer.encode(prompt, return_tensors="pt") + st = time.time() + # if your selected model is capable of utilizing previous key/value attentions + # to enhance decoding speed, but has `"use_cache": false` in its model config, + # it is important to set `use_cache=True` explicitly in the `generate` function + # to obtain optimal performance with IPEX-LLM INT4 optimizations + output = model.generate(input_ids, + max_new_tokens=args.n_predict) + end = time.time() + output_str = tokenizer.decode(output[0], skip_special_tokens=True) + print(f'Inference time: {end - st} s') + print('-' * 20, 'Prompt', '-' * 20) + print(prompt) + print('-' * 20, 'Output', '-' * 20) + print(output_str) \ No newline at end of file diff --git a/Intel_device_demo/ipex_llm_cpu_demo/openai_api_request.py b/Intel_device_demo/ipex_llm_cpu_demo/openai_api_request.py new file mode 100644 index 00000000..572329d3 --- /dev/null +++ b/Intel_device_demo/ipex_llm_cpu_demo/openai_api_request.py @@ -0,0 +1,89 @@ +""" +This script is an example of using the OpenAI API to create various interactions with a ChatGLM3 model. +It includes functions to: + +1. Conduct a basic chat session, asking about weather conditions in multiple cities. +2. Initiate a simple chat in Chinese, asking the model to tell a short story. +3. Retrieve and print embeddings for a given text input. + +Each function demonstrates a different aspect of the API's capabilities, showcasing how to make requests +and handle responses. +""" + +from openai import OpenAI +import time + +base_url = "http://127.0.0.1:8000/v1/" +client = OpenAI(api_key="EMPTY", base_url=base_url) + + +def function_chat(): + messages = [{"role": "user", "content": "What's the weather like in San Francisco, Tokyo, and Paris?"}] + tools = [ + { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "location": { + "type": "string", + "description": "The city and state, e.g. San Francisco, CA", + }, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["location"], + }, + }, + } + ] + + response = client.chat.completions.create( + model="chatglm3-6b", + messages=messages, + tools=tools, + tool_choice="auto", + ) + if response: + content = response.choices[0].message.content + print(content) + else: + print("Error:", response.status_code) + + +def simple_chat(use_stream=True): + messages = [ + { + "role": "system", + "content": "You are ChatGLM3, a large language model trained by Zhipu.AI. Follow the user's " + "instructions carefully. Respond using markdown.", + }, + { + "role": "user", + "content": "你好,请你用生动的话语给我讲一个小故事吧" + } + ] + response = client.chat.completions.create( + model="chatglm3-6b", + messages=messages, + stream=use_stream, + max_tokens=256, + temperature=0.8, + presence_penalty=1.1, + top_p=0.8) + if response: + if use_stream: + for chunk in response: + print(chunk.choices[0].delta.content) + else: + content = response.choices[0].message.content + print(content) + else: + print("Error:", response.status_code) + + +if __name__ == "__main__": + simple_chat(use_stream=False) + simple_chat(use_stream=True) diff --git a/Intel_device_demo/ipex_llm_cpu_demo/utils.py b/Intel_device_demo/ipex_llm_cpu_demo/utils.py new file mode 100644 index 00000000..1d102ebf --- /dev/null +++ b/Intel_device_demo/ipex_llm_cpu_demo/utils.py @@ -0,0 +1,186 @@ +import gc +import json +import torch +from transformers import PreTrainedModel, PreTrainedTokenizer +from transformers.generation.logits_process import LogitsProcessor +from typing import Union, Tuple + + +class InvalidScoreLogitsProcessor(LogitsProcessor): + def __call__( + self, input_ids: torch.LongTensor, scores: torch.FloatTensor + ) -> torch.FloatTensor: + if torch.isnan(scores).any() or torch.isinf(scores).any(): + scores.zero_() + scores[..., 5] = 5e4 + return scores + + +def process_response(output: str, use_tool: bool = False) -> Union[str, dict]: + content = "" + for response in output.split("<|assistant|>"): + metadata, content = response.split("\n", maxsplit=1) + if not metadata.strip(): + content = content.strip() + content = content.replace("[[训练时间]]", "2023年") + else: + if use_tool: + content = "\n".join(content.split("\n")[1:-1]) + + def tool_call(**kwargs): + return kwargs + + parameters = eval(content) + content = { + "name": metadata.strip(), + "arguments": json.dumps(parameters, ensure_ascii=False) + } + else: + content = { + "name": metadata.strip(), + "content": content + } + return content + + +@torch.inference_mode() +def generate_stream_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict): + messages = params["messages"] + tools = params["tools"] + temperature = float(params.get("temperature", 1.0)) + repetition_penalty = float(params.get("repetition_penalty", 1.0)) + top_p = float(params.get("top_p", 1.0)) + max_new_tokens = int(params.get("max_tokens", 256)) + echo = params.get("echo", True) + messages = process_chatglm_messages(messages, tools=tools) + query, role = messages[-1]["content"], messages[-1]["role"] + + inputs = tokenizer.build_chat_input(query, history=messages[:-1], role=role) + inputs = inputs.to(model.device) + input_echo_len = len(inputs["input_ids"][0]) + + if input_echo_len >= model.config.seq_length: + print(f"Input length larger than {model.config.seq_length}") + + eos_token_id = [ + tokenizer.eos_token_id, + tokenizer.get_command("<|user|>"), + ] + + gen_kwargs = { + "max_new_tokens": max_new_tokens, + "do_sample": True if temperature > 1e-5 else False, + "top_p": top_p, + "repetition_penalty": repetition_penalty, + "logits_processor": [InvalidScoreLogitsProcessor()], + } + if temperature > 1e-5: + gen_kwargs["temperature"] = temperature + + total_len = 0 + for total_ids in model.stream_generate(**inputs, eos_token_id=eos_token_id, **gen_kwargs): + total_ids = total_ids.tolist()[0] + total_len = len(total_ids) + if echo: + output_ids = total_ids[:-1] + else: + output_ids = total_ids[input_echo_len:-1] + + response = tokenizer.decode(output_ids) + if response and response[-1] != "�": + response, stop_found = apply_stopping_strings(response, ["<|observation|>"]) + + yield { + "text": response, + "usage": { + "prompt_tokens": input_echo_len, + "completion_tokens": total_len - input_echo_len, + "total_tokens": total_len, + }, + "finish_reason": "function_call" if stop_found else None, + } + + if stop_found: + break + + # Only last stream result contains finish_reason, we set finish_reason as stop + ret = { + "text": response, + "usage": { + "prompt_tokens": input_echo_len, + "completion_tokens": total_len - input_echo_len, + "total_tokens": total_len, + }, + "finish_reason": "stop", + } + yield ret + + gc.collect() + torch.cuda.empty_cache() + + +def process_chatglm_messages(messages, tools=None): + _messages = messages + messages = [] + if tools: + messages.append( + { + "role": "system", + "content": "Answer the following questions as best as you can. You have access to the following tools:", + "tools": tools + } + ) + + for m in _messages: + role, content, func_call = m.role, m.content, m.function_call + if role == "function": + messages.append( + { + "role": "observation", + "content": content + } + ) + + elif role == "assistant" and func_call is not None: + for response in content.split("<|assistant|>"): + metadata, sub_content = response.split("\n", maxsplit=1) + messages.append( + { + "role": role, + "metadata": metadata, + "content": sub_content.strip() + } + ) + else: + messages.append({"role": role, "content": content}) + return messages + + +def generate_chatglm3(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict): + for response in generate_stream_chatglm3(model, tokenizer, params): + pass + return response + + +def apply_stopping_strings(reply, stop_strings) -> Tuple[str, bool]: + stop_found = False + for string in stop_strings: + idx = reply.find(string) + if idx != -1: + reply = reply[:idx] + stop_found = True + break + + if not stop_found: + # If something like "\nYo" is generated just before "\nYou: is completed, trim it + for string in stop_strings: + for j in range(len(string) - 1, 0, -1): + if reply[-j:] == string[:j]: + reply = reply[:-j] + break + else: + continue + + break + + return reply, stop_found