Normalize and denormalize llamacpp streaming reply

jhrozek · jhrozek · commit dc988a36263e · 2024-11-29T09:40:55.000+01:00
Originally, I wanted to add the normalizers to convert the `im_start`/`im_end` tags, but we worked around that by setting llamacpp to use the OpenAI format. We'll still need a normalizer for the vllm provider though. At the moment we really need the denormalizer so that the blocking pipeline can return a stream of `ModelResponse`s and the denormalizer would convert them to the CreateChatCompletionStreamResponse structure that is then serialized to the client. This avoids any guessing or special casing that would otherwise be needed in the `llamacpp_stream_generator` which currently expected `Iterator[CreateChatCompletionStreamResponse]`. Another change that simplifies the logic is that the `llamacpp_stream_generator` now accepts an `AsyncIterator` instead of just `Iterator` that the llamacpp completion hander was returning. Again, this is to simplify the logic and pass the iterator from the blocking pipeline. On the completion side we have a simple sync-to-async wrapper. Fixes: stacklok#94
diff --git a/src/codegate/pipeline/codegate_system_prompt/codegate.py b/src/codegate/pipeline/codegate_system_prompt/codegate.py
@@ -17,8 +17,7 @@ class CodegateSystemPrompt(PipelineStep):
 
     def __init__(self, system_prompt_message: Optional[str] = None):
         self._system_message = ChatCompletionSystemMessage(
-            content=system_prompt_message,
-            role="system"
+            content=system_prompt_message, role="system"
         )
 
     @property
@@ -29,7 +28,7 @@ def name(self) -> str:
         return "codegate-system-prompt"
 
     async def process(
-            self, request: ChatCompletionRequest, context: PipelineContext
+        self, request: ChatCompletionRequest, context: PipelineContext
     ) -> PipelineResult:
         """
         Process the completion request and add a system prompt if the user message contains
diff --git a/src/codegate/providers/base.py b/src/codegate/providers/base.py
@@ -49,6 +49,20 @@ def _setup_routes(self) -> None:
     def provider_route_name(self) -> str:
         pass
 
+    async def _run_output_stream_pipeline(
+        self,
+        normalized_stream: AsyncIterator[ModelResponse],
+    ) -> AsyncIterator[ModelResponse]:
+        # we don't have a pipeline for output stream yet
+        return normalized_stream
+
+    def _run_output_pipeline(
+        self,
+        normalized_response: ModelResponse,
+    ) -> ModelResponse:
+        # we don't have a pipeline for output yet
+        return normalized_response
+
     async def _run_input_pipeline(
         self, normalized_request: ChatCompletionRequest, is_fim_request: bool
     ) -> PipelineResult:
@@ -149,8 +163,13 @@ async def complete(
             provider_request, api_key=api_key, stream=streaming
         )
         if not streaming:
-            return self._output_normalizer.denormalize(model_response)
-        return self._output_normalizer.denormalize_streaming(model_response)
+            normalized_response = self._output_normalizer.normalize(model_response)
+            pipeline_output = self._run_output_pipeline(normalized_response)
+            return self._output_normalizer.denormalize(pipeline_output)
+
+        normalized_stream = self._output_normalizer.normalize_streaming(model_response)
+        pipeline_output_stream = await self._run_output_stream_pipeline(normalized_stream)
+        return self._output_normalizer.denormalize_streaming(pipeline_output_stream)
 
     def get_routes(self) -> APIRouter:
         return self.router
diff --git a/src/codegate/providers/litellmshim/generators.py b/src/codegate/providers/litellmshim/generators.py
@@ -1,4 +1,3 @@
-import asyncio
 import json
 from typing import Any, AsyncIterator
 
diff --git a/src/codegate/providers/llamacpp/completion_handler.py b/src/codegate/providers/llamacpp/completion_handler.py
@@ -4,21 +4,24 @@
 
 from fastapi.responses import StreamingResponse
 from litellm import ChatCompletionRequest, ModelResponse
+from llama_cpp.llama_types import (
+    CreateChatCompletionStreamResponse,
+)
 
 from codegate.config import Config
 from codegate.inference.inference_engine import LlamaCppInferenceEngine
 from codegate.providers.base import BaseCompletionHandler
 
 
-async def llamacpp_stream_generator(stream: Iterator[Any]) -> AsyncIterator[str]:
+async def llamacpp_stream_generator(
+    stream: AsyncIterator[CreateChatCompletionStreamResponse],
+) -> AsyncIterator[str]:
     """OpenAI-style SSE format"""
     try:
-        for chunk in stream:
-            if hasattr(chunk, "model_dump_json"):
-                chunk = chunk.model_dump_json(exclude_none=True, exclude_unset=True)
+        async for chunk in stream:
+            chunk = json.dumps(chunk)
             try:
-                yield f"data:{json.dumps(chunk)}\n\n"
-                await asyncio.sleep(0)
+                yield f"data:{chunk}\n\n"
             except Exception as e:
                 yield f"data:{str(e)}\n\n"
     except Exception as e:
@@ -27,6 +30,18 @@ async def llamacpp_stream_generator(stream: Iterator[Any]) -> AsyncIterator[str]
         yield "data: [DONE]\n\n"
 
 
+async def convert_to_async_iterator(
+    sync_iterator: Iterator[CreateChatCompletionStreamResponse],
+) -> AsyncIterator[CreateChatCompletionStreamResponse]:
+    """
+    Convert a synchronous iterator to an asynchronous iterator. This makes the logic easier
+    because both the pipeline and the completion handler can use async iterators.
+    """
+    for item in sync_iterator:
+        yield item
+        await asyncio.sleep(0)
+
+
 class LlamaCppCompletionHandler(BaseCompletionHandler):
     def __init__(self):
         self.inference_engine = LlamaCppInferenceEngine()
@@ -53,9 +68,10 @@ async def execute_completion(
                 Config.get_config().chat_model_n_gpu_layers,
                 **request,
             )
-        return response
 
-    def create_streaming_response(self, stream: Iterator[Any]) -> StreamingResponse:
+        return convert_to_async_iterator(response) if stream else response
+
+    def create_streaming_response(self, stream: AsyncIterator[Any]) -> StreamingResponse:
         """
         Create a streaming response from a stream generator. The StreamingResponse
         is the format that FastAPI expects for streaming responses.
diff --git a/src/codegate/providers/llamacpp/normalizer.py b/src/codegate/providers/llamacpp/normalizer.py
@@ -1,6 +1,13 @@
-from typing import Any, AsyncIterable, AsyncIterator, Dict, Iterable, Iterator, Union
+from typing import Any, AsyncIterable, AsyncIterator, Dict, Union
 
 from litellm import ChatCompletionRequest, ModelResponse
+from litellm.types.utils import Delta, StreamingChoices
+from llama_cpp.llama_types import (
+    ChatCompletionStreamResponseChoice,
+    ChatCompletionStreamResponseDelta,
+    ChatCompletionStreamResponseDeltaEmpty,
+    CreateChatCompletionStreamResponse,
+)
 
 from codegate.providers.normalizer import ModelInputNormalizer, ModelOutputNormalizer
 
@@ -32,16 +39,97 @@ def denormalize(self, data: ChatCompletionRequest) -> Dict:
         return data
 
 
+class ModelToLlamaCpp(AsyncIterator[CreateChatCompletionStreamResponse]):
+    def __init__(self, normalized_reply: AsyncIterable[ModelResponse]):
+        self.normalized_reply = normalized_reply
+        self._aiter = normalized_reply.__aiter__()
+
+    def __aiter__(self):
+        return self
+
+    @staticmethod
+    def _create_delta(
+        choice_delta: Delta,
+    ) -> Union[ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty]:
+        if not choice_delta:
+            return ChatCompletionStreamResponseDeltaEmpty()
+        return ChatCompletionStreamResponseDelta(
+            content=choice_delta.content,
+            role=choice_delta.role,
+        )
+
+    async def __anext__(self) -> CreateChatCompletionStreamResponse:
+        try:
+            chunk = await self._aiter.__anext__()
+            return CreateChatCompletionStreamResponse(
+                id=chunk["id"],
+                model=chunk["model"],
+                object="chat.completion.chunk",
+                created=chunk["created"],
+                choices=[
+                    ChatCompletionStreamResponseChoice(
+                        index=choice.index,
+                        delta=self._create_delta(choice.delta),
+                        finish_reason=choice.finish_reason,
+                        logprobs=None,
+                    )
+                    for choice in chunk["choices"]
+                ],
+            )
+        except StopAsyncIteration:
+            raise StopAsyncIteration
+
+
+class LlamaCppToModel(AsyncIterator[ModelResponse]):
+    def __init__(self, normalized_reply: AsyncIterable[CreateChatCompletionStreamResponse]):
+        self.normalized_reply = normalized_reply
+        self._aiter = normalized_reply.__aiter__()
+
+    def __aiter__(self):
+        return self
+
+    @staticmethod
+    def _create_delta(
+        choice_delta: Union[
+            ChatCompletionStreamResponseDelta, ChatCompletionStreamResponseDeltaEmpty
+        ]
+    ) -> Delta:
+        if not choice_delta:  # Handles empty dict case
+            return Delta(content=None, role=None)
+        return Delta(content=choice_delta.get("content"), role=choice_delta.get("role"))
+
+    async def __anext__(self) -> ModelResponse:
+        try:
+            chunk = await self._aiter.__anext__()
+            return ModelResponse(
+                id=chunk["id"],
+                choices=[
+                    StreamingChoices(
+                        finish_reason=choice.get("finish_reason", None),
+                        index=choice["index"],
+                        delta=self._create_delta(choice.get("delta")),
+                        logprobs=None,
+                    )
+                    for choice in chunk["choices"]
+                ],
+                created=chunk["created"],
+                model=chunk["model"],
+                object=chunk["object"],
+            )
+        except StopAsyncIteration:
+            raise StopAsyncIteration
+
+
 class LLamaCppOutputNormalizer(ModelOutputNormalizer):
     def normalize_streaming(
         self,
-        model_reply: Union[AsyncIterable[Any], Iterable[Any]],
-    ) -> Union[AsyncIterator[ModelResponse], Iterator[ModelResponse]]:
+        llamacpp_stream: AsyncIterable[CreateChatCompletionStreamResponse],
+    ) -> AsyncIterator[ModelResponse]:
         """
         Normalize the output stream. This is a pass-through for liteLLM output normalizer
         as the liteLLM output is already in the normalized format.
         """
-        return model_reply
+        return LlamaCppToModel(llamacpp_stream)
 
     def normalize(self, model_reply: Any) -> ModelResponse:
         """
@@ -59,10 +147,10 @@ def denormalize(self, normalized_reply: ModelResponse) -> Any:
 
     def denormalize_streaming(
         self,
-        normalized_reply: Union[AsyncIterable[ModelResponse], Iterable[ModelResponse]],
-    ) -> Union[AsyncIterator[Any], Iterator[Any]]:
+        model_stream: AsyncIterable[ModelResponse],
+    ) -> AsyncIterator[CreateChatCompletionStreamResponse]:
         """
         Denormalize the output stream from the completion function to the format
         expected by the client
         """
-        return normalized_reply
+        return ModelToLlamaCpp(model_stream)
diff --git a/src/codegate/server.py b/src/codegate/server.py
@@ -6,8 +6,6 @@
 from codegate.config import Config
 from codegate.pipeline.base import PipelineStep, SequentialPipelineProcessor
 from codegate.pipeline.codegate_system_prompt.codegate import CodegateSystemPrompt
-from codegate.pipeline.secrets.secrets import CodegateSecrets
-from codegate.pipeline.secrets.signatures import CodegateSignatures
 from codegate.pipeline.version.version import CodegateVersion
 from codegate.providers.anthropic.provider import AnthropicProvider
 from codegate.providers.llamacpp.provider import LlamaCppProvider
diff --git a/tests/pipeline/codegate_system_prompt/test_codegate_system_prompt.py b/tests/pipeline/codegate_system_prompt/test_codegate_system_prompt.py
@@ -24,61 +24,50 @@ def test_init_with_system_message(self):
         step = CodegateSystemPrompt(system_prompt_message=test_message)
         assert step._system_message["content"] == test_message
 
-    @pytest.mark.parametrize("user_message,expected_modification", [
-        # Test cases with different scenarios
-        ("Hello CodeGate", True),
-        ("CODEGATE in uppercase", True),
-        ("No matching message", False),
-        ("codegate with lowercase", True)
-    ])
-    async def test_process_system_prompt_insertion(
-        self,
-        user_message,
-        expected_modification
-    ):
+    @pytest.mark.parametrize(
+        "user_message,expected_modification",
+        [
+            # Test cases with different scenarios
+            ("Hello CodeGate", True),
+            ("CODEGATE in uppercase", True),
+            ("No matching message", False),
+            ("codegate with lowercase", True),
+        ],
+    )
+    async def test_process_system_prompt_insertion(self, user_message, expected_modification):
         """
         Test system prompt insertion based on message content
         """
         # Prepare mock request with user message
-        mock_request = {
-            "messages": [
-                {"role": "user", "content": user_message}
-            ]
-        }
+        mock_request = {"messages": [{"role": "user", "content": user_message}]}
         mock_context = Mock(spec=PipelineContext)
 
         # Create system prompt step
         system_prompt = "Security analysis system prompt"
         step = CodegateSystemPrompt(system_prompt_message=system_prompt)
 
         # Mock the get_last_user_message method
-        step.get_last_user_message = Mock(
-            return_value=(user_message, 0)
-        )
+        step.get_last_user_message = Mock(return_value=(user_message, 0))
 
         # Process the request
         result = await step.process(ChatCompletionRequest(**mock_request), mock_context)
 
         if expected_modification:
             # Check that system message was inserted
-            assert len(result.request['messages']) == 2
-            assert result.request['messages'][0]['role'] == 'system'
-            assert result.request['messages'][0]['content'] == system_prompt
-            assert result.request['messages'][1]['role'] == 'user'
-            assert result.request['messages'][1]['content'] == user_message
+            assert len(result.request["messages"]) == 2
+            assert result.request["messages"][0]["role"] == "system"
+            assert result.request["messages"][0]["content"] == system_prompt
+            assert result.request["messages"][1]["role"] == "user"
+            assert result.request["messages"][1]["content"] == user_message
         else:
             # Ensure no modification occurred
-            assert len(result.request['messages']) == 1
+            assert len(result.request["messages"]) == 1
 
     async def test_no_system_message_configured(self):
         """
         Test behavior when no system message is configured
         """
-        mock_request = {
-            "messages": [
-                {"role": "user", "content": "CodeGate test"}
-            ]
-        }
+        mock_request = {"messages": [{"role": "user", "content": "CodeGate test"}]}
         mock_context = Mock(spec=PipelineContext)
 
         # Create step without system message
@@ -90,10 +79,13 @@ async def test_no_system_message_configured(self):
         # Verify request remains unchanged
         assert result.request == mock_request
 
-    @pytest.mark.parametrize("edge_case", [
-        None,  # No messages
-        [],    # Empty messages list
-    ])
+    @pytest.mark.parametrize(
+        "edge_case",
+        [
+            None,  # No messages
+            [],  # Empty messages list
+        ],
+    )
     async def test_edge_cases(self, edge_case):
         """
         Test edge cases with None or empty message list
diff --git a/tests/providers/llamacpp/test_normalizer.py b/tests/providers/llamacpp/test_normalizer.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-import asyncio`
`2`	`1`	`import json`
`3`	`2`	`from typing import Any, AsyncIterator`
`4`	`3`