Skip to content

Commit b549eab

Browse files
author
Luke Hinds
committed
vLLM Provider
1 parent 189aee9 commit b549eab

17 files changed

+322
-101
lines changed

config.yaml.example

Lines changed: 37 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,42 @@
1-
# Example configuration file
2-
# Copy this file to config.yaml and modify as needed
1+
# Codegate Example Configuration
32

4-
# Server configuration
5-
port: 8989
6-
host: "localhost"
3+
# Network settings
4+
port: 8989 # Port to listen on (1-65535)
5+
host: "localhost" # Host to bind to (use localhost for all interfaces)
76

87
# Logging configuration
9-
log_level: "INFO" # ERROR, WARNING, INFO, DEBUG
10-
log_format: "JSON" # JSON, TEXT
8+
log_level: "INFO" # One of: ERROR, WARNING, INFO, DEBUG
119

12-
# Prompts configuration
13-
# Option 1: Define prompts directly in the config file
14-
prompts:
15-
my_system_prompt: "Custom system prompt defined in config"
16-
another_prompt: "Another custom prompt"
10+
# Note: This configuration can be overridden by:
11+
# 1. CLI arguments (--port, --host, --log-level)
12+
# 2. Environment variables (CODEGATE_APP_PORT, CODEGATE_APP_HOST, CODEGATE_APP_LOG_LEVEL)
1713

18-
# Option 2: Reference a separate prompts file
19-
# prompts: "prompts.yaml" # Path to prompts file (relative to config file or absolute)
14+
# Provider URLs
15+
provider_urls:
16+
openai: "https://api.openai.com/v1"
17+
anthropic: "https://api.anthropic.com/v1"
18+
vllm: "http://localhost:8000" # Base URL without /v1 path, it will be added automatically
19+
20+
# Note: Provider URLs can be overridden by environment variables:
21+
# CODEGATE_PROVIDER_OPENAI_URL
22+
# CODEGATE_PROVIDER_ANTHROPIC_URL
23+
# CODEGATE_PROVIDER_VLLM_URL
24+
# Or by CLI flags:
25+
# --vllm-url
26+
# --openai-url
27+
# --anthropic-url
28+
29+
# Embedding model configuration
30+
31+
####
32+
# Inference model configuration
33+
##
34+
35+
# Model to use for chatting
36+
chat_model_path: "./models/qwen2.5-coder-1.5b-instruct-q5_k_m.gguf"
37+
38+
# Context length of the model
39+
chat_model_n_ctx: 32768
40+
41+
# Number of layers to offload to GPU. If -1, all layers are offloaded.
42+
chat_model_n_gpu_layers: -1

src/codegate/cli.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import sys
44
from pathlib import Path
5-
from typing import Optional
5+
from typing import Dict, Optional
66

77
import click
88

@@ -88,17 +88,47 @@ def show_prompts(prompts: Optional[Path]) -> None:
8888
default=None,
8989
help="Path to YAML prompts file",
9090
)
91+
@click.option(
92+
"--vllm-url",
93+
type=str,
94+
default=None,
95+
help="vLLM provider URL (default: http://localhost:8000/v1)",
96+
)
97+
@click.option(
98+
"--openai-url",
99+
type=str,
100+
default=None,
101+
help="OpenAI provider URL (default: https://api.openai.com/v1)",
102+
)
103+
@click.option(
104+
"--anthropic-url",
105+
type=str,
106+
default=None,
107+
help="Anthropic provider URL (default: https://api.anthropic.com/v1)",
108+
)
91109
def serve(
92110
port: Optional[int],
93111
host: Optional[str],
94112
log_level: Optional[str],
95113
log_format: Optional[str],
96114
config: Optional[Path],
97115
prompts: Optional[Path],
116+
vllm_url: Optional[str],
117+
openai_url: Optional[str],
118+
anthropic_url: Optional[str],
98119
) -> None:
99120
"""Start the codegate server."""
100121
logger = None
101122
try:
123+
# Create provider URLs dict from CLI options
124+
cli_provider_urls: Dict[str, str] = {}
125+
if vllm_url:
126+
cli_provider_urls["vllm"] = vllm_url
127+
if openai_url:
128+
cli_provider_urls["openai"] = openai_url
129+
if anthropic_url:
130+
cli_provider_urls["anthropic"] = anthropic_url
131+
102132
# Load configuration with priority resolution
103133
cfg = Config.load(
104134
config_path=config,
@@ -107,6 +137,7 @@ def serve(
107137
cli_host=host,
108138
cli_log_level=log_level,
109139
cli_log_format=log_format,
140+
cli_provider_urls=cli_provider_urls,
110141
)
111142

112143
logger = setup_logging(cfg.log_level, cfg.log_format)
@@ -118,6 +149,7 @@ def serve(
118149
"log_level": cfg.log_level.value,
119150
"log_format": cfg.log_format.value,
120151
"prompts_loaded": len(cfg.prompts.prompts),
152+
"provider_urls": cfg.provider_urls,
121153
},
122154
)
123155

src/codegate/config.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import os
44
from dataclasses import dataclass, field
55
from pathlib import Path
6-
from typing import Optional, Union
6+
from typing import Dict, Optional, Union
77

88
import yaml
99

@@ -32,6 +32,15 @@ class Config:
3232
chat_model_n_ctx: int = 32768
3333
chat_model_n_gpu_layers: int = -1
3434

35+
# Provider URLs with defaults
36+
provider_urls: Dict[str, str] = field(
37+
default_factory=lambda: {
38+
"openai": "https://api.openai.com/v1",
39+
"anthropic": "https://api.anthropic.com/v1",
40+
"vllm": "http://localhost:8000", # Base URL without /v1 path
41+
}
42+
)
43+
3544
def __post_init__(self) -> None:
3645
"""Validate configuration after initialization."""
3746
if not isinstance(self.port, int) or not (1 <= self.port <= 65535):
@@ -95,19 +104,23 @@ def from_file(cls, config_path: Union[str, Path]) -> "Config":
95104
prompts_path = Path(config_path).parent / prompts_path
96105
prompts_config = PromptConfig.from_file(prompts_path)
97106

107+
# Get provider URLs from config
108+
provider_urls = cls.provider_urls.copy()
109+
if "provider_urls" in config_data:
110+
provider_urls.update(config_data.pop("provider_urls"))
111+
98112
return cls(
99113
port=config_data.get("port", cls.port),
100114
host=config_data.get("host", cls.host),
101115
log_level=config_data.get("log_level", cls.log_level.value),
102116
log_format=config_data.get("log_format", cls.log_format.value),
103117
model_base_path=config_data.get("chat_model_path", cls.model_base_path),
104-
chat_model_n_ctx=config_data.get(
105-
"chat_model_n_ctx", cls.chat_model_n_ctx
106-
),
118+
chat_model_n_ctx=config_data.get("chat_model_n_ctx", cls.chat_model_n_ctx),
107119
chat_model_n_gpu_layers=config_data.get(
108120
"chat_model_n_gpu_layers", cls.chat_model_n_gpu_layers
109121
),
110122
prompts=prompts_config,
123+
provider_urls=provider_urls,
111124
)
112125
except yaml.YAMLError as e:
113126
raise ConfigurationError(f"Failed to parse config file: {e}")
@@ -138,6 +151,12 @@ def from_env(cls) -> "Config":
138151
os.environ["CODEGATE_PROMPTS_FILE"]
139152
) # noqa: E501
140153

154+
# Load provider URLs from environment variables
155+
for provider in config.provider_urls.keys():
156+
env_var = f"CODEGATE_PROVIDER_{provider.upper()}_URL"
157+
if env_var in os.environ:
158+
config.provider_urls[provider] = os.environ[env_var]
159+
141160
return config
142161
except ValueError as e:
143162
raise ConfigurationError(f"Invalid environment variable value: {e}")
@@ -151,6 +170,7 @@ def load(
151170
cli_host: Optional[str] = None,
152171
cli_log_level: Optional[str] = None,
153172
cli_log_format: Optional[str] = None,
173+
cli_provider_urls: Optional[Dict[str, str]] = None,
154174
) -> "Config":
155175
"""Load configuration with priority resolution.
156176
@@ -167,6 +187,7 @@ def load(
167187
cli_host: Optional CLI host override
168188
cli_log_level: Optional CLI log level override
169189
cli_log_format: Optional CLI log format override
190+
cli_provider_urls: Optional dict of provider URLs from CLI
170191
171192
Returns:
172193
Config: Resolved configuration
@@ -198,6 +219,10 @@ def load(
198219
if "CODEGATE_PROMPTS_FILE" in os.environ:
199220
config.prompts = env_config.prompts
200221

222+
# Override provider URLs from environment
223+
for provider, url in env_config.provider_urls.items():
224+
config.provider_urls[provider] = url
225+
201226
# Override with CLI arguments
202227
if cli_port is not None:
203228
config.port = cli_port
@@ -209,6 +234,8 @@ def load(
209234
config.log_format = LogFormat(cli_log_format)
210235
if prompts_path is not None:
211236
config.prompts = PromptConfig.from_file(prompts_path)
237+
if cli_provider_urls is not None:
238+
config.provider_urls.update(cli_provider_urls)
212239

213240
# Set the __config class attribute
214241
Config.__config = config

src/codegate/pipeline/fim/secret_analyzer.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,7 @@ def name(self) -> str:
2828
return "fim-secret-analyzer"
2929

3030
async def process(
31-
self,
32-
request: ChatCompletionRequest,
33-
context: PipelineContext
31+
self, request: ChatCompletionRequest, context: PipelineContext
3432
) -> PipelineResult:
3533
# We should call here Secrets Blocking module to see if the request messages contain secrets
3634
# messages_contain_secrets = [analyze_msg_secrets(msg) for msg in request.messages]
@@ -39,7 +37,7 @@ async def process(
3937
# For the moment to test shortcutting just treat all messages as if they contain secrets
4038
message_with_secrets = False
4139
if message_with_secrets:
42-
logger.info('Blocking message with secrets.')
40+
logger.info("Blocking message with secrets.")
4341
return PipelineResult(
4442
response=PipelineResponse(
4543
step_name=self.name,

src/codegate/providers/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@
22
from codegate.providers.base import BaseProvider
33
from codegate.providers.openai.provider import OpenAIProvider
44
from codegate.providers.registry import ProviderRegistry
5+
from codegate.providers.vllm.provider import VLLMProvider
56

67
__all__ = [
78
"BaseProvider",
89
"ProviderRegistry",
910
"OpenAIProvider",
1011
"AnthropicProvider",
12+
"VLLMProvider",
1113
]

src/codegate/providers/anthropic/completion_handler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ async def execute_completion(
2727
For more details, refer to the
2828
[LiteLLM Documentation](https://docs.litellm.ai/docs/providers/anthropic).
2929
"""
30-
model_in_request = request['model']
31-
if not model_in_request.startswith('anthropic/'):
32-
request['model'] = f'anthropic/{model_in_request}'
30+
model_in_request = request["model"]
31+
if not model_in_request.startswith("anthropic/"):
32+
request["model"] = f"anthropic/{model_in_request}"
3333
return await super().execute_completion(request, api_key, stream)

src/codegate/providers/anthropic/provider.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,17 @@
1111

1212
class AnthropicProvider(BaseProvider):
1313
def __init__(
14-
self,
15-
pipeline_processor: Optional[SequentialPipelineProcessor] = None,
16-
fim_pipeline_processor: Optional[SequentialPipelineProcessor] = None
17-
):
14+
self,
15+
pipeline_processor: Optional[SequentialPipelineProcessor] = None,
16+
fim_pipeline_processor: Optional[SequentialPipelineProcessor] = None,
17+
):
1818
completion_handler = AnthropicCompletion(stream_generator=anthropic_stream_generator)
1919
super().__init__(
2020
AnthropicInputNormalizer(),
2121
AnthropicOutputNormalizer(),
2222
completion_handler,
2323
pipeline_processor,
24-
fim_pipeline_processor
24+
fim_pipeline_processor,
2525
)
2626

2727
@property

src/codegate/providers/base.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,17 +50,15 @@ def provider_route_name(self) -> str:
5050
pass
5151

5252
async def _run_input_pipeline(
53-
self,
54-
normalized_request: ChatCompletionRequest,
55-
is_fim_request: bool
53+
self, normalized_request: ChatCompletionRequest, is_fim_request: bool
5654
) -> PipelineResult:
5755
# Decide which pipeline processor to use
5856
if is_fim_request:
5957
pipeline_processor = self._fim_pipelin_processor
60-
logger.info('FIM pipeline selected for execution.')
58+
logger.info("FIM pipeline selected for execution.")
6159
else:
6260
pipeline_processor = self._pipeline_processor
63-
logger.info('Chat completion pipeline selected for execution.')
61+
logger.info("Chat completion pipeline selected for execution.")
6462
if pipeline_processor is None:
6563
return PipelineResult(request=normalized_request)
6664

@@ -92,21 +90,21 @@ def _is_fim_request_body(self, data: Dict) -> bool:
9290
Determine from the raw incoming data if it's a FIM request.
9391
Used by: OpenAI and Anthropic
9492
"""
95-
messages = data.get('messages', [])
93+
messages = data.get("messages", [])
9694
if not messages:
9795
return False
9896

99-
first_message_content = messages[0].get('content')
97+
first_message_content = messages[0].get("content")
10098
if first_message_content is None:
10199
return False
102100

103-
fim_stop_sequences = ['</COMPLETION>', '<COMPLETION>', '</QUERY>', '<QUERY>']
101+
fim_stop_sequences = ["</COMPLETION>", "<COMPLETION>", "</QUERY>", "<QUERY>"]
104102
if isinstance(first_message_content, str):
105103
msg_prompt = first_message_content
106104
elif isinstance(first_message_content, list):
107-
msg_prompt = first_message_content[0].get('text', '')
105+
msg_prompt = first_message_content[0].get("text", "")
108106
else:
109-
logger.warning(f'Could not determine if message was FIM from data: {data}')
107+
logger.warning(f"Could not determine if message was FIM from data: {data}")
110108
return False
111109
return all([stop_sequence in msg_prompt for stop_sequence in fim_stop_sequences])
112110

@@ -121,7 +119,7 @@ def _is_fim_request(self, request: Request, data: Dict) -> bool:
121119
return self._is_fim_request_body(data)
122120

123121
async def complete(
124-
self, data: Dict, api_key: Optional[str], is_fim_request: bool
122+
self, data: Dict, api_key: Optional[str], is_fim_request: bool
125123
) -> Union[ModelResponse, AsyncIterator[ModelResponse]]:
126124
"""
127125
Main completion flow with pipeline integration

src/codegate/providers/llamacpp/completion_handler.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import json
21
import asyncio
2+
import json
33
from typing import Any, AsyncIterator, Iterator, Optional, Union
44

55
from fastapi.responses import StreamingResponse
@@ -39,16 +39,20 @@ async def execute_completion(
3939
"""
4040
model_path = f"{Config.get_config().model_base_path}/{request['model']}.gguf"
4141

42-
if 'prompt' in request:
43-
response = await self.inference_engine.complete(model_path,
44-
Config.get_config().chat_model_n_ctx,
45-
Config.get_config().chat_model_n_gpu_layers,
46-
**request)
42+
if "prompt" in request:
43+
response = await self.inference_engine.complete(
44+
model_path,
45+
Config.get_config().chat_model_n_ctx,
46+
Config.get_config().chat_model_n_gpu_layers,
47+
**request,
48+
)
4749
else:
48-
response = await self.inference_engine.chat(model_path,
49-
Config.get_config().chat_model_n_ctx,
50-
Config.get_config().chat_model_n_gpu_layers,
51-
**request)
50+
response = await self.inference_engine.chat(
51+
model_path,
52+
Config.get_config().chat_model_n_ctx,
53+
Config.get_config().chat_model_n_gpu_layers,
54+
**request,
55+
)
5256
return response
5357

5458
def create_streaming_response(self, stream: Iterator[Any]) -> StreamingResponse:

0 commit comments

Comments
 (0)