LiteLLM Minor Fixes & Improvements (09/19/2024) (BerriAI#5793)

* fix(model_prices_and_context_window.json): add cost tracking for more vertex llama3.1 model 8b and 70b models * fix(proxy/utils.py): handle data being none on pre-call hooks * fix(proxy/): create views on initial proxy startup fixes base case, where user starts proxy for first time Fixes BerriAI#5756 * build(config.yml): fix vertex version for test * feat(ui/): support enabling/disabling slack alerting Allows admin to turn on/off slack alerting through ui * feat(rerank/main.py): support langfuse logging * fix(proxy/utils.py): fix linting errors * fix(langfuse.py): log clean metadata * test(tests): replace deprecated openai model
jwillians · Sep 20, 2024 · 3933fba · 3933fba
1 parent 696fc38
commit 3933fba
Show file tree

Hide file tree

Showing 22 changed files with 644 additions and 93 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -558,6 +558,7 @@ jobs:
             pip install "anyio==3.7.1"
             pip install "asyncio==3.4.3"
             pip install "PyGithub==1.59.1"
+            pip install "google-cloud-aiplatform==1.59.0"
       - run:
           name: Build Docker image
           command: docker build -t my-app:latest -f Dockerfile.database .

diff --git a/db_scripts/create_views.py b/db_scripts/create_views.py
@@ -51,22 +51,25 @@ async def check_view_exists():
 
         print("LiteLLM_VerificationTokenView Created!")  # noqa
 
-    sql_query = """
-        CREATE MATERIALIZED VIEW IF NOT EXISTS "MonthlyGlobalSpend" AS 
+    try:
+        await db.query_raw("""SELECT 1 FROM "MonthlyGlobalSpend" LIMIT 1""")
+        print("MonthlyGlobalSpend Exists!")  # noqa
+    except Exception as e:
+        sql_query = """
+        CREATE OR REPLACE VIEW "MonthlyGlobalSpend" AS 
         SELECT
-            DATE_TRUNC('day', "startTime") AS date, 
-            SUM("spend") AS spend 
+        DATE("startTime") AS date, 
+        SUM("spend") AS spend 
         FROM 
-            "LiteLLM_SpendLogs" 
+        "LiteLLM_SpendLogs" 
         WHERE 
-            "startTime" >= CURRENT_DATE - INTERVAL '30 days'
+        "startTime" >= (CURRENT_DATE - INTERVAL '30 days')
         GROUP BY 
-            DATE_TRUNC('day', "startTime");
-    """
-    # Execute the queries
-    await db.execute_raw(query=sql_query)
+        DATE("startTime");
+        """
+        await db.execute_raw(query=sql_query)
 
-    print("MonthlyGlobalSpend Created!")  # noqa
+        print("MonthlyGlobalSpend Created!")  # noqa
 
     try:
         await db.query_raw("""SELECT 1 FROM "Last30dKeysBySpend" LIMIT 1""")

diff --git a/litellm/integrations/langfuse.py b/litellm/integrations/langfuse.py
@@ -10,6 +10,7 @@
 import litellm
 from litellm._logging import verbose_logger
 from litellm.litellm_core_utils.redact_messages import redact_user_api_key_info
+from litellm.secret_managers.main import str_to_bool
 
 
 class LangFuseLogger:
@@ -66,6 +67,11 @@ def __init__(
             project_id = None
 
         if os.getenv("UPSTREAM_LANGFUSE_SECRET_KEY") is not None:
+            upstream_langfuse_debug = (
+                str_to_bool(self.upstream_langfuse_debug)
+                if self.upstream_langfuse_debug is not None
+                else None
+            )
             self.upstream_langfuse_secret_key = os.getenv(
                 "UPSTREAM_LANGFUSE_SECRET_KEY"
             )
@@ -80,7 +86,11 @@ def __init__(
                 secret_key=self.upstream_langfuse_secret_key,
                 host=self.upstream_langfuse_host,
                 release=self.upstream_langfuse_release,
-                debug=self.upstream_langfuse_debug,
+                debug=(
+                    upstream_langfuse_debug
+                    if upstream_langfuse_debug is not None
+                    else False
+                ),
             )
         else:
             self.upstream_langfuse = None
@@ -175,6 +185,7 @@ def log_event(
                         pass
 
             # end of processing langfuse ########################
+
             if (
                 level == "ERROR"
                 and status_message is not None
@@ -208,6 +219,11 @@ def log_event(
             ):
                 input = prompt
                 output = response_obj["text"]
+            elif response_obj is not None and isinstance(
+                response_obj, litellm.RerankResponse
+            ):
+                input = prompt
+                output = response_obj.results
             elif (
                 kwargs.get("call_type") is not None
                 and kwargs.get("call_type") == "pass_through_endpoint"
@@ -283,14 +299,14 @@ def _log_langfuse_v1(
         input,
         response_obj,
     ):
-        from langfuse.model import CreateGeneration, CreateTrace
+        from langfuse.model import CreateGeneration, CreateTrace  # type: ignore
 
         verbose_logger.warning(
             "Please upgrade langfuse to v2.0.0 or higher: https://github.com/langfuse/langfuse-python/releases/tag/v2.0.1"
         )
 
-        trace = self.Langfuse.trace(
-            CreateTrace(
+        trace = self.Langfuse.trace(  # type: ignore
+            CreateTrace(  # type: ignore
                 name=metadata.get("generation_name", "litellm-completion"),
                 input=input,
                 output=output,
@@ -336,6 +352,7 @@ def _log_langfuse_v2(
         try:
             tags = []
             try:
+                optional_params.pop("metadata")
                 metadata = copy.deepcopy(
                     metadata
                 )  # Avoid modifying the original metadata
@@ -361,7 +378,7 @@ def _log_langfuse_v2(
                 langfuse.version.__version__
             ) >= Version("2.7.3")
 
-            print_verbose(f"Langfuse Layer Logging - logging to langfuse v2 ")
+            print_verbose("Langfuse Layer Logging - logging to langfuse v2 ")
 
             if supports_tags:
                 metadata_tags = metadata.pop("tags", [])
@@ -519,25 +536,31 @@ def _log_langfuse_v2(
                         if key.lower() not in ["authorization", "cookie", "referer"]:
                             clean_headers[key] = value
 
-                clean_metadata["request"] = {
-                    "method": method,
-                    "url": url,
-                    "headers": clean_headers,
-                }
+                # clean_metadata["request"] = {
+                #     "method": method,
+                #     "url": url,
+                #     "headers": clean_headers,
+                # }
             trace = self.Langfuse.trace(**trace_params)
 
             # Log provider specific information as a span
             log_provider_specific_information_as_span(trace, clean_metadata)
 
             generation_id = None
             usage = None
-            if response_obj is not None and response_obj.get("id", None) is not None:
-                generation_id = litellm.utils.get_logging_id(start_time, response_obj)
-                usage = {
-                    "prompt_tokens": response_obj.usage.prompt_tokens,
-                    "completion_tokens": response_obj.usage.completion_tokens,
-                    "total_cost": cost if supports_costs else None,
-                }
+            if response_obj is not None:
+                if response_obj.get("id", None) is not None:
+                    generation_id = litellm.utils.get_logging_id(
+                        start_time, response_obj
+                    )
+                _usage_obj = getattr(response_obj, "usage", None)
+
+                if _usage_obj:
+                    usage = {
+                        "prompt_tokens": _usage_obj.prompt_tokens,
+                        "completion_tokens": _usage_obj.completion_tokens,
+                        "total_cost": cost if supports_costs else None,
+                    }
             generation_name = clean_metadata.pop("generation_name", None)
             if generation_name is None:
                 # if `generation_name` is None, use sensible default values

diff --git a/litellm/llms/cohere/rerank.py b/litellm/llms/cohere/rerank.py
@@ -66,9 +66,6 @@ def rerank(
 
         request_data_dict = request_data.dict(exclude_none=True)
 
-        if _is_async:
-            return self.async_rerank(request_data_dict=request_data_dict, api_key=api_key, api_base=api_base, headers=headers)  # type: ignore # Call async method
-
         ## LOGGING
         litellm_logging_obj.pre_call(
             input=request_data_dict,
@@ -79,6 +76,10 @@ def rerank(
                 "headers": headers,
             },
         )
+
+        if _is_async:
+            return self.async_rerank(request_data_dict=request_data_dict, api_key=api_key, api_base=api_base, headers=headers)  # type: ignore # Call async method
+
         client = _get_httpx_client()
         response = client.post(
             api_base,

diff --git a/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py b/litellm/llms/vertex_ai_and_google_ai_studio/vertex_ai_partner_models/main.py
@@ -175,7 +175,7 @@ def completion(
                 client=client,
                 timeout=timeout,
                 encoding=encoding,
-                custom_llm_provider="vertex_ai_beta",
+                custom_llm_provider="vertex_ai",
             )
 
         except Exception as e:

diff --git a/litellm/model_prices_and_context_window_backup.json b/litellm/model_prices_and_context_window_backup.json
@@ -2350,6 +2350,26 @@
         "mode": "chat",
         "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
     },
+    "vertex_ai/meta/llama3-70b-instruct-maas": {
+        "max_tokens": 32000,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 32000,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "vertex_ai-llama_models",
+        "mode": "chat",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
+    },
+    "vertex_ai/meta/llama3-8b-instruct-maas": {
+        "max_tokens": 32000,
+        "max_input_tokens": 32000,
+        "max_output_tokens": 32000,
+        "input_cost_per_token": 0.0,
+        "output_cost_per_token": 0.0,
+        "litellm_provider": "vertex_ai-llama_models",
+        "mode": "chat",
+        "source": "https://cloud.google.com/vertex-ai/generative-ai/pricing#partner-models"
+    },
     "vertex_ai/mistral-large@latest": {
         "max_tokens": 8191,
         "max_input_tokens": 128000,

diff --git a/litellm/proxy/_new_secret_config.yaml b/litellm/proxy/_new_secret_config.yaml
@@ -19,11 +19,11 @@ model_list:
   - model_name: o1-preview
     litellm_params:
       model: o1-preview
-
+  - model_name: rerank-english-v3.0
+    litellm_params:
+      model: cohere/rerank-english-v3.0
+      api_key: os.environ/COHERE_API_KEY
+
+
 litellm_settings:
-  cache: true
-  # cache_params:
-  #   type: "redis"
-    # service_name: "mymaster"
-    # sentinel_nodes:
-    # - ["localhost", 26379]
+  success_callback: ["langfuse"]