LLMstudio tracker 1.0.5, LLMstudio proxy 1.0.4, LLMstudio core 1.0.2 (#192)

claramoreirag · actions-user · MiNeves00 · web-flow · commit 705f10e83119 · 2024-12-18T10:04:51.000Z
* feat: allow for url to be used instead of just host and port * feat: allow url instead of just host and port on proxy and tracker config * style: lint * [fix] bump prerelease version in pyproject.toml * [fix] bump prerelease version in pyproject.toml * fix: llmstudio-tracker pyproject.toml Signed-off-by: Clara Moreira Gadelho <56916880+claramoreirag@users.noreply.github.com> * feat: added support for o1-preview and o1-mini (#193) * feat: added support for o1-preview and o1-mini * chore: added o1 preview and mini for azure to config * chore: bump core Signed-off-by: Diogo Goncalves <diogoncalves@users.noreply.github.com> * fix: proxy version Signed-off-by: Diogo Goncalves <diogoncalves@users.noreply.github.com> * chore: isort fix * chore: pyproject.toml * chore: update pyproject.toml --------- Signed-off-by: Clara Moreira Gadelho <56916880+claramoreirag@users.noreply.github.com> Signed-off-by: Diogo Goncalves <diogoncalves@users.noreply.github.com> Co-authored-by: GitHub Actions <actions@github.com> Co-authored-by: Miguel Neves <61327611+MiNeves00@users.noreply.github.com> Co-authored-by: Diogo Goncalves <diogoncalves@users.noreply.github.com> Co-authored-by: Diogo Goncalves <diogo.goncalves@tensorops.ai>
diff --git a/examples/core.py b/examples/core.py
@@ -11,20 +11,7 @@ def run_provider(provider, model, api_key, **kwargs):
     llm = LLMCore(provider=provider, api_key=api_key, **kwargs)
 
     latencies = {}
-    chat_request = {
-        "chat_input": "Hello, my name is Json",
-        "model": model,
-        "is_stream": False,
-        "retries": 0,
-        "parameters": {
-            "temperature": 0,
-            "max_tokens": 100,
-            "response_format": {"type": "json_object"},
-            "functions": None,
-        }
-    }
-
-
+    chat_request = build_chat_request(model, chat_input="Hello, my name is Jason Json", is_stream=False)
     
     import asyncio
     response_async = asyncio.run(llm.achat(**chat_request))
@@ -34,18 +21,7 @@ def run_provider(provider, model, api_key, **kwargs):
     # stream
     print("\nasync stream")
     async def async_stream():
-        chat_request = {
-            "chat_input": "Hello, my name is Json",
-            "model": model,
-            "is_stream": True,
-            "retries": 0,
-            "parameters": {
-                "temperature": 0,
-                "max_tokens": 100,
-                "response_format": {"type": "json_object"},
-                "functions": None,
-            }
-        }
+        chat_request = build_chat_request(model, chat_input="Hello, my name is Tom Json", is_stream=True)
         
         response_async = await llm.achat(**chat_request)
         async for p in response_async:
@@ -61,36 +37,14 @@ async def async_stream():
     
     
     print("# Now sync calls")
-    chat_request = {
-        "chat_input": "Hello, my name is Json",
-        "model": model,
-        "is_stream": False,
-        "retries": 0,
-        "parameters": {
-            "temperature": 0,
-            "max_tokens": 100,
-            "response_format": {"type": "json_object"},
-            "functions": None,
-        }
-    }
+    chat_request = build_chat_request(model, chat_input="Hello, my name is Alice Json", is_stream=False)
     
     response_sync = llm.chat(**chat_request)
     pprint(response_sync)
     latencies["sync (ms)"]= response_sync.metrics["latency_s"]*1000
 
     print("# Now sync calls streaming")
-    chat_request = {
-        "chat_input": "Hello, my name is Json",
-        "model": model,
-        "is_stream": True,
-        "retries": 0,
-        "parameters": {
-            "temperature": 0,
-            "max_tokens": 100,
-            "response_format": {"type": "json_object"},
-            "functions": None,
-        }
-    }
+    chat_request = build_chat_request(model, chat_input="Hello, my name is Mary Json", is_stream=True)
     
     response_sync_stream = llm.chat(**chat_request)
     for p in response_sync_stream:
@@ -101,16 +55,58 @@ async def async_stream():
             pprint(p)
             latencies["sync stream (ms)"]= p.metrics["latency_s"]*1000
 
-    print(f"\n\n###EPORT for {provider}, {model} ###")
+    print(f"\n\n###REPORT for <{provider}>, <{model}> ###")
     return latencies
 
+def build_chat_request(model: str, chat_input: str, is_stream: bool, max_tokens: int=1000):
+    if model == "o1-preview" or model == "o1-mini":
+        chat_request = {
+            "chat_input": chat_input,
+            "model": model,
+            "is_stream": is_stream,
+            "retries": 0,
+            "parameters": {
+                "max_completion_tokens": max_tokens
+            }
+        }
+    else:
+        chat_request = {
+            "chat_input": chat_input,
+            "model": model,
+            "is_stream": is_stream,
+            "retries": 0,
+            "parameters": {
+                "temperature": 0,
+                "max_tokens": max_tokens,
+                "response_format": {"type": "json_object"},
+                "functions": None,
+            }
+        }
+    return chat_request
+
+
+
+
 
 provider = "openai"
 model = "gpt-4o-mini"
 for _ in range(1):
     latencies = run_provider(provider=provider, model=model, api_key=os.environ["OPENAI_API_KEY"])
     pprint(latencies)
 
+    
+provider = "openai"
+model = "o1-preview"
+for _ in range(1):
+    latencies = run_provider(provider=provider, model=model, api_key=os.environ["OPENAI_API_KEY"])
+    pprint(latencies)
+    
+provider = "openai"
+model = "o1-mini"
+for _ in range(1):
+    latencies = run_provider(provider=provider, model=model, api_key=os.environ["OPENAI_API_KEY"])
+    pprint(latencies)
+
 # provider = "anthropic"
 # model = "claude-3-opus-20240229"
 # for _ in range(1):
@@ -126,6 +122,24 @@ async def async_stream():
                             api_version=os.environ["AZURE_API_VERSION"],
                             api_endpoint=os.environ["AZURE_API_ENDPOINT"])
     pprint(latencies)
+    
+provider = "azure"
+model = "o1-preview"
+for _ in range(1):
+    latencies = run_provider(provider=provider, model=model, 
+                            api_key=os.environ["AZURE_API_KEY"], 
+                            api_version=os.environ["AZURE_API_VERSION"],
+                            api_endpoint=os.environ["AZURE_API_ENDPOINT"])
+    pprint(latencies)
+    
+provider = "azure"
+model = "o1-mini"
+for _ in range(1):
+    latencies = run_provider(provider=provider, model=model, 
+                            api_key=os.environ["AZURE_API_KEY"], 
+                            api_version=os.environ["AZURE_API_VERSION"],
+                            api_endpoint=os.environ["AZURE_API_ENDPOINT"])
+    pprint(latencies)
 
 # provider = "azure"
 # model = "gpt-4o"
@@ -137,10 +151,10 @@ async def async_stream():
 #     pprint(latencies)
 
 
-provider = "vertexai"
-model = "gemini-1.5-pro-latest"
-for _ in range(1):
-    latencies = run_provider(provider=provider, model=model, 
-                            api_key=os.environ["GOOGLE_API_KEY"], 
-                            )
-    pprint(latencies)
+# provider = "vertexai"
+# model = "gemini-1.5-pro-latest"
+# for _ in range(1):
+#     latencies = run_provider(provider=provider, model=model, 
+#                             api_key=os.environ["GOOGLE_API_KEY"], 
+#                             )
+#     pprint(latencies)
diff --git a/libs/core/llmstudio_core/config.yaml b/libs/core/llmstudio_core/config.yaml
@@ -204,6 +204,16 @@ providers:
     keys:
       - OPENAI_API_KEY
     models:
+      o1-preview:
+        mode: chat
+        max_completion_tokens: 128000
+        input_token_cost: 0.000015
+        output_token_cost: 0.000060
+      o1-mini:
+        mode: chat
+        max_completion_tokens: 128000
+        input_token_cost: 0.000003
+        output_token_cost: 0.000012
       gpt-4o-mini:
         mode: chat
         max_tokens: 128000
@@ -280,6 +290,16 @@ providers:
       - AZURE_API_ENDPOINT
       - AZURE_API_VERSION
     models:
+      o1-preview:
+        mode: chat
+        max_completion_tokens: 128000
+        input_token_cost: 0.0000165
+        output_token_cost: 0.000066
+      o1-mini:
+        mode: chat
+        max_completion_tokens: 128000
+        input_token_cost: 0.0000033
+        output_token_cost: 0.0000132
       gpt-4o-mini:
         mode: chat
         max_tokens: 128000
diff --git a/libs/core/llmstudio_core/utils.py b/libs/core/llmstudio_core/utils.py
@@ -3,7 +3,7 @@
 from typing import Any, Dict, List, Optional, Union
 
 import yaml
-from pydantic import BaseModel, ValidationError
+from pydantic import BaseModel, Field, ValidationError
 
 
 class OpenAIToolParameters(BaseModel):
@@ -30,9 +30,10 @@ class CostRange(BaseModel):
 
 class ModelConfig(BaseModel):
     mode: str
-    max_tokens: int
-    input_token_cost: Union[float, List[CostRange]]
-    output_token_cost: Union[float, List[CostRange]]
+    max_tokens: Optional[int] = Field(default=None, alias="max_completion_tokens")
+    max_completion_tokens: Optional[int] = None
+    input_token_cost: Union[float, List["CostRange"]]
+    output_token_cost: Union[float, List["CostRange"]]
 
 
 class ProviderConfig(BaseModel):
diff --git a/libs/core/pyproject.toml b/libs/core/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llmstudio-core"
-version = "1.0.1"
+version = "1.0.2"
 description = "LLMStudio core capabilities for routing llm calls for any vendor. No proxy server required. For that use llmstudio[proxy]"
 authors = ["Cláudio Lemos <claudio.lemos@tensorops.ai>"]
 license = "MIT"
diff --git a/libs/proxy/llmstudio_proxy/provider.py b/libs/proxy/llmstudio_proxy/provider.py
@@ -18,26 +18,24 @@ class ProxyConfig(BaseModel):
 
     def __init__(self, **data):
         super().__init__(**data)
-        if (self.host is None and self.port is None) and self.url is None:
-            raise ValueError(
-                "Either both 'host' and 'port' must be provided, or 'url' must be specified."
-            )
+        if self.url is None:
+            if self.host is not None and self.port is not None:
+                self.url = f"http://{self.host}:{self.port}"
+            else:
+                raise ValueError(
+                    "You must provide either both 'host' and 'port', or 'url'."
+                )
 
 
 class LLMProxyProvider(Provider):
     def __init__(self, provider: str, proxy_config: ProxyConfig):
         self.provider = provider
+        self.engine_url = proxy_config.url
 
-        self.engine_host = proxy_config.host
-        self.engine_port = proxy_config.port
-        if is_server_running(host=self.engine_host, port=self.engine_port):
-            print(
-                f"Connected to LLMStudio Proxy @ {self.engine_host}:{self.engine_port}"
-            )
+        if is_server_running(url=self.engine_url):
+            print(f"Connected to LLMStudio Proxy @ {self.engine_url}")
         else:
-            raise Exception(
-                f"LLMStudio Proxy is not running @ {self.engine_host}:{self.engine_port}"
-            )
+            raise Exception(f"LLMStudio Proxy is not running @ {self.engine_url}")
 
     @staticmethod
     def _provider_config_name():
@@ -53,7 +51,7 @@ def chat(
         **kwargs,
     ) -> Union[ChatCompletion]:
         response = requests.post(
-            f"http://{self.engine_host}:{self.engine_port}/api/engine/chat/{self.provider}",
+            f"{self.engine_url}/api/engine/chat/{self.provider}",
             json={
                 "chat_input": chat_input,
                 "model": model,
@@ -109,7 +107,7 @@ async def async_non_stream(
     ):
         response = await asyncio.to_thread(
             requests.post,
-            f"http://{self.engine_host}:{self.engine_port}/api/engine/chat/{self.provider}",
+            f"{self.engine_url}/api/engine/chat/{self.provider}",
             json={
                 "chat_input": chat_input,
                 "model": model,
@@ -131,7 +129,7 @@ async def async_stream(
     ):
         response = await asyncio.to_thread(
             requests.post,
-            f"http://{self.engine_host}:{self.engine_port}/api/engine/chat/{self.provider}",
+            f"{self.engine_url}/api/engine/chat/{self.provider}",
             json={
                 "chat_input": chat_input,
                 "model": model,
diff --git a/libs/proxy/llmstudio_proxy/server.py b/libs/proxy/llmstudio_proxy/server.py
@@ -165,9 +165,9 @@ def run_proxy_app(started_event: Event):
         print(f"Error running LLMstudio Proxy: {e}")
 
 
-def is_server_running(host, port, path="/health"):
+def is_server_running(url, path="/health"):
     try:
-        response = requests.get(f"http://{host}:{port}{path}")
+        response = requests.get(f"{url}{path}")
         if response.status_code == 200 and response.json().get("status") == "healthy":
             return True
     except requests.ConnectionError:
@@ -176,7 +176,7 @@ def is_server_running(host, port, path="/health"):
 
 
 def start_server_component(host, port, run_func, server_name):
-    if not is_server_running(host, port):
+    if not is_server_running(url=f"http://{host}:{port}"):
         started_event = Event()
         thread = Thread(target=run_func, daemon=True, args=(started_event,))
         thread.start()
diff --git a/libs/proxy/pyproject.toml b/libs/proxy/pyproject.toml
@@ -1,8 +1,8 @@
 [tool.poetry]
 name = "llmstudio-proxy"
-version = "1.0.3a0"
-description = ""
-authors = ["Diogo Goncalves <diogo.goncalves@tensorops.ai>"]
+version = "1.0.4"
+description = "LLMstudio Proxy is the module of LLMstudio that allows calling any LLM as a Service Provider in proxy server. By leveraging LLMstudio Proxy, you can have a centralized point for connecting to any provider running independently from your application."
+authors = ["Claudio Lemos <claudio.lemos@tensorops.ai>"]
 readme = "README.md"
 
 [tool.poetry.dependencies]
diff --git a/libs/tracker/llmstudio_tracker/server.py b/libs/tracker/llmstudio_tracker/server.py
@@ -66,9 +66,9 @@ def run_tracker_app(started_event: Event):
         print(f"Error running LLMstudio Tracking: {e}")
 
 
-def is_server_running(host, port, path="/health"):
+def is_server_running(url, path="/health"):
     try:
-        response = requests.get(f"http://{host}:{port}{path}")
+        response = requests.get(f"{url}{path}")
         if response.status_code == 200 and response.json().get("status") == "healthy":
             return True
     except requests.ConnectionError:
@@ -77,7 +77,7 @@ def is_server_running(host, port, path="/health"):
 
 
 def start_server_component(host, port, run_func, server_name):
-    if not is_server_running(host, port):
+    if not is_server_running(url=f"http://{host}:{port}"):
         started_event = Event()
         thread = Thread(target=run_func, daemon=True, args=(started_event,))
         thread.start()
diff --git a/libs/tracker/llmstudio_tracker/tracker.py b/libs/tracker/llmstudio_tracker/tracker.py
diff --git a/libs/tracker/pyproject.toml b/libs/tracker/pyproject.toml