fix exception mapping for streaming

BerriAI · Sep 23, 2023 · 889679a · 889679a
1 parent f984e5f
commit 889679a
Show file tree

Hide file tree

Showing 8 changed files with 760 additions and 94 deletions.
diff --git a/litellm/__pycache__/main.cpython-311.pyc b/litellm/__pycache__/main.cpython-311.pyc
diff --git a/litellm/__pycache__/utils.cpython-311.pyc b/litellm/__pycache__/utils.cpython-311.pyc
diff --git a/litellm/llms/replicate.py b/litellm/llms/replicate.py
@@ -77,14 +77,16 @@ def handle_prediction_response_streaming(prediction_url, api_token, print_verbos
     }
     status = ""
     while True and (status not in ["succeeded", "failed", "canceled"]):
-        time.sleep(0.0001)
+        time.sleep(0.0001) # prevent being rate limited by replicate
         response = requests.get(prediction_url, headers=headers)
         if response.status_code == 200:
             response_data = response.json()
+            status = response_data['status']
+            print(f"response data: {response_data}")
             if "output" in response_data:
                 output_string = "".join(response_data['output'])
                 new_output = output_string[len(previous_output):]
-                yield new_output
+                yield {"output": new_output, "status": status}
                 previous_output = output_string
             status = response_data['status']
 

diff --git a/litellm/main.py b/litellm/main.py
@@ -485,11 +485,11 @@ def completion(
             # Setting the relevant API KEY for replicate, replicate defaults to using os.environ.get("REPLICATE_API_TOKEN")
             replicate_key = None
             replicate_key = (
-                get_secret("REPLICATE_API_KEY")
-                or get_secret("REPLICATE_API_TOKEN")
-                or api_key
+                api_key
                 or litellm.replicate_key
-                or litellm.api_key
+                or litellm.api_key 
+                or get_secret("REPLICATE_API_KEY")
+                or get_secret("REPLICATE_API_TOKEN")
             )
 
             model_response = replicate.completion(
@@ -575,7 +575,7 @@ def completion(
 
             if "stream" in optional_params and optional_params["stream"] == True:
                 # don't try to access stream object,
-                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph-alpha", logging_obj=logging)
+                response = CustomStreamWrapper(model_response, model, custom_llm_provider="aleph_alpha", logging_obj=logging)
                 return response
             response = model_response
         elif model in litellm.openrouter_models or custom_llm_provider == "openrouter":
@@ -769,7 +769,7 @@ def completion(
             if stream:
                 model_response = chat.send_message_streaming(prompt, **optional_params)
                 response = CustomStreamWrapper(
-                    model_response, model, custom_llm_provider="vertexai", logging_obj=logging
+                    model_response, model, custom_llm_provider="vertex_ai", logging_obj=logging
                 )
                 return response
 

diff --git a/litellm/tests/test_completion.py b/litellm/tests/test_completion.py
@@ -643,24 +643,6 @@ def test_completion_sagemaker():
 
 # test_completion_sagemaker()
 
-def test_completion_sagemaker_stream():
-    litellm.set_verbose = False
-    try:
-        response = completion(
-            model="sagemaker/jumpstart-dft-meta-textgeneration-llama-2-7b", 
-            messages=messages,
-            temperature=0.2,
-            max_tokens=80,
-            stream=True,
-        )
-        # Add any assertions here to check the response
-        for chunk in response:
-            print(chunk)
-    except Exception as e:
-        pytest.fail(f"Error occurred: {e}")
-
-# test_completion_sagemaker_stream()
-
 def test_completion_bedrock_titan():
     try:
         response = completion(