stanfordnlp · TomeHirata · Oct 3, 2025 · Oct 3, 2025 · chenmoneygithub · Oct 3, 2025
diff --git a/dspy/streaming/streamify.py b/dspy/streaming/streamify.py
@@ -185,7 +185,12 @@ async def async_streamer(*args, **kwargs):
                     else:
                         # We are receiving a chunk from the LM's response stream, delegate it to the listeners to
                         # determine if we should yield a value to the user.
-                        for listener in predict_id_to_listener[value.predict_id]:
+                        # Reorder listeners so that active (buffering) listeners are processed first
+                        # This ensures buffered chunks are flushed in the correct order.
+                        listeners = predict_id_to_listener[value.predict_id]
+                        listeners.sort(key=lambda x: x.stream_start, reverse=True)
+
+                        for listener in listeners:
                             # In some special cases such as Citation API, it is possible that multiple listeners
                             # return values at the same time due to the chunk buffer of the listener.
                             if output := listener.receive(value):

diff --git a/dspy/streaming/streaming_listener.py b/dspy/streaming/streaming_listener.py
@@ -97,8 +97,6 @@ def receive(self, chunk: ModelResponseStream):
 
         try:
             chunk_message = chunk.choices[0].delta.content
-            if chunk_message is None:
-                return
         except Exception:
             return
 
@@ -112,6 +110,20 @@ def receive(self, chunk: ModelResponseStream):
                     is_last_chunk=self.stream_end,
                 )
 
+        # If we receive an empty chunk but streaming has started, flush the buffer.
+        # LiteLLM does not send completely empty chunks (https://github.com/BerriAI/litellm/blob/main/litellm/litellm_core_utils/model_response_utils.py#L10),
+        # so empty content means it has other native fields such as provider_specific_fields.
+        if not chunk_message:
+            if self.stream_start:
+                if token := self._get_last_token():
+                    return StreamResponse(
+                        self.predict_name,
+                        self.signature_field_name,
+                        token,
+                        is_last_chunk=False,
+                    )
+            return
+
         if chunk_message and start_identifier in chunk_message:
             # If the cache is hit, the chunk_message could be the full response. When it happens we can
             # directly end the stream listening. In some models like gemini, each stream chunk can be multiple
@@ -192,8 +204,7 @@ def flush(self) -> str:
         are in the buffer because we don't directly yield the tokens received by the stream listener
         with the purpose to not yield the end_identifier tokens, e.g., "[[ ## ... ## ]]" for ChatAdapter.
         """
-        last_tokens = "".join(self.field_end_queue.queue)
-        self.field_end_queue = Queue()
+        last_tokens = self._get_last_token()
         if isinstance(settings.adapter, JSONAdapter):
             match = re.search(r'",|"\s*}', last_tokens)
             if match:
@@ -215,6 +226,12 @@ def flush(self) -> str:
                 f"{', '.join([a.__name__ for a in ADAPTER_SUPPORT_STREAMING])}"
             )
 
+    def _get_last_token(self) -> str:
+        last_token = "".join(self.field_end_queue.queue)
+        self.field_end_queue = Queue()
+        return last_token
+
+
     @property
     def _output_type(self) -> type | None:
         try:

diff --git a/tests/streaming/test_streaming.py b/tests/streaming/test_streaming.py
@@ -1004,12 +1004,15 @@ async def citation_stream(*args, **kwargs):
             output = program(documents=docs, question="What temperature does water boil?")
             citation_chunks = []
             answer_chunks = []
+            entire_chunks = []
             final_prediction = None
             async for value in output:
                 if isinstance(value, dspy.streaming.StreamResponse) and value.signature_field_name == "citations":
                     citation_chunks.append(value)
+                    entire_chunks.append(f"[{value.chunk[0].document_index}]")
                 elif isinstance(value, dspy.streaming.StreamResponse) and value.signature_field_name == "answer":
                     answer_chunks.append(value.chunk)
+                    entire_chunks.append(value.chunk)
                 elif isinstance(value, dspy.Prediction):
                     final_prediction = value
 
@@ -1023,6 +1026,7 @@ async def citation_stream(*args, **kwargs):
 
             # Verify the answer chunks are correct
             assert "".join(answer_chunks) == "According to the references, water boils at 100°C."
+            assert "".join(entire_chunks) == "According to the references,[0] water boils at 100°C."
 
             # Test that prediction contains the expected fields
             assert final_prediction is not None