@@ -43,7 +43,7 @@ class LlamaIndexRag(BaseAgent):
43
43
agent (OpenAIAgent): Agent that uses the query engine to answer questions.
44
44
"""
45
45
46
- def __init__ (self , vector_id : str , temperature : float , model : str , buffer : int = 40 , max_tokens : int = 100 ,
46
+ def __init__ (self , vector_id : str , temperature : float , model : str , buffer : int = 20 , max_tokens : int = 100 ,
47
47
provider_config : dict = None ):
48
48
"""
49
49
Initialize the LlamaIndexRag instance.
@@ -52,7 +52,7 @@ def __init__(self, vector_id: str, temperature: float, model: str, buffer: int =
52
52
vector_id (str): Identifier for the vector store.
53
53
temperature (float): Temperature setting for the language model.
54
54
model (str): The name of the OpenAI model to use.
55
- buffer (int, optional): Size of the token buffer for streaming responses. Defaults to 40 .
55
+ buffer (int, optional): Size of the token buffer for streaming responses. Defaults to 20 .
56
56
max_tokens (int, optional): Maximum number of tokens for the language model output. Defaults to 100.
57
57
"""
58
58
super ().__init__ ()
@@ -176,7 +176,7 @@ async def generate(self, message: List[dict], **kwargs) -> AsyncGenerator[Tuple[
176
176
if latency < 0 :
177
177
latency = time .time () - start_time
178
178
buffer += token
179
- if len (buffer .split ()) >= self .buffer :
179
+ if len (buffer .split ()) >= self .buffer or buffer [ - 1 ] in { '.' , '!' , '?' } :
180
180
yield buffer .strip (), False , latency , False
181
181
logger .info (f"LLM BUFFER FULL BUFFER OUTPUT: { buffer } " )
182
182
buffer = ""
0 commit comments