changes to make the rag output small and match the pre-defined configuration

vipul-maheshwari · vipul-maheshwari · commit f408c8f49925 · 2024-09-03T13:29:45.000+05:30
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+.vscode
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
diff --git a/bolna/agent_manager/task_manager.py b/bolna/agent_manager/task_manager.py
@@ -575,8 +575,8 @@ def __get_agent_object(self, llm, agent_type, assistant_config=None):
                 vector_id=vector_store_config.get("vector_id"),
                 temperature=extra_config.get("temperature", 0.1),
                 model=extra_config.get("model", "gpt-3.5-turbo-16k"),
-                buffer=40,
-                max_tokens=100,  # You might want to make this configurable
+                buffer = self.task_config["tools_config"]["synthesizer"].get('buffer_size'),
+                max_tokens = self.llm_agent_config['extra_config']['max_tokens'], 
                 provider_config=vector_store_config
             )
             logger.info("Llama-index rag agent is created")
@@ -1074,6 +1074,17 @@ async def __do_llm_generation(self, messages, meta_info, next_step, should_bypas
                 logger.info(f"Got llm latencies {self.llm_latencies}")
 
             llm_response += " " + data
+
+            # Checking the size of the llm_response as it should be less than 20 words
+            # if len(llm_response.strip()) > 20:
+            #     # Let's make a gpt3.5 turbo openai async call to summarize the text and make it shorter
+            #     summary_prompt = f"Summarize the following text in less than 100 characters and the answer should be precise:\n\n{llm_response}"
+            #     summary_response = ""
+            #     async for summarized_response in self.tools['llm_agent'].generate([{'role': 'user', 'content': summary_prompt}]):
+            #         logger.info(f"Summarized response : {summary_response}")
+                
+            #     llm_response = summarized_response
+
             logger.info(f"Got a response from LLM {llm_response}")
             if end_of_llm_stream:
                 meta_info["end_of_llm_stream"] = True
diff --git a/bolna/agent_types/llama_index_rag_agent.py b/bolna/agent_types/llama_index_rag_agent.py
@@ -43,7 +43,7 @@ class LlamaIndexRag(BaseAgent):
         agent (OpenAIAgent): Agent that uses the query engine to answer questions.
     """
 
-    def __init__(self, vector_id: str, temperature: float, model: str, buffer: int = 40, max_tokens: int = 100,
+    def __init__(self, vector_id: str, temperature: float, model: str, buffer: int = 20, max_tokens: int = 100,
                  provider_config: dict = None):
         """
         Initialize the LlamaIndexRag instance.
@@ -52,7 +52,7 @@ def __init__(self, vector_id: str, temperature: float, model: str, buffer: int =
             vector_id (str): Identifier for the vector store.
             temperature (float): Temperature setting for the language model.
             model (str): The name of the OpenAI model to use.
-            buffer (int, optional): Size of the token buffer for streaming responses. Defaults to 40.
+            buffer (int, optional): Size of the token buffer for streaming responses. Defaults to 20.
             max_tokens (int, optional): Maximum number of tokens for the language model output. Defaults to 100.
         """
         super().__init__()
@@ -176,7 +176,7 @@ async def generate(self, message: List[dict], **kwargs) -> AsyncGenerator[Tuple[
                 if latency < 0:
                     latency = time.time() - start_time
                 buffer += token
-                if len(buffer.split()) >= self.buffer:
+                if len(buffer.split()) >= self.buffer or buffer[-1] in {'.', '!', '?'}:
                     yield buffer.strip(), False, latency, False
                     logger.info(f"LLM BUFFER FULL BUFFER OUTPUT: {buffer}")
                     buffer = ""