Skip to content

Commit f408c8f

Browse files
changes to make the rag output small and match the pre-defined configuration
1 parent 2bace31 commit f408c8f

File tree

3 files changed

+17
-5
lines changed

3 files changed

+17
-5
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ share/python-wheels/
2525
.installed.cfg
2626
*.egg
2727
MANIFEST
28+
.vscode
2829

2930
# PyInstaller
3031
# Usually these files are written by a python script from a template

bolna/agent_manager/task_manager.py

+13-2
Original file line numberDiff line numberDiff line change
@@ -575,8 +575,8 @@ def __get_agent_object(self, llm, agent_type, assistant_config=None):
575575
vector_id=vector_store_config.get("vector_id"),
576576
temperature=extra_config.get("temperature", 0.1),
577577
model=extra_config.get("model", "gpt-3.5-turbo-16k"),
578-
buffer=40,
579-
max_tokens=100, # You might want to make this configurable
578+
buffer = self.task_config["tools_config"]["synthesizer"].get('buffer_size'),
579+
max_tokens = self.llm_agent_config['extra_config']['max_tokens'],
580580
provider_config=vector_store_config
581581
)
582582
logger.info("Llama-index rag agent is created")
@@ -1074,6 +1074,17 @@ async def __do_llm_generation(self, messages, meta_info, next_step, should_bypas
10741074
logger.info(f"Got llm latencies {self.llm_latencies}")
10751075

10761076
llm_response += " " + data
1077+
1078+
# Checking the size of the llm_response as it should be less than 20 words
1079+
# if len(llm_response.strip()) > 20:
1080+
# # Let's make a gpt3.5 turbo openai async call to summarize the text and make it shorter
1081+
# summary_prompt = f"Summarize the following text in less than 100 characters and the answer should be precise:\n\n{llm_response}"
1082+
# summary_response = ""
1083+
# async for summarized_response in self.tools['llm_agent'].generate([{'role': 'user', 'content': summary_prompt}]):
1084+
# logger.info(f"Summarized response : {summary_response}")
1085+
1086+
# llm_response = summarized_response
1087+
10771088
logger.info(f"Got a response from LLM {llm_response}")
10781089
if end_of_llm_stream:
10791090
meta_info["end_of_llm_stream"] = True

bolna/agent_types/llama_index_rag_agent.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ class LlamaIndexRag(BaseAgent):
4343
agent (OpenAIAgent): Agent that uses the query engine to answer questions.
4444
"""
4545

46-
def __init__(self, vector_id: str, temperature: float, model: str, buffer: int = 40, max_tokens: int = 100,
46+
def __init__(self, vector_id: str, temperature: float, model: str, buffer: int = 20, max_tokens: int = 100,
4747
provider_config: dict = None):
4848
"""
4949
Initialize the LlamaIndexRag instance.
@@ -52,7 +52,7 @@ def __init__(self, vector_id: str, temperature: float, model: str, buffer: int =
5252
vector_id (str): Identifier for the vector store.
5353
temperature (float): Temperature setting for the language model.
5454
model (str): The name of the OpenAI model to use.
55-
buffer (int, optional): Size of the token buffer for streaming responses. Defaults to 40.
55+
buffer (int, optional): Size of the token buffer for streaming responses. Defaults to 20.
5656
max_tokens (int, optional): Maximum number of tokens for the language model output. Defaults to 100.
5757
"""
5858
super().__init__()
@@ -176,7 +176,7 @@ async def generate(self, message: List[dict], **kwargs) -> AsyncGenerator[Tuple[
176176
if latency < 0:
177177
latency = time.time() - start_time
178178
buffer += token
179-
if len(buffer.split()) >= self.buffer:
179+
if len(buffer.split()) >= self.buffer or buffer[-1] in {'.', '!', '?'}:
180180
yield buffer.strip(), False, latency, False
181181
logger.info(f"LLM BUFFER FULL BUFFER OUTPUT: {buffer}")
182182
buffer = ""

0 commit comments

Comments
 (0)