From 6004fc4bd361272b500c8e1f68f2a8321ac51b5b Mon Sep 17 00:00:00 2001
From: Mohamed ASSOUKTI <mohamed.assoukti@partnre.com>
Date: Tue, 19 Aug 2025 10:56:44 +0200
Subject: [PATCH 1/2] [DERCBOT-1609] Structuring the LLM response

---
 .../models/engines-configurations.ts          | 115 +++++--
 .../admin/bot/rag/BotRAGConfiguration.kt      |   1 +
 .../kotlin/engine/config/RAGAnswerHandler.kt  |  25 +-
 .../responses/RAGResponse.kt                  |   3 +-
 .../orchestratorclient/responses/models.kt    |  15 +-
 .../models/llm/OllamaLLMSetting.kt            |   1 -
 .../models/rag/rag_models.py                  |  55 +++-
 .../routers/responses/responses.py            |   8 +-
 .../services/langchain/rag_chain.py           | 294 ++++++++++--------
 .../server/tests/services/test_rag_chain.py   | 141 ++++++---
 10 files changed, 429 insertions(+), 229 deletions(-)
diff --git a/bot/admin/web/src/app/rag/rag-settings/models/engines-configurations.ts b/bot/admin/web/src/app/rag/rag-settings/models/engines-configurations.ts
index 925980ad0e..d862251d20 100644
--- a/bot/admin/web/src/app/rag/rag-settings/models/engines-configurations.ts
+++ b/bot/admin/web/src/app/rag/rag-settings/models/engines-configurations.ts
@@ -27,35 +27,102 @@ import {
   PromptDefinitionFormatter
 } from '../../../shared/model/ai-settings';
 
-export const QuestionCondensingDefaultPrompt: string = `Given a chat history and the latest user question which might reference context in the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.`;
+export const QuestionCondensingDefaultPrompt: string = `You are a helpful assistant that reformulates questions.
 
-export const QuestionAnsweringDefaultPrompt: string = `# TOCK (The Open Conversation Kit) chatbot
-
-## General context
-
-You are a chatbot designed to provide short conversational messages in response to user queries.
-
-## Guidelines
-
-Incorporate any relevant details from the provided context into your answers, ensuring they are directly related to the user's query.
+You are given:
+- The conversation history between the user and the assistant
+- The most recent user question
 
-## Style and format
+Your task:
+- Reformulate the user’s latest question into a clear, standalone query.
+- Incorporate relevant context from the conversation history.
+- Do NOT answer the question.
+- If the history does not provide additional context, keep the question as is.
 
-Your tone is empathetic, informative and polite.
+Return only the reformulated question.`;
 
-## Additional instructions
-
-Use the following pieces of retrieved context to answer the question.
-If you dont know the answer, answer (exactly) with "{{no_answer}}".
-Answer in {{locale}}.
-
-## Context
-
-{{context}}
-
-## Question
+export const QuestionAnsweringDefaultPrompt: string = `# TOCK (The Open Conversation Kit) chatbot
 
-{{question}}
+## Instructions:
+You must answer STRICTLY in valid JSON format (no extra text, no explanations).
+Use only the following context and the rules below to answer the question.
+
+### Rules for JSON output:
+
+- If the answer is found in the context:
+  - "status": "found_in_context"
+
+- If the answer is NOT found in the context:
+  - "status": "not_found_in_context"
+  - "answer":
+    - The "answer" must not be a generic refusal. Instead, generate a helpful and intelligent response:
+        - If a similar or related element exists in the context (e.g., another product, service, or regulation with a close name, date, or wording), suggest it naturally in the answer.
+        - If no similar element exists, politely acknowledge the lack of information while encouraging clarification or rephrasing.
+    - Always ensure the response is phrased in a natural and user-friendly way, rather than a dry "not found in context".
+
+- If the question matches a special case defined below:
+  - "status": "<the corresponding case code>"
+
+And for all cases (MANDATORY):
+  - "answer": "<the best possible answer in {{ locale }}>"
+  - "topic": "<exactly ONE topic chosen STRICTLY from the predefined list below. If no exact match is possible, set 'unknown'>"
+  - "suggested_topics": ["<zero or more free-form suggestions if topic is unknown>"]
+
+Exception: If the question is small talk (only to conversational rituals such as greetings (e.g., “hello”, “hi”) and farewells or leave-takings (e.g., “goodbye”, “see you”) ), you may ignore the context and generate a natural small-talk response in the "answer". In this case:
+  - "status": "small_talk"
+  - "topic": "<e.g., greetings>"
+  - "suggested_topics": []
+  - "context": []
+
+### Context tracing requirements (MANDATORY):
+- You MUST include **every** chunk from the input context in the "context" array, in the same order they appear. **No chunk may be omitted**.
+- If explicit chunk identifiers are present in the context, use them; otherwise assign sequential numbers starting at 1.
+- For each chunk object:
+  - "chunk": "<chunk_identifier_or_sequential_number>"
+  - "sentences": ["<verbatim sentence(s) from this chunk used to answer the question>"] — leave empty \`[]\` if none.
+  - "reason": null if the chunk contributed; otherwise a concise explanation of why this chunk is not relevant to the question (e.g., "general background only", "different product", "no data for the asked period", etc.).
+- If there are zero chunks in the context, return \`"context": []\`.
+
+### Predefined list of topics (use EXACT spelling, no variations):
+
+## Context:
+{{ context }}
+
+## Conversation history
+{{ chat_history }}
+
+## User question
+{{ question }}
+
+## Output format (JSON only):
+Return your response in the following format:
+
+{
+  "status": "found_on_context" | "not_in_context" | "small_talk",
+  "answer": "TEXTUAL_ANSWER",
+  "topic": "EXACT_TOPIC_FROM_LIST_OR_UNKNOWN",
+  "suggested_topics": [
+    "SUGGESTED_TOPIC_1",
+    "SUGGESTED_TOPIC_2"
+  ],
+  "context": [
+    {
+      "chunk": "1",
+      "sentences": ["SENTENCE_1", "SENTENCE_2"],
+      "reason": null
+    },
+    {
+      "chunk": "2",
+      "sentences": [],
+      "reason": "General description; no details related to the question."
+    },
+    {
+      "chunk": "3",
+      "sentences": ["SENTENCE_X"],
+      "reason": null
+    }
+  ]
+}
 `;
 
 export const QuestionCondensing_prompt: ProvidersConfigurationParam[] = [
diff --git a/bot/engine/src/main/kotlin/admin/bot/rag/BotRAGConfiguration.kt b/bot/engine/src/main/kotlin/admin/bot/rag/BotRAGConfiguration.kt
index 0665f3a1a5..6680b11237 100644
--- a/bot/engine/src/main/kotlin/admin/bot/rag/BotRAGConfiguration.kt
+++ b/bot/engine/src/main/kotlin/admin/bot/rag/BotRAGConfiguration.kt
@@ -36,6 +36,7 @@ data class BotRAGConfiguration(
     val llmSetting: LLMSetting? = null,
     val emSetting: EMSetting,
     val indexSessionId: String? = null,
+    @Deprecated("Replaced by LLM answer status")
     val noAnswerSentence: String,
     val noAnswerStoryId: String? = null,
     val documentsRequired: Boolean = true,
diff --git a/bot/engine/src/main/kotlin/engine/config/RAGAnswerHandler.kt b/bot/engine/src/main/kotlin/engine/config/RAGAnswerHandler.kt
index c43698900d..c3f97cfb23 100644
--- a/bot/engine/src/main/kotlin/engine/config/RAGAnswerHandler.kt
+++ b/bot/engine/src/main/kotlin/engine/config/RAGAnswerHandler.kt
@@ -31,9 +31,9 @@ import ai.tock.bot.engine.action.SendSentenceWithFootnotes
 import ai.tock.bot.engine.dialog.Dialog
 import ai.tock.bot.engine.user.PlayerType
 import ai.tock.genai.orchestratorclient.requests.*
+import ai.tock.genai.orchestratorclient.responses.LLMAnswer
 import ai.tock.genai.orchestratorclient.responses.ObservabilityInfo
 import ai.tock.genai.orchestratorclient.responses.RAGResponse
-import ai.tock.genai.orchestratorclient.responses.TextWithFootnotes
 import ai.tock.genai.orchestratorclient.retrofit.GenAIOrchestratorBusinessError
 import ai.tock.genai.orchestratorclient.retrofit.GenAIOrchestratorValidationError
 import ai.tock.genai.orchestratorclient.services.RAGService
@@ -60,7 +60,7 @@ object RAGAnswerHandler : AbstractProactiveAnswerHandler {
             BotRepository.saveMetric(createMetric(MetricType.STORY_HANDLED))
 
             // Call RAG Api - Gen AI Orchestrator
-            val (answer, debug, noAnswerStory, observabilityInfo) = rag(this)
+            val (answer, footnotes, debug, noAnswerStory, observabilityInfo) = rag(this)
 
             // Add debug data if available and if debugging is enabled
             if (debug != null) {
@@ -75,14 +75,18 @@ object RAGAnswerHandler : AbstractProactiveAnswerHandler {
                 val modifiedObservabilityInfo = observabilityInfo?.let { updateObservabilityInfo(this, it) }
 
                 send(
-                    SendSentenceWithFootnotes(
-                        botId, connectorId, userId, text = answer.text, footnotes = answer.footnotes.map {
+                    action = SendSentenceWithFootnotes(
+                        playerId = botId,
+                        applicationId = connectorId,
+                        recipientId = userId,
+                        text = answer.answer,
+                        footnotes = footnotes?.map {
                             Footnote(
                                 it.identifier, it.title, it.url,
                                 if(action.metadata.sourceWithContent) it.content else null,
                                 it.score
                             )
-                        }.toMutableList(),
+                        }?.toMutableList() ?: mutableListOf<Footnote>(),
                         // modifiedObservabilityInfo includes the public langfuse URL if filled.
                         metadata = ActionMetadata(isGenAiRagAnswer = true, observabilityInfo = modifiedObservabilityInfo)
                     )
@@ -116,13 +120,13 @@ object RAGAnswerHandler : AbstractProactiveAnswerHandler {
     private fun ragStoryRedirection(botBus: BotBus, response: RAGResponse?): StoryDefinition? {
         return with(botBus) {
             botDefinition.ragConfiguration?.let { ragConfig ->
-                if (response?.answer?.text.equals(ragConfig.noAnswerSentence, ignoreCase = true)) {
+                if (response?.answer?.status.equals("not_found_in_context", ignoreCase = true)) {
                     // Save no answer metric
                     saveRagMetric(IndicatorValues.NO_ANSWER)
 
                     // Switch to no answer story if configured
                     if (!ragConfig.noAnswerStoryId.isNullOrBlank()) {
-                        logger.info { "The RAG response is equal to the configured no-answer sentence, so switch to the no-answer story." }
+                        logger.info { "Switch to the no-answer RAG story." }
                         getNoAnswerRAGStory(ragConfig)
                     } else null
                 } else {
@@ -221,7 +225,7 @@ object RAGAnswerHandler : AbstractProactiveAnswerHandler {
                 )
 
                 // Handle RAG response
-                return RAGResult(response?.answer, response?.debug, ragStoryRedirection(this, response), response?.observabilityInfo)
+                return RAGResult(response?.answer, response?.footnotes, response?.debug, ragStoryRedirection(this, response), response?.observabilityInfo)
             } catch (exc: Exception) {
                 logger.error { exc }
                 // Save failure metric
@@ -232,7 +236,7 @@ object RAGAnswerHandler : AbstractProactiveAnswerHandler {
                     RAGResult(noAnswerStory = getNoAnswerRAGStory(ragConfiguration))
                 }
                 else RAGResult(
-                    answer = TextWithFootnotes(text = technicalErrorMessage),
+                    answer = LLMAnswer(status="error", answer = technicalErrorMessage),
                     debug = when(exc) {
                         is GenAIOrchestratorBusinessError -> RAGError(exc.message, exc.error)
                         is GenAIOrchestratorValidationError -> RAGError(exc.message, exc.detail)
@@ -282,7 +286,8 @@ object RAGAnswerHandler : AbstractProactiveAnswerHandler {
  * Aggregation of RAG answer, debug and the no answer Story.
  */
 data class RAGResult(
-    val answer: TextWithFootnotes? = null,
+    val answer: LLMAnswer? = null,
+    val footnotes: List<ai.tock.genai.orchestratorclient.responses.Footnote>? = null,
     val debug: Any? = null,
     val noAnswerStory: StoryDefinition? = null,
     val observabilityInfo: ObservabilityInfo? = null,
diff --git a/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/RAGResponse.kt b/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/RAGResponse.kt
index a4dfde4c3e..fb699fd7f3 100644
--- a/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/RAGResponse.kt
+++ b/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/RAGResponse.kt
@@ -17,7 +17,8 @@
 package ai.tock.genai.orchestratorclient.responses
 
 data class RAGResponse(
-    val answer: TextWithFootnotes,
+    val answer: LLMAnswer,
+    val footnotes: List<Footnote> = emptyList(),
     val debug: Any? = null,
     val observabilityInfo: ObservabilityInfo? = null,
 )
diff --git a/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/models.kt b/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/models.kt
index 4131d334fd..fa0314a143 100644
--- a/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/models.kt
+++ b/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/models.kt
@@ -17,9 +17,18 @@
 package ai.tock.genai.orchestratorclient.responses
 
 
-data class TextWithFootnotes(
-    val text: String,
-    val footnotes: List<Footnote> = emptyList(),
+data class ChunkSentences(
+    val chunk: String? = null,
+    val sentences: List<String>? = emptyList(),
+    val reason: String? = null,
+)
+
+data class LLMAnswer(
+    val status: String,
+    val answer: String,
+    val topic: String? = null,
+    val suggestedTopics: List<String>? = null,
+    val context: List<ChunkSentences>? = null,
 )
 
 data class Footnote(
diff --git a/gen-ai/orchestrator-core/src/main/kotlin/ai/tock/genai/orchestratorcore/models/llm/OllamaLLMSetting.kt b/gen-ai/orchestrator-core/src/main/kotlin/ai/tock/genai/orchestratorcore/models/llm/OllamaLLMSetting.kt
index 0d72aa1de3..28077e3a3f 100644
--- a/gen-ai/orchestrator-core/src/main/kotlin/ai/tock/genai/orchestratorcore/models/llm/OllamaLLMSetting.kt
+++ b/gen-ai/orchestrator-core/src/main/kotlin/ai/tock/genai/orchestratorcore/models/llm/OllamaLLMSetting.kt
@@ -28,4 +28,3 @@ data class OllamaLLMSetting<T>(
     }
 }
 
-// TODO MASS : Check Compile + TU (car dernier commit)
diff --git a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/models/rag/rag_models.py b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/models/rag/rag_models.py
index a20a78e73c..c52d7a42df 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/models/rag/rag_models.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/models/rag/rag_models.py
@@ -52,16 +52,55 @@ class Footnote(Source):
 
     identifier: str = Field(description='Footnote identifier', examples=['1'])
 
+class ChunkInfos(BaseModel):
+    """A model representing information about a chunk used in the RAG context."""
 
-class TextWithFootnotes(BaseModel):
-    """Text with its footnotes. Used for RAG response"""
-
-    text: str = Field(
-        description='Text with footnotes used to list outside sources',
-        examples=['This is page content [1], and this is more content [2]'],
+    chunk: Optional[str] = Field(
+        description='Unique identifier of the chunk.',
+        examples=['cd6d8221-ba9f-44da-86ee-0e25a3c9a5c7'],
+        default=None
+    )
+    sentences: Optional[List[str]] = Field(
+        description='List of verbatim sentences from the chunk that were used by the LLM.',
+        default=None
     )
-    footnotes: set[Footnote] = Field(description='Set of footnotes')
+    reason: Optional[str] = Field(
+        description='Reason why the chunk was not used (e.g., irrelevant, general background).',
+        default=None
+    )
+
+
+class LLMAnswer(BaseModel):
+    """
+    A model representing the structured answer generated by the LLM
+    in response to a user query, based on the provided RAG context.
+    """
 
+    status: Optional[str] = Field(
+        description="The status of the answer generation. "
+                    "Possible values: 'found_in_context', 'not_found_in_context', 'small_talk', "
+                    "or other case-specific codes.",
+        default=None
+    )
+    answer: Optional[str] = Field(
+        description="The textual answer generated by the LLM, in the user's locale.",
+        default=None
+    )
+    topic: Optional[str] = Field(
+        description="The main topic assigned to the answer. Must be one of the predefined list "
+                    "of topics, or 'unknown' if no match is possible.",
+        default=None
+    )
+    suggested_topics: Optional[List[str]] = Field(
+        description="A list of suggested alternative or related topics, "
+                    "used when the main topic is 'unknown'.",
+        default=None
+    )
+    context: Optional[List[ChunkInfos]] = Field(
+        description="The list of chunks from the context that contributed to or were considered "
+                    "in the LLM's answer. Each entry contains identifiers, sentences, and reasons.",
+        default=None
+    )
 
 @unique
 class ChatMessageType(str, Enum):
@@ -154,4 +193,4 @@ class RAGDebugData(QADebugData):
             'Question: Hello, how to plan a trip to Morocco ?. Answer in French.'
         ],
     )
-    answer: str = Field(description='The RAG answer.')
+    answer: LLMAnswer = Field(description='The RAG answer.')
diff --git a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/routers/responses/responses.py b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/routers/responses/responses.py
index fe633d2f56..bbab4cf8bb 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/routers/responses/responses.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/routers/responses/responses.py
@@ -25,9 +25,8 @@
     ErrorInfo,
 )
 from gen_ai_orchestrator.models.llm.llm_provider import LLMProvider
-from gen_ai_orchestrator.models.rag.rag_models import Source, TextWithFootnotes
+from gen_ai_orchestrator.models.rag.rag_models import Source, LLMAnswer, Footnote
 from gen_ai_orchestrator.models.observability.observability_provider import ObservabilityProvider
-from gen_ai_orchestrator.models.rag.rag_models import TextWithFootnotes
 from gen_ai_orchestrator.models.vector_stores.vectore_store_provider import VectorStoreProvider
 
 
@@ -122,9 +121,10 @@ class ObservabilityInfo(BaseModel):
 class RAGResponse(BaseModel):
     """The RAG response model"""
 
-    answer: TextWithFootnotes = Field(
-        description='The RAG answer, with outside sources.'
+    answer: Optional[LLMAnswer] = Field(
+        description='The RAG answer'
     )
+    footnotes: set[Footnote] = Field(description='Set of footnotes')
     debug: Optional[Any] = Field(
         description='Debug data',
         examples=[{'action': 'retrieve', 'result': 'OK', 'errors': []}],
diff --git a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py
index ed7e666cdb..fe8d289118 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py
@@ -17,31 +17,30 @@
 It uses LangChain to perform a Conversational Retrieval Chain
 """
 
+import json
 import logging
 import time
 from functools import partial
 from logging import ERROR, WARNING
-from typing import List, Optional
+from operator import itemgetter
+from typing import List, Optional, Tuple
 
-from langchain.chains.conversational_retrieval.base import (
-    ConversationalRetrievalChain,
-)
 from langchain.retrievers.contextual_compression import (
     ContextualCompressionRetriever,
 )
 from langchain_community.chat_message_histories import ChatMessageHistory
 from langchain_core.callbacks import BaseCallbackHandler
 from langchain_core.documents import Document
-from langchain_core.output_parsers import StrOutputParser
+from langchain_core.messages import HumanMessage, AIMessage
+from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.prompts import PromptTemplate as LangChainPromptTemplate
 from langchain_core.runnables import (
     RunnableParallel,
     RunnablePassthrough,
-    RunnableSerializable,
+    RunnableSerializable, RunnableConfig, RunnableLambda,
 )
 from langchain_core.vectorstores import VectorStoreRetriever
-from langfuse.callback import CallbackHandler as LangfuseCallbackHandler
 from typing_extensions import Any
 
 from gen_ai_orchestrator.errors.exceptions.exceptions import (
@@ -68,11 +67,10 @@
     RAGDebugData,
     RAGDocument,
     RAGDocumentMetadata,
-    TextWithFootnotes,
+    LLMAnswer,
 )
 from gen_ai_orchestrator.routers.requests.requests import RAGRequest
 from gen_ai_orchestrator.routers.responses.responses import (
-    ObservabilityInfo,
     RAGResponse,
 )
 from gen_ai_orchestrator.services.langchain.callbacks.rag_callback_handler import (
@@ -109,7 +107,7 @@ async def execute_rag_chain(
     Args:
         request: The RAG request
         debug: True if RAG data debug should be returned with the response.
-        custom_observability_handler: Custom observability handler
+        custom_observability_handler: Custom observability handler (Used in the tooling run_experiment.py script)
     Returns:
         The RAG response (Answer and document sources)
     """
@@ -120,96 +118,105 @@ async def execute_rag_chain(
     conversational_retrieval_chain = create_rag_chain(request=request)
 
     message_history = ChatMessageHistory()
-    session_id = None
-    user_id = None
-    tags = []
     if request.dialog:
         for msg in request.dialog.history:
             if ChatMessageType.HUMAN == msg.type:
                 message_history.add_user_message(msg.text)
             else:
                 message_history.add_ai_message(msg.text)
-        session_id = (request.dialog.dialog_id,)
-        user_id = (request.dialog.user_id,)
-        tags = (request.dialog.tags,)
 
-    logger.debug(
-        'RAG chain - Use chat history: %s',
-        'Yes' if len(message_history.messages) > 0 else 'No',
-    )
+    logger.debug('RAG chain - Use chat history: %s', len(message_history.messages) > 0)
+    logger.debug('RAG chain - Use RAGCallbackHandler for debugging : %s', debug)
+
+    records_handler, observability_handler = get_callback_handlers(request, debug)
+
+    callbacks = [
+        handler
+        for handler in (records_handler, observability_handler, custom_observability_handler)
+        if handler is not None
+    ]
 
     inputs = {
         **request.question_answering_prompt.inputs,
         'chat_history': message_history.messages,
     }
 
-    logger.debug(
-        'RAG chain - Use RAGCallbackHandler for debugging : %s',
-        debug,
-    )
-
-    callback_handlers = []
-    records_callback_handler = RAGCallbackHandler()
-    observability_handler = None
-    if debug:
-        # Debug callback handler
-        callback_handlers.append(records_callback_handler)
-    if custom_observability_handler is not None:
-        callback_handlers.append(custom_observability_handler)
-    if request.observability_setting is not None:
-        # Langfuse callback handler
-        observability_handler = create_observability_callback_handler(
-            observability_setting=request.observability_setting,
-            trace_name=ObservabilityTrace.RAG.value,
-            session_id=session_id,
-            user_id=user_id,
-            tags=tags,
-        )
-        callback_handlers.append(observability_handler)
-
     response = await conversational_retrieval_chain.ainvoke(
         input=inputs,
-        config={'callbacks': callback_handlers},
+        config=RunnableConfig(callbacks=callbacks)
     )
+    llm_answer = LLMAnswer(**response['answer'])
 
     # RAG Guard
-    rag_guard(inputs, response, request.documents_required)
+    rag_guard(inputs, llm_answer, response, request.documents_required)
 
     # Guardrail
     if request.guardrail_setting:
         guardrail = get_guardrail_factory(
             setting=request.guardrail_setting
         ).get_parser()
-        guardrail_output = guardrail.parse(response['answer'])
+        guardrail_output = guardrail.parse(llm_answer.answer)
         check_guardrail_output(guardrail_output)
 
     # Calculation of RAG processing time
     rag_duration = '{:.2f}'.format(time.time() - start_time)
     logger.info('RAG chain - End of execution. (Duration : %s seconds)', rag_duration)
 
+    # Group contexts by chunk id
+    contexts_by_chunk = {
+        ctx.chunk: ctx
+        for ctx in (llm_answer.context or [])
+        if ctx.sentences
+    }
+
     # Returning RAG response
     return RAGResponse(
-        answer=TextWithFootnotes(
-            text=response['answer'],
-            footnotes=set(
-                map(
-                    lambda doc: Footnote(
-                        identifier=doc.metadata['id'],
-                        title=doc.metadata['title'],
-                        url=doc.metadata['source'],
-                        content=get_source_content(doc),
-                        score=doc.metadata.get('retriever_score', None),
-                    ),
-                    response['documents'],
-                )
-            ),
-        ),
+        answer=llm_answer,
+        footnotes={
+            Footnote(
+                identifier=doc.metadata['id'],
+                title=doc.metadata['title'],
+                url=doc.metadata['source'],
+                content=get_source_content(doc),
+                score=doc.metadata.get('retriever_score', None),
+            )
+            for doc in response["documents"]
+            if doc.metadata['id'] in contexts_by_chunk
+        },
         observability_info=get_observability_info(observability_handler),
-        debug=get_rag_debug_data(request, records_callback_handler, rag_duration)
+        debug=get_rag_debug_data(request, records_handler, rag_duration)
         if debug
         else None,
     )
 
+def get_callback_handlers(request, debug) -> Tuple[
+    Optional[RAGCallbackHandler],
+    Optional[object],
+]:
+    records_handler = RAGCallbackHandler() if debug else None
+    observability_handler = None
+
+    if request.observability_setting is not None:
+        if request.dialog:
+            session_id = request.dialog.dialog_id
+            user_id = request.dialog.user_id
+            tags = request.dialog.tags
+        else:
+            session_id = None
+            user_id = None
+            tags = None
+        observability_handler = create_observability_callback_handler(
+            observability_setting=request.observability_setting,
+            trace_name=ObservabilityTrace.RAG.value,
+            session_id=session_id,
+            user_id=user_id,
+            tags=tags,
+        )
+
+    return (
+        records_handler,
+        observability_handler,
+    )
 
 def get_source_content(doc: Document) -> str:
     """
@@ -279,31 +286,62 @@ def create_rag_chain(
     if question_condensing_llm_factory is not None:
         question_condensing_llm = question_condensing_llm_factory.get_language_model()
     question_answering_llm = question_answering_llm_factory.get_language_model()
-    rag_prompt = build_rag_prompt(request)
 
-    # Construct the RAG chain using the prompt and LLM,
-    # This chain will consume the documents retrieved by the retriever as input.
-    rag_chain = construct_rag_chain(question_answering_llm, rag_prompt)
+    # Fallback in case of missing condensing LLM setting using the answering LLM setting.
+    if question_condensing_llm is not None:
+        condensing_llm = question_condensing_llm
+    else :
+        condensing_llm = question_answering_llm
 
     # Build the chat chain for question contextualization
-    chat_chain = build_question_condensation_chain(
-        question_condensing_llm
-        if question_condensing_llm is not None
-        else question_answering_llm,
-        request.question_condensing_prompt,
-    )
+    chat_chain = build_question_condensation_chain(condensing_llm, request.question_condensing_prompt)
+    rag_prompt = build_rag_prompt(request)
 
     # Function to contextualize the question based on chat history
     contextualize_question_fn = partial(contextualize_question, chat_chain=chat_chain)
 
-    # Final RAG chain with retriever and source documents
-    rag_chain_with_retriever = (
-        contextualize_question_fn
-        | RunnableParallel({'documents': retriever, 'question': RunnablePassthrough()})
-        | RunnablePassthrough.assign(answer=rag_chain)
-    )
-
-    return rag_chain_with_retriever
+    # Calculate the condensed question
+    with_condensed_question = RunnableParallel({
+        "condensed_question": contextualize_question_fn,
+        "question": itemgetter("question"),
+        "chat_history": itemgetter("chat_history"),
+    })
+
+    def retrieve_with_variants(inputs):
+        variants = [
+            # inputs["question"], Deactivated. It's an example to prove the multi retriever process
+            inputs["condensed_question"]
+        ]
+        docs = []
+        for v in variants:
+            docs.extend(retriever.invoke(v))
+        # Deduplicate docs
+        unique_docs = {d.metadata['id']: d for d in docs}
+
+        # TODO [DERCBOT-1649] Apply the RRF Algo on unique_docs.
+        return list(unique_docs.values())
+
+    # Build the RAG inputs
+    rag_inputs = with_condensed_question | RunnableParallel({
+        "question": itemgetter("condensed_question"),
+        "chat_history": itemgetter("chat_history"),
+        "documents": RunnableLambda(retrieve_with_variants),
+    })
+
+    return rag_inputs | RunnablePassthrough.assign(answer=(
+            {
+                "context": lambda x: json.dumps([
+                    {
+                        "chunk_id": doc.metadata['id'],
+                        "chunk_text": doc.page_content,
+                    }
+                    for doc in x["documents"]
+                ], ensure_ascii=False, indent=2),
+                "chat_history": format_chat_history,
+            }
+            | rag_prompt
+            | question_answering_llm
+            | JsonOutputParser(pydantic_object=LLMAnswer, name="rag_chain_output")))
 
 
 def build_rag_prompt(request: RAGRequest) -> LangChainPromptTemplate:
@@ -316,25 +354,14 @@ def build_rag_prompt(request: RAGRequest) -> LangChainPromptTemplate:
         partial_variables=request.question_answering_prompt.inputs,
     )
 
-
-def construct_rag_chain(llm, rag_prompt):
-    """
-    Construct the RAG chain from LLM and prompt.
-    """
-    return (
-        {
-            'context': lambda inputs: '\n\n'.join(
-                doc.page_content for doc in inputs['documents']
-            ),
-            'question': lambda inputs: inputs[
-                'question'
-            ],  # Override the user's original question with the condensed one
-        }
-        | rag_prompt
-        | llm
-        | StrOutputParser(name='rag_chain_output')
-    )
-
+def format_chat_history(x):
+    messages = []
+    for msg in x["chat_history"]:
+        if isinstance(msg, HumanMessage):
+            messages.append({"user": msg.content})
+        elif isinstance(msg, AIMessage):
+            messages.append({"assistant": msg.content})
+    return json.dumps(messages, ensure_ascii=False, indent=2)
 
 def build_question_condensation_chain(
     llm, prompt: Optional[PromptTemplate]
@@ -342,14 +369,27 @@ def build_question_condensation_chain(
     """
     Build the chat chain for contextualizing questions.
     """
+    # TODO deprecated : All Gen configurations are supposed to have this prompt now. It is mandatory in the RAG configuration.
     if prompt is None:
         # Default prompt
         prompt = PromptTemplate(
             formatter=PromptFormatter.F_STRING,
             inputs={},
-            template='Given a chat history and the latest user question which might reference context in \
-the chat history, formulate a standalone question which can be understood without the chat history. \
-Do NOT answer the question, just reformulate it if needed and otherwise return it as is.',
+            template="""
+You are a helpful assistant that reformulates questions.
+
+You are given:
+- The conversation history between the user and the assistant
+- The most recent user question
+
+Your task:
+- Reformulate the user’s latest question into a clear, standalone query.
+- Incorporate relevant context from the conversation history.
+- Do NOT answer the question.
+- If the history does not provide additional context, keep the question as is.
+
+Return only the reformulated question.
+"""
         )
 
     return (
@@ -373,51 +413,39 @@ def contextualize_question(inputs: dict, chat_chain) -> str:
         return chat_chain
     return inputs['question']
 
-
-def rag_guard(inputs, response, documents_required):
+def rag_guard(question, answer, response, documents_required):
     """
     Validates the RAG system's response based on the presence or absence of source documents
     and the `documentsRequired` setting.
 
     Args:
-        inputs: question answering prompt inputs
+        question: user question
+        answer: the LLM answer
         response: the RAG response
         documents_required (bool): Specifies whether documents are mandatory for the response.
     """
 
-    no_docs_retrieved = response['documents'] == []
-    no_docs_but_required = no_docs_retrieved and documents_required
-    chain_can_give_no_answer_reply = 'no_answer' in inputs
-    chain_reply_no_answer = False
-
-    if chain_can_give_no_answer_reply:
-        chain_reply_no_answer = response['answer'] == inputs['no_answer']
-
-    if no_docs_but_required:
-        if chain_can_give_no_answer_reply and chain_reply_no_answer:
-            # We expect the chain to use its non-response value, and it has done so, which is the expected behavior.
-            return
-        # Everything else isn't expected
-        message = 'The RAG system cannot provide an answer when no documents are found and documents are required'
-        rag_log(level=ERROR, message=message, inputs=inputs, response=response)
+    if documents_required and answer.status == "found_in_context" and len(response['documents']) == 0:
+        message = 'No documents were retrieved, yet an answer was attempted.'
+        rag_log(level=ERROR, message=message, question=question, answer=answer.answer, response=response)
         raise GenAIGuardCheckException(ErrorInfo(cause=message))
 
-    if chain_reply_no_answer and not no_docs_retrieved:
-        # If the chain responds with its non-response value and the documents are retrieved,
-        # so we remove them from the RAG response.
-        message = 'The RAG gives no answer for user question, but some documents has been found!'
-        rag_log(level=WARNING, message=message, inputs=inputs, response=response)
+    if answer.status == "not_found_in_context" and len(response['documents']) > 0:
+        # If the answer is not found in context and some documents are retrieved, so we remove them from the RAG response.
+        message = 'No answer found in the retrieved context. The documents are therefore removed from the RAG response.'
+        rag_log(level=WARNING, message=message, question=question, answer=answer.answer, response=response)
         response['documents'] = []
 
 
-def rag_log(level, message, inputs, response):
+def rag_log(level, message, question, answer, response):
     """
     RAG logging
 
     Args:
         level: logging level
         message: message to log
-        inputs: question answering prompt inputs
+        question: question answering prompt inputs
+        answer: LLM answer
         response: the RAG response
     """
 
@@ -427,9 +455,9 @@ def rag_log(level, message, inputs, response):
         'RAG chain - question="%(question)s", answer="%(answer)s", documents="%(documents)s"',
         {
             'message': message,
-            'question': inputs['question'],
-            'answer': response['answer'],
-            'documents': response['documents'],
+            'question': question,
+            'answer': answer,
+            'documents': len(response['documents']),
         },
     )
 
@@ -451,6 +479,8 @@ def get_rag_documents(handler: RAGCallbackHandler) -> List[RAGDocument]:
         for doc in handler.records['documents']
     ]
 
+def get_llm_answer(rag_chain_output) -> LLMAnswer:
+    return LLMAnswer(**json.loads(rag_chain_output.strip().removeprefix("```json").removesuffix("```").strip()))
 
 def get_rag_debug_data(
     request: RAGRequest, records_callback_handler: RAGCallbackHandler, rag_duration
@@ -470,7 +500,7 @@ def get_rag_debug_data(
         documents=get_rag_documents(records_callback_handler),
         document_index_name=request.document_index_name,
         document_search_params=request.document_search_params,
-        answer=records_callback_handler.records['rag_chain_output'],
+        answer=get_llm_answer(records_callback_handler.records['rag_chain_output']),
         duration=rag_duration,
     )
 
diff --git a/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_rag_chain.py b/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_rag_chain.py
index 7a1b5edb1b..8d88a9629c 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_rag_chain.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_rag_chain.py
@@ -28,9 +28,9 @@
 from gen_ai_orchestrator.models.guardrail.bloomz.bloomz_guardrail_setting import (
     BloomzGuardrailSetting,
 )
+from gen_ai_orchestrator.models.rag.rag_models import LLMAnswer
 from gen_ai_orchestrator.routers.requests.requests import RAGRequest
 from gen_ai_orchestrator.services.langchain import rag_chain
-
 from gen_ai_orchestrator.services.langchain.factories.langchain_factory import (
     get_guardrail_factory,
 )
@@ -48,19 +48,19 @@
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.RAGCallbackHandler')
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.rag_guard')
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.RAGResponse')
-@patch('gen_ai_orchestrator.services.langchain.rag_chain.TextWithFootnotes')
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.RAGDebugData')
+@patch('gen_ai_orchestrator.services.langchain.rag_chain.get_llm_answer')
 @pytest.mark.asyncio
 async def test_rag_chain(
-    mocked_rag_debug_data,
-    mocked_text_with_footnotes,
-    mocked_rag_response,
-    mocked_rag_guard,
-    mocked_callback_init,
-    mocked_create_rag_chain,
-    mocked_get_callback_handler_factory,
-    mocked_get_document_compressor_factory,
-    mocked_guardrail_parse,
+        mocked_get_llm_answer,
+        mocked_rag_debug_data,
+        mocked_rag_response,
+        mocked_rag_guard,
+        mocked_callback_init,
+        mocked_create_rag_chain,
+        mocked_get_callback_handler_factory,
+        mocked_get_document_compressor_factory,
+        mocked_guardrail_parse,
 ):
     """Test the full execute_qa_chain method by mocking all external calls."""
     # Build a test RAGRequest
@@ -90,7 +90,7 @@ async def test_rag_chain(
 {question}
 
 Answer in {locale}:""",
-            'inputs' : {
+            'inputs': {
                 'question': 'How to get started playing guitar ?',
                 'no_answer': 'Sorry, I don t know.',
                 'locale': 'French',
@@ -154,9 +154,19 @@ async def test_rag_chain(
     }
     docs = [Document(
         page_content='some page content',
-        metadata={'id':'123-abc', 'title':'my-title', 'source': None},
+        metadata={'id': '123-abc', 'title': 'my-title', 'source': None},
     )]
-    response = {'answer': 'an answer from llm', 'documents': docs}
+    response = {
+        'answer': {
+            'status': '',
+            'answer': 'an answer from llm',
+            'topic': None,
+            'suggested_topics': None,
+            'context': []
+        },
+        'documents': docs
+    }
+    llm_answer = LLMAnswer(**response['answer'])
 
     # Setup mock factories/init return value
     observability_factory_instance = mocked_get_callback_handler_factory.return_value
@@ -186,10 +196,8 @@ async def test_rag_chain(
     )
     # Assert the response is build using the expected settings
     mocked_rag_response.assert_called_once_with(
-        # TextWithFootnotes must be mocked or mapping the footnotes will fail
-        answer=mocked_text_with_footnotes(
-            text=mocked_rag_answer['answer'], footnotes=[]
-        ),
+        answer=llm_answer,
+        footnotes=set(),
         debug=mocked_rag_debug_data(request, mocked_rag_answer, mocked_callback, 1),
         observability_info=None
     )
@@ -199,23 +207,32 @@ async def test_rag_chain(
     # Assert the rag guardrail is called
     mocked_guardrail_parse.assert_called_once_with(
         os.path.join(request.guardrail_setting.api_base, 'guardrail'),
-        json={'text': [mocked_rag_answer['answer']]},
+        json={'text': [mocked_rag_answer['answer']['answer']]},
     )
     # Assert the rag guard is called
     mocked_rag_guard.assert_called_once_with(
-        inputs, response, request.documents_required
+        inputs, llm_answer, response, request.documents_required
     )
 
+
 @patch('gen_ai_orchestrator.services.langchain.impls.guardrail.bloomz_guardrail.requests.post')
 def test_guardrail_parse_succeed_with_toxicities_encountered(
-    mocked_guardrail_response,
+        mocked_guardrail_response,
 ):
     guardrail = get_guardrail_factory(
         BloomzGuardrailSetting(
             provider='BloomzGuardrail', max_score=0.5, api_base='http://test-guard.com'
         )
     ).get_parser()
-    rag_response = {'answer': 'This is a sample text.'}
+    rag_response = {
+        'answer': {
+            'status': '',
+            'answer': 'This is a sample text.',
+            'topic': None,
+            'suggested_topics': None,
+            'context': []
+        }
+    }
 
     mocked_response = MagicMock()
     mocked_response.status_code = 200
@@ -231,11 +248,11 @@ def test_guardrail_parse_succeed_with_toxicities_encountered(
     }
 
     mocked_guardrail_response.return_value = mocked_response
-    guardrail_output = guardrail.parse(rag_response['answer'])
+    guardrail_output = guardrail.parse(rag_response['answer']['answer'])
 
     mocked_guardrail_response.assert_called_once_with(
         os.path.join(guardrail.endpoint, 'guardrail'),
-        json={'text': [rag_response['answer']]},
+        json={'text': [rag_response['answer']['answer']]},
     )
     assert guardrail_output == {
         'content': 'This is a sample text.',
@@ -251,21 +268,29 @@ def test_guardrail_parse_fail(mocked_guardrail_response):
             provider='BloomzGuardrail', max_score=0.5, api_base='http://test-guard.com'
         )
     ).get_parser()
-    rag_response = {'answer': 'This is a sample text.'}
+    rag_response = {
+        'answer': {
+            'status': '',
+            'answer': 'This is a sample text.',
+            'topic': None,
+            'suggested_topics': None,
+            'context': []
+        }
+    }
 
     mocked_response = MagicMock()
     mocked_response.status_code = 500
     mocked_guardrail_response.return_value = mocked_response
 
     with pytest.raises(
-        HTTPError,
-        match=f"Error {mocked_response.status_code}. Bloomz guardrail didn't respond as expected.",
+            HTTPError,
+            match=f"Error {mocked_response.status_code}. Bloomz guardrail didn't respond as expected.",
     ):
-        guardrail.parse(rag_response['answer'])
+        guardrail.parse(rag_response['answer']['answer'])
 
     mocked_guardrail_response.assert_called_once_with(
         os.path.join(guardrail.endpoint, 'guardrail'),
-        json={'text': [rag_response['answer']]},
+        json={'text': [rag_response['answer']['answer']]},
     )
 
 
@@ -409,59 +434,83 @@ def test_check_guardrail_output_is_ok():
 
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.rag_log')
 def test_rag_guard_fails_if_no_docs_in_valid_answer(mocked_log):
-    inputs = {'no_answer': "Sorry, I don't know."}
+    question = 'Hi!'
     response = {
-        'answer': 'a valid answer',
+        'answer': {
+            'status': 'found_in_context',
+            'answer': 'a valid answer'
+        },
         'documents': [],
     }
     try:
-        rag_chain.rag_guard(inputs, response,documents_required=True)
+        rag_chain.rag_guard(question, LLMAnswer(**response['answer']), response, documents_required=True)
     except Exception as e:
         assert isinstance(e, GenAIGuardCheckException)
 
 
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.rag_log')
 def test_rag_guard_accepts_no_answer_even_with_docs(mocked_log):
-    inputs = {'no_answer': "Sorry, I don't know."}
+    question = 'Hi!'
     response = {
-        'answer': "Sorry, I don't know.",
+        'answer': {
+            'status': 'not_found_in_context',
+            'answer': 'Sorry, I don t know.',
+            'context': [
+                {
+                    'chunk': 1,
+                    'sentences': ["str1"],
+                }
+            ]
+        },
         'documents': ['a doc as a string'],
     }
-    rag_chain.rag_guard(inputs, response, documents_required=True)
-    assert response['documents'] == ['a doc as a string']
+    rag_chain.rag_guard(question, LLMAnswer(**response['answer']), response, documents_required=True)
+    # No answer found in the retrieved context. The documents are therefore removed from the RAG response.
+    assert response['documents'] == []
 
 
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.rag_log')
 def test_rag_guard_valid_answer_with_docs(mocked_log):
-    inputs = {'no_answer': "Sorry, I don't know."}
+    question = 'Hi!'
     response = {
-        'answer': 'a valid answer',
+        'answer': {
+            'status': 'found_in_context',
+            'answer': 'a valid answer',
+        },
         'documents': ['doc1', 'doc2'],
     }
-    rag_chain.rag_guard(inputs, response, documents_required=True)
+    rag_chain.rag_guard(question, LLMAnswer(**response['answer']), response, documents_required=True)
     assert response['documents'] == ['doc1', 'doc2']
 
+
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.rag_log')
 def test_rag_guard_no_answer_with_no_docs(mocked_log):
-    inputs = {'no_answer': "Sorry, I don't know."}
+    question = 'Hi!'
     response = {
-        'answer': "Sorry, I don't know.",
+        'answer': {
+            'status': 'not_found_in_context',
+            'answer': 'Sorry, I don t know.'
+        },
         'documents': [],
     }
-    rag_chain.rag_guard(inputs, response, documents_required=True)
+    rag_chain.rag_guard(question, LLMAnswer(**response['answer']), response, documents_required=True)
     assert response['documents'] == []
 
+
 @patch('gen_ai_orchestrator.services.langchain.rag_chain.rag_log')
 def test_rag_guard_without_no_answer_input(mocked_log):
     """Test that __rag_guard handles missing no_answer input correctly."""
-    inputs = {}  # No 'no_answer' key
+    question = 'Hi!'
     response = {
-        'answer': 'some answer',
+        'answer': {
+            'status': 'found_in_context',
+            'answer': 'a valid answer',
+        },
         'documents': [],
     }
     with pytest.raises(GenAIGuardCheckException) as exc:
-        rag_chain.rag_guard(inputs, response, documents_required=True)
+        rag_chain.rag_guard(question, LLMAnswer(**response['answer']), response, documents_required=True)
 
     mocked_log.assert_called_once()
 
-    assert isinstance(exc.value, GenAIGuardCheckException)
\ No newline at end of file
+    assert isinstance(exc.value, GenAIGuardCheckException)

From 3800867223a8037fa10f9fab6fa145bba33c6e78 Mon Sep 17 00:00:00 2001
From: Mohamed ASSOUKTI <mohamed.assoukti@partnre.com>
Date: Tue, 16 Sep 2025 16:36:44 +0200
Subject: [PATCH 2/2] [DERCBOT-1649] RAG - Multi-query retrieval

---
 .../model/genai/BotRAGConfigurationDTO.kt     |   3 +
 .../src/test/kotlin/service/RAGServiceTest.kt |   3 +-
 .../service/RAGValidationServiceTest.kt       |   1 +
 .../models/engines-configurations.ts          |  29 +++--
 .../rag/rag-settings/models/rag-settings.ts   |   1 +
 .../rag-settings/rag-settings.component.html  |  21 ++++
 .../rag-settings/rag-settings.component.ts    |   9 +-
 ...-message-sentence-footnotes.component.html |   6 +-
 .../web/src/app/shared/model/dialog-data.ts   |   1 +
 .../kotlin/GoogleChatFootnoteFormatterTest.kt |  20 ++--
 .../main/kotlin/MattermostMessageConverter.kt |   3 +-
 .../src/main/kotlin/model/Footnote.kt         |   6 +-
 .../src/main/kotlin/model/ClientFootnote.kt   |   3 +-
 .../tock/bot/connector/web/send/Footnote.kt   |   6 +-
 .../src/main/kotlin/WebMessageProcessor.kt    |   3 +-
 .../test/kotlin/WebConnectorResponseTest.kt   |   4 +-
 .../admin/bot/rag/BotRAGConfiguration.kt      |   3 +-
 .../src/main/kotlin/engine/action/Footnote.kt |   6 +-
 .../kotlin/engine/config/RAGAnswerHandler.kt  |  10 +-
 .../orchestratorclient/requests/RAGRequest.kt |   3 +-
 .../orchestratorclient/responses/models.kt    |   1 +
 .../models/rag/rag_models.py                  |   1 +
 .../routers/requests/requests.py              |   7 ++
 .../services/langchain/rag_chain.py           | 112 +++++++++++-------
 .../secret_manager/secret_manager_service.py  |   2 +-
 .../configurations/test_configurations.py     |   3 +
 .../server/tests/services/test_qa_chain.py    |   1 +
 .../server/tests/services/test_rag_chain.py   |   3 +-
 28 files changed, 189 insertions(+), 82 deletions(-)

diff --git a/bot/admin/server/src/main/kotlin/model/genai/BotRAGConfigurationDTO.kt b/bot/admin/server/src/main/kotlin/model/genai/BotRAGConfigurationDTO.kt
index 2580de0953..04f4478b42 100644
--- a/bot/admin/server/src/main/kotlin/model/genai/BotRAGConfigurationDTO.kt
+++ b/bot/admin/server/src/main/kotlin/model/genai/BotRAGConfigurationDTO.kt
@@ -47,6 +47,7 @@ data class BotRAGConfigurationDTO(
     val documentsRequired: Boolean = true,
     val debugEnabled: Boolean,
     val maxDocumentsRetrieved: Int,
+    val maxDocumentsInContext: Int,
     val maxMessagesFromHistory: Int,
 ) {
     constructor(configuration: BotRAGConfiguration) : this(
@@ -67,6 +68,7 @@ data class BotRAGConfigurationDTO(
         documentsRequired = configuration.documentsRequired,
         debugEnabled = configuration.debugEnabled,
         maxDocumentsRetrieved = configuration.maxDocumentsRetrieved,
+        maxDocumentsInContext = configuration.maxDocumentsInContext,
         maxMessagesFromHistory = configuration.maxMessagesFromHistory,
     )
 
@@ -102,6 +104,7 @@ data class BotRAGConfigurationDTO(
             documentsRequired = documentsRequired,
             debugEnabled = debugEnabled,
             maxDocumentsRetrieved = maxDocumentsRetrieved,
+            maxDocumentsInContext = maxDocumentsInContext,
             maxMessagesFromHistory = maxMessagesFromHistory,
         )
 }
diff --git a/bot/admin/server/src/test/kotlin/service/RAGServiceTest.kt b/bot/admin/server/src/test/kotlin/service/RAGServiceTest.kt
index 77048c5f6d..1abd0eda67 100644
--- a/bot/admin/server/src/test/kotlin/service/RAGServiceTest.kt
+++ b/bot/admin/server/src/test/kotlin/service/RAGServiceTest.kt
@@ -66,7 +66,7 @@ class RAGServiceTest : AbstractTest() {
         const val INDEX_SESSION_ID = "1010101"
 
         private val DEFAULT_RAG_CONFIG = BotRAGConfigurationDTO(
-             id = "ragId",
+            id = "ragId",
             namespace = NAMESPACE,
             botId = BOT_ID,
             enabled = false,
@@ -95,6 +95,7 @@ class RAGServiceTest : AbstractTest() {
             documentsRequired = true,
             debugEnabled = false,
             maxDocumentsRetrieved = 2,
+            maxDocumentsInContext = 3,
             maxMessagesFromHistory = 2,
         )
 
diff --git a/bot/admin/server/src/test/kotlin/service/RAGValidationServiceTest.kt b/bot/admin/server/src/test/kotlin/service/RAGValidationServiceTest.kt
index f8ff1905ff..e29f2f0ead 100644
--- a/bot/admin/server/src/test/kotlin/service/RAGValidationServiceTest.kt
+++ b/bot/admin/server/src/test/kotlin/service/RAGValidationServiceTest.kt
@@ -88,6 +88,7 @@ class RAGValidationServiceTest {
         documentsRequired = true,
         debugEnabled = false,
         maxDocumentsRetrieved = 2,
+        maxDocumentsInContext = 3,
         maxMessagesFromHistory = 2,
     )
 
diff --git a/bot/admin/web/src/app/rag/rag-settings/models/engines-configurations.ts b/bot/admin/web/src/app/rag/rag-settings/models/engines-configurations.ts
index d862251d20..254b3f40cd 100644
--- a/bot/admin/web/src/app/rag/rag-settings/models/engines-configurations.ts
+++ b/bot/admin/web/src/app/rag/rag-settings/models/engines-configurations.ts
@@ -27,19 +27,32 @@ import {
   PromptDefinitionFormatter
 } from '../../../shared/model/ai-settings';
 
-export const QuestionCondensingDefaultPrompt: string = `You are a helpful assistant that reformulates questions.
+export const QuestionCondensingDefaultPrompt: string = `# Question Reformulation Assistant
+
+## Context
+You are a helpful assistant that reformulates questions.
 
 You are given:
 - The conversation history between the user and the assistant
 - The most recent user question
 
-Your task:
-- Reformulate the user’s latest question into a clear, standalone query.
-- Incorporate relevant context from the conversation history.
-- Do NOT answer the question.
-- If the history does not provide additional context, keep the question as is.
-
-Return only the reformulated question.`;
+## Task
+1. Reformulate the user’s latest question into a clear, standalone query.
+2. Incorporate relevant context from the conversation history.
+3. Enrich the reformulation with the business/domain lexicon whenever relevant.
+4. Expand any acronym into its full meaning, and also keep the acronym in parentheses.
+ - Example: "PTZ" → "Prêt à Taux Zéro (PTZ)"
+5. If the user provides the full term without acronym, add the acronym in parentheses if it is commonly used in the business domain.
+ - Example: "Prêt à Taux Zéro" → "Prêt à Taux Zéro (PTZ)"
+6. Do NOT answer the question.
+
+## Business/domain lexicon
+PTZ : Prêt à Taux Zéro
+Éco-PTZ : Éco-Prêt à Taux Zéro
+
+## Output
+Return only the reformulated question.
+`;
 
 export const QuestionAnsweringDefaultPrompt: string = `# TOCK (The Open Conversation Kit) chatbot
 
diff --git a/bot/admin/web/src/app/rag/rag-settings/models/rag-settings.ts b/bot/admin/web/src/app/rag/rag-settings/models/rag-settings.ts
index 52b4dbb9fc..8dae52cdb9 100644
--- a/bot/admin/web/src/app/rag/rag-settings/models/rag-settings.ts
+++ b/bot/admin/web/src/app/rag/rag-settings/models/rag-settings.ts
@@ -40,6 +40,7 @@ export interface RagSettings {
   indexName: string;
 
   maxDocumentsRetrieved: number;
+  maxDocumentsInContext: number;
 
   documentsRequired: boolean;
 }
diff --git a/bot/admin/web/src/app/rag/rag-settings/rag-settings.component.html b/bot/admin/web/src/app/rag/rag-settings/rag-settings.component.html
index 51ae47b2d3..bc7f665ac0 100644
--- a/bot/admin/web/src/app/rag/rag-settings/rag-settings.component.html
+++ b/bot/admin/web/src/app/rag/rag-settings/rag-settings.component.html
@@ -408,6 +408,27 @@ <h1 class="flex-grow-1">Rag settings</h1>
                 />
               </tock-form-control>
             </div>
+            <div class="col-6 px-3">
+              <tock-form-control
+                label="Max documents in LLM context"
+                name="maxDocumentsInContext"
+                [controls]="maxDocumentsInContext"
+                [required]="false"
+                [boldLabel]="false"
+                information="Maximum number of documents to be provided as context to the LLM."
+              >
+                <input
+                  type="number"
+                  step="1"
+                  min="0"
+                  max="50"
+                  nbInput
+                  fieldSize="small"
+                  fullWidth
+                  formControlName="maxDocumentsInContext"
+                />
+              </tock-form-control>
+            </div>
 
             <div class="col-6 px-3">
               <tock-form-control
diff --git a/bot/admin/web/src/app/rag/rag-settings/rag-settings.component.ts b/bot/admin/web/src/app/rag/rag-settings/rag-settings.component.ts
index 0f9a827271..0a077e94ef 100644
--- a/bot/admin/web/src/app/rag/rag-settings/rag-settings.component.ts
+++ b/bot/admin/web/src/app/rag/rag-settings/rag-settings.component.ts
@@ -54,6 +54,7 @@ interface RagSettingsForm {
   indexName: FormControl<string>;
 
   maxDocumentsRetrieved: FormControl<number>;
+  maxDocumentsInContext: FormControl<number>;
 
   documentsRequired: FormControl<boolean>;
 
@@ -217,6 +218,7 @@ export class RagSettingsComponent implements OnInit, OnDestroy {
 
     documentsRequired: new FormControl(undefined),
     maxDocumentsRetrieved: new FormControl(undefined),
+    maxDocumentsInContext: new FormControl(undefined),
 
     questionCondensingLlmProvider: new FormControl(undefined, [Validators.required]),
     questionCondensingLlmSetting: new FormGroup({}),
@@ -275,6 +277,10 @@ export class RagSettingsComponent implements OnInit, OnDestroy {
     return this.form.get('maxDocumentsRetrieved') as FormControl;
   }
 
+  get maxDocumentsInContext(): FormControl {
+    return this.form.get('maxDocumentsInContext') as FormControl;
+  }
+
   get indexName(): FormControl {
     return this.form.get('indexName') as FormControl;
   }
@@ -396,7 +402,8 @@ export class RagSettingsComponent implements OnInit, OnDestroy {
       documentsRequired: true,
       debugEnabled: false,
       maxMessagesFromHistory: 5,
-      maxDocumentsRetrieved: 4
+      maxDocumentsRetrieved: 4,
+      maxDocumentsInContext: 4,
     });
   }
 
diff --git a/bot/admin/web/src/app/shared/components/chat-ui/chat-ui-message/chat-ui-message-sentence-footnotes/chat-ui-message-sentence-footnotes.component.html b/bot/admin/web/src/app/shared/components/chat-ui/chat-ui-message/chat-ui-message-sentence-footnotes/chat-ui-message-sentence-footnotes.component.html
index 121bfaa88b..8307f42028 100644
--- a/bot/admin/web/src/app/shared/components/chat-ui/chat-ui-message/chat-ui-message-sentence-footnotes/chat-ui-message-sentence-footnotes.component.html
+++ b/bot/admin/web/src/app/shared/components/chat-ui/chat-ui-message/chat-ui-message-sentence-footnotes/chat-ui-message-sentence-footnotes.component.html
@@ -50,10 +50,12 @@
       </span>
 
       <nb-icon
-        *ngIf="footNote.score"
+        *ngIf="footNote.score != null || footNote.rrfScore != null"
         icon="trophy"
         class="font-size-xsmall align-center ml-2"
-        nbTooltip="Compressor score : {{ footNote.score.toFixed(2) }}"
+        [nbTooltip]="(footNote.score != null ? 'Compressor score : ' + footNote.score.toFixed(3) : '') +
+                     (footNote.score != null && footNote.rrfScore != null ? ' | ' : '') +
+                     (footNote.rrfScore != null ? 'RRF score : ' + footNote.rrfScore.toFixed(3) : '')"
       ></nb-icon>
     </span>
 
diff --git a/bot/admin/web/src/app/shared/model/dialog-data.ts b/bot/admin/web/src/app/shared/model/dialog-data.ts
index 06a8f7b3d6..3b58389198 100644
--- a/bot/admin/web/src/app/shared/model/dialog-data.ts
+++ b/bot/admin/web/src/app/shared/model/dialog-data.ts
@@ -263,6 +263,7 @@ export interface Footnote {
   url: string;
   content?: string;
   score?: number;
+  rrfScore?: number;
   _showFullContent?: boolean;
   identifier: string;
 }
diff --git a/bot/connector-google-chat/src/test/kotlin/GoogleChatFootnoteFormatterTest.kt b/bot/connector-google-chat/src/test/kotlin/GoogleChatFootnoteFormatterTest.kt
index 5ca2760b33..875b0216bc 100644
--- a/bot/connector-google-chat/src/test/kotlin/GoogleChatFootnoteFormatterTest.kt
+++ b/bot/connector-google-chat/src/test/kotlin/GoogleChatFootnoteFormatterTest.kt
@@ -63,8 +63,8 @@ class GoogleChatFootnoteFormatterTest {
                 "keeps footnotes with same URL but different titles",
                 "Check this out",
                 listOf(
-                    Footnote("id1", "Title A", "https://example.com", null, null),
-                    Footnote("id2", "Title B", "https://example.com", null, null)
+                    Footnote("id1", "Title A", "https://example.com", null, null,null),
+                    Footnote("id2", "Title B", "https://example.com", null, null,null)
                 ),
                 """
                 Check this out
@@ -78,8 +78,8 @@ class GoogleChatFootnoteFormatterTest {
                 "formats mix of footnotes with and without URL",
                 "Here's some info",
                 listOf(
-                    Footnote("id1", "Google", "https://google.com", null, null),
-                    Footnote("id2", "Just text", null, null, null)
+                    Footnote("id1", "Google", "https://google.com", null,null, null),
+                    Footnote("id2", "Just text", null, null, null,null)
                 ),
                 """
                 Here's some info
@@ -93,8 +93,8 @@ class GoogleChatFootnoteFormatterTest {
                 "does not deduplicate footnotes with same title and different URLs",
                 "Interesting links",
                 listOf(
-                    Footnote("id1", "Duplicate", "https://a.com", null, null),
-                    Footnote("id2", "Duplicate", "https://b.com", null, null)
+                    Footnote("id1", "Duplicate", "https://a.com", null, null,null),
+                    Footnote("id2", "Duplicate", "https://b.com", null, null,null)
                 ),
                 """
                 Interesting links
@@ -108,8 +108,8 @@ class GoogleChatFootnoteFormatterTest {
                 "deduplicates footnotes based only on url and title ignoring other fields",
                 "References",
                 listOf(
-                    Footnote("id1", "Doc", "https://doc.com", "Content A", 0.9f),
-                    Footnote("id2", "Doc", "https://doc.com", "Content B", 0.2f)
+                    Footnote("id1", "Doc", "https://doc.com", "Content A", 0.9f,null),
+                    Footnote("id2", "Doc", "https://doc.com", "Content B", 0.2f,null)
                 ),
                 """
                 References
@@ -126,8 +126,8 @@ class GoogleChatFootnoteFormatterTest {
                 "generates numbered links with and without URLs",
                 "Sources below",
                 listOf(
-                    Footnote("id1", "Tock", "https://tock.ai", null, null),
-                    Footnote("id2", "Offline doc", null, null, null)
+                    Footnote("id1", "Tock", "https://tock.ai", null, null,null),
+                    Footnote("id2", "Offline doc", null, null, null,null)
                 ),
                 """
                 Sources below
diff --git a/bot/connector-mattermost/src/main/kotlin/MattermostMessageConverter.kt b/bot/connector-mattermost/src/main/kotlin/MattermostMessageConverter.kt
index aacdfc26e5..f92c53ceb8 100644
--- a/bot/connector-mattermost/src/main/kotlin/MattermostMessageConverter.kt
+++ b/bot/connector-mattermost/src/main/kotlin/MattermostMessageConverter.kt
@@ -59,7 +59,8 @@ internal object MattermostMessageConverter {
                             footnote.title,
                             footnote.url,
                             footnote.content,
-                            footnote.score
+                            footnote.score,
+                            footnote.rrfScore
                         )
                     })
             }
diff --git a/bot/connector-mattermost/src/main/kotlin/model/Footnote.kt b/bot/connector-mattermost/src/main/kotlin/model/Footnote.kt
index 161256b54f..a85ff77ae2 100644
--- a/bot/connector-mattermost/src/main/kotlin/model/Footnote.kt
+++ b/bot/connector-mattermost/src/main/kotlin/model/Footnote.kt
@@ -38,7 +38,11 @@ data class Footnote(
      */
     val content: String?,
     /**
-     * A footnote score
+     * A footnote score (Compressor)
      */
     val score: Float?,
+    /**
+     * A footnote score (RFF)
+     */
+    val rrfScore: Float?,
 )
\ No newline at end of file
diff --git a/bot/connector-rest-client/src/main/kotlin/model/ClientFootnote.kt b/bot/connector-rest-client/src/main/kotlin/model/ClientFootnote.kt
index b8b0f1bf0f..b4599940a6 100644
--- a/bot/connector-rest-client/src/main/kotlin/model/ClientFootnote.kt
+++ b/bot/connector-rest-client/src/main/kotlin/model/ClientFootnote.kt
@@ -24,5 +24,6 @@ data class ClientFootnote(
     val title: String,
     val url: String?,
     val content: String?,
-    val score: Float?
+    val score: Float?,
+    val rrfScore: Float?
 )
diff --git a/bot/connector-web-model/src/main/kotlin/ai/tock/bot/connector/web/send/Footnote.kt b/bot/connector-web-model/src/main/kotlin/ai/tock/bot/connector/web/send/Footnote.kt
index 7250d57512..f189b8ae07 100644
--- a/bot/connector-web-model/src/main/kotlin/ai/tock/bot/connector/web/send/Footnote.kt
+++ b/bot/connector-web-model/src/main/kotlin/ai/tock/bot/connector/web/send/Footnote.kt
@@ -38,7 +38,11 @@ data class Footnote(
      */
     val content: String?,
     /**
-     * A footnote score
+     * A footnote score (Compressor)
      */
     val score: Float?,
+    /**
+     * A footnote score (RFF)
+     */
+    val rrfScore: Float?,
 )
diff --git a/bot/connector-web/src/main/kotlin/WebMessageProcessor.kt b/bot/connector-web/src/main/kotlin/WebMessageProcessor.kt
index 35ca55e8c5..2c759f91cd 100644
--- a/bot/connector-web/src/main/kotlin/WebMessageProcessor.kt
+++ b/bot/connector-web/src/main/kotlin/WebMessageProcessor.kt
@@ -44,7 +44,8 @@ internal class WebMessageProcessor(private val processMarkdown: Boolean) {
                             footnote.title,
                             footnote.url,
                             footnote.content?.let { postProcess(it) },
-                            footnote.score
+                            footnote.score,
+                            footnote.rrfScore
                         )
                     })
             }
diff --git a/bot/connector-web/src/test/kotlin/WebConnectorResponseTest.kt b/bot/connector-web/src/test/kotlin/WebConnectorResponseTest.kt
index 32c972c60a..72f0c44740 100644
--- a/bot/connector-web/src/test/kotlin/WebConnectorResponseTest.kt
+++ b/bot/connector-web/src/test/kotlin/WebConnectorResponseTest.kt
@@ -51,8 +51,8 @@ internal class WebConnectorResponseTest {
             responses = listOf(
                 WebMessageContent(
                     text = "Text with 2 footnotes", footnotes = listOf(
-                        Footnote("e122e97a5cc7", "title 1", url = "https://doc.tock.ai", content = "content 1", score = null),
-                        Footnote("fcad492fdb99", "title 2", url = "https://github.com/theopenconversationkit/tock", content = "content 2", score = null)
+                        Footnote("e122e97a5cc7", "title 1", url = "https://doc.tock.ai", content = "content 1", score = null, rrfScore = null),
+                        Footnote("fcad492fdb99", "title 2", url = "https://github.com/theopenconversationkit/tock", content = "content 2", score = null, rrfScore = null)
                     )
                 )
             )
diff --git a/bot/engine/src/main/kotlin/admin/bot/rag/BotRAGConfiguration.kt b/bot/engine/src/main/kotlin/admin/bot/rag/BotRAGConfiguration.kt
index 6680b11237..27a0240728 100644
--- a/bot/engine/src/main/kotlin/admin/bot/rag/BotRAGConfiguration.kt
+++ b/bot/engine/src/main/kotlin/admin/bot/rag/BotRAGConfiguration.kt
@@ -41,7 +41,8 @@ data class BotRAGConfiguration(
     val noAnswerStoryId: String? = null,
     val documentsRequired: Boolean = true,
     val debugEnabled: Boolean = false,
-    val maxDocumentsRetrieved: Int = 4,
+    val maxDocumentsRetrieved: Int = 4, // Max documents retrieved from vector db per query
+    val maxDocumentsInContext: Int = 4, // Max documents used in LLM context
     val maxMessagesFromHistory: Int = 5,
 ) {
     @Deprecated("use BotRAGConfiguration#questionAnsweringLlmSetting")
diff --git a/bot/engine/src/main/kotlin/engine/action/Footnote.kt b/bot/engine/src/main/kotlin/engine/action/Footnote.kt
index 9677a719ba..069338a8ae 100644
--- a/bot/engine/src/main/kotlin/engine/action/Footnote.kt
+++ b/bot/engine/src/main/kotlin/engine/action/Footnote.kt
@@ -38,7 +38,11 @@ data class Footnote(
      */
     val content: String?,
     /**
-     * A footnote score
+     * A footnote score (Compressor)
      */
     val score: Float?,
+    /**
+     * A footnote score (RFF)
+     */
+    val rrfScore: Float?,
 )
diff --git a/bot/engine/src/main/kotlin/engine/config/RAGAnswerHandler.kt b/bot/engine/src/main/kotlin/engine/config/RAGAnswerHandler.kt
index c3f97cfb23..0e14280414 100644
--- a/bot/engine/src/main/kotlin/engine/config/RAGAnswerHandler.kt
+++ b/bot/engine/src/main/kotlin/engine/config/RAGAnswerHandler.kt
@@ -82,11 +82,14 @@ object RAGAnswerHandler : AbstractProactiveAnswerHandler {
                         text = answer.answer,
                         footnotes = footnotes?.map {
                             Footnote(
-                                it.identifier, it.title, it.url,
+                                it.identifier,
+                                it.title,
+                                it.url,
                                 if(action.metadata.sourceWithContent) it.content else null,
-                                it.score
+                                it.score,
+                                it.rrfScore
                             )
-                        }?.toMutableList() ?: mutableListOf<Footnote>(),
+                        }?.toMutableList() ?: mutableListOf(),
                         // modifiedObservabilityInfo includes the public langfuse URL if filled.
                         metadata = ActionMetadata(isGenAiRagAnswer = true, observabilityInfo = modifiedObservabilityInfo)
                     )
@@ -221,6 +224,7 @@ object RAGAnswerHandler : AbstractProactiveAnswerHandler {
                         vectorStoreSetting = vectorStoreSetting,
                         observabilitySetting = botDefinition.observabilityConfiguration?.setting,
                         documentsRequired = ragConfiguration.documentsRequired,
+                        maxDocumentsInContext = ragConfiguration.maxDocumentsInContext
                     ), debug = action.metadata.debugEnabled || ragConfiguration.debugEnabled
                 )
 
diff --git a/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/requests/RAGRequest.kt b/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/requests/RAGRequest.kt
index c446cec11c..b6f34f372c 100644
--- a/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/requests/RAGRequest.kt
+++ b/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/requests/RAGRequest.kt
@@ -34,7 +34,8 @@ data class RAGRequest(
     val compressorSetting: DocumentCompressorSetting?,
     val vectorStoreSetting: VectorStoreSetting?,
     val observabilitySetting: ObservabilitySetting?,
-    val documentsRequired: Boolean = true,
+    val documentsRequired: Boolean,
+    val maxDocumentsInContext: Int,
 )
 
 
diff --git a/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/models.kt b/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/models.kt
index fa0314a143..75e3772309 100644
--- a/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/models.kt
+++ b/gen-ai/orchestrator-client/src/main/kotlin/ai/tock/genai/orchestratorclient/responses/models.kt
@@ -37,6 +37,7 @@ data class Footnote(
     val url: String? = null,
     val content: String? = null,
     val score: Float? = null,
+    val rrfScore: Float? = null,
 )
 
 data class ObservabilityInfo(
diff --git a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/models/rag/rag_models.py b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/models/rag/rag_models.py
index c52d7a42df..34d18623e1 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/models/rag/rag_models.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/models/rag/rag_models.py
@@ -29,6 +29,7 @@ class Source(BaseModel):
     url: Optional[AnyUrl] = Field(description='Source url', examples=['https://doc.tock.ai/tock/'], default=None)
     content: str = Field(description='Source content', examples=['Tock: The Open Conversation Kit'])
     score: Optional[float] = Field(description='The compressor score', examples=[0.9149009585380554], default=None)
+    rrf_score: Optional[float] = Field(description='The Reciprocal Rank Fusion (RRF) score', examples=[0.075], default=None)
 
     def __eq__(self, other):
         """
diff --git a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/routers/requests/requests.py b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/routers/requests/requests.py
index a99ee25368..1bcee6564e 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/routers/requests/requests.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/routers/requests/requests.py
@@ -89,6 +89,11 @@ class BaseRequest(BaseModel):
         description='Compressor settings, to rerank relevant documents returned by retriever.',
         default=None,
     )
+    max_documents_in_context: int = Field(
+        description='Max documents used in LLM context.',
+        examples=[3],
+        default=4,
+    )
 
 
 class QARequest(BaseRequest):
@@ -121,6 +126,7 @@ class QARequest(BaseRequest):
                         ],
                         'k': 4,
                     },
+                    'max_documents_in_context': 4,
                 }
             ]
         }
@@ -258,6 +264,7 @@ class RAGRequest(BaseRequest):
                         'label': 'entailment',
                         'endpoint': 'https://*********',
                     },
+                    'max_documents_in_context': 4,
                 }
             ]
         }
diff --git a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py
index fe8d289118..782ad7f08d 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/services/langchain/rag_chain.py
@@ -29,7 +29,7 @@
     ContextualCompressionRetriever,
 )
 from langchain_community.chat_message_histories import ChatMessageHistory
-from langchain_core.callbacks import BaseCallbackHandler
+from langchain_core.callbacks import BaseCallbackHandler, Callbacks
 from langchain_core.documents import Document
 from langchain_core.messages import HumanMessage, AIMessage
 from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
@@ -115,8 +115,6 @@ async def execute_rag_chain(
     logger.info('RAG chain - Start of execution...')
     start_time = time.time()
 
-    conversational_retrieval_chain = create_rag_chain(request=request)
-
     message_history = ChatMessageHistory()
     if request.dialog:
         for msg in request.dialog.history:
@@ -141,6 +139,8 @@ async def execute_rag_chain(
         'chat_history': message_history.messages,
     }
 
+    conversational_retrieval_chain = create_rag_chain(request=request)
+
     response = await conversational_retrieval_chain.ainvoke(
         input=inputs,
         config=RunnableConfig(callbacks=callbacks)
@@ -179,6 +179,7 @@ async def execute_rag_chain(
                 url=doc.metadata['source'],
                 content=get_source_content(doc),
                 score=doc.metadata.get('retriever_score', None),
+                rrf_score=doc.metadata.get('rrf_score', None),
             )
             for doc in response["documents"]
             if doc.metadata['id'] in contexts_by_chunk
@@ -297,35 +298,27 @@ def create_rag_chain(
     chat_chain = build_question_condensation_chain(condensing_llm, request.question_condensing_prompt)
     rag_prompt = build_rag_prompt(request)
 
-    # Function to contextualize the question based on chat history
-    contextualize_question_fn = partial(contextualize_question, chat_chain=chat_chain)
-
     # Calculate the condensed question
     with_condensed_question = RunnableParallel({
-        "condensed_question": contextualize_question_fn,
+        "condensed_question": chat_chain,
         "question": itemgetter("question"),
         "chat_history": itemgetter("chat_history"),
     })
 
-    def retrieve_with_variants(inputs):
-        variants = [
-            # inputs["question"], Deactivated. It's an example to prove the multi retriever process
-            inputs["condensed_question"]
-        ]
-        docs = []
-        for v in variants:
-            docs.extend(retriever.invoke(v))
-        # Deduplicate docs
-        unique_docs = {d.metadata['id']: d for d in docs}
-
-        # TODO [DERCBOT-1649] Apply the RRF Algo on unique_docs.
-        return list(unique_docs.values())
+    def multi_query_retrieve(inputs) -> list[Document]:
+        """Multi-query retrieval.
+        Retrieve documents from the vector database for each variant of the user's question,
+        then apply fusion (e.g., RRF) to produce a ranked list of results.
+        """
+        variants = [inputs["question"], inputs["condensed_question"]]
+        results = [retriever.invoke(input=v) for v in variants]
+        return apply_rrf_ranking(results, k=60, top_n=request.max_documents_in_context)
 
     # Build the RAG inputs
     rag_inputs = with_condensed_question | RunnableParallel({
-        "question": itemgetter("condensed_question"),
+        "condensed_question": itemgetter("condensed_question"),
         "chat_history": itemgetter("chat_history"),
-        "documents": RunnableLambda(retrieve_with_variants),
+        "documents": RunnableLambda(name="multi_query_retrieve", func=multi_query_retrieve),
     })
 
     return rag_inputs | RunnablePassthrough.assign(answer=(
@@ -338,12 +331,64 @@ def retrieve_with_variants(inputs):
                     for doc in x["documents"]
                 ], ensure_ascii=False, indent=2),
                 "chat_history": format_chat_history,
+                "condensed_question": itemgetter("condensed_question"),
             }
             | rag_prompt
             | question_answering_llm
             | JsonOutputParser(pydantic_object=LLMAnswer, name="rag_chain_output")))
 
 
+def apply_rrf_ranking(ranked_results: list[list[Document]], k: int, top_n: int) -> list[Document]:
+    """Apply Reciprocal Rank Fusion (RRF) on multiple ranked result lists.
+
+    Each document is assigned an RRF score based on its rank in the individual lists:
+        score(d) = Σ (1 / (k + rank_q(d)))
+    where rank_q(d) is the 1-based position of the document d in the result list for query q.
+
+    Documents appearing in multiple lists are boosted. Results are deduplicated and
+    sorted by their final RRF score in descending order.
+
+    The fusion effect of RRF is : A doc appearing in several lists (even slightly lower ranked)
+    will usually score higher than one appearing at the very top of only one list.
+    Here’s why:
+        - Suppose docA is ranked 5th for question and 3rd for condensed_question.
+        - Suppose docB is ranked 1st for question but doesn’t appear at all for condensed_question.
+    docA gets: 1/(60+5) + 1/(60+3) ≈ 0.032 + 0.033 ≈ 0.065
+    docB gets: 1/(60+1) ≈ 0.016
+    Even though docB was “better” for one query, docA wins overall because it’s consistently relevant across variants.
+
+    Args:
+        ranked_results (list[list[Document]]): Lists of ranked documents per query variant.
+        k (int): RRF dampening parameter (default 60).
+        top_n (int): Number of top documents to return.
+
+    Returns:
+        list[Document]: The top-N fused and ranked documents.
+    """
+
+    # Assign RRF scores
+    scores = {}
+    for results in ranked_results:
+        for rank, doc in enumerate(results, start=1):  # 1-based rank
+            doc_id = doc.metadata["id"]
+            score = 1.0 / (k + rank)
+            scores[doc_id] = scores.get(doc_id, 0) + score
+
+    # Sort by RRF score
+    unique_docs = {}
+    for results in ranked_results:
+        for doc in results:
+            unique_docs[doc.metadata["id"]] = doc  # keep doc object
+
+    ranked_docs = sorted(unique_docs.values(), key=lambda d: scores[d.metadata["id"]], reverse=True)
+
+    # Storing RRF score
+    for doc in ranked_docs:
+        doc.metadata["rrf_score"] = scores[doc.metadata["id"]]
+
+    # Return only the top N docs back.
+    return ranked_docs[:top_n]
+
 def build_rag_prompt(request: RAGRequest) -> LangChainPromptTemplate:
     """
     Build the RAG prompt template.
@@ -369,29 +414,6 @@ def build_question_condensation_chain(
     """
     Build the chat chain for contextualizing questions.
     """
-    # TODO deprecated : All Gen configurations are supposed to have this prompt now. It is mandatory in the RAG configuration.
-    if prompt is None:
-        # Default prompt
-        prompt = PromptTemplate(
-            formatter=PromptFormatter.F_STRING,
-            inputs={},
-            template="""
-You are a helpful assistant that reformulates questions.
-
-You are given:
-- The conversation history between the user and the assistant
-- The most recent user question
-
-Your task:
-- Reformulate the user’s latest question into a clear, standalone query.
-- Incorporate relevant context from the conversation history.
-- Do NOT answer the question.
-- If the history does not provide additional context, keep the question as is.
-
-Return only the reformulated question.
-"""
-        )
-
     return (
         ChatPromptTemplate.from_messages(
             [
diff --git a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/utils/secret_manager/secret_manager_service.py b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/utils/secret_manager/secret_manager_service.py
index 5d16081397..25237ef5be 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/utils/secret_manager/secret_manager_service.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/src/gen_ai_orchestrator/utils/secret_manager/secret_manager_service.py
@@ -41,7 +41,7 @@ def fetch_default_vector_store_credentials() -> Optional[Credentials]:
         logger.info("A default Vector Store Credentials have been successfully retrieved.")
         logger.debug(
             'A default Vector Store Credentials have been defined [Credentials=(user:%s, password:%s)] for [Provider=%s]',
-            application_settings.vector_store_provider.value,
+            application_settings.vector_store_provider,
             credentials.username,
             obfuscate(credentials.password),
         )
diff --git a/gen-ai/orchestrator-server/src/main/python/server/tests/configurations/test_configurations.py b/gen-ai/orchestrator-server/src/main/python/server/tests/configurations/test_configurations.py
index c6608ceda2..2f02456c6f 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/tests/configurations/test_configurations.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/tests/configurations/test_configurations.py
@@ -88,6 +88,7 @@ def test_fetch_gcp_secret_credentials(mock_get_credentials, mock_gcp_secretmanag
     mock_get_credentials.return_value = my_credentials
 
     # Call the function to fetch the credentials of the default vector store
+    fetch_default_vector_store_credentials.cache_clear()
     credentials = fetch_default_vector_store_credentials()
 
     # Check test results
@@ -111,6 +112,7 @@ def test_fetch_gcp_secret_credentials(mock_get_credentials, mock_gcp_secretmanag
        ))
 def test_fetch_default_credentials(mock_get_credentials, mock_boto3_client):
         # Call the function to fetch the credentials of the default vector store
+        fetch_default_vector_store_credentials.cache_clear()
         credentials = fetch_default_vector_store_credentials()
 
         # Check test results
@@ -143,6 +145,7 @@ def test_fetch_bad_credentials(mock_get_credentials, mock_boto3_client):
     mock_get_credentials.return_value = open_search_credentials
 
     # Call the function to fetch the credentials of the default vector store
+    fetch_default_vector_store_credentials.cache_clear()
     credentials = fetch_default_vector_store_credentials()
 
     # Check test results
diff --git a/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_qa_chain.py b/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_qa_chain.py
index 1199998702..c964763943 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_qa_chain.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_qa_chain.py
@@ -67,6 +67,7 @@ async def test_qa_chain(
             'k': 4,
         },
         'documents_required': True,
+        'max_documents_in_context': 4,
     }
     request = QARequest(**query_dict)
 
diff --git a/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_rag_chain.py b/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_rag_chain.py
index 8d88a9629c..d79dc668e7 100644
--- a/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_rag_chain.py
+++ b/gen-ai/orchestrator-server/src/main/python/server/tests/services/test_rag_chain.py
@@ -143,6 +143,7 @@ async def test_rag_chain(
             'endpoint': 'http://test-rerank.com',
         },
         'documents_required': True,
+        'max_documents_in_context': 4,
     }
     request = RAGRequest(**query_dict)
     inputs = {
@@ -457,7 +458,7 @@ def test_rag_guard_accepts_no_answer_even_with_docs(mocked_log):
             'answer': 'Sorry, I don t know.',
             'context': [
                 {
-                    'chunk': 1,
+                    'chunk': '1',
                     'sentences': ["str1"],
                 }
             ]