Merge pull request #10 from TogetherCrew/feat/subquery-generator

Feat: subquery generator
TogetherCrew · Jan 8, 2024 · b9a4d47 · b9a4d47
2 parents ab04e8b + 34ceb7c
commit b9a4d47
Show file tree

Hide file tree

Showing 10 changed files with 365 additions and 130 deletions.
diff --git a/.env.example b/.env.example
@@ -0,0 +1,25 @@
+CHUNK_SIZE=
+COHERE_API_KEY=
+D_RETRIEVER_SEARCH=
+EMBEDDING_DIM=
+K1_RETRIEVER_SEARCH=
+K2_RETRIEVER_SEARCH=
+MONGODB_HOST=
+MONGODB_PASS=
+MONGODB_PORT=
+MONGODB_USER=
+NEO4J_DB=
+NEO4J_HOST=
+NEO4J_PASSWORD=
+NEO4J_PORT=
+NEO4J_PROTOCOL=
+NEO4J_USER=
+OPENAI_API_KEY=
+POSTGRES_HOST=
+POSTGRES_PASS=
+POSTGRES_PORT=
+POSTGRES_USER=
+RABBIT_HOST=
+RABBIT_PASSWORD=
+RABBIT_PORT=
+RABBIT_USER=
diff --git a/.gitignore b/.gitignore
@@ -159,4 +159,5 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-hivemind-bot-env/*
+hivemind-bot-env/*
+main.ipynb
diff --git a/bot/retrievers/summary_retriever_base.py b/bot/retrievers/summary_retriever_base.py
@@ -34,7 +34,7 @@ def __init__(
             the embedding model to use for doing embedding on the query string
             default would be CohereEmbedding that we've written
         """
-        self.index = self._setup_index(table_name, dbname)
+        self.index = self._setup_index(table_name, dbname, embedding_model)
         self.embedding_model = embedding_model
 
     def get_similar_nodes(
@@ -62,10 +62,14 @@ def get_similar_nodes(
 
         return nodes
 
-    def _setup_index(self, table_name: str, dbname: str) -> VectorStoreIndex:
+    def _setup_index(
+        self, table_name: str, dbname: str, embedding_model: BaseEmbedding
+    ) -> VectorStoreIndex:
         """
         setup the llama_index VectorStoreIndex
         """
-        pg_vector_access = PGVectorAccess(table_name=table_name, dbname=dbname)
+        pg_vector_access = PGVectorAccess(
+            table_name=table_name, dbname=dbname, embed_model=embedding_model
+        )
         index = pg_vector_access.load_index()
         return index
diff --git a/discord_query.py b/discord_query.py
@@ -1,145 +1,36 @@
-from bot.retrievers.forum_summary_retriever import ForumBasedSummaryRetriever
-from bot.retrievers.process_dates import process_dates
-from bot.retrievers.utils.load_hyperparams import load_hyperparams
 from llama_index import QueryBundle
-from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters
+from llama_index.schema import NodeWithScore
 from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
-from tc_hivemind_backend.pg_vector_access import PGVectorAccess
+from utils.query_engine.discord_query_engine import prepare_discord_engine_auto_filter
 
 
 def query_discord(
     community_id: str,
     query: str,
-    thread_names: list[str],
-    channel_names: list[str],
-    days: list[str],
-    similarity_top_k: int | None = None,
-) -> str:
+) -> tuple[str, list[NodeWithScore]]:
     """
-    query the discord database using filters given
-    and give an anwer to the given query using the LLM
+    query the llm using the query engine
 
     Parameters
     ------------
-    guild_id : str
-        the discord guild data to query
+    query_engine : BaseQueryEngine
+        the prepared query engine
     query : str
-        the query (question) of the user
-    thread_names : list[str]
-        the given threads to search for
-    channel_names : list[str]
-        the given channels to search for
-    days : list[str]
-        the given days to search for
-    similarity_top_k : int | None
-        the k similar results to use when querying the data
-        if `None` will load from `.env` file
+        the string question
 
     Returns
-    ---------
+    ----------
     response : str
-        the LLM response given the query
+        the LLM response
+    source_nodes : list[llama_index.schema.NodeWithScore]
+        the source nodes that helped in answering the question
     """
-    if similarity_top_k is None:
-        _, similarity_top_k, _ = load_hyperparams()
-
-    table_name = "discord"
-    dbname = f"community_{community_id}"
-
-    pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)
-
-    index = pg_vector.load_index()
-
-    thread_filters: list[ExactMatchFilter] = []
-    channel_filters: list[ExactMatchFilter] = []
-    day_filters: list[ExactMatchFilter] = []
-
-    for channel in channel_names:
-        channel_updated = channel.replace("'", "''")
-        channel_filters.append(ExactMatchFilter(key="channel", value=channel_updated))
-
-    for thread in thread_names:
-        thread_updated = thread.replace("'", "''")
-        thread_filters.append(ExactMatchFilter(key="thread", value=thread_updated))
-
-    for day in days:
-        day_filters.append(ExactMatchFilter(key="date", value=day))
-
-    all_filters: list[ExactMatchFilter] = []
-    all_filters.extend(thread_filters)
-    all_filters.extend(channel_filters)
-    all_filters.extend(day_filters)
-
-    filters = MetadataFilters(filters=all_filters, condition=FilterCondition.OR)
-
-    query_engine = index.as_query_engine(
-        filters=filters, similarity_top_k=similarity_top_k
+    query_engine = prepare_discord_engine_auto_filter(
+        community_id=community_id,
+        query=query,
     )
-
     query_bundle = QueryBundle(
         query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query)
     )
     response = query_engine.query(query_bundle)
-
-    return response.response
-
-
-def query_discord_auto_filter(
-    community_id: str,
-    query: str,
-    similarity_top_k: int | None = None,
-    d: int | None = None,
-) -> str:
-    """
-    get the query results and do the filtering automatically.
-    By automatically we mean, it would first query the summaries
-    to get the metadata filters
-
-    Parameters
-    -----------
-    guild_id : str
-        the discord guild data to query
-    query : str
-        the query (question) of the user
-    similarity_top_k : int | None
-        the value for the initial summary search
-        to get the `k2` count simliar nodes
-        if `None`, then would read from `.env`
-    d : int
-        this would make the secondary search (`query_discord`)
-        to be done on the `metadata.date - d` to `metadata.date + d`
-
-
-    Returns
-    ---------
-    response : str
-        the LLM response given the query
-    """
-    table_name = "discord_summary"
-    dbname = f"community_{community_id}"
-
-    if d is None:
-        _, _, d = load_hyperparams()
-    if similarity_top_k is None:
-        similarity_top_k, _, _ = load_hyperparams()
-
-    discord_retriever = ForumBasedSummaryRetriever(table_name=table_name, dbname=dbname)
-
-    channels, threads, dates = discord_retriever.retreive_metadata(
-        query=query,
-        metadata_group1_key="channel",
-        metadata_group2_key="thread",
-        metadata_date_key="date",
-        similarity_top_k=similarity_top_k,
-    )
-
-    dates_modified = process_dates(list(dates), d)
-
-    response = query_discord(
-        community_id=community_id,
-        query=query,
-        thread_names=list(threads),
-        channel_names=list(channels),
-        days=dates_modified,
-    )
-    return response
+    return response.response, response.source_nodes
diff --git a/docker-compose.test.yml b/docker-compose.test.yml
@@ -31,6 +31,8 @@ services:
       - K1_RETRIEVER_SEARCH=20
       - K2_RETRIEVER_SEARCH=5
       - D_RETRIEVER_SEARCH=7
+      - COHERE_API_KEY=some_credentials
+      - OPENAI_API_KEY=some_credentials2
     volumes:
       - ./coverage:/project/coverage
     depends_on:

diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 numpy
-llama-index>=0.9.21, <1.0.0
+llama-index>=0.9.26, <1.0.0
 pymongo
 python-dotenv
 pgvector
@@ -16,5 +16,6 @@ neo4j>=5.14.1, <6.0.0
 coverage>=7.3.3, <8.0.0
 pytest>=7.4.3, <8.0.0
 python-dotenv==1.0.0
-tc-hivemind-backend==1.0.0
+tc-hivemind-backend==1.1.0
 celery>=5.3.6, <6.0.0
+guidance
diff --git a/subquery.py b/subquery.py
@@ -0,0 +1,111 @@
+from guidance.models import OpenAIChat
+from llama_index import QueryBundle, ServiceContext
+from llama_index.core import BaseQueryEngine
+from llama_index.query_engine import SubQuestionQueryEngine
+from llama_index.question_gen.guidance_generator import GuidanceQuestionGenerator
+from llama_index.schema import NodeWithScore
+from llama_index.tools import QueryEngineTool, ToolMetadata
+from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
+from utils.query_engine import prepare_discord_engine_auto_filter
+
+
+def query_multiple_source(
+    query: str,
+    community_id: str,
+    discord: bool,
+    discourse: bool,
+    gdrive: bool,
+    notion: bool,
+    telegram: bool,
+    github: bool,
+) -> tuple[str, list[NodeWithScore]]:
+    """
+    query multiple platforms and get an answer from the multiple
+
+    Parameters
+    ------------
+    query : str
+        the user question
+    community_id : str
+        the community id to get their data
+    discord : bool
+        if `True` then add the engine to the subquery_generator
+    discourse : bool
+        if `True` then add the engine to the subquery_generator
+    gdrive : bool
+        if `True` then add the engine to the subquery_generator
+    notion : bool
+        if `True` then add the engine to the subquery_generator
+    telegram : bool
+        if `True` then add the engine to the subquery_generator
+    github : bool
+        if `True` then add the engine to the subquery_generator
+
+
+    Returns
+    --------
+    response : str,
+        the response to the user query from the LLM
+        using the engines of the given platforms (pltform equal to True)
+    source_nodes : list[NodeWithScore]
+        the list of nodes that were source of answering
+    """
+    query_engine_tools: list[QueryEngineTool] = []
+    tools: list[ToolMetadata] = []
+
+    discord_query_engine: BaseQueryEngine
+    # discourse_query_engine: BaseQueryEngine
+    # gdrive_query_engine: BaseQueryEngine
+    # notion_query_engine: BaseQueryEngine
+    # telegram_query_engine: BaseQueryEngine
+    # github_query_engine: BaseQueryEngine
+
+    # query engine perparation
+    # tools_metadata and query_engine_tools
+    if discord:
+        discord_query_engine = prepare_discord_engine_auto_filter(
+            community_id,
+            query,
+            similarity_top_k=None,
+            d=None,
+        )
+        tool_metadata = ToolMetadata(
+            name="Discord",
+            description="Contains messages and summaries of conversations from the Discord platform of the community",
+        )
+
+        tools.append(tool_metadata)
+        query_engine_tools.append(
+            QueryEngineTool(
+                query_engine=discord_query_engine,
+                metadata=tool_metadata,
+            )
+        )
+
+    if discourse:
+        raise NotImplementedError
+    if gdrive:
+        raise NotImplementedError
+    if notion:
+        raise NotImplementedError
+    if telegram:
+        raise NotImplementedError
+    if github:
+        raise NotImplementedError
+
+    question_gen = GuidanceQuestionGenerator.from_defaults(
+        guidance_llm=OpenAIChat("gpt-3.5-turbo"),
+        verbose=False,
+    )
+    embed_model = CohereEmbedding()
+    service_context = ServiceContext.from_defaults(embed_model=embed_model)
+    s_engine = SubQuestionQueryEngine.from_defaults(
+        question_gen=question_gen,
+        query_engine_tools=query_engine_tools,
+        use_async=False,
+        service_context=service_context,
+    )
+    query_embedding = embed_model.get_text_embedding(text=query)
+    response = s_engine.query(QueryBundle(query_str=query, embedding=query_embedding))
+
+    return response.response, response.source_nodes