Skip to content

Commit

Permalink
Merge pull request #10 from TogetherCrew/feat/subquery-generator
Browse files Browse the repository at this point in the history
Feat: subquery generator
  • Loading branch information
cyri113 authored Jan 8, 2024
2 parents ab04e8b + 34ceb7c commit b9a4d47
Show file tree
Hide file tree
Showing 10 changed files with 365 additions and 130 deletions.
25 changes: 25 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
CHUNK_SIZE=
COHERE_API_KEY=
D_RETRIEVER_SEARCH=
EMBEDDING_DIM=
K1_RETRIEVER_SEARCH=
K2_RETRIEVER_SEARCH=
MONGODB_HOST=
MONGODB_PASS=
MONGODB_PORT=
MONGODB_USER=
NEO4J_DB=
NEO4J_HOST=
NEO4J_PASSWORD=
NEO4J_PORT=
NEO4J_PROTOCOL=
NEO4J_USER=
OPENAI_API_KEY=
POSTGRES_HOST=
POSTGRES_PASS=
POSTGRES_PORT=
POSTGRES_USER=
RABBIT_HOST=
RABBIT_PASSWORD=
RABBIT_PORT=
RABBIT_USER=
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -159,4 +159,5 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

hivemind-bot-env/*
hivemind-bot-env/*
main.ipynb
10 changes: 7 additions & 3 deletions bot/retrievers/summary_retriever_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(
the embedding model to use for doing embedding on the query string
default would be CohereEmbedding that we've written
"""
self.index = self._setup_index(table_name, dbname)
self.index = self._setup_index(table_name, dbname, embedding_model)
self.embedding_model = embedding_model

def get_similar_nodes(
Expand Down Expand Up @@ -62,10 +62,14 @@ def get_similar_nodes(

return nodes

def _setup_index(self, table_name: str, dbname: str) -> VectorStoreIndex:
def _setup_index(
self, table_name: str, dbname: str, embedding_model: BaseEmbedding
) -> VectorStoreIndex:
"""
setup the llama_index VectorStoreIndex
"""
pg_vector_access = PGVectorAccess(table_name=table_name, dbname=dbname)
pg_vector_access = PGVectorAccess(
table_name=table_name, dbname=dbname, embed_model=embedding_model
)
index = pg_vector_access.load_index()
return index
139 changes: 15 additions & 124 deletions discord_query.py
Original file line number Diff line number Diff line change
@@ -1,145 +1,36 @@
from bot.retrievers.forum_summary_retriever import ForumBasedSummaryRetriever
from bot.retrievers.process_dates import process_dates
from bot.retrievers.utils.load_hyperparams import load_hyperparams
from llama_index import QueryBundle
from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters
from llama_index.schema import NodeWithScore
from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
from tc_hivemind_backend.pg_vector_access import PGVectorAccess
from utils.query_engine.discord_query_engine import prepare_discord_engine_auto_filter


def query_discord(
community_id: str,
query: str,
thread_names: list[str],
channel_names: list[str],
days: list[str],
similarity_top_k: int | None = None,
) -> str:
) -> tuple[str, list[NodeWithScore]]:
"""
query the discord database using filters given
and give an anwer to the given query using the LLM
query the llm using the query engine
Parameters
------------
guild_id : str
the discord guild data to query
query_engine : BaseQueryEngine
the prepared query engine
query : str
the query (question) of the user
thread_names : list[str]
the given threads to search for
channel_names : list[str]
the given channels to search for
days : list[str]
the given days to search for
similarity_top_k : int | None
the k similar results to use when querying the data
if `None` will load from `.env` file
the string question
Returns
---------
----------
response : str
the LLM response given the query
the LLM response
source_nodes : list[llama_index.schema.NodeWithScore]
the source nodes that helped in answering the question
"""
if similarity_top_k is None:
_, similarity_top_k, _ = load_hyperparams()

table_name = "discord"
dbname = f"community_{community_id}"

pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname)

index = pg_vector.load_index()

thread_filters: list[ExactMatchFilter] = []
channel_filters: list[ExactMatchFilter] = []
day_filters: list[ExactMatchFilter] = []

for channel in channel_names:
channel_updated = channel.replace("'", "''")
channel_filters.append(ExactMatchFilter(key="channel", value=channel_updated))

for thread in thread_names:
thread_updated = thread.replace("'", "''")
thread_filters.append(ExactMatchFilter(key="thread", value=thread_updated))

for day in days:
day_filters.append(ExactMatchFilter(key="date", value=day))

all_filters: list[ExactMatchFilter] = []
all_filters.extend(thread_filters)
all_filters.extend(channel_filters)
all_filters.extend(day_filters)

filters = MetadataFilters(filters=all_filters, condition=FilterCondition.OR)

query_engine = index.as_query_engine(
filters=filters, similarity_top_k=similarity_top_k
query_engine = prepare_discord_engine_auto_filter(
community_id=community_id,
query=query,
)

query_bundle = QueryBundle(
query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query)
)
response = query_engine.query(query_bundle)

return response.response


def query_discord_auto_filter(
community_id: str,
query: str,
similarity_top_k: int | None = None,
d: int | None = None,
) -> str:
"""
get the query results and do the filtering automatically.
By automatically we mean, it would first query the summaries
to get the metadata filters
Parameters
-----------
guild_id : str
the discord guild data to query
query : str
the query (question) of the user
similarity_top_k : int | None
the value for the initial summary search
to get the `k2` count simliar nodes
if `None`, then would read from `.env`
d : int
this would make the secondary search (`query_discord`)
to be done on the `metadata.date - d` to `metadata.date + d`
Returns
---------
response : str
the LLM response given the query
"""
table_name = "discord_summary"
dbname = f"community_{community_id}"

if d is None:
_, _, d = load_hyperparams()
if similarity_top_k is None:
similarity_top_k, _, _ = load_hyperparams()

discord_retriever = ForumBasedSummaryRetriever(table_name=table_name, dbname=dbname)

channels, threads, dates = discord_retriever.retreive_metadata(
query=query,
metadata_group1_key="channel",
metadata_group2_key="thread",
metadata_date_key="date",
similarity_top_k=similarity_top_k,
)

dates_modified = process_dates(list(dates), d)

response = query_discord(
community_id=community_id,
query=query,
thread_names=list(threads),
channel_names=list(channels),
days=dates_modified,
)
return response
return response.response, response.source_nodes
2 changes: 2 additions & 0 deletions docker-compose.test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ services:
- K1_RETRIEVER_SEARCH=20
- K2_RETRIEVER_SEARCH=5
- D_RETRIEVER_SEARCH=7
- COHERE_API_KEY=some_credentials
- OPENAI_API_KEY=some_credentials2
volumes:
- ./coverage:/project/coverage
depends_on:
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
numpy
llama-index>=0.9.21, <1.0.0
llama-index>=0.9.26, <1.0.0
pymongo
python-dotenv
pgvector
Expand All @@ -16,5 +16,6 @@ neo4j>=5.14.1, <6.0.0
coverage>=7.3.3, <8.0.0
pytest>=7.4.3, <8.0.0
python-dotenv==1.0.0
tc-hivemind-backend==1.0.0
tc-hivemind-backend==1.1.0
celery>=5.3.6, <6.0.0
guidance
111 changes: 111 additions & 0 deletions subquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from guidance.models import OpenAIChat
from llama_index import QueryBundle, ServiceContext
from llama_index.core import BaseQueryEngine
from llama_index.query_engine import SubQuestionQueryEngine
from llama_index.question_gen.guidance_generator import GuidanceQuestionGenerator
from llama_index.schema import NodeWithScore
from llama_index.tools import QueryEngineTool, ToolMetadata
from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
from utils.query_engine import prepare_discord_engine_auto_filter


def query_multiple_source(
query: str,
community_id: str,
discord: bool,
discourse: bool,
gdrive: bool,
notion: bool,
telegram: bool,
github: bool,
) -> tuple[str, list[NodeWithScore]]:
"""
query multiple platforms and get an answer from the multiple
Parameters
------------
query : str
the user question
community_id : str
the community id to get their data
discord : bool
if `True` then add the engine to the subquery_generator
discourse : bool
if `True` then add the engine to the subquery_generator
gdrive : bool
if `True` then add the engine to the subquery_generator
notion : bool
if `True` then add the engine to the subquery_generator
telegram : bool
if `True` then add the engine to the subquery_generator
github : bool
if `True` then add the engine to the subquery_generator
Returns
--------
response : str,
the response to the user query from the LLM
using the engines of the given platforms (pltform equal to True)
source_nodes : list[NodeWithScore]
the list of nodes that were source of answering
"""
query_engine_tools: list[QueryEngineTool] = []
tools: list[ToolMetadata] = []

discord_query_engine: BaseQueryEngine
# discourse_query_engine: BaseQueryEngine
# gdrive_query_engine: BaseQueryEngine
# notion_query_engine: BaseQueryEngine
# telegram_query_engine: BaseQueryEngine
# github_query_engine: BaseQueryEngine

# query engine perparation
# tools_metadata and query_engine_tools
if discord:
discord_query_engine = prepare_discord_engine_auto_filter(
community_id,
query,
similarity_top_k=None,
d=None,
)
tool_metadata = ToolMetadata(
name="Discord",
description="Contains messages and summaries of conversations from the Discord platform of the community",
)

tools.append(tool_metadata)
query_engine_tools.append(
QueryEngineTool(
query_engine=discord_query_engine,
metadata=tool_metadata,
)
)

if discourse:
raise NotImplementedError
if gdrive:
raise NotImplementedError
if notion:
raise NotImplementedError
if telegram:
raise NotImplementedError
if github:
raise NotImplementedError

question_gen = GuidanceQuestionGenerator.from_defaults(
guidance_llm=OpenAIChat("gpt-3.5-turbo"),
verbose=False,
)
embed_model = CohereEmbedding()
service_context = ServiceContext.from_defaults(embed_model=embed_model)
s_engine = SubQuestionQueryEngine.from_defaults(
question_gen=question_gen,
query_engine_tools=query_engine_tools,
use_async=False,
service_context=service_context,
)
query_embedding = embed_model.get_text_embedding(text=query)
response = s_engine.query(QueryBundle(query_str=query, embedding=query_embedding))

return response.response, response.source_nodes
Loading

0 comments on commit b9a4d47

Please sign in to comment.