From c38c2f2af93c9b56c086427b6295e50a6e7741d4 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 1 Jan 2024 12:42:07 +0330 Subject: [PATCH 01/16] feat: seperate the query engine! --- .gitignore | 3 ++- discord_query.py | 65 ++++++++++++++++++++++++++++++++---------------- 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/.gitignore b/.gitignore index 1cd6533..3a5bd6b 100644 --- a/.gitignore +++ b/.gitignore @@ -159,4 +159,5 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -hivemind-bot-env/* \ No newline at end of file +hivemind-bot-env/* +main.ipynb \ No newline at end of file diff --git a/discord_query.py b/discord_query.py index ae0865a..aedfe79 100644 --- a/discord_query.py +++ b/discord_query.py @@ -2,19 +2,19 @@ from bot.retrievers.process_dates import process_dates from bot.retrievers.utils.load_hyperparams import load_hyperparams from llama_index import QueryBundle +from llama_index.core import BaseQueryEngine from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from tc_hivemind_backend.pg_vector_access import PGVectorAccess -def query_discord( +def create_discord_engine( community_id: str, - query: str, thread_names: list[str], channel_names: list[str], days: list[str], similarity_top_k: int | None = None, -) -> str: +) -> BaseQueryEngine: """ query the discord database using filters given and give an anwer to the given query using the LLM @@ -37,18 +37,16 @@ def query_discord( Returns --------- - response : str - the LLM response given the query + query_engine : BaseQueryEngine + the created query engine with the filters """ - if similarity_top_k is None: - _, similarity_top_k, _ = load_hyperparams() - table_name = "discord" dbname = f"community_{community_id}" pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname) - index = pg_vector.load_index() + if similarity_top_k is None: + _, similarity_top_k, _ = load_hyperparams() thread_filters: list[ExactMatchFilter] = [] channel_filters: list[ExactMatchFilter] = [] @@ -76,22 +74,17 @@ def query_discord( filters=filters, similarity_top_k=similarity_top_k ) - query_bundle = QueryBundle( - query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query) - ) - response = query_engine.query(query_bundle) + return query_engine - return response.response - -def query_discord_auto_filter( +def create_discord_engine_auto_filter( community_id: str, query: str, similarity_top_k: int | None = None, d: int | None = None, -) -> str: +) -> BaseQueryEngine: """ - get the query results and do the filtering automatically. + get the query engine and do the filtering automatically. By automatically we mean, it would first query the summaries to get the metadata filters @@ -106,14 +99,14 @@ def query_discord_auto_filter( to get the `k2` count simliar nodes if `None`, then would read from `.env` d : int - this would make the secondary search (`query_discord`) + this would make the secondary search (`create_discord_engine`) to be done on the `metadata.date - d` to `metadata.date + d` Returns --------- - response : str - the LLM response given the query + query_engine : BaseQueryEngine + the created query engine with the filters """ table_name = "discord_summary" dbname = f"community_{community_id}" @@ -135,11 +128,39 @@ def query_discord_auto_filter( dates_modified = process_dates(list(dates), d) - response = query_discord( + engine = create_discord_engine( community_id=community_id, query=query, thread_names=list(threads), channel_names=list(channels), days=dates_modified, ) + return engine + + +def query_discord( + community_id: str, + query: str, +) -> str: + """ + query the llm using the query engine + + Parameters + ------------ + query_engine : BaseQueryEngine + the prepared query engine + query : str + the string question + """ + query_engine = create_discord_engine_auto_filter( + community_id=community_id, + query=query, + ) + + query_bundle = QueryBundle( + query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query) + ) + + response = query_engine.query(query_bundle) + return response From ede353c36147e497495739386bd2ef30a2c91f80 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 1 Jan 2024 14:43:39 +0330 Subject: [PATCH 02/16] feat: Added subquery reponse generator! --- discord_query.py | 142 +-------------------- requirements.txt | 1 + subquery.py | 104 +++++++++++++++ utils/query_engine/__init__.py | 2 + utils/query_engine/discord_query_engine.py | 136 ++++++++++++++++++++ 5 files changed, 245 insertions(+), 140 deletions(-) create mode 100644 subquery.py create mode 100644 utils/query_engine/__init__.py create mode 100644 utils/query_engine/discord_query_engine.py diff --git a/discord_query.py b/discord_query.py index aedfe79..b6ec65b 100644 --- a/discord_query.py +++ b/discord_query.py @@ -1,141 +1,6 @@ -from bot.retrievers.forum_summary_retriever import ForumBasedSummaryRetriever -from bot.retrievers.process_dates import process_dates -from bot.retrievers.utils.load_hyperparams import load_hyperparams +from utils.query_engine.discord_query_engine import prepare_discord_engine_auto_filter from llama_index import QueryBundle -from llama_index.core import BaseQueryEngine -from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters from tc_hivemind_backend.embeddings.cohere import CohereEmbedding -from tc_hivemind_backend.pg_vector_access import PGVectorAccess - - -def create_discord_engine( - community_id: str, - thread_names: list[str], - channel_names: list[str], - days: list[str], - similarity_top_k: int | None = None, -) -> BaseQueryEngine: - """ - query the discord database using filters given - and give an anwer to the given query using the LLM - - Parameters - ------------ - guild_id : str - the discord guild data to query - query : str - the query (question) of the user - thread_names : list[str] - the given threads to search for - channel_names : list[str] - the given channels to search for - days : list[str] - the given days to search for - similarity_top_k : int | None - the k similar results to use when querying the data - if `None` will load from `.env` file - - Returns - --------- - query_engine : BaseQueryEngine - the created query engine with the filters - """ - table_name = "discord" - dbname = f"community_{community_id}" - - pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname) - index = pg_vector.load_index() - if similarity_top_k is None: - _, similarity_top_k, _ = load_hyperparams() - - thread_filters: list[ExactMatchFilter] = [] - channel_filters: list[ExactMatchFilter] = [] - day_filters: list[ExactMatchFilter] = [] - - for channel in channel_names: - channel_updated = channel.replace("'", "''") - channel_filters.append(ExactMatchFilter(key="channel", value=channel_updated)) - - for thread in thread_names: - thread_updated = thread.replace("'", "''") - thread_filters.append(ExactMatchFilter(key="thread", value=thread_updated)) - - for day in days: - day_filters.append(ExactMatchFilter(key="date", value=day)) - - all_filters: list[ExactMatchFilter] = [] - all_filters.extend(thread_filters) - all_filters.extend(channel_filters) - all_filters.extend(day_filters) - - filters = MetadataFilters(filters=all_filters, condition=FilterCondition.OR) - - query_engine = index.as_query_engine( - filters=filters, similarity_top_k=similarity_top_k - ) - - return query_engine - - -def create_discord_engine_auto_filter( - community_id: str, - query: str, - similarity_top_k: int | None = None, - d: int | None = None, -) -> BaseQueryEngine: - """ - get the query engine and do the filtering automatically. - By automatically we mean, it would first query the summaries - to get the metadata filters - - Parameters - ----------- - guild_id : str - the discord guild data to query - query : str - the query (question) of the user - similarity_top_k : int | None - the value for the initial summary search - to get the `k2` count simliar nodes - if `None`, then would read from `.env` - d : int - this would make the secondary search (`create_discord_engine`) - to be done on the `metadata.date - d` to `metadata.date + d` - - - Returns - --------- - query_engine : BaseQueryEngine - the created query engine with the filters - """ - table_name = "discord_summary" - dbname = f"community_{community_id}" - - if d is None: - _, _, d = load_hyperparams() - if similarity_top_k is None: - similarity_top_k, _, _ = load_hyperparams() - - discord_retriever = ForumBasedSummaryRetriever(table_name=table_name, dbname=dbname) - - channels, threads, dates = discord_retriever.retreive_metadata( - query=query, - metadata_group1_key="channel", - metadata_group2_key="thread", - metadata_date_key="date", - similarity_top_k=similarity_top_k, - ) - - dates_modified = process_dates(list(dates), d) - - engine = create_discord_engine( - community_id=community_id, - query=query, - thread_names=list(threads), - channel_names=list(channels), - days=dates_modified, - ) - return engine def query_discord( @@ -152,15 +17,12 @@ def query_discord( query : str the string question """ - query_engine = create_discord_engine_auto_filter( + query_engine = prepare_discord_engine_auto_filter( community_id=community_id, query=query, ) - query_bundle = QueryBundle( query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query) ) - response = query_engine.query(query_bundle) - return response diff --git a/requirements.txt b/requirements.txt index f04f1e7..f41372e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -18,3 +18,4 @@ pytest>=7.4.3, <8.0.0 python-dotenv==1.0.0 tc-hivemind-backend==1.0.0 celery>=5.3.6, <6.0.0 +guidance diff --git a/subquery.py b/subquery.py new file mode 100644 index 0000000..ad3f64a --- /dev/null +++ b/subquery.py @@ -0,0 +1,104 @@ +from utils.query_engine import prepare_discord_engine_auto_filter +from llama_index.core import BaseQueryEngine +from guidance.llms import OpenAI as GuidanceOpenAI +from llama_index import QueryBundle +from llama_index.tools import QueryEngineTool, ToolMetadata +from llama_index.query_engine import SubQuestionQueryEngine +from llama_index.question_gen.guidance_generator import GuidanceQuestionGenerator + +def query_multiple_source( + query: str, + community_id: str, + discord: bool, + discourse: bool, + gdrive: bool, + notion: bool, + telegram: bool, + github: bool, + ) -> str: + """ + query multiple platforms and get an answer from the multiple + + Parameters + ------------ + query : str + the user question + community_id : str + the community id to get their data + discord : bool + if `True` then add the engine to the subquery_generator + discourse : bool + if `True` then add the engine to the subquery_generator + gdrive : bool + if `True` then add the engine to the subquery_generator + notion : bool + if `True` then add the engine to the subquery_generator + telegram : bool + if `True` then add the engine to the subquery_generator + github : bool + if `True` then add the engine to the subquery_generator + + + Returns + -------- + reponse : str + the response to the user query from the LLM + using the engines of the given platforms (pltform equal to True) + """ + query_engine_tools: list[QueryEngineTool] = [] + tools: list[ToolMetadata] = [] + + discord_query_engine: BaseQueryEngine + discourse_query_engine: BaseQueryEngine + gdrive_query_engine: BaseQueryEngine + notion_query_engine: BaseQueryEngine + telegram_query_engine: BaseQueryEngine + github_query_engine: BaseQueryEngine + + # query engine perparation + # tools_metadata and query_engine_tools + if discord: + discord_query_engine = prepare_discord_engine_auto_filter( + community_id, + query, + similarity_top_k=None, + d=None, + ) + tool_metadata = ToolMetadata( + name="Discord", + description="Provides the discord platform conversations data." + ) + + tools.append(tool_metadata) + query_engine_tools.append( + QueryEngineTool( + query_engine=discord_query_engine, + metadata=tool_metadata, + ) + ) + + if discourse: + raise NotImplementedError + if gdrive: + raise NotImplementedError + if notion: + raise NotImplementedError + if telegram: + raise NotImplementedError + if github: + raise NotImplementedError + + + question_gen = GuidanceQuestionGenerator.from_defaults( + guidance_llm=GuidanceOpenAI("text-davinci-003"), verbose=False + ) + + s_engine = SubQuestionQueryEngine.from_defaults( + question_gen=question_gen, + query_engine_tools=query_engine_tools, + ) + reponse = s_engine.query( + QueryBundle(query) + ) + + return reponse.response \ No newline at end of file diff --git a/utils/query_engine/__init__.py b/utils/query_engine/__init__.py new file mode 100644 index 0000000..0de4592 --- /dev/null +++ b/utils/query_engine/__init__.py @@ -0,0 +1,2 @@ +# flake8: noqa +from discord_query_engine import prepare_discord_engine_auto_filter \ No newline at end of file diff --git a/utils/query_engine/discord_query_engine.py b/utils/query_engine/discord_query_engine.py new file mode 100644 index 0000000..0af3064 --- /dev/null +++ b/utils/query_engine/discord_query_engine.py @@ -0,0 +1,136 @@ +from bot.retrievers.forum_summary_retriever import ForumBasedSummaryRetriever +from bot.retrievers.process_dates import process_dates +from bot.retrievers.utils.load_hyperparams import load_hyperparams +from llama_index.core import BaseQueryEngine +from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters +from tc_hivemind_backend.pg_vector_access import PGVectorAccess + + +def prepare_discord_engine( + community_id: str, + thread_names: list[str], + channel_names: list[str], + days: list[str], + similarity_top_k: int | None = None, +) -> BaseQueryEngine: + """ + query the discord database using filters given + and give an anwer to the given query using the LLM + + Parameters + ------------ + guild_id : str + the discord guild data to query + query : str + the query (question) of the user + thread_names : list[str] + the given threads to search for + channel_names : list[str] + the given channels to search for + days : list[str] + the given days to search for + similarity_top_k : int | None + the k similar results to use when querying the data + if `None` will load from `.env` file + + Returns + --------- + query_engine : BaseQueryEngine + the created query engine with the filters + """ + table_name = "discord" + dbname = f"community_{community_id}" + + pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname) + index = pg_vector.load_index() + if similarity_top_k is None: + _, similarity_top_k, _ = load_hyperparams() + + thread_filters: list[ExactMatchFilter] = [] + channel_filters: list[ExactMatchFilter] = [] + day_filters: list[ExactMatchFilter] = [] + + for channel in channel_names: + channel_updated = channel.replace("'", "''") + channel_filters.append(ExactMatchFilter(key="channel", value=channel_updated)) + + for thread in thread_names: + thread_updated = thread.replace("'", "''") + thread_filters.append(ExactMatchFilter(key="thread", value=thread_updated)) + + for day in days: + day_filters.append(ExactMatchFilter(key="date", value=day)) + + all_filters: list[ExactMatchFilter] = [] + all_filters.extend(thread_filters) + all_filters.extend(channel_filters) + all_filters.extend(day_filters) + + filters = MetadataFilters(filters=all_filters, condition=FilterCondition.OR) + + query_engine = index.as_query_engine( + filters=filters, similarity_top_k=similarity_top_k + ) + + return query_engine + + +def prepare_discord_engine_auto_filter( + community_id: str, + query: str, + similarity_top_k: int | None = None, + d: int | None = None, +) -> BaseQueryEngine: + """ + get the query engine and do the filtering automatically. + By automatically we mean, it would first query the summaries + to get the metadata filters + + Parameters + ----------- + guild_id : str + the discord guild data to query + query : str + the query (question) of the user + similarity_top_k : int | None + the value for the initial summary search + to get the `k2` count simliar nodes + if `None`, then would read from `.env` + d : int + this would make the secondary search (`prepare_discord_engine`) + to be done on the `metadata.date - d` to `metadata.date + d` + + + Returns + --------- + query_engine : BaseQueryEngine + the created query engine with the filters + """ + table_name = "discord_summary" + dbname = f"community_{community_id}" + + if d is None: + _, _, d = load_hyperparams() + if similarity_top_k is None: + similarity_top_k, _, _ = load_hyperparams() + + discord_retriever = ForumBasedSummaryRetriever(table_name=table_name, dbname=dbname) + + channels, threads, dates = discord_retriever.retreive_metadata( + query=query, + metadata_group1_key="channel", + metadata_group2_key="thread", + metadata_date_key="date", + similarity_top_k=similarity_top_k, + ) + + dates_modified = process_dates(list(dates), d) + + engine = prepare_discord_engine( + community_id=community_id, + query=query, + thread_names=list(threads), + channel_names=list(channels), + days=dates_modified, + ) + return engine \ No newline at end of file From 073cbb7e5c0b5b4c1bd15722b465089c3695fd33 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 1 Jan 2024 15:01:36 +0330 Subject: [PATCH 03/16] fix: wrong import and cleaning code for linters! --- subquery.py | 32 ++++++++++------------ utils/query_engine/__init__.py | 2 +- utils/query_engine/discord_query_engine.py | 2 +- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/subquery.py b/subquery.py index ad3f64a..d954934 100644 --- a/subquery.py +++ b/subquery.py @@ -1,21 +1,22 @@ from utils.query_engine import prepare_discord_engine_auto_filter from llama_index.core import BaseQueryEngine -from guidance.llms import OpenAI as GuidanceOpenAI +from guidance.models import OpenAI as GuidanceOpenAI from llama_index import QueryBundle from llama_index.tools import QueryEngineTool, ToolMetadata from llama_index.query_engine import SubQuestionQueryEngine from llama_index.question_gen.guidance_generator import GuidanceQuestionGenerator + def query_multiple_source( - query: str, - community_id: str, - discord: bool, - discourse: bool, - gdrive: bool, - notion: bool, - telegram: bool, - github: bool, - ) -> str: + query: str, + community_id: str, + discord: bool, + discourse: bool, + gdrive: bool, + notion: bool, + telegram: bool, + github: bool, +) -> str: """ query multiple platforms and get an answer from the multiple @@ -66,7 +67,7 @@ def query_multiple_source( ) tool_metadata = ToolMetadata( name="Discord", - description="Provides the discord platform conversations data." + description="Provides the discord platform conversations data.", ) tools.append(tool_metadata) @@ -76,7 +77,7 @@ def query_multiple_source( metadata=tool_metadata, ) ) - + if discourse: raise NotImplementedError if gdrive: @@ -87,7 +88,6 @@ def query_multiple_source( raise NotImplementedError if github: raise NotImplementedError - question_gen = GuidanceQuestionGenerator.from_defaults( guidance_llm=GuidanceOpenAI("text-davinci-003"), verbose=False @@ -97,8 +97,6 @@ def query_multiple_source( question_gen=question_gen, query_engine_tools=query_engine_tools, ) - reponse = s_engine.query( - QueryBundle(query) - ) + reponse = s_engine.query(QueryBundle(query)) - return reponse.response \ No newline at end of file + return reponse.response diff --git a/utils/query_engine/__init__.py b/utils/query_engine/__init__.py index 0de4592..115169c 100644 --- a/utils/query_engine/__init__.py +++ b/utils/query_engine/__init__.py @@ -1,2 +1,2 @@ # flake8: noqa -from discord_query_engine import prepare_discord_engine_auto_filter \ No newline at end of file +from discord_query_engine import prepare_discord_engine_auto_filter diff --git a/utils/query_engine/discord_query_engine.py b/utils/query_engine/discord_query_engine.py index 0af3064..56496cb 100644 --- a/utils/query_engine/discord_query_engine.py +++ b/utils/query_engine/discord_query_engine.py @@ -133,4 +133,4 @@ def prepare_discord_engine_auto_filter( channel_names=list(channels), days=dates_modified, ) - return engine \ No newline at end of file + return engine From 36cd0dcc17fab59cc499f967aa8ceac931537bea Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 1 Jan 2024 15:23:56 +0330 Subject: [PATCH 04/16] feat: Applying cohere embedding! --- subquery.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/subquery.py b/subquery.py index d954934..1e3ef9c 100644 --- a/subquery.py +++ b/subquery.py @@ -5,6 +5,7 @@ from llama_index.tools import QueryEngineTool, ToolMetadata from llama_index.query_engine import SubQuestionQueryEngine from llama_index.question_gen.guidance_generator import GuidanceQuestionGenerator +from tc_hivemind_backend.embeddings.cohere import CohereEmbedding def query_multiple_source( @@ -97,6 +98,10 @@ def query_multiple_source( question_gen=question_gen, query_engine_tools=query_engine_tools, ) - reponse = s_engine.query(QueryBundle(query)) + reponse = s_engine.query( + QueryBundle( + query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query) + ) + ) return reponse.response From 058b528c1de2fde9fb65d412c2e80d7d790d3f61 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 2 Jan 2024 10:30:48 +0330 Subject: [PATCH 05/16] update: test case for discord secondary search! --- .../unit/test_prepare_discord_query_engine.py | 50 +++++++++++++++++++ utils/query_engine/__init__.py | 2 +- utils/query_engine/discord_query_engine.py | 8 ++- 3 files changed, 58 insertions(+), 2 deletions(-) create mode 100644 tests/unit/test_prepare_discord_query_engine.py diff --git a/tests/unit/test_prepare_discord_query_engine.py b/tests/unit/test_prepare_discord_query_engine.py new file mode 100644 index 0000000..7ac1182 --- /dev/null +++ b/tests/unit/test_prepare_discord_query_engine.py @@ -0,0 +1,50 @@ +import unittest +import os +from unittest.mock import patch, Mock +from utils.query_engine.discord_query_engine import prepare_discord_engine +from llama_index.core import BaseQueryEngine +from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters + + +class TestPrepareDiscordEngine(unittest.TestCase): + def setUp(self): + # Set up environment variables for testing + os.environ["CHUNK_SIZE"] = "128" + os.environ["EMBEDDING_DIM"] = "256" + os.environ["K1_RETRIEVER_SEARCH"] = "20" + os.environ["K2_RETRIEVER_SEARCH"] = "5" + os.environ["D_RETRIEVER_SEARCH"] = "3" + + def test_prepare_discord_engine(self): + community_id = "123456" + thread_names = ["thread1", "thread2"] + channel_names = ["channel1", "channel2"] + days = ["2022-01-01", "2022-01-02"] + + # Call the function + query_engine = prepare_discord_engine( + community_id, + thread_names, + channel_names, + days, + testing=True, + ) + + # Assertions + self.assertIsInstance(query_engine, BaseQueryEngine) + + expected_filter = MetadataFilters( + filters=[ + ExactMatchFilter(key="thread", value="thread1"), + ExactMatchFilter(key="thread", value="thread2"), + ExactMatchFilter(key="channel", value="channel1"), + ExactMatchFilter(key="channel", value="channel2"), + ExactMatchFilter(key="date", value="2022-01-01"), + ExactMatchFilter(key="date", value="2022-01-02"), + ], + condition=FilterCondition.OR, + ) + + self.assertEqual(query_engine.retriever._filters, expected_filter) + # this is the secondary search, so K2 should be for this + self.assertEqual(query_engine.retriever._similarity_top_k, 5) diff --git a/utils/query_engine/__init__.py b/utils/query_engine/__init__.py index 115169c..fad06f9 100644 --- a/utils/query_engine/__init__.py +++ b/utils/query_engine/__init__.py @@ -1,2 +1,2 @@ # flake8: noqa -from discord_query_engine import prepare_discord_engine_auto_filter +from .discord_query_engine import prepare_discord_engine_auto_filter diff --git a/utils/query_engine/discord_query_engine.py b/utils/query_engine/discord_query_engine.py index 56496cb..4b121df 100644 --- a/utils/query_engine/discord_query_engine.py +++ b/utils/query_engine/discord_query_engine.py @@ -12,6 +12,7 @@ def prepare_discord_engine( channel_names: list[str], days: list[str], similarity_top_k: int | None = None, + **kwarg, ) -> BaseQueryEngine: """ query the discord database using filters given @@ -32,6 +33,9 @@ def prepare_discord_engine( similarity_top_k : int | None the k similar results to use when querying the data if `None` will load from `.env` file + ** kwargs : + testing : bool + whether to setup the PGVectorAccess in testing mode Returns --------- @@ -41,7 +45,9 @@ def prepare_discord_engine( table_name = "discord" dbname = f"community_{community_id}" - pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname) + testing = kwarg.get("testing", False) + + pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname, testing=testing) index = pg_vector.load_index() if similarity_top_k is None: _, similarity_top_k, _ = load_hyperparams() From 0b19e7bf7695395c4f188a46ddc43faf09cb91a2 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 2 Jan 2024 10:49:42 +0330 Subject: [PATCH 06/16] fix: linter issues based on superlinter rules! --- discord_query.py | 2 +- subquery.py | 16 ++++++++-------- tests/unit/test_prepare_discord_query_engine.py | 1 - 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/discord_query.py b/discord_query.py index b6ec65b..a630f7e 100644 --- a/discord_query.py +++ b/discord_query.py @@ -1,6 +1,6 @@ -from utils.query_engine.discord_query_engine import prepare_discord_engine_auto_filter from llama_index import QueryBundle from tc_hivemind_backend.embeddings.cohere import CohereEmbedding +from utils.query_engine.discord_query_engine import prepare_discord_engine_auto_filter def query_discord( diff --git a/subquery.py b/subquery.py index 1e3ef9c..cf4fa10 100644 --- a/subquery.py +++ b/subquery.py @@ -1,11 +1,11 @@ -from utils.query_engine import prepare_discord_engine_auto_filter -from llama_index.core import BaseQueryEngine from guidance.models import OpenAI as GuidanceOpenAI from llama_index import QueryBundle -from llama_index.tools import QueryEngineTool, ToolMetadata +from llama_index.core import BaseQueryEngine from llama_index.query_engine import SubQuestionQueryEngine from llama_index.question_gen.guidance_generator import GuidanceQuestionGenerator +from llama_index.tools import QueryEngineTool, ToolMetadata from tc_hivemind_backend.embeddings.cohere import CohereEmbedding +from utils.query_engine import prepare_discord_engine_auto_filter def query_multiple_source( @@ -51,11 +51,11 @@ def query_multiple_source( tools: list[ToolMetadata] = [] discord_query_engine: BaseQueryEngine - discourse_query_engine: BaseQueryEngine - gdrive_query_engine: BaseQueryEngine - notion_query_engine: BaseQueryEngine - telegram_query_engine: BaseQueryEngine - github_query_engine: BaseQueryEngine + # discourse_query_engine: BaseQueryEngine + # gdrive_query_engine: BaseQueryEngine + # notion_query_engine: BaseQueryEngine + # telegram_query_engine: BaseQueryEngine + # github_query_engine: BaseQueryEngine # query engine perparation # tools_metadata and query_engine_tools diff --git a/tests/unit/test_prepare_discord_query_engine.py b/tests/unit/test_prepare_discord_query_engine.py index 7ac1182..26816f1 100644 --- a/tests/unit/test_prepare_discord_query_engine.py +++ b/tests/unit/test_prepare_discord_query_engine.py @@ -1,6 +1,5 @@ import unittest import os -from unittest.mock import patch, Mock from utils.query_engine.discord_query_engine import prepare_discord_engine from llama_index.core import BaseQueryEngine from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters From da6bb2ca9fa2ca85a4d2bd10eb84274dea59a931 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 2 Jan 2024 11:02:47 +0330 Subject: [PATCH 07/16] fix: isort linter issue! --- tests/unit/test_prepare_discord_query_engine.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_prepare_discord_query_engine.py b/tests/unit/test_prepare_discord_query_engine.py index 26816f1..72ad1a6 100644 --- a/tests/unit/test_prepare_discord_query_engine.py +++ b/tests/unit/test_prepare_discord_query_engine.py @@ -1,8 +1,9 @@ -import unittest import os -from utils.query_engine.discord_query_engine import prepare_discord_engine +import unittest + from llama_index.core import BaseQueryEngine from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters +from utils.query_engine.discord_query_engine import prepare_discord_engine class TestPrepareDiscordEngine(unittest.TestCase): From d75342bae2c112d14d327694ea985d71a88e1d78 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 2 Jan 2024 15:34:27 +0330 Subject: [PATCH 08/16] feat: Added source node returning! --- discord_query.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/discord_query.py b/discord_query.py index a630f7e..7a61c32 100644 --- a/discord_query.py +++ b/discord_query.py @@ -1,4 +1,5 @@ from llama_index import QueryBundle +from llama_index.schema import NodeWithScore from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from utils.query_engine.discord_query_engine import prepare_discord_engine_auto_filter @@ -6,7 +7,7 @@ def query_discord( community_id: str, query: str, -) -> str: +) -> tuple[str, list[NodeWithScore]]: """ query the llm using the query engine @@ -25,4 +26,4 @@ def query_discord( query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query) ) response = query_engine.query(query_bundle) - return response + return response.response, response.source_nodes From fe95fbff2c45c59827bb1a2e85178a54d830aebc Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 2 Jan 2024 15:34:51 +0330 Subject: [PATCH 09/16] feat: Added credentials! --- .env.example | 26 ++++++++++++++++++++++++++ docker-compose.test.yml | 2 ++ 2 files changed, 28 insertions(+) create mode 100644 .env.example diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..ccba142 --- /dev/null +++ b/.env.example @@ -0,0 +1,26 @@ +PORT= +MONGODB_HOST= +MONGODB_PORT= +MONGODB_USER= +MONGODB_PASS= +NEO4J_PROTOCOL= +NEO4J_HOST= +NEO4J_PORT= +NEO4J_USER= +NEO4J_PASSWORD= +NEO4J_DB= +POSTGRES_HOST= +POSTGRES_USER= +POSTGRES_PASS= +POSTGRES_PORT= +RABBIT_HOST= +RABBIT_PORT= +RABBIT_USER= +RABBIT_PASSWORD= +CHUNK_SIZE= +EMBEDDING_DIM= +K1_RETRIEVER_SEARCH= +K2_RETRIEVER_SEARCH= +D_RETRIEVER_SEARCH= +COHERE_API_KEY= +OPENAI_API_KEY= \ No newline at end of file diff --git a/docker-compose.test.yml b/docker-compose.test.yml index 4e046d9..6375962 100644 --- a/docker-compose.test.yml +++ b/docker-compose.test.yml @@ -31,6 +31,8 @@ services: - K1_RETRIEVER_SEARCH=20 - K2_RETRIEVER_SEARCH=5 - D_RETRIEVER_SEARCH=7 + - COHERE_API_KEY=some_credentials + - OPENAI_API_KEY=some_credentials2 volumes: - ./coverage:/project/coverage depends_on: From 923a3cb964da849dc491e3f3ee962d37b745fafc Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 2 Jan 2024 15:40:50 +0330 Subject: [PATCH 10/16] feat: completing the function doc! --- discord_query.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/discord_query.py b/discord_query.py index 7a61c32..24e762f 100644 --- a/discord_query.py +++ b/discord_query.py @@ -17,6 +17,13 @@ def query_discord( the prepared query engine query : str the string question + + Returns + ---------- + response : str + the LLM response + source_nodes : list[llama_index.schema.NodeWithScore] + the source nodes that helped in answering the question """ query_engine = prepare_discord_engine_auto_filter( community_id=community_id, From 43cb8670c3f3ea57c36caa29f4ede5d5ded218bc Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Tue, 2 Jan 2024 15:48:15 +0330 Subject: [PATCH 11/16] fix: dotenv-linter issue! --- .env.example | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/.env.example b/.env.example index ccba142..e73ede4 100644 --- a/.env.example +++ b/.env.example @@ -1,26 +1,25 @@ -PORT= +CHUNK_SIZE= +COHERE_API_KEY= +D_RETRIEVER_SEARCH= +EMBEDDING_DIM= +K1_RETRIEVER_SEARCH= +K2_RETRIEVER_SEARCH= MONGODB_HOST= +MONGODB_PASS= MONGODB_PORT= MONGODB_USER= -MONGODB_PASS= -NEO4J_PROTOCOL= +NEO4J_DB= NEO4J_HOST= +NEO4J_PASSWORD= NEO4J_PORT= +NEO4J_PROTOCOL= NEO4J_USER= -NEO4J_PASSWORD= -NEO4J_DB= +OPENAI_API_KEY= POSTGRES_HOST= -POSTGRES_USER= POSTGRES_PASS= POSTGRES_PORT= +POSTGRES_USER= RABBIT_HOST= +RABBIT_PASSWORD= RABBIT_PORT= RABBIT_USER= -RABBIT_PASSWORD= -CHUNK_SIZE= -EMBEDDING_DIM= -K1_RETRIEVER_SEARCH= -K2_RETRIEVER_SEARCH= -D_RETRIEVER_SEARCH= -COHERE_API_KEY= -OPENAI_API_KEY= \ No newline at end of file From 51c4c23636ae85f80b779d5cb4008709ada8c4b8 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Wed, 3 Jan 2024 08:41:13 +0330 Subject: [PATCH 12/16] feat: update discord platform description! --- subquery.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subquery.py b/subquery.py index cf4fa10..8f3f3d2 100644 --- a/subquery.py +++ b/subquery.py @@ -68,7 +68,7 @@ def query_multiple_source( ) tool_metadata = ToolMetadata( name="Discord", - description="Provides the discord platform conversations data.", + description="Contains messages and summaries of conversations from the Discord platform of the community", ) tools.append(tool_metadata) From 6f51cd5aad10c91408d7449cea6205c2195b756d Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 4 Jan 2024 16:28:16 +0330 Subject: [PATCH 13/16] feat: Added cohere embedding model and updated subquery - We needed to apply the cohere embedding model in our codes. - The `subquery.py` updated based on little experiments (both embedding model and function output updated). --- bot/retrievers/summary_retriever_base.py | 10 +++++++--- subquery.py | 23 ++++++++++++---------- utils/query_engine/discord_query_engine.py | 8 +++++++- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/bot/retrievers/summary_retriever_base.py b/bot/retrievers/summary_retriever_base.py index 1cc3420..0095a7f 100644 --- a/bot/retrievers/summary_retriever_base.py +++ b/bot/retrievers/summary_retriever_base.py @@ -34,7 +34,7 @@ def __init__( the embedding model to use for doing embedding on the query string default would be CohereEmbedding that we've written """ - self.index = self._setup_index(table_name, dbname) + self.index = self._setup_index(table_name, dbname, embedding_model) self.embedding_model = embedding_model def get_similar_nodes( @@ -62,10 +62,14 @@ def get_similar_nodes( return nodes - def _setup_index(self, table_name: str, dbname: str) -> VectorStoreIndex: + def _setup_index( + self, table_name: str, dbname: str, embedding_model: BaseEmbedding + ) -> VectorStoreIndex: """ setup the llama_index VectorStoreIndex """ - pg_vector_access = PGVectorAccess(table_name=table_name, dbname=dbname) + pg_vector_access = PGVectorAccess( + table_name=table_name, dbname=dbname, embed_model=embedding_model + ) index = pg_vector_access.load_index() return index diff --git a/subquery.py b/subquery.py index 8f3f3d2..73cbb55 100644 --- a/subquery.py +++ b/subquery.py @@ -1,8 +1,9 @@ from guidance.models import OpenAI as GuidanceOpenAI -from llama_index import QueryBundle +from llama_index import QueryBundle, ServiceContext from llama_index.core import BaseQueryEngine from llama_index.query_engine import SubQuestionQueryEngine from llama_index.question_gen.guidance_generator import GuidanceQuestionGenerator +from llama_index.schema import NodeWithScore from llama_index.tools import QueryEngineTool, ToolMetadata from tc_hivemind_backend.embeddings.cohere import CohereEmbedding from utils.query_engine import prepare_discord_engine_auto_filter @@ -17,7 +18,7 @@ def query_multiple_source( notion: bool, telegram: bool, github: bool, -) -> str: +) -> tuple[str, list[NodeWithScore]]: """ query multiple platforms and get an answer from the multiple @@ -43,9 +44,11 @@ def query_multiple_source( Returns -------- - reponse : str + response : str, the response to the user query from the LLM using the engines of the given platforms (pltform equal to True) + source_nodes : list[NodeWithScore] + the list of nodes that were source of answering """ query_engine_tools: list[QueryEngineTool] = [] tools: list[ToolMetadata] = [] @@ -93,15 +96,15 @@ def query_multiple_source( question_gen = GuidanceQuestionGenerator.from_defaults( guidance_llm=GuidanceOpenAI("text-davinci-003"), verbose=False ) - + embed_model = CohereEmbedding() + service_context = ServiceContext.from_defaults(embed_model=embed_model) s_engine = SubQuestionQueryEngine.from_defaults( question_gen=question_gen, query_engine_tools=query_engine_tools, + use_async=False, + service_context=service_context, ) - reponse = s_engine.query( - QueryBundle( - query_str=query, embedding=CohereEmbedding().get_text_embedding(text=query) - ) - ) + query_embedding = embed_model.get_text_embedding(text=query) + response = s_engine.query(QueryBundle(query_str=query, embedding=query_embedding)) - return reponse.response + return response.response, response.source_nodes diff --git a/utils/query_engine/discord_query_engine.py b/utils/query_engine/discord_query_engine.py index 4b121df..ca032c3 100644 --- a/utils/query_engine/discord_query_engine.py +++ b/utils/query_engine/discord_query_engine.py @@ -4,6 +4,7 @@ from llama_index.core import BaseQueryEngine from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters from tc_hivemind_backend.pg_vector_access import PGVectorAccess +from tc_hivemind_backend.embeddings.cohere import CohereEmbedding def prepare_discord_engine( @@ -47,7 +48,12 @@ def prepare_discord_engine( testing = kwarg.get("testing", False) - pg_vector = PGVectorAccess(table_name=table_name, dbname=dbname, testing=testing) + pg_vector = PGVectorAccess( + table_name=table_name, + dbname=dbname, + testing=testing, + embed_model=CohereEmbedding(), + ) index = pg_vector.load_index() if similarity_top_k is None: _, similarity_top_k, _ = load_hyperparams() From 243de104c62bdc402c9281b43ab11cebf4d224c7 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 4 Jan 2024 16:35:25 +0330 Subject: [PATCH 14/16] fix: isort linter issue based on superlinter rules! --- utils/query_engine/discord_query_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/query_engine/discord_query_engine.py b/utils/query_engine/discord_query_engine.py index ca032c3..6a29833 100644 --- a/utils/query_engine/discord_query_engine.py +++ b/utils/query_engine/discord_query_engine.py @@ -3,8 +3,8 @@ from bot.retrievers.utils.load_hyperparams import load_hyperparams from llama_index.core import BaseQueryEngine from llama_index.vector_stores import ExactMatchFilter, FilterCondition, MetadataFilters -from tc_hivemind_backend.pg_vector_access import PGVectorAccess from tc_hivemind_backend.embeddings.cohere import CohereEmbedding +from tc_hivemind_backend.pg_vector_access import PGVectorAccess def prepare_discord_engine( From e469d3d2fd63b10aebc1730379c787aa1f5ced3c Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Thu, 4 Jan 2024 16:42:14 +0330 Subject: [PATCH 15/16] update: shared codes lib version! We have added the custom embed model support in its newer version. --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f41372e..a780b05 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,6 @@ neo4j>=5.14.1, <6.0.0 coverage>=7.3.3, <8.0.0 pytest>=7.4.3, <8.0.0 python-dotenv==1.0.0 -tc-hivemind-backend==1.0.0 +tc-hivemind-backend==1.1.0 celery>=5.3.6, <6.0.0 guidance From 34ceb7ce111cbb4619b7501839b90775f5cd53a0 Mon Sep 17 00:00:00 2001 From: Mohammad Amin Date: Mon, 8 Jan 2024 09:07:27 +0330 Subject: [PATCH 16/16] update: llama-index lib usage! we updated the library to the newest right version and we're chosed the right LLM for the guidance. note: the guidance_llm would create the subqueries. --- requirements.txt | 2 +- subquery.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index a780b05..0a35833 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy -llama-index>=0.9.21, <1.0.0 +llama-index>=0.9.26, <1.0.0 pymongo python-dotenv pgvector diff --git a/subquery.py b/subquery.py index 73cbb55..58dfd21 100644 --- a/subquery.py +++ b/subquery.py @@ -1,4 +1,4 @@ -from guidance.models import OpenAI as GuidanceOpenAI +from guidance.models import OpenAIChat from llama_index import QueryBundle, ServiceContext from llama_index.core import BaseQueryEngine from llama_index.query_engine import SubQuestionQueryEngine @@ -94,7 +94,8 @@ def query_multiple_source( raise NotImplementedError question_gen = GuidanceQuestionGenerator.from_defaults( - guidance_llm=GuidanceOpenAI("text-davinci-003"), verbose=False + guidance_llm=OpenAIChat("gpt-3.5-turbo"), + verbose=False, ) embed_model = CohereEmbedding() service_context = ServiceContext.from_defaults(embed_model=embed_model)