-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #9 from TogetherCrew/feat/init-bot
feat: initializing the bot!
- Loading branch information
Showing
37 changed files
with
1,195 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
.github/ | ||
|
||
.coverage/ | ||
.coverage | ||
coverage | ||
|
||
venv/ | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
CHUNK_SIZE= | ||
COHERE_API_KEY= | ||
D_RETRIEVER_SEARCH= | ||
EMBEDDING_DIM= | ||
K1_RETRIEVER_SEARCH= | ||
K2_RETRIEVER_SEARCH= | ||
MONGODB_HOST= | ||
MONGODB_PASS= | ||
MONGODB_PORT= | ||
MONGODB_USER= | ||
NEO4J_DB= | ||
NEO4J_HOST= | ||
NEO4J_PASSWORD= | ||
NEO4J_PORT= | ||
NEO4J_PROTOCOL= | ||
NEO4J_USER= | ||
OPENAI_API_KEY= | ||
POSTGRES_HOST= | ||
POSTGRES_PASS= | ||
POSTGRES_PORT= | ||
POSTGRES_USER= | ||
RABBIT_HOST= | ||
RABBIT_PASSWORD= | ||
RABBIT_PORT= | ||
RABBIT_USER= |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
name: Production CI/CD Pipeline | ||
|
||
on: | ||
push: | ||
branches: | ||
- main | ||
|
||
jobs: | ||
ci: | ||
uses: TogetherCrew/operations/.github/workflows/ci.yml@main | ||
secrets: | ||
CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
name: Staging CI/CD Pipeline | ||
|
||
on: pull_request | ||
|
||
jobs: | ||
ci: | ||
uses: TogetherCrew/operations/.github/workflows/ci.yml@main | ||
secrets: | ||
CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# It's recommended that we use `bullseye` for Python (alpine isn't suitable as it conflcts with numpy) | ||
FROM python:3.11-bullseye AS base | ||
WORKDIR /project | ||
COPY . . | ||
RUN pip3 install -r requirements.txt | ||
|
||
FROM base AS test | ||
RUN chmod +x docker-entrypoint.sh | ||
CMD ["./docker-entrypoint.sh"] | ||
|
||
FROM base AS prod | ||
CMD ["python3", "celery", "-A", "celery_app.server", "worker", "-l", "INFO"] |
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
from bot.retrievers.summary_retriever_base import BaseSummarySearch | ||
from llama_index.embeddings import BaseEmbedding | ||
from tc_hivemind_backend.embeddings.cohere import CohereEmbedding | ||
|
||
|
||
class ForumBasedSummaryRetriever(BaseSummarySearch): | ||
def __init__( | ||
self, | ||
table_name: str, | ||
dbname: str, | ||
embedding_model: BaseEmbedding | CohereEmbedding = CohereEmbedding(), | ||
) -> None: | ||
""" | ||
the class for forum based data like discord and discourse | ||
by default CohereEmbedding will be used. | ||
""" | ||
super().__init__(table_name, dbname, embedding_model=embedding_model) | ||
|
||
def retreive_metadata( | ||
self, | ||
query: str, | ||
metadata_group1_key: str, | ||
metadata_group2_key: str, | ||
metadata_date_key: str, | ||
similarity_top_k: int = 20, | ||
) -> tuple[set[str], set[str], set[str]]: | ||
""" | ||
retrieve the metadata information of the similar nodes with the query | ||
Parameters | ||
----------- | ||
query : str | ||
the user query to process | ||
metadata_group1_key : str | ||
the conversations grouping type 1 | ||
in discord can be `channel`, and in discourse can be `category` | ||
metadata_group2_key : str | ||
the conversations grouping type 2 | ||
in discord can be `thread`, and in discourse can be `topic` | ||
metadata_date_key : str | ||
the daily metadata saved key | ||
similarity_top_k : int | ||
the top k nodes to get as the retriever. | ||
default is set as 20 | ||
Returns | ||
--------- | ||
group1_data : set[str] | ||
the similar summary nodes having the group1_data. | ||
can be an empty set meaning no similar thread | ||
conversations for it was available. | ||
group2_data : set[str] | ||
the similar summary nodes having the group2_data. | ||
can be an empty set meaning no similar channel | ||
conversations for it was available. | ||
dates : set[str] | ||
the similar daily conversations to the given query | ||
""" | ||
nodes = self.get_similar_nodes(query=query, similarity_top_k=similarity_top_k) | ||
|
||
group1_data: set[str] = set() | ||
dates: set[str] = set() | ||
group2_data: set[str] = set() | ||
|
||
for node in nodes: | ||
if node.metadata[metadata_group1_key]: | ||
group1_data.add(node.metadata[metadata_group1_key]) | ||
if node.metadata[metadata_group2_key]: | ||
group2_data.add(node.metadata[metadata_group2_key]) | ||
dates.add(node.metadata[metadata_date_key]) | ||
|
||
return group1_data, group2_data, dates |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
import logging | ||
from datetime import timedelta | ||
|
||
from dateutil import parser | ||
|
||
|
||
def process_dates(dates: list[str], d: int) -> list[str]: | ||
""" | ||
process the dates to be from `date - d` to `date + d` | ||
Parameters | ||
------------ | ||
dates : list[str] | ||
the list of dates given | ||
d : int | ||
to update the `dates` list to have `-d` and `+d` days | ||
Returns | ||
---------- | ||
dates_modified : list[str] | ||
days added to it | ||
""" | ||
dates_modified: list[str] = [] | ||
if dates != []: | ||
lowest_date = min(parser.parse(date) for date in dates) | ||
greatest_date = max(parser.parse(date) for date in dates) | ||
|
||
delta_days = timedelta(days=d) | ||
|
||
# the date condition | ||
dt = lowest_date - delta_days | ||
while dt <= greatest_date + delta_days: | ||
dates_modified.append(dt.strftime("%Y-%m-%d")) | ||
dt += timedelta(days=1) | ||
else: | ||
logging.warning("No dates given!") | ||
|
||
return dates_modified |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from llama_index import VectorStoreIndex | ||
from llama_index.embeddings import BaseEmbedding | ||
from llama_index.indices.query.schema import QueryBundle | ||
from llama_index.schema import NodeWithScore | ||
from tc_hivemind_backend.embeddings.cohere import CohereEmbedding | ||
from tc_hivemind_backend.pg_vector_access import PGVectorAccess | ||
|
||
|
||
class BaseSummarySearch: | ||
def __init__( | ||
self, | ||
table_name: str, | ||
dbname: str, | ||
embedding_model: BaseEmbedding = CohereEmbedding(), | ||
) -> None: | ||
""" | ||
initialize the base summary search class | ||
In this class we're doing a similarity search | ||
for available saved nodes under postgresql | ||
Parameters | ||
------------- | ||
table_name : str | ||
the table that summary data is saved | ||
*Note:* Don't include the `data_` prefix of the table, | ||
cause lamma_index would original include that. | ||
dbname : str | ||
the database name to access | ||
similarity_top_k : int | ||
the top k nodes to get as the retriever. | ||
default is set as 20 | ||
embedding_model : llama_index.embeddings.BaseEmbedding | ||
the embedding model to use for doing embedding on the query string | ||
default would be CohereEmbedding that we've written | ||
""" | ||
self.index = self._setup_index(table_name, dbname, embedding_model) | ||
self.embedding_model = embedding_model | ||
|
||
def get_similar_nodes( | ||
self, query: str, similarity_top_k: int = 20 | ||
) -> list[NodeWithScore]: | ||
""" | ||
get k similar nodes to the query. | ||
Note: this funciton wold get the embedding | ||
for the query to do the similarity search. | ||
Parameters | ||
------------ | ||
query : str | ||
the user query to process | ||
similarity_top_k : int | ||
the top k nodes to get as the retriever. | ||
default is set as 20 | ||
""" | ||
retriever = self.index.as_retriever(similarity_top_k=similarity_top_k) | ||
|
||
query_embedding = self.embedding_model.get_text_embedding(text=query) | ||
|
||
query_bundle = QueryBundle(query_str=query, embedding=query_embedding) | ||
nodes = retriever._retrieve(query_bundle) | ||
|
||
return nodes | ||
|
||
def _setup_index( | ||
self, table_name: str, dbname: str, embedding_model: BaseEmbedding | ||
) -> VectorStoreIndex: | ||
""" | ||
setup the llama_index VectorStoreIndex | ||
""" | ||
pg_vector_access = PGVectorAccess( | ||
table_name=table_name, dbname=dbname, embed_model=embedding_model | ||
) | ||
index = pg_vector_access.load_index() | ||
return index |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import os | ||
|
||
from dotenv import load_dotenv | ||
|
||
|
||
def load_hyperparams() -> tuple[int, int, int]: | ||
""" | ||
load the k1, k2, and d hyperparams that are used for retrievers | ||
Returns | ||
--------- | ||
k1 : int | ||
the value for the first summary search | ||
to get the `k1` count similar nodes | ||
k2 : int | ||
the value for the secondary raw search | ||
to get the `k2` count simliar nodes | ||
d : int | ||
the before and after day interval | ||
""" | ||
load_dotenv() | ||
|
||
k1 = os.getenv("K1_RETRIEVER_SEARCH") | ||
k2 = os.getenv("K2_RETRIEVER_SEARCH") | ||
d = os.getenv("D_RETRIEVER_SEARCH") | ||
|
||
if k1 is None: | ||
raise ValueError("No `K1_RETRIEVER_SEARCH` available in .env file!") | ||
if k2 is None: | ||
raise ValueError("No `K2_RETRIEVER_SEARCH` available in .env file!") | ||
if d is None: | ||
raise ValueError("No `D_RETRIEVER_SEARCH` available in .env file!") | ||
|
||
return int(k1), int(k2), int(d) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
from tc_messageBroker import RabbitMQ | ||
from tc_messageBroker.rabbit_mq.event import Event | ||
from tc_messageBroker.rabbit_mq.queue import Queue | ||
|
||
|
||
def job_send(broker_url, port, username, password, res): | ||
rabbit_mq = RabbitMQ( | ||
broker_url=broker_url, port=port, username=username, password=password | ||
) | ||
|
||
content = { | ||
"uuid": "d99a1490-fba6-11ed-b9a9-0d29e7612dp8", | ||
"data": f"some results {res}", | ||
} | ||
|
||
rabbit_mq.connect(Queue.DISCORD_ANALYZER) | ||
rabbit_mq.publish( | ||
queue_name=Queue.DISCORD_ANALYZER, | ||
event=Event.DISCORD_BOT.FETCH, | ||
content=content, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
# TODO: read from .env | ||
broker_url = "localhost" | ||
port = 5672 | ||
username = "root" | ||
password = "pass" | ||
job_send(broker_url, port, username, password, "CALLED FROM __main__") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from celery import Celery | ||
from utils.credentials import load_rabbitmq_credentials | ||
|
||
rabbit_creds = load_rabbitmq_credentials() | ||
user = rabbit_creds["user"] | ||
password = rabbit_creds["password"] | ||
host = rabbit_creds["host"] | ||
port = rabbit_creds["port"] | ||
|
||
app = Celery("celery_app/tasks", broker=f"pyamqp://{user}:{password}@{host}:{port}//") | ||
app.autodiscover_tasks(["celery_app"]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from celery_app.job_send import job_send | ||
from celery_app.server import app | ||
from utils.credentials import load_rabbitmq_credentials | ||
|
||
# TODO: Write tasks that match our requirements | ||
|
||
|
||
@app.task | ||
def add(x, y): | ||
rabbit_creds = load_rabbitmq_credentials() | ||
username = rabbit_creds["user"] | ||
password = rabbit_creds["password"] | ||
broker_url = rabbit_creds["host"] | ||
port = rabbit_creds["port"] | ||
|
||
res = x + y | ||
job_send(broker_url, port, username, password, res) | ||
|
||
return res | ||
|
||
|
||
@app.task | ||
def mul(x, y): | ||
return x * y | ||
|
||
|
||
@app.task | ||
def xsum(numbers): | ||
return sum(numbers) |
Oops, something went wrong.