Skip to content

Commit

Permalink
Merge pull request #9 from TogetherCrew/feat/init-bot
Browse files Browse the repository at this point in the history
feat: initializing the bot!
  • Loading branch information
cyri113 authored Jan 9, 2024
2 parents 652a0db + b9a4d47 commit 98706b2
Show file tree
Hide file tree
Showing 37 changed files with 1,195 additions and 0 deletions.
8 changes: 8 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
.github/

.coverage/
.coverage
coverage

venv/
.env
25 changes: 25 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
CHUNK_SIZE=
COHERE_API_KEY=
D_RETRIEVER_SEARCH=
EMBEDDING_DIM=
K1_RETRIEVER_SEARCH=
K2_RETRIEVER_SEARCH=
MONGODB_HOST=
MONGODB_PASS=
MONGODB_PORT=
MONGODB_USER=
NEO4J_DB=
NEO4J_HOST=
NEO4J_PASSWORD=
NEO4J_PORT=
NEO4J_PROTOCOL=
NEO4J_USER=
OPENAI_API_KEY=
POSTGRES_HOST=
POSTGRES_PASS=
POSTGRES_PORT=
POSTGRES_USER=
RABBIT_HOST=
RABBIT_PASSWORD=
RABBIT_PORT=
RABBIT_USER=
12 changes: 12 additions & 0 deletions .github/workflows/production.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
name: Production CI/CD Pipeline

on:
push:
branches:
- main

jobs:
ci:
uses: TogetherCrew/operations/.github/workflows/ci.yml@main
secrets:
CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
9 changes: 9 additions & 0 deletions .github/workflows/start.staging.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
name: Staging CI/CD Pipeline

on: pull_request

jobs:
ci:
uses: TogetherCrew/operations/.github/workflows/ci.yml@main
secrets:
CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,6 @@ cython_debug/
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

hivemind-bot-env/*
main.ipynb
12 changes: 12 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# It's recommended that we use `bullseye` for Python (alpine isn't suitable as it conflcts with numpy)
FROM python:3.11-bullseye AS base
WORKDIR /project
COPY . .
RUN pip3 install -r requirements.txt

FROM base AS test
RUN chmod +x docker-entrypoint.sh
CMD ["./docker-entrypoint.sh"]

FROM base AS prod
CMD ["python3", "celery", "-A", "celery_app.server", "worker", "-l", "INFO"]
Empty file added bot/__init__.py
Empty file.
Empty file added bot/retrievers/__init__.py
Empty file.
73 changes: 73 additions & 0 deletions bot/retrievers/forum_summary_retriever.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
from bot.retrievers.summary_retriever_base import BaseSummarySearch
from llama_index.embeddings import BaseEmbedding
from tc_hivemind_backend.embeddings.cohere import CohereEmbedding


class ForumBasedSummaryRetriever(BaseSummarySearch):
def __init__(
self,
table_name: str,
dbname: str,
embedding_model: BaseEmbedding | CohereEmbedding = CohereEmbedding(),
) -> None:
"""
the class for forum based data like discord and discourse
by default CohereEmbedding will be used.
"""
super().__init__(table_name, dbname, embedding_model=embedding_model)

def retreive_metadata(
self,
query: str,
metadata_group1_key: str,
metadata_group2_key: str,
metadata_date_key: str,
similarity_top_k: int = 20,
) -> tuple[set[str], set[str], set[str]]:
"""
retrieve the metadata information of the similar nodes with the query
Parameters
-----------
query : str
the user query to process
metadata_group1_key : str
the conversations grouping type 1
in discord can be `channel`, and in discourse can be `category`
metadata_group2_key : str
the conversations grouping type 2
in discord can be `thread`, and in discourse can be `topic`
metadata_date_key : str
the daily metadata saved key
similarity_top_k : int
the top k nodes to get as the retriever.
default is set as 20
Returns
---------
group1_data : set[str]
the similar summary nodes having the group1_data.
can be an empty set meaning no similar thread
conversations for it was available.
group2_data : set[str]
the similar summary nodes having the group2_data.
can be an empty set meaning no similar channel
conversations for it was available.
dates : set[str]
the similar daily conversations to the given query
"""
nodes = self.get_similar_nodes(query=query, similarity_top_k=similarity_top_k)

group1_data: set[str] = set()
dates: set[str] = set()
group2_data: set[str] = set()

for node in nodes:
if node.metadata[metadata_group1_key]:
group1_data.add(node.metadata[metadata_group1_key])
if node.metadata[metadata_group2_key]:
group2_data.add(node.metadata[metadata_group2_key])
dates.add(node.metadata[metadata_date_key])

return group1_data, group2_data, dates
39 changes: 39 additions & 0 deletions bot/retrievers/process_dates.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import logging
from datetime import timedelta

from dateutil import parser


def process_dates(dates: list[str], d: int) -> list[str]:
"""
process the dates to be from `date - d` to `date + d`
Parameters
------------
dates : list[str]
the list of dates given
d : int
to update the `dates` list to have `-d` and `+d` days
Returns
----------
dates_modified : list[str]
days added to it
"""
dates_modified: list[str] = []
if dates != []:
lowest_date = min(parser.parse(date) for date in dates)
greatest_date = max(parser.parse(date) for date in dates)

delta_days = timedelta(days=d)

# the date condition
dt = lowest_date - delta_days
while dt <= greatest_date + delta_days:
dates_modified.append(dt.strftime("%Y-%m-%d"))
dt += timedelta(days=1)
else:
logging.warning("No dates given!")

return dates_modified
75 changes: 75 additions & 0 deletions bot/retrievers/summary_retriever_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from llama_index import VectorStoreIndex
from llama_index.embeddings import BaseEmbedding
from llama_index.indices.query.schema import QueryBundle
from llama_index.schema import NodeWithScore
from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
from tc_hivemind_backend.pg_vector_access import PGVectorAccess


class BaseSummarySearch:
def __init__(
self,
table_name: str,
dbname: str,
embedding_model: BaseEmbedding = CohereEmbedding(),
) -> None:
"""
initialize the base summary search class
In this class we're doing a similarity search
for available saved nodes under postgresql
Parameters
-------------
table_name : str
the table that summary data is saved
*Note:* Don't include the `data_` prefix of the table,
cause lamma_index would original include that.
dbname : str
the database name to access
similarity_top_k : int
the top k nodes to get as the retriever.
default is set as 20
embedding_model : llama_index.embeddings.BaseEmbedding
the embedding model to use for doing embedding on the query string
default would be CohereEmbedding that we've written
"""
self.index = self._setup_index(table_name, dbname, embedding_model)
self.embedding_model = embedding_model

def get_similar_nodes(
self, query: str, similarity_top_k: int = 20
) -> list[NodeWithScore]:
"""
get k similar nodes to the query.
Note: this funciton wold get the embedding
for the query to do the similarity search.
Parameters
------------
query : str
the user query to process
similarity_top_k : int
the top k nodes to get as the retriever.
default is set as 20
"""
retriever = self.index.as_retriever(similarity_top_k=similarity_top_k)

query_embedding = self.embedding_model.get_text_embedding(text=query)

query_bundle = QueryBundle(query_str=query, embedding=query_embedding)
nodes = retriever._retrieve(query_bundle)

return nodes

def _setup_index(
self, table_name: str, dbname: str, embedding_model: BaseEmbedding
) -> VectorStoreIndex:
"""
setup the llama_index VectorStoreIndex
"""
pg_vector_access = PGVectorAccess(
table_name=table_name, dbname=dbname, embed_model=embedding_model
)
index = pg_vector_access.load_index()
return index
Empty file.
34 changes: 34 additions & 0 deletions bot/retrievers/utils/load_hyperparams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os

from dotenv import load_dotenv


def load_hyperparams() -> tuple[int, int, int]:
"""
load the k1, k2, and d hyperparams that are used for retrievers
Returns
---------
k1 : int
the value for the first summary search
to get the `k1` count similar nodes
k2 : int
the value for the secondary raw search
to get the `k2` count simliar nodes
d : int
the before and after day interval
"""
load_dotenv()

k1 = os.getenv("K1_RETRIEVER_SEARCH")
k2 = os.getenv("K2_RETRIEVER_SEARCH")
d = os.getenv("D_RETRIEVER_SEARCH")

if k1 is None:
raise ValueError("No `K1_RETRIEVER_SEARCH` available in .env file!")
if k2 is None:
raise ValueError("No `K2_RETRIEVER_SEARCH` available in .env file!")
if d is None:
raise ValueError("No `D_RETRIEVER_SEARCH` available in .env file!")

return int(k1), int(k2), int(d)
Empty file added celery_app/__init__.py
Empty file.
30 changes: 30 additions & 0 deletions celery_app/job_send.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from tc_messageBroker import RabbitMQ
from tc_messageBroker.rabbit_mq.event import Event
from tc_messageBroker.rabbit_mq.queue import Queue


def job_send(broker_url, port, username, password, res):
rabbit_mq = RabbitMQ(
broker_url=broker_url, port=port, username=username, password=password
)

content = {
"uuid": "d99a1490-fba6-11ed-b9a9-0d29e7612dp8",
"data": f"some results {res}",
}

rabbit_mq.connect(Queue.DISCORD_ANALYZER)
rabbit_mq.publish(
queue_name=Queue.DISCORD_ANALYZER,
event=Event.DISCORD_BOT.FETCH,
content=content,
)


if __name__ == "__main__":
# TODO: read from .env
broker_url = "localhost"
port = 5672
username = "root"
password = "pass"
job_send(broker_url, port, username, password, "CALLED FROM __main__")
11 changes: 11 additions & 0 deletions celery_app/server.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from celery import Celery
from utils.credentials import load_rabbitmq_credentials

rabbit_creds = load_rabbitmq_credentials()
user = rabbit_creds["user"]
password = rabbit_creds["password"]
host = rabbit_creds["host"]
port = rabbit_creds["port"]

app = Celery("celery_app/tasks", broker=f"pyamqp://{user}:{password}@{host}:{port}//")
app.autodiscover_tasks(["celery_app"])
29 changes: 29 additions & 0 deletions celery_app/tasks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from celery_app.job_send import job_send
from celery_app.server import app
from utils.credentials import load_rabbitmq_credentials

# TODO: Write tasks that match our requirements


@app.task
def add(x, y):
rabbit_creds = load_rabbitmq_credentials()
username = rabbit_creds["user"]
password = rabbit_creds["password"]
broker_url = rabbit_creds["host"]
port = rabbit_creds["port"]

res = x + y
job_send(broker_url, port, username, password, res)

return res


@app.task
def mul(x, y):
return x * y


@app.task
def xsum(numbers):
return sum(numbers)
Loading

0 comments on commit 98706b2

Please sign in to comment.