Merge pull request #9 from TogetherCrew/feat/init-bot

feat: initializing the bot!
TogetherCrew · Jan 9, 2024 · 98706b2 · 98706b2
2 parents 652a0db + b9a4d47
commit 98706b2
Show file tree

Hide file tree

Showing 37 changed files with 1,195 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,8 @@
+.github/
+
+.coverage/
+.coverage
+coverage
+
+venv/
+.env
diff --git a/.env.example b/.env.example
@@ -0,0 +1,25 @@
+CHUNK_SIZE=
+COHERE_API_KEY=
+D_RETRIEVER_SEARCH=
+EMBEDDING_DIM=
+K1_RETRIEVER_SEARCH=
+K2_RETRIEVER_SEARCH=
+MONGODB_HOST=
+MONGODB_PASS=
+MONGODB_PORT=
+MONGODB_USER=
+NEO4J_DB=
+NEO4J_HOST=
+NEO4J_PASSWORD=
+NEO4J_PORT=
+NEO4J_PROTOCOL=
+NEO4J_USER=
+OPENAI_API_KEY=
+POSTGRES_HOST=
+POSTGRES_PASS=
+POSTGRES_PORT=
+POSTGRES_USER=
+RABBIT_HOST=
+RABBIT_PASSWORD=
+RABBIT_PORT=
+RABBIT_USER=
diff --git a/.github/workflows/production.yml b/.github/workflows/production.yml
@@ -0,0 +1,12 @@
+name: Production CI/CD Pipeline
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  ci:
+    uses: TogetherCrew/operations/.github/workflows/ci.yml@main
+    secrets:
+      CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
diff --git a/.github/workflows/start.staging.yml b/.github/workflows/start.staging.yml
@@ -0,0 +1,9 @@
+name: Staging CI/CD Pipeline
+
+on: pull_request
+
+jobs:
+  ci:
+    uses: TogetherCrew/operations/.github/workflows/ci.yml@main
+    secrets:
+      CC_TEST_REPORTER_ID: ${{ secrets.CC_TEST_REPORTER_ID }}
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+hivemind-bot-env/*
+main.ipynb
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,12 @@
+# It's recommended that we use `bullseye` for Python (alpine isn't suitable as it conflcts with numpy)
+FROM python:3.11-bullseye AS base 
+WORKDIR /project
+COPY . .
+RUN pip3 install -r requirements.txt
+
+FROM base AS test
+RUN chmod +x docker-entrypoint.sh
+CMD ["./docker-entrypoint.sh"]
+
+FROM base AS prod
+CMD ["python3", "celery", "-A", "celery_app.server", "worker", "-l", "INFO"]
diff --git a/bot/__init__.py b/bot/__init__.py
diff --git a/bot/retrievers/__init__.py b/bot/retrievers/__init__.py
diff --git a/bot/retrievers/forum_summary_retriever.py b/bot/retrievers/forum_summary_retriever.py
@@ -0,0 +1,73 @@
+from bot.retrievers.summary_retriever_base import BaseSummarySearch
+from llama_index.embeddings import BaseEmbedding
+from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
+
+
+class ForumBasedSummaryRetriever(BaseSummarySearch):
+    def __init__(
+        self,
+        table_name: str,
+        dbname: str,
+        embedding_model: BaseEmbedding | CohereEmbedding = CohereEmbedding(),
+    ) -> None:
+        """
+        the class for forum based data like discord and discourse
+        by default CohereEmbedding will be used.
+        """
+        super().__init__(table_name, dbname, embedding_model=embedding_model)
+
+    def retreive_metadata(
+        self,
+        query: str,
+        metadata_group1_key: str,
+        metadata_group2_key: str,
+        metadata_date_key: str,
+        similarity_top_k: int = 20,
+    ) -> tuple[set[str], set[str], set[str]]:
+        """
+        retrieve the metadata information of the similar nodes with the query
+
+        Parameters
+        -----------
+        query : str
+            the user query to process
+        metadata_group1_key : str
+            the conversations grouping type 1
+            in discord can be `channel`, and in discourse can be `category`
+        metadata_group2_key : str
+            the conversations grouping type 2
+            in discord can be `thread`, and in discourse can be `topic`
+        metadata_date_key : str
+            the daily metadata saved key
+        similarity_top_k : int
+            the top k nodes to get as the retriever.
+            default is set as 20
+
+
+        Returns
+        ---------
+        group1_data : set[str]
+            the similar summary nodes having the group1_data.
+            can be an empty set meaning no similar thread
+            conversations for it was available.
+        group2_data : set[str]
+            the similar summary nodes having the group2_data.
+            can be an empty set meaning no similar channel
+            conversations for it was available.
+        dates : set[str]
+            the similar daily conversations to the given query
+        """
+        nodes = self.get_similar_nodes(query=query, similarity_top_k=similarity_top_k)
+
+        group1_data: set[str] = set()
+        dates: set[str] = set()
+        group2_data: set[str] = set()
+
+        for node in nodes:
+            if node.metadata[metadata_group1_key]:
+                group1_data.add(node.metadata[metadata_group1_key])
+            if node.metadata[metadata_group2_key]:
+                group2_data.add(node.metadata[metadata_group2_key])
+            dates.add(node.metadata[metadata_date_key])
+
+        return group1_data, group2_data, dates
diff --git a/bot/retrievers/process_dates.py b/bot/retrievers/process_dates.py
@@ -0,0 +1,39 @@
+import logging
+from datetime import timedelta
+
+from dateutil import parser
+
+
+def process_dates(dates: list[str], d: int) -> list[str]:
+    """
+    process the dates to be from `date - d` to `date + d`
+
+    Parameters
+    ------------
+    dates : list[str]
+        the list of dates given
+    d : int
+        to update the `dates` list to have `-d` and `+d` days
+
+
+    Returns
+    ----------
+    dates_modified : list[str]
+        days added to it
+    """
+    dates_modified: list[str] = []
+    if dates != []:
+        lowest_date = min(parser.parse(date) for date in dates)
+        greatest_date = max(parser.parse(date) for date in dates)
+
+        delta_days = timedelta(days=d)
+
+        # the date condition
+        dt = lowest_date - delta_days
+        while dt <= greatest_date + delta_days:
+            dates_modified.append(dt.strftime("%Y-%m-%d"))
+            dt += timedelta(days=1)
+    else:
+        logging.warning("No dates given!")
+
+    return dates_modified
diff --git a/bot/retrievers/summary_retriever_base.py b/bot/retrievers/summary_retriever_base.py
@@ -0,0 +1,75 @@
+from llama_index import VectorStoreIndex
+from llama_index.embeddings import BaseEmbedding
+from llama_index.indices.query.schema import QueryBundle
+from llama_index.schema import NodeWithScore
+from tc_hivemind_backend.embeddings.cohere import CohereEmbedding
+from tc_hivemind_backend.pg_vector_access import PGVectorAccess
+
+
+class BaseSummarySearch:
+    def __init__(
+        self,
+        table_name: str,
+        dbname: str,
+        embedding_model: BaseEmbedding = CohereEmbedding(),
+    ) -> None:
+        """
+        initialize the base summary search class
+
+        In this class we're doing a similarity search
+        for available saved nodes under postgresql
+
+        Parameters
+        -------------
+        table_name : str
+            the table that summary data is saved
+            *Note:* Don't include the `data_` prefix of the table,
+            cause lamma_index would original include that.
+        dbname : str
+            the database name to access
+        similarity_top_k : int
+            the top k nodes to get as the retriever.
+            default is set as 20
+        embedding_model : llama_index.embeddings.BaseEmbedding
+            the embedding model to use for doing embedding on the query string
+            default would be CohereEmbedding that we've written
+        """
+        self.index = self._setup_index(table_name, dbname, embedding_model)
+        self.embedding_model = embedding_model
+
+    def get_similar_nodes(
+        self, query: str, similarity_top_k: int = 20
+    ) -> list[NodeWithScore]:
+        """
+        get k similar nodes to the query.
+        Note: this funciton wold get the embedding
+        for the query to do the similarity search.
+
+        Parameters
+        ------------
+        query : str
+            the user query to process
+        similarity_top_k : int
+            the top k nodes to get as the retriever.
+            default is set as 20
+        """
+        retriever = self.index.as_retriever(similarity_top_k=similarity_top_k)
+
+        query_embedding = self.embedding_model.get_text_embedding(text=query)
+
+        query_bundle = QueryBundle(query_str=query, embedding=query_embedding)
+        nodes = retriever._retrieve(query_bundle)
+
+        return nodes
+
+    def _setup_index(
+        self, table_name: str, dbname: str, embedding_model: BaseEmbedding
+    ) -> VectorStoreIndex:
+        """
+        setup the llama_index VectorStoreIndex
+        """
+        pg_vector_access = PGVectorAccess(
+            table_name=table_name, dbname=dbname, embed_model=embedding_model
+        )
+        index = pg_vector_access.load_index()
+        return index
diff --git a/bot/retrievers/utils/__init__.py b/bot/retrievers/utils/__init__.py
diff --git a/bot/retrievers/utils/load_hyperparams.py b/bot/retrievers/utils/load_hyperparams.py
@@ -0,0 +1,34 @@
+import os
+
+from dotenv import load_dotenv
+
+
+def load_hyperparams() -> tuple[int, int, int]:
+    """
+    load the k1, k2, and d hyperparams that are used for retrievers
+
+    Returns
+    ---------
+    k1 : int
+        the value for the first summary search
+        to get the `k1` count similar nodes
+    k2 : int
+        the value for the secondary raw search
+        to get the `k2` count simliar nodes
+    d : int
+        the before and after day interval
+    """
+    load_dotenv()
+
+    k1 = os.getenv("K1_RETRIEVER_SEARCH")
+    k2 = os.getenv("K2_RETRIEVER_SEARCH")
+    d = os.getenv("D_RETRIEVER_SEARCH")
+
+    if k1 is None:
+        raise ValueError("No `K1_RETRIEVER_SEARCH` available in .env file!")
+    if k2 is None:
+        raise ValueError("No `K2_RETRIEVER_SEARCH` available in .env file!")
+    if d is None:
+        raise ValueError("No `D_RETRIEVER_SEARCH` available in .env file!")
+
+    return int(k1), int(k2), int(d)
diff --git a/celery_app/__init__.py b/celery_app/__init__.py
diff --git a/celery_app/job_send.py b/celery_app/job_send.py
@@ -0,0 +1,30 @@
+from tc_messageBroker import RabbitMQ
+from tc_messageBroker.rabbit_mq.event import Event
+from tc_messageBroker.rabbit_mq.queue import Queue
+
+
+def job_send(broker_url, port, username, password, res):
+    rabbit_mq = RabbitMQ(
+        broker_url=broker_url, port=port, username=username, password=password
+    )
+
+    content = {
+        "uuid": "d99a1490-fba6-11ed-b9a9-0d29e7612dp8",
+        "data": f"some results {res}",
+    }
+
+    rabbit_mq.connect(Queue.DISCORD_ANALYZER)
+    rabbit_mq.publish(
+        queue_name=Queue.DISCORD_ANALYZER,
+        event=Event.DISCORD_BOT.FETCH,
+        content=content,
+    )
+
+
+if __name__ == "__main__":
+    # TODO: read from .env
+    broker_url = "localhost"
+    port = 5672
+    username = "root"
+    password = "pass"
+    job_send(broker_url, port, username, password, "CALLED FROM __main__")
diff --git a/celery_app/server.py b/celery_app/server.py
@@ -0,0 +1,11 @@
+from celery import Celery
+from utils.credentials import load_rabbitmq_credentials
+
+rabbit_creds = load_rabbitmq_credentials()
+user = rabbit_creds["user"]
+password = rabbit_creds["password"]
+host = rabbit_creds["host"]
+port = rabbit_creds["port"]
+
+app = Celery("celery_app/tasks", broker=f"pyamqp://{user}:{password}@{host}:{port}//")
+app.autodiscover_tasks(["celery_app"])
diff --git a/celery_app/tasks.py b/celery_app/tasks.py
@@ -0,0 +1,29 @@
+from celery_app.job_send import job_send
+from celery_app.server import app
+from utils.credentials import load_rabbitmq_credentials
+
+# TODO: Write tasks that match our requirements
+
+
+@app.task
+def add(x, y):
+    rabbit_creds = load_rabbitmq_credentials()
+    username = rabbit_creds["user"]
+    password = rabbit_creds["password"]
+    broker_url = rabbit_creds["host"]
+    port = rabbit_creds["port"]
+
+    res = x + y
+    job_send(broker_url, port, username, password, res)
+
+    return res
+
+
+@app.task
+def mul(x, y):
+    return x * y
+
+
+@app.task
+def xsum(numbers):
+    return sum(numbers)