gemi (speak): add voice capability to gemi

Akash98Sky · Jan 13, 2024 · 3df6f61 · 3df6f61
1 parent dcd34a1
commit 3df6f61
Show file tree

Hide file tree

Showing 9 changed files with 52 additions and 18 deletions.
diff --git a/.gitignore b/.gitignore
@@ -161,3 +161,6 @@ cython_debug/
 
 # Testing purpose
 test.*
+
+# Temporary files.
+temp/voice_*.*
diff --git a/bot/routes.py b/bot/routes.py
@@ -3,8 +3,7 @@
 from typing import Any, Dict, Union
 from aiogram import Router
 from aiogram.filters import CommandStart
-from aiogram.types import Message
-from aiogram.enums import ParseMode
+from aiogram.types import Message, InputMediaAudio
 from aiogram.exceptions import TelegramBadRequest
 from aiogram.utils.markdown import bold, italic, pre
 
@@ -63,6 +62,11 @@ async def echo_handler(message: Message, repo: ChatRepo, prompts: list[Union[str
                         await sent.delete()
                         sent = None
                     await message.reply_media_group(media=reply)
+                elif isinstance(reply, InputMediaAudio):
+                    if sent:
+                        await sent.delete()
+                        sent = None
+                    await message.reply_voice(voice=reply.media)
                 elif sent:
                     response = response + reply
                     error = None

diff --git a/chat/query_processor.py b/chat/query_processor.py
@@ -1,21 +1,31 @@
 import asyncio
 from logging import Logger, getLogger
-import aiohttp
-from aiogram.types import InputMediaPhoto, BufferedInputFile
+import pyttsx3
+from aiogram.types import InputMediaPhoto, InputMediaAudio, FSInputFile
 from duckduckgo_search import AsyncDDGS
 from google.generativeai.generative_models import ChatSession, content_types
+import time
 
 from chat.service import ChatService
-from prompts.keywords import IMAGE_QUERY, SEARCH_QUERIES
+from prompts.keywords import IMAGE_QUERY, SEARCH_QUERIES, VOICE_RESPONSE
 from prompts.templates import build_searchengine_response_prompt
 
 logging: Logger = getLogger(__name__)
 
 class QueryProcessor():
     __service: ChatService
+    __voice_engine: pyttsx3.Engine
+    __query_list__ = [
+        IMAGE_QUERY,
+        SEARCH_QUERIES,
+        VOICE_RESPONSE
+    ]
 
     def __init__(self, service: ChatService):
         self.__service = service
+        self.__voice_engine = pyttsx3.init()
+        self.__voice_engine.setProperty('voice', self.__voice_engine.getProperty('voices')[1].id)
+        self.__voice_engine.setProperty('rate', 140)
 
     async def __process_searchengine_query__(self, query: str):
         async with AsyncDDGS() as ddgs:
@@ -48,15 +58,22 @@ async def __gen_image_data__(self, query: str):
 
         return images
 
-    async def process_response(self, session: ChatSession, messages: list[content_types.PartType]):
+    async def __gen_voice_data__(self, query: str, chat_id: int):
+        logging.debug(f"Generate voice query: {query}")
+        file_path = f"temp/voice_{chat_id}_{int(time.time())}.mp3"
+        self.__voice_engine.save_to_file(query, file_path)
+        self.__voice_engine.runAndWait()
+        return InputMediaAudio(media=FSInputFile(file_path))
+
+    async def process_response(self, session: ChatSession, messages: list[content_types.PartType], chat_id: int):
         text = ""
         has_query = False
         response_stream = self.__service.gen_response_stream(prompts=messages, chat=session)
 
         async for res in response_stream:
             text += res
             if len(text) > 15:
-                if text.startswith(f"{SEARCH_QUERIES}:") or text.startswith(f"{IMAGE_QUERY}:"):
+                if len([query for query in self.__query_list__ if text.startswith(f"{query}:")]) > 0:
                     has_query = True
                 else:
                     yield text
@@ -69,9 +86,12 @@ async def process_response(self, session: ChatSession, messages: list[content_ty
             if text.startswith(f"{IMAGE_QUERY}:"):
                 query = text.replace(f"{IMAGE_QUERY}:", "").strip()
                 yield await self.__gen_image_data__(query)
+            elif text.startswith(f"{VOICE_RESPONSE}:"):
+                query = text.replace(f"{VOICE_RESPONSE}:", "").strip()
+                yield await self.__gen_voice_data__(query, chat_id)
             else:
                 queries = text.replace(f"{SEARCH_QUERIES}:\n-", "").split("\n-")
                 query_responses_prompt = await self.__gen_live_data_prompt__(queries)
-                response_stream = self.process_response(session=session, messages=[query_responses_prompt])
+                response_stream = self.process_response(session=session, messages=[query_responses_prompt], chat_id=chat_id)
                 async for res in response_stream:
                     yield res
diff --git a/chat/repository.py b/chat/repository.py
@@ -6,18 +6,20 @@
 from chat.query_processor import QueryProcessor
 
 class Chat():
+    __id: int
     __session: ChatSession
     __processor: QueryProcessor
     __sem = asyncio.BoundedSemaphore(1)
 
-    def __init__(self, session: ChatSession, processor: QueryProcessor):
+    def __init__(self, id: int, session: ChatSession, processor: QueryProcessor):
+        self.__id = id
         self.__session = session
         self.__processor = processor
 
     async def send_message_async(self, messages: Union[Iterable[content_types.PartType], str]):
         async with self.__sem:
             # generate new response only if earlier responses are complete
-            async for reply in self.__processor.process_response(session=self.__session, messages=messages):
+            async for reply in self.__processor.process_response(session=self.__session, messages=messages, chat_id=self.__id):
                 yield reply
 
 class ChatRepo():
@@ -26,15 +28,15 @@ class ChatRepo():
     __chat_creation_sem = asyncio.BoundedSemaphore(1)
     __query_processor: QueryProcessor
 
-    def __init__(self, service: ChatService) -> None:
+    def __init__(self, service: ChatService, processor: QueryProcessor) -> None:
         self.__service = service
         self.__chats = {}
-        self.__query_processor = QueryProcessor(service=service)
+        self.__query_processor = processor
 
     async def get_chat_session(self, chat_id: int):
         async with self.__chat_creation_sem:
             if chat_id not in self.__chats.keys():
                 session = self.__service.create_chat_session()
-                self.__chats[chat_id] = Chat(session=session, processor=self.__query_processor)
+                self.__chats[chat_id] = Chat(id=chat_id, session=session, processor=self.__query_processor)
         return self.__chats[chat_id]
 
diff --git a/containers.py b/containers.py
@@ -1,5 +1,6 @@
 from dependency_injector import providers, containers
 from bot.bot import TgBot
+from chat.query_processor import QueryProcessor
 from chat.repository import ChatRepo
 from chat.service import ChatService
 
@@ -9,5 +10,6 @@ class Configs(containers.DeclarativeContainer):
 
 class BotContainer(containers.DeclarativeContainer):
     chat_service = providers.Singleton(ChatService, api_key=Configs.chat_config.api_key, bing_cookie=Configs.chat_config.bing_cookie)
-    chat_repo = providers.Factory(ChatRepo, service=chat_service)
+    query_processor = providers.Singleton(QueryProcessor, service=chat_service)
+    chat_repo = providers.Factory(ChatRepo, service=chat_service, processor=query_processor)
     tg_bot = providers.Singleton(TgBot, token=Configs.bot_config.token, chat_repo=chat_repo, webhook_host=Configs.bot_config.webhook_host, webhook_secret=Configs.bot_config.webhook_secret)
diff --git a/prompts/keywords.py b/prompts/keywords.py
@@ -1,4 +1,5 @@
 MESSAGE_METADATA = 'message_metadata'
 SEARCH_QUERIES = 'search_queries'
 SEARCH_RESPONSES = 'search_responses'
-IMAGE_QUERY = 'image_query'
+IMAGE_QUERY = 'image_query'
+VOICE_RESPONSE = 'voice_response'
diff --git a/prompts/static.py b/prompts/static.py
@@ -1,6 +1,6 @@
 from google.generativeai.generative_models import content_types
 
-from prompts.keywords import IMAGE_QUERY, MESSAGE_METADATA, SEARCH_QUERIES, SEARCH_RESPONSES
+from prompts.keywords import IMAGE_QUERY, MESSAGE_METADATA, SEARCH_QUERIES, SEARCH_RESPONSES, VOICE_RESPONSE
 
 CHAT_INIT_HISTORY = [
     content_types.ContentDict(parts = ["""
@@ -10,9 +10,10 @@
 Here are a few set of rules that you should follow.
 """, f"""
 Rules:
-- A response you generate can be either your answer or a set of search queries to gather missing informations or a image query to generate an image. Never mix the three in a single response.
+- A response you generate can be either your answer in text/voice or a set of search queries to gather missing informations or a image query to generate an image. Never mix any of the four types in a single response.
 - Ask conversational questions and don't generate any queries until you understand the exact motive of the conversation.
 - Generate an image query only when the user asks for an image. The image query should be in the format, "{IMAGE_QUERY}: <image-query>". The <image-query> should contain a detailed description of the image that the user asked for.
+- Generate a voice response only when the user asks you to speak or send a voice or audio message. The voice response should be in the format, "{VOICE_RESPONSE}: <voice-response>". The <voice-response> should contain the voice text of your response.
 - The search queries message format should be just like, "{SEARCH_QUERIES}:\n- <query-1>\n- <query-2>\n- <query-3> ...\n- <query-n>"
 - Generate 5 search queries when you're less than 50% confident, 4 search queries if greater than 50%, 3 if greater than 60%, 2 if greater than 70%, 1 if greater than 80%. Don't ask search query if you're more than 90% confident.
 - On response to search queries you'll receive search responses in the format, "{SEARCH_RESPONSES}:\n- query:<query-1>\n- title:<title-1>\n- body:<body-1>\n- url:<url-1>\n- ...\n- query:<query-n>\n- title:<title-n>\n- body:<body-n>\n- url:<url-n>\n"

diff --git a/requirements.txt b/requirements.txt
@@ -7,4 +7,5 @@ pypdfium2~=4.25.0
 duckduckgo-search~=4.1.1
 sentry_sdk~=1.39.1
 re_edge_gpt~=0.0.20
-md2tgmd @ git+https://github.com/yym68686/md2tgmd.git
+md2tgmd @ git+https://github.com/yym68686/md2tgmd.git
+pyttsx3~=2.90
diff --git a/temp/.gitkeep b/temp/.gitkeep