V1.1: Add speech-to-text (#2)

Currie32 · Aug 21, 2023 · afa80b5 · afa80b5
1 parent 5ee528d
commit afa80b5
Show file tree

Hide file tree

Showing 9 changed files with 294 additions and 69 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -16,7 +16,7 @@ COPY pages pages
 # Create a virtual environment and activate it
 RUN python -m venv venv
 ENV PATH="/app/venv/bin:$PATH"
-    
+
 RUN pip3 install --no-cache-dir -r requirements.txt
 
 # Specify the command to run the app

diff --git a/app.py b/app.py
@@ -1,17 +1,21 @@
+import base64
+import io
+
 import dash_bootstrap_components as dbc
 from dash import Dash, html, page_container
-from flask import send_from_directory
+from flask import Flask, request, send_from_directory
 
 from footer import footer
 
+server = Flask(__name__)
 app = Dash(
     __name__,
     use_pages=True,
     pages_folder="pages",
     external_stylesheets=[dbc.icons.BOOTSTRAP, dbc.themes.BOOTSTRAP],
+    server=server,
 )
 app.config.suppress_callback_exceptions = True
-server = app.server
 
 
 @server.route("/robots.txt")
@@ -60,9 +64,28 @@ def serve_sitemap():
                 footer,
             ],
         )
-    ]
+    ],
 )
 
 
+@server.route("/save_audio_recording", methods=["POST"])
+def save_audio_recording():
+    try:
+        data = request.get_json()
+        audio_data = data["audio_data"]
+        # Decode the Base64 audio data
+        audio_bytes = base64.b64decode(audio_data)
+
+        # Save the audio recording
+        with io.BytesIO(audio_bytes) as f:
+            with open("audio_recording.wav", "wb") as audio_file:
+                audio_file.write(f.read())
+
+        return "Audio data received successfully", 200
+
+    except Exception:
+        return "An error occurred", 500
+
+
 if __name__ == "__main__":
     app.run_server(debug=True, port=8080)
diff --git a/assets/app.css b/assets/app.css
@@ -12,6 +12,9 @@ body {
     font-size: 22px;
     margin-left: 10px;
 }
+#button-record-audio {
+    margin-bottom: 5px;
+}
 #button-start-conversation {
     background-color: #0061F3;
     color: white;
@@ -133,13 +136,18 @@ body {
 #title {
     margin: 0px;
 }
-#user-input {
+#user-response {
+    display: flex;
     margin: 0px 0px 30px;
     width: 100%;
 }
-#user-input.form-control {
+#user-response-text {
+    height: 80px;
+    margin-right: 10px;
+}
+#user-response-text.form-control {
     border: 1px solid rgba(7, 76, 179, 0.9);
 }
-#user-input::placeholder {
+#user-response-text::placeholder {
     color: #aaa;
 }
diff --git a/assets/chat_request.py b/assets/chat_request.py
@@ -1,16 +1,67 @@
 import os
 import re
+import time
+from typing import Dict, List
 
 import openai
 import requests
+from dash import Input, Output, callback, no_update
 from tenacity import retry, stop_after_attempt, wait_random_exponential
 
 openai.api_key = os.environ.get("OPENAI_KEY")
 
 
-def get_assistant_message(messages):
+@callback(
+    Output("user-response-text", "value", allow_duplicate=True),
+    Output("loading", "style", allow_duplicate=True),
+    Output("check-for-audio-file", "data", allow_duplicate=True),
+    Input("check-for-audio-file", "data"),
+    prevent_initial_call=True,
+)
+def convert_audio_recording_to_text(check_for_audio_file: bool) -> str:
+    """
+    Convert the audio recording from the user into text using OpenAI's
+    Whisper-1 model.
 
-    chat_response = chat_completion_request(messages)
+    Params:
+        check_for_audio_file: Whether to check for the audio recording file.
+
+    Returns:
+        The text of the user's audio recording.
+        The style of the loading icons.
+        Stop checking for the user's audio recording.
+    """
+
+    audio_recording = "audio_recording.wav"
+
+    while check_for_audio_file:
+        if os.path.exists(audio_recording):
+
+            audio_file = open(audio_recording, "rb")
+            os.remove(audio_recording)
+            transcript = openai.Audio.transcribe("whisper-1", audio_file)
+            message_user = transcript.to_dict()["text"]
+
+            return message_user, {"display": "none"}, False
+
+        # Wait 0.1 seconds before looking for the audio file again
+        time.sleep(0.1)
+
+    return no_update
+
+
+def get_assistant_message(messages: List[Dict[str, str]]) -> str:
+    """
+    Get and process the assistant's (OpenAI's model) message to continue the conversation.
+
+    Params:
+        messages: The conversation history between the user and the chat model.
+
+    Returns:
+        The message from the assistant.
+    """
+
+    chat_response = _chat_completion_request(messages)
     message_assistant = chat_response.json()["choices"][0]["message"]["content"]
 
     # Remove space before "!" or "?"
@@ -20,12 +71,26 @@ def get_assistant_message(messages):
 
 
 @retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
-def chat_completion_request(messages, model="gpt-3.5-turbo-0613"):
+def _chat_completion_request(messages: List[Dict[str, str]]) -> Dict:
+    """
+    Request a response to the user's statement from one of OpenAI's chat models.
+
+    Params:
+        messages: The conversation history between the user and the chat model.
+
+    Returns:
+        A response from OpenAI's model to the user's statement.
+    """
+
     headers = {
         "Content-Type": "application/json",
         "Authorization": "Bearer " + openai.api_key,
     }
-    json_data = {"model": model, "messages": messages, "temperature": 1.5}
+    json_data = {
+        "model": "gpt-3.5-turbo-0613",
+        "messages": messages,
+        "temperature": 1.5,
+    }
     try:
         response = requests.post(
             "https://api.openai.com/v1/chat/completions",
@@ -34,16 +99,28 @@ def chat_completion_request(messages, model="gpt-3.5-turbo-0613"):
         )
         return response
     except Exception as e:
-        print("Unable to generate ChatCompletion response")
-        print(f"Exception: {e}")
         return e
 
 
 def system_content(
-    language_learn, language_known, setting, point_in_conversation="Start"
-):
+    conversation_setting: str,
+    language_learn: str,
+    language_known: str,
+) -> str:
+    """
+    Write the content message for the system as part of call OpenAI's chat completion API.
+    This provide OpenAI's model with some context about the conversation.
+
+    Params:
+        conversation_setting: The setting of the conversation between the user and OpenAI's model.
+        language_learn: The language that the user wants to learn.
+        language_known: The language that the user speaks.
+
+    Returns:
+        The content message for the system.
+    """
 
-    content = f"{point_in_conversation} a conversation about {setting} in {language_learn}. \
+    content = f"Start a conversation about {conversation_setting} in {language_learn}. \
         Provide one statement in {language_learn}, then wait for my response. \
         Do not write in {language_known}. \
         Always finish your response with a question. \

diff --git a/callbacks/conversation_settings.py b/callbacks/conversation_settings.py
@@ -33,7 +33,7 @@ def start_conversation_button_disabled(
 
     has_setting = (
         (conversation_setting == "other") & (conversation_setting_custom is not None)
-    ) | (conversation_setting != "other")
+    ) | ((conversation_setting != "other") & (conversation_setting is not None))
 
     return not (has_two_languages & has_different_languages & has_setting)
 

diff --git a/callbacks/display_components.py b/callbacks/display_components.py
@@ -1,6 +1,6 @@
 from typing import Dict, List, Tuple
 
-from dash import Input, Output, State, callback, callback_context
+from dash import Input, Output, State, callback, callback_context, html
 
 
 @callback(
@@ -27,7 +27,6 @@ def display_conversation_helpers(
             {"display": "block"},
             {
                 "margin": "20px 0px 0px",
-                "float": "right",
             },
         )
 
@@ -47,29 +46,64 @@ def display_user_input(conversation: List) -> Dict[str, str]:
     """
 
     if conversation:
-        return {"display": "block"}
+        return {"display": "flex"}
 
     return {"display": "none"}
 
 
+@callback(
+    Output("button-record-audio", "children"),
+    Output("check-for-audio-file", "data", allow_duplicate=True),
+    Input("button-record-audio", "n_clicks"),
+    prevent_initial_call=True,
+)
+def is_user_recording_audio(button_record_audio_n_clicks: int) -> Tuple[html.I, bool]:
+    """
+    Change the icon for the audio recording button based on if
+    a recording is taking place or not. Also, check for the audio
+    recording after it has been completed.
+
+    Params:
+        button_record_audio_n_clicks: Number of times the button to record the user's audio has been clicked.
+
+    Returns:
+        The icon of the button.
+        Whether to check for a file of the user's audio recording.
+    """
+
+    # Recording taking place
+    if button_record_audio_n_clicks % 2 == 1:
+        return html.I(className="bi bi-headphones"), False
+
+    # Not recording right now
+    else:
+        return html.I(className="bi bi-mic-fill"), True
+
+
 @callback(
     Output("loading", "style", allow_duplicate=True),
     Input("button-start-conversation", "n_clicks"),
-    Input("user-response", "n_submit"),
-    State("user-response", "value"),
+    Input("button-submit-response-text", "n_clicks"),
+    Input("user-response-text", "n_submit"),
+    Input("button-record-audio", "n_clicks"),
+    State("user-response-text", "value"),
     prevent_initial_call="initial_duplicate",
 )
 def loading_visible(
     button_start_conversation_n_clicks: int,
-    user_response_n_submits: int,
+    button_submit_text_n_clicks: int,
+    user_response_text_n_submits: int,
+    user_response_audio_n_clicks: int,
     user_response_text: str,
 ) -> Dict[str, str]:
     """
     Whether to make the loading icons visible.
 
     Params:
-        button_start_conversation_clicks: Number of time the start conversation button was clicked
-        user_response_n_submits: Number of times the user response was submitted.
+        button_start_conversation_n_clicks: Number of time the start conversation button was clicked.
+        button_submit_text_n_clicks: Number of times the button to submit the user's text reponse was clicked.
+        user_response_text_n_submits: Number of times the user's text response was submitted (by clicking enter/return).
+        user_response_audio_n_clicks: Number of times the button to record the user's audio was clicked.
         user_response_text: The text of the user_response field when it was submitted.
 
     Returns:
@@ -83,8 +117,16 @@ def loading_visible(
         if button_start_conversation_n_clicks:
             return {"display": "flex"}
 
-    elif triggered_input_id == "user-response":
-        if user_response_n_submits is not None and user_response_text:
+    if triggered_input_id == "button-submit-response-text":
+        if button_submit_text_n_clicks:
+            return {"display": "flex"}
+
+    elif triggered_input_id == "user-response-text":
+        if user_response_text_n_submits is not None and user_response_text:
+            return {"display": "flex"}
+
+    elif triggered_input_id == "button-record-audio":
+        if user_response_audio_n_clicks:
             return {"display": "flex"}
 
     return {"display": "none"}
diff --git a/callbacks/placeholder_text.py b/callbacks/placeholder_text.py
@@ -2,7 +2,7 @@
 
 
 @callback(
-    Output("user-response", "placeholder"),
+    Output("user-response-text", "placeholder"),
     Input("language-known", "value"),
     Input("language-learn", "value"),
 )