-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvoice_stt_mode.py
626 lines (549 loc) · 22.3 KB
/
voice_stt_mode.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
"""
Airtime and Messaging Servicea using Africa's Talking API
This script provides a Gradio-based web interface for sending airtime and messages
using the Africa's Talking API. It also tracks the carbon emissions of the operations
using the CodeCarbon library.
The voice command interface allows users to send airtime, send messages, and search for
news articles using voice commands. However, the audio transcription and processing
is required since the model only accepts text inputs & has limited capabilities.
Usage:
1. Set the environment variables `AT_USERNAME`, `GROQ_API_KEY`, and `AT_API_KEY` with
your Africa's Talking credentials.
2. Run the script: `python app.py`
3. Access the Gradio web interface to send airtime or messages or search for news articles.
Example:
Send airtime to a phone number:
- `Send airtime to +254712345678 with an amount of 10 in currency KES`
Send a message to a phone number:
- `Send a message to +254712345678 with the message 'Hello there',
using the username 'username'`
Search for news about a topic:
- `Latest news on climate change`
"""
# ------------------------------------------------------------------------------------
# Import Statements
# ------------------------------------------------------------------------------------
# Standard Library Imports
import os
import io
import json
import logging
from logging.handlers import RotatingFileHandler
import asyncio
from importlib.metadata import version, PackageNotFoundError
import tempfile
# Third-Party Library Imports
import gradio as gr
from langtrace_python_sdk import langtrace, with_langtrace_root_span
import groq
import numpy as np
import soundfile as sf
import ollama
import edge_tts
# Local Module Imports
from utils.function_call import send_airtime, send_message, search_news, translate_text
from typing import Optional
from utils.models import ReceiptData, LineItem
from utils.constants import VISION_SYSTEM_PROMPT, API_SYSTEM_PROMPT
# ------------------------------------------------------------------------------------
# Logging Configuration
# ------------------------------------------------------------------------------------
# Initialize Langtrace
langtrace.init(api_key=os.getenv("LANGTRACE_API_KEY"))
groq_client = groq.Client(api_key=os.getenv("GROQ_API_KEY"))
# Set up the logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG) # Set the logger to handle all levels DEBUG and above
# Prevent logs from being propagated to the root logger
logger.propagate = False
# Define logging format
formatter = logging.Formatter("%(asctime)s:%(name)s:%(levelname)s:%(message)s")
# Set up the file handler for logging to a file
file_handler = RotatingFileHandler(
"voice_stt_mode.log", maxBytes=5 * 1024 * 1024, backupCount=5
)
file_handler.setLevel(logging.INFO) # Capture INFO and above in the file
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
# Set up the stream handler for console output
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.DEBUG) # Capture DEBUG and above in the console
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
# ------------------------------------------------------------------------------------
# Log the Start of the Script
# ------------------------------------------------------------------------------------
logger.info(
"Starting the voice&text function calling script to send airtime and messages using the "
"Africa's Talking API"
)
logger.info("Review GROQ Speech-to-Text if they log the audio data or not")
logger.info("Let's review the packages and their versions")
# ------------------------------------------------------------------------------------
# Log Versions of the Libraries
# ------------------------------------------------------------------------------------
pkgs = [
"africastalking",
"ollama",
"duckduckgo_search",
"langtrace-python-sdk",
"gradio",
"groq",
"soundfile",
"numpy",
"edge-tts", # Add edge-tts to version checking
]
for pkg in pkgs:
try:
pkg_version = version(pkg)
logger.info("%s version: %s", pkg, pkg_version)
except PackageNotFoundError:
logger.error("Package %s is not installed.", pkg)
except Exception as e:
logger.error("Failed to retrieve version for %s: %s", pkg, str(e))
# ------------------------------------------------------------------------------------
# Add TTS Configuration after version checking
# ------------------------------------------------------------------------------------
VOICE = "sw-TZ-RehemaNeural"
OUTPUT_FILE = "tts_output.mp3" # Saved in current working directory
async def text_to_speech(text: str) -> None:
try:
communicate = edge_tts.Communicate(text, VOICE)
await communicate.save(OUTPUT_FILE)
logger.info(f"Generated speech output: {OUTPUT_FILE}")
except Exception as e:
logger.error(f"TTS Error: {str(e)}")
raise
# ------------------------------------------------------------------------------------
# Define Tools Schema
# ------------------------------------------------------------------------------------
tools = [
{
"type": "function",
"function": {
"name": "send_airtime",
"description": "Send airtime to a phone number using the Africa's Talking API",
"parameters": {
"type": "object",
"properties": {
"phone_number": {
"type": "string",
"description": "The phone number in international format",
},
"currency_code": {
"type": "string",
"description": "The 3-letter ISO currency code",
},
"amount": {
"type": "string",
"description": "The amount of airtime to send",
},
},
"required": ["phone_number", "currency_code", "amount"],
},
},
},
{
"type": "function",
"function": {
"name": "send_message",
"description": "Send a message to a phone number using the Africa's Talking API",
"parameters": {
"type": "object",
"properties": {
"phone_number": {
"type": "string",
"description": "The phone number in international format",
},
"message": {
"type": "string",
"description": "The message to send",
},
"username": {
"type": "string",
"description": "The username for the Africa's Talking account",
},
},
"required": ["phone_number", "message", "username"],
},
},
},
{
"type": "function",
"function": {
"name": "search_news",
"description": "Search for news articles using DuckDuckGo News API",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The search query for news articles",
},
"max_results": {
"type": "integer",
"description": "The maximum number of news articles to retrieve",
"default": 5,
},
},
"required": ["query"],
},
},
},
{
"type": "function",
"function": {
"name": "translate_text",
"description": "Translate text to a specified language using Ollama",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The text to translate",
},
"target_language": {
"type": "string",
"description": "The target language for translation",
},
},
"required": ["text", "target_language"],
},
},
},
]
# ------------------------------------------------------------------------------------
# Function Definitions
# ------------------------------------------------------------------------------------
@with_langtrace_root_span()
async def process_user_message(
message: str,
history: list,
use_vision: bool = False,
image_path: Optional[str] = None,
) -> str:
"""
Handle the conversation with the model asynchronously.
Parameters
----------
message : str
The user's input message.
history : list of list of str
The conversation history up to that point.
use_vision : bool, optional
Whether to use vision model for processing (default is False).
image_path : str, optional
Path to the image file for vision model (default is None).
Returns
-------
str
The model's response or the function execution result.
"""
logger.info("Processing user message: %s", message)
client = ollama.AsyncClient()
messages = []
# Add system prompt
system_prompt = VISION_SYSTEM_PROMPT if use_vision else API_SYSTEM_PROMPT
messages.append({"role": "system", "content": system_prompt})
# Add user message with image if present
if use_vision and image_path:
messages.append({"role": "user", "content": message, "images": [image_path]})
else:
messages.append({"role": "user", "content": message})
try:
model_name = "llama3.2-vision" if use_vision else "qwen2.5:0.5b"
response = await client.chat(
model=model_name,
messages=messages,
tools=None if use_vision else tools,
format="json" if use_vision else None,
options={"temperature": 0},
)
except Exception as e:
logger.exception("Failed to get response from Ollama client.")
return "An unexpected error occurred while communicating with the assistant."
model_message = response.get("message", {})
model_content = model_message.get("content", "")
model_role = model_message.get("role", "assistant")
logger.info("Model response: %s", model_content)
messages.append(
{
"role": model_role,
"content": model_content,
}
)
logger.debug("Model response details: %s", response.get("message"))
if model_message.get("tool_calls"):
for tool in model_message["tool_calls"]:
tool_name = tool["function"]["name"]
arguments = tool["function"]["arguments"]
logger.info("Tool call detected: %s", tool_name)
try:
if tool_name == "send_airtime":
logger.info("Calling send_airtime with arguments: %s", arguments)
function_response = send_airtime(
arguments["phone_number"],
arguments["currency_code"],
arguments["amount"],
)
elif tool_name == "send_message":
logger.info("Calling send_message with arguments: %s", arguments)
function_response = send_message(
arguments["phone_number"],
arguments["message"],
arguments["username"],
)
elif tool_name == "search_news":
logger.info("Calling search_news with arguments: %s", arguments)
function_response = search_news(arguments["query"])
elif tool_name == "translate_text":
logger.info("Calling translate_text with arguments: %s", arguments)
function_response = translate_text(
arguments["text"],
arguments["target_language"],
)
else:
function_response = json.dumps({"error": "Unknown function"})
logger.warning("Unknown function called: %s", tool_name)
logger.debug("Function response: %s", function_response)
messages.append(
{
"role": "tool",
"content": function_response,
}
)
return f"Function `{tool_name}` executed successfully. Response:\n{function_response}"
except (
send_airtime.ErrorType,
send_message.ErrorType,
search_news.ErrorType,
translate_text.ErrorType,
) as e:
logger.error("Handled error in tool `%s`: %s", tool_name, e)
return f"Error executing `{tool_name}`: {str(e)}"
except Exception as e: # pylint: disable=broad-exception-caught
logger.exception("Unexpected error in tool `%s`: %s", tool_name, e)
return f"An unexpected error occurred while executing `{tool_name}`."
else:
return model_content
# Add error handling for audio processing
async def process_audio_and_llm(audio):
"""
Process the audio recording and get the transcription using Groq.
Parameters
----------
audio : tuple
The audio recording tuple with the sample rate and audio data.
Returns
-------
str
The transcription and LLM response.
Raises
------
Exception
If there is an error in processing the audio.
"""
if audio is None:
return "Error: No audio recorded. Please try again."
try:
sr, y = audio
if len(y) == 0:
return "Error: Empty audio recording. Please speak and try again."
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
# Normalize audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Write audio to buffer
buffer = io.BytesIO()
sf.write(buffer, y, sr, format="wav")
buffer.seek(0)
try:
# Get transcription from Groq
transcription = groq_client.audio.transcriptions.create(
model="distil-whisper-large-v3-en",
file=("audio.wav", buffer),
response_format="text",
)
# Process transcription with LLM
response = await process_user_message(transcription, [])
return f"Transcription: {transcription}\nLLM Response: {response}"
except Exception as e:
logger.exception("Error during transcription or LLM processing: %s", e)
return f"Error: {str(e)}"
except Exception as e:
logger.exception("Error in audio processing: %s", e)
return f"Error: {str(e)}"
def gradio_interface(message: str, history: list) -> str:
"""
Gradio interface function to process user messages and track emissions.
Parameters
----------
message : str
The user's input message.
history : list of list of str
The conversation history up to that point.
Returns
-------
str
The model's response or the function execution result.
"""
try:
response = asyncio.run(process_user_message(message, history))
return response
except Exception as e: # pylint: disable=broad-exception-caught
logger.exception("Error in gradio_interface: %s", e)
return "An unexpected error occurred while processing your message."
# ------------------------------------------------------------------------------------
# Create Gradio Interface with Both Text and Audio Inputs
# ------------------------------------------------------------------------------------
with gr.Blocks(title="🎙️ Voice & Vision Communication Interface 🌍") as demo:
gr.Markdown("# Voice Command & Text Communication Interface")
# Add tabs for voice and text input
with gr.Tab("Voice Input"):
# How to use
gr.Markdown(
"""
This interface allows you to send airtime, messages, and search
for news articles using voice commands.
You can also type your commands in the text input tab.
Here are some examples of commands you can use:
- Send airtime to +254712345678 with an amount of 10 in currency KES 📞
- Send a message to +254712345678 with the message 'Hello there' with
the username 'add your username'💬
- Search news for 'latest technology trends' 📰
- Translate the text "Hi" to the target language "French"
* Please speak clearly and concisely for accurate transcription. In English only for now.
* You can also edit the transcription before processing. We all make mistakes! 🤗
"""
)
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Speak your command",
streaming=False,
)
transcription_preview = gr.Textbox(
label="Preview Transcription (Edit if needed)",
interactive=True,
placeholder="Transcription will appear here first...",
)
audio_output = gr.Textbox(
label="Final Result", placeholder="LLM response will appear here..."
)
tts_button = gr.Button("Play TTS")
tts_audio = gr.Audio(label="TTS Output")
with gr.Row():
transcribe_button = gr.Button("Transcribe")
process_button = gr.Button("Process Edited Text", variant="primary")
def show_transcription(audio):
"""
Transcribe the audio recording and show the preview.
Parameters
----------
audio : tuple
The audio recording tuple with the sample rate and audio data.
Returns
-------
str
The transcription of the audio recording.
"""
try:
if audio is None:
return "Error: No audio recorded. Please try again."
sr, y = audio
if len(y) == 0:
return "Error: Empty audio recording. Please speak and try again."
# Convert to mono if stereo
if y.ndim > 1:
y = y.mean(axis=1)
# Normalize audio
y = y.astype(np.float32)
y /= np.max(np.abs(y))
# Write audio to buffer
buffer = io.BytesIO()
sf.write(buffer, y, sr, format="wav")
buffer.seek(0)
# Get transcription from Groq
transcription = groq_client.audio.transcriptions.create(
model="distil-whisper-large-v3-en",
file=("audio.wav", buffer),
response_format="text",
)
logger.info("Audio transcribed successfully: %s", transcription)
return transcription
except Exception as e:
logger.exception("Error during transcription: %s", e)
return f"Error: {str(e)}"
# Define TTS Function
async def generate_tts(text: str) -> str:
"""
Generate TTS audio and return the file path.
"""
try:
communicate = edge_tts.Communicate(text, VOICE)
await communicate.save(OUTPUT_FILE)
logger.info(f"TTS audio generated successfully: {OUTPUT_FILE}")
return OUTPUT_FILE
except Exception as e:
logger.error(f"TTS Generation Error: {str(e)}")
return None
# Wire up the components
transcribe_button.click(
fn=show_transcription, inputs=audio_input, outputs=transcription_preview
)
# Process the edited text
process_button.click(
fn=lambda x: asyncio.run(process_user_message(x, [])),
inputs=transcription_preview,
outputs=audio_output,
)
# Connect TTS Button to Function
tts_button.click(
fn=lambda txt: asyncio.run(generate_tts(txt)),
inputs=audio_output, # Replace with the component holding the final text
outputs=tts_audio,
)
# Text input tab
with gr.Tab("Text Input"):
chat_interface = gr.ChatInterface(
fn=gradio_interface,
description=(
"Type your commands or use voice input above:\n"
"- Send airtime to +254712345678 with an amount of 10 in currency KES 📞\n"
"- Send a message to +254712345678 with the message 'Hello there' 💬\n"
"- Search news for 'latest technology trends' 📰"
),
type="messages",
)
with gr.Tab("Receipt Scanner"):
image_input = gr.Image(type="filepath", label="Upload Receipt/Invoice")
scan_button = gr.Button("Scan Receipt")
result_text = gr.Textbox(label="Analysis Result")
async def process_with_speech(image):
try:
# Get text result first
text_result = await process_user_message(
"Analyze this receipt", [], use_vision=True, image_path=image
)
return text_result
except Exception as e:
logger.error(f"Processing error: {str(e)}")
return str(e)
scan_button.click(
fn=lambda img: asyncio.run(process_with_speech(img)),
inputs=image_input,
outputs=result_text,
)
# ------------------------------------------------------------------------------------
# Launch Gradio Interface
# ------------------------------------------------------------------------------------
if __name__ == "__main__":
try:
logger.info("Launching Gradio interface...")
demo.launch(inbrowser=True, server_name="0.0.0.0", server_port=7860)
logger.info("Gradio interface launched successfully.")
except Exception as e:
logger.exception("Failed to launch Gradio interface: %s", e)
logger.info("Script execution completed")