diff --git a/.vscode/settings.json b/.vscode/settings.json index 241e05d..29536fd 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,4 +1,4 @@ { "python.analysis.autoImportCompletions": true, - "python.analysis.typeCheckingMode": "off" + "python.analysis.typeCheckingMode": "basic" } diff --git a/src/audio_transcriber.py b/src/audio_transcriber.py index b71e514..90e8458 100644 --- a/src/audio_transcriber.py +++ b/src/audio_transcriber.py @@ -8,7 +8,7 @@ choose_sample_rate, find_supported_sample_rates, ) -from src.cli_interface import CliInterface +from src.cli_interface import CliInterface, start_pause_message from src.config import FRAMES_PER_BUFFER from src.whisper_transcription import WhisperTranscription @@ -24,8 +24,10 @@ def __init__(self, model_size): self.pyaudio_instance = pyaudio.PyAudio() self.stream = None self.device_index, self.chosen_sample_rate = self.setup_audio_device() - self.whisper_transcription = WhisperTranscription(model_size, self.chosen_sample_rate) CliInterface.print_welcome() + self.whisper_transcription = WhisperTranscription(model_size, self.chosen_sample_rate) + + CliInterface.print_info(start_pause_message) def setup_audio_device(self): """ @@ -34,6 +36,11 @@ def setup_audio_device(self): """ device_index = choose_audio_device(self.pyaudio_instance) supported_rates = find_supported_sample_rates(self.pyaudio_instance, device_index) + # if supported_rates is empty, print an error message and exit + if not supported_rates: + CliInterface.print_error("No supported sample rates found for the device.") + self.pyaudio_instance.terminate() + exit(1) chosen_sample_rate = choose_sample_rate(supported_rates) return device_index, chosen_sample_rate @@ -44,10 +51,10 @@ def toggle_recording(self): """ if self.recording: self.pause_recording() - CliInterface.print_recording_paused() + print(CliInterface.colorize("\r\n\u23f8", bold=True) + " Recording paused. " + start_pause_message) else: self.start_recording() - CliInterface.print_recording_started() + print(CliInterface.colorize("\r\n\u25cf", red=True) + " Recording started. " + start_pause_message) def start_recording(self): """ @@ -55,7 +62,7 @@ def start_recording(self): The stream's callback function is set to the transcription service's audio callback function. """ if not self.recording: - CliInterface.print_initialize_recording() + CliInterface.print_info("Initializing recording...") self.stream = self.pyaudio_instance.open( format=pyaudio.paInt16, channels=1, @@ -72,6 +79,13 @@ def pause_recording(self, stop=False): """ Pause recording audio. Stop the audio stream and wait for the transcription service to finish processing. """ + + CliInterface.print_info( + "Pausing recording... wait for processing to complete" + if not stop + else "Stopping recording... wait for processing to complete" + ) + if self.recording: if self.stream: self.stream.stop_stream() @@ -80,7 +94,6 @@ def pause_recording(self, stop=False): self.whisper_transcription.finalize_recording() self.recording = False - CliInterface.print_recording_pausing(stop) while not self.whisper_transcription.is_processing_completed(): time.sleep(0.1) # Adjust sleep time as necessary @@ -105,5 +118,5 @@ def run(self): """ Start the main loop of the application. Listens for key press events and handles them with the on_key_press function. """ - with keyboard.Listener(on_press=self.on_key_press) as listener: + with keyboard.Listener(on_press=self.on_key_press) as listener: # type: ignore listener.join() diff --git a/src/audio_utils.py b/src/audio_utils.py index 1052458..489b691 100644 --- a/src/audio_utils.py +++ b/src/audio_utils.py @@ -23,9 +23,11 @@ def choose_audio_device(pyaudio_instance): :return: The index of the chosen audio device. """ devices = get_audio_devices(pyaudio_instance) - CliInterface.print_devices(devices) - device_index = int(input("Enter the index of the desired audio device: ")) - return device_index + CliInterface.print_info("Available audio devices:\n") + for i, name in enumerate(devices, start=1): + print(CliInterface.colorize(f"{i})", bold=True) + f" {name[1]}") + choice = int(input("\n" + CliInterface.question("Enter the number corresponding to the desired device: "))) + return devices[choice - 1][0] def find_supported_sample_rates(pyaudio_instance, device_index): @@ -51,7 +53,9 @@ def find_supported_sample_rates(pyaudio_instance, device_index): supported_rates.append(rate) except Exception: continue - CliInterface.print_supported_sample_rates(supported_rates) + CliInterface.print_info("Supported sample rates for the device:") + for rate in supported_rates: + CliInterface.print_success(f"Supported: {rate} Hz") return supported_rates @@ -61,15 +65,16 @@ def choose_sample_rate(supported_rates): :param supported_rates: A list of supported sample rates. :return: The chosen sample rate. """ - CliInterface.print_sample_rate_options(supported_rates) + CliInterface.print_info("Choose a sample rate for recording:\n") + for i, rate in enumerate(supported_rates, start=1): + print(f"{i}) {rate} Hz") try: - CliInterface.print_enter_number() - choice = int(input()) + choice = int(input("\n" + CliInterface.question("Enter the number corresponding to the desired sample rate: "))) if 1 <= choice <= len(supported_rates): return supported_rates[choice - 1] else: - CliInterface.print_invalid_selection() + CliInterface.print_warning("Invalid selection. Please enter a number from the list.") return choose_sample_rate(supported_rates) - except ValueError: - CliInterface.print_invalid_number() + except ValueError as e: + CliInterface.print_error(e) return choose_sample_rate(supported_rates) diff --git a/src/cli_interface.py b/src/cli_interface.py index c184492..8a272a8 100644 --- a/src/cli_interface.py +++ b/src/cli_interface.py @@ -1,95 +1,72 @@ class CliInterface: @staticmethod - def print_welcome(): - print("Welcome to the Whisper Audio Transcription Service") - print("-------------------------------------------------") - print("Press Space to start/stop recording, Esc to exit.") - - @staticmethod - def print_initialize_recording(): - print("Initializing recording...", end="", flush=True) - - @staticmethod - def print_recording_started(): - print("\r\033[91m●\033[0m Recording started. Press Space to pause...\n") + def colorize(string, bold=False, red=False, yellow=False, green=False, cyan=False): + """ + Return a string with bold, red, and/or yellow formatting. If many colors are specified, the color precedence is: + red > yellow > green > cyan. + :param string: The input string. + :param bold: Whether to use bold formatting. + :param red: Whether to use red formatting. + :param yellow: Whether to use yellow formatting. + :param green: Whether to use green formatting. + :param cyan: Whether to use cyan formatting. + """ + b = "\033[1m" if bold else "" + r = "\033[91m" if red else "" + y = "\033[93m" if yellow else "" + g = "\033[92m" if green else "" + c = "\033[96m" if cyan else "" + e = "\033[0m" + + return f"{b}{c}{g}{y}{r}{string}{e}" @staticmethod - def print_recording_pausing(stop=False): - print( - "\nPausing" if not stop else "\nStopping", - "recording... please wait for processing to finish.", - ) - - @staticmethod - def print_recording_paused(): - print("\n\u23F8 Recording paused. Press Space to start or Esc to exit.") + def print_welcome(): + print("\n--------------------------------------------") + print("| " + CliInterface.colorize("Welcome to the Whisper Audio Transcriber", bold=True) + " |") + print("--------------------------------------------") @staticmethod def print_exit(): - print("\nExiting application... Thank you for using our service!") - - @staticmethod - def print_processing_chunk(volume_db, chunk_size): - print(f"\r>> Processing chunk (Volume: {volume_db:.2f} dB, Size: {chunk_size} bytes)...") - - @staticmethod - def print_processed_chunk(volume_db, chunk_size): - print(f"\nProcessed audio chunk with volume {volume_db:.2f} dB and size {chunk_size}.") - - @staticmethod - def print_transcription_attempt(attempt): - print(f"\nTranscription attempt {attempt}...") - - @staticmethod - def print_transcription_failed(): - print("\nFailed to transcribe after several attempts.") - - @staticmethod - def print_finalizing(): - print("\nFinalizing recording...") - - @staticmethod - def print_transcription_complete(): - print("\nTranscription completed successfully.") + print("\r\nExiting application...") + print("\n-------------------------------------------------") + print("| " + CliInterface.colorize("Thank you for using Whisper Audio Transcriber", bold=True) + " |") + print("-------------------------------------------------") @staticmethod def print_error(e): - print(f"\nError: {e}") + print("\n" + CliInterface.colorize("!", red=True) + f" Error: {e}") @staticmethod - def print_output_path(path): - print(f"\nTranscription results have been written to: {path}") + def print_warning(warning): + print("\n" + CliInterface.colorize("⚠", yellow=True) + f" Warning: {warning}") @staticmethod - def print_output(json_output): - print(json_output) + def print_success(message): + print("\n" + CliInterface.colorize("✔", green=True) + f" {message}") @staticmethod - def print_devices(devices): - print("Available audio devices:") - for index, name in devices: - print(f"{index}: {name}") + def print_info(message): + print("\n" + CliInterface.colorize("i", cyan=True) + f" {message}") @staticmethod - def print_supported_sample_rates(rates): - print("Testing supported sample rates for the device:") - for rate in rates: - print(f"Supported: {rate} Hz") + def print_question(message): + print("\n" + CliInterface.colorize("?", bold=True) + f" {message}") @staticmethod - def print_sample_rate_options(supported_rates): - print("Supported sample rates: ") - for i, rate in enumerate(supported_rates, start=1): - print(f"{i}) {rate} Hz") + def question(message): + return CliInterface.colorize("?", bold=True) + f" {message}" @staticmethod - def print_invalid_selection(): - print("Invalid selection. Please enter a number from the list.") + def print_error_message(message): + print("\n" + CliInterface.colorize("!", red=True) + f" {message}") - @staticmethod - def print_enter_number(): - print("Enter the number corresponding to the desired sample rate: ") - @staticmethod - def print_invalid_number(): - print("Please enter a valid number.") +start_pause_message = ( + "Press " + + CliInterface.colorize("Space", bold=True) + + " to start/pause recording." + + " Press " + + CliInterface.colorize("Esc", bold=True) + + " to exit." +) diff --git a/src/config.py b/src/config.py index cb42fc8..a1c6aa6 100644 --- a/src/config.py +++ b/src/config.py @@ -3,6 +3,12 @@ # Model size to use for Whisper transcription. Options: "tiny", "base", "small", "medium", "large" MODEL_SIZE = "base" +# Language code to use for Whisper transcription +LANGUAGE_CODE = "en" + +# Whisper prompt to use for transcription +PROMPT = "" + # Sample rates to consider for testing device capabilities (in Hz) SAMPLE_RATES = [8000, 16000, 32000, 44100, 48000] @@ -20,3 +26,6 @@ # Path to the file to print the transcription results OUTPUT_FILE_PATH = "transcription_results.json" + +# Export raw transcriptions to a file +EXPORT_RAW_TRANSCRIPTIONS = True diff --git a/src/whisper_transcription.py b/src/whisper_transcription.py index fb38207..d5016df 100644 --- a/src/whisper_transcription.py +++ b/src/whisper_transcription.py @@ -12,7 +12,15 @@ import whisper from src.cli_interface import CliInterface -from src.config import MAX_RETRIES, OUTPUT_FILE_PATH, PRINT_TO_FILE, RECORDING_DURATION +from src.config import ( + EXPORT_RAW_TRANSCRIPTIONS, + LANGUAGE_CODE, + MAX_RETRIES, + OUTPUT_FILE_PATH, + PRINT_TO_FILE, + PROMPT, + RECORDING_DURATION, +) class WhisperTranscription: @@ -22,6 +30,7 @@ def __init__(self, model_size, chosen_sample_rate): :param model_size: The size of the model to use for transcription. :param chosen_sample_rate: The sample rate chosen for recording. """ + CliInterface.print_info("Loading Whisper model: " + CliInterface.colorize(model_size, bold=True)) self.model = whisper.load_model(model_size) self.chosen_sample_rate = chosen_sample_rate self.processing_queue = Queue() @@ -58,7 +67,9 @@ def process_audio_chunks_queue(self): volume_db, ) = self.processing_queue.get() # Adjusted to include volume_db self.transcribe_audio_chunk(temp_file_path, volume_db) # Pass volume_db to the method - # CliInterface.print_processed_chunk(volume_db, len(audio_chunk)) + CliInterface.print_success( + "Processed audio chunk with volume {:.2f} dB and size {}.".format(volume_db, len(audio_chunk)) + ) else: time.sleep(0.1) # Sleep briefly to avoid busy waiting @@ -72,17 +83,17 @@ def transcribe_audio_chunk(self, temp_file_path, volume_db): attempt = 0 while attempt < MAX_RETRIES: try: - result = self.model.transcribe(temp_file_path, word_timestamps=True) + result = self.model.transcribe(temp_file_path, word_timestamps=True, language=LANGUAGE_CODE, prompt=PROMPT) os.remove(temp_file_path) # Clean up the temporary file self.append_transcription_result(result, volume_db) # Append result with volume break except Exception as e: CliInterface.print_error(e) - CliInterface.print_transcription_attempt(attempt + 1) + CliInterface.print_warning(f"Retrying transcription attempt {attempt + 1}...") time.sleep(1) # Adding delay between retries attempt += 1 if attempt == MAX_RETRIES: - CliInterface.print_transcription_failed() + CliInterface.print_error("Failed to transcribe audio chunk.") self.active_transcribing_tasks -= 1 @@ -136,7 +147,6 @@ def process_and_queue_chunk(self): wave_file.setframerate(self.chosen_sample_rate) wave_file.writeframes(self.audio_buffer) volume_db = self.process_audio_chunk_volume(self.audio_buffer) - CliInterface.print_processing_chunk(volume_db, len(self.audio_buffer)) self.processing_queue.put((self.audio_buffer, temp_file_path, volume_db)) # Ensure this matches the expected unpacking self.audio_buffer = bytes() # Clear the buffer for the next chunk @@ -160,19 +170,32 @@ def output_transcription_results(self): """ Output the full transcription results, including the full text and information about each word. """ - full_text = " ".join([result["text"] for result in self.transcription_results]) - words = [ - word - for result in self.transcription_results - for segment in result.get("segments", []) - for word in segment.get("words", []) - ] - output = {"full_text": full_text, "words": words} + if self.transcription_results.__len__() == 0: + CliInterface.print_warning("No transcription results to output.") + return + if EXPORT_RAW_TRANSCRIPTIONS: + output = self.transcription_results + else: + full_text = "".join([result["text"] for result in self.transcription_results]) + words = [ + word + for result in self.transcription_results + for segment in result.get("segments", []) + for word in segment.get("words", []) + ] + output = { + "full_text": full_text, + "words": words, + } + json_output = json.dumps(output, indent=4) if PRINT_TO_FILE: with open(OUTPUT_FILE_PATH, "w") as file: file.write(json_output) - CliInterface.print_output_path(OUTPUT_FILE_PATH) + CliInterface.print_info( + "Transcription results have been written to: " + CliInterface.colorize(OUTPUT_FILE_PATH, bold=True) + ) else: - CliInterface.print_output(json_output) + CliInterface.print_info("Transcription results:") + print(json_output) diff --git a/tests/test_audio_transcriber.py b/tests/test_audio_transcriber.py index ce5e779..21877b3 100644 --- a/tests/test_audio_transcriber.py +++ b/tests/test_audio_transcriber.py @@ -21,6 +21,7 @@ def audio_transcriber_mocked(): # Mock CliInterface.print_welcome cli_interface_mock = Mock(spec=CliInterface) + cli_interface_mock.colorize.return_value = "" # Mock setup_audio_device method to return dummy device index and sample rate device_index, chosen_sample_rate = 0, 44100 diff --git a/tests/test_audio_utils.py b/tests/test_audio_utils.py index eb6dde6..7a8e73f 100644 --- a/tests/test_audio_utils.py +++ b/tests/test_audio_utils.py @@ -47,7 +47,7 @@ def test_choose_audio_device(mocker, mock_pyaudio_instance): mocker.patch("builtins.input", return_value="1") device_index = choose_audio_device(mock_pyaudio_instance) # Check if the function returns the correct device index - assert device_index == 1 + assert device_index == 0 # Test to verify the find_supported_sample_rates function