diff --git a/app.py b/app.py index 26588ae..0e8760b 100644 --- a/app.py +++ b/app.py @@ -7,6 +7,10 @@ import jiwer import eng_to_ipa as p from fuzzywuzzy import fuzz +import librosa +import numpy as np +import soundfile as sf +import noisereduce as nr app = Flask(__name__) @@ -67,6 +71,126 @@ "y", "a", "x", "c" ] +@app.route('/audio_processing', methods=['POST']) +def home(): + data = request.json + if data: + audio_base64 = data.get('audio_base64') + if audio_base64: + # Convert base64 audio to audio data + audio_data = base64.b64decode(audio_base64) + audio_io = BytesIO(audio_data) + + # Proceed with existing process + denoised_audio, sample_rate, initial_snr, final_snr = denoise_audio(audio_io, speed_factor=0.75) + denoised_audio_base64 = convert_to_base64(denoised_audio, sample_rate) + + # Delete audio data from cache + del audio_data + del audio_io + + return jsonify({"denoised_audio_base64": denoised_audio_base64}), 200 + else: + return jsonify({"error": "Missing audio_base64 parameter."}), 400 + else: + return jsonify({"error": "No data received."}), 400 + +def calculate_snr(audio, sr): + n_fft = min(len(audio), 2048) # Ensure n_fft does not exceed the length of the audio + stft = librosa.stft(audio, n_fft=n_fft) + power = np.abs(stft)**2 + + mel_spectrogram = librosa.feature.melspectrogram(S=power, sr=sr) + mel_power = np.mean(mel_spectrogram, axis=0) + + energy_threshold = np.mean(mel_power) + speech_indices = mel_power > energy_threshold + noise_indices = ~speech_indices + + signal_power = np.mean(power[:, speech_indices], axis=1) + average_signal_power = np.mean(signal_power) if signal_power.size > 0 else 0 + + noise_power = np.mean(power[:, noise_indices], axis=1) + average_noise_power = np.mean(noise_power) if noise_power.size > 0 else 1e-10 + + snr = 10 * np.log10(average_signal_power / average_noise_power) if average_signal_power > 0 else 0 + return snr + +def estimate_noise_floor(audio, sr, frame_length=None, hop_length=512): + frame_length = frame_length or min(len(audio), 2048) + stft = librosa.stft(audio, n_fft=frame_length, hop_length=hop_length) + power_spectrogram = np.abs(stft)**2 + energy = np.sum(power_spectrogram, axis=0) + + low_energy_threshold = np.percentile(energy, 10) + very_low_energy = energy[energy <= low_energy_threshold] + + adaptive_percentile = 5 if len(very_low_energy) < len(energy) * 0.1 else 10 + noise_floor = np.percentile(energy, adaptive_percentile) + + return noise_floor + +def denoise_audio(filepath, speed_factor=1.0): + audio, sample_rate = librosa.load(filepath, sr=None) + + # Apply time stretching first if the speed factor is not 1.0 + if speed_factor != 1.0: + audio = librosa.effects.time_stretch(audio, rate=speed_factor) + + # Calculate initial full audio SNR + initial_snr = calculate_snr(audio, sample_rate) + + # Improved VAD + vad_intervals = librosa.effects.split(audio, top_db=20) + noise_floor = estimate_noise_floor(audio, sample_rate) + + noise_reduced_audio = np.copy(audio) + improved_intervals = False # Flag to track if any intervals improved SNR + + for interval in vad_intervals: + interval_audio = audio[interval[0]:interval[1]] + interval_snr = calculate_snr(interval_audio, sample_rate) + + # Determine reduction intensity based on initial SNR + reduction_intensity = determine_reduction_intensity(initial_snr) + + # Apply noise reduction + reduced_interval_audio = nr.reduce_noise(y=interval_audio, sr=sample_rate, prop_decrease=reduction_intensity) + + # Calculate SNR after noise reduction + reduced_interval_snr = calculate_snr(reduced_interval_audio, sample_rate) + if reduced_interval_snr > interval_snr: + noise_reduced_audio[interval[0]:interval[1]] = reduced_interval_audio + improved_intervals = True + else: + print("No SNR improvement; keeping original audio for this interval.") + + # Calculate final SNR and decide which version to use based on SNR comparison + final_snr = calculate_snr(noise_reduced_audio, sample_rate) + if not improved_intervals or final_snr < initial_snr: + final_snr = initial_snr # Revert to original SNR if no improvement + noise_reduced_audio = audio # Revert to original audio + + normalized_audio = librosa.util.normalize(noise_reduced_audio) + return normalized_audio, sample_rate, initial_snr, final_snr + +def determine_reduction_intensity(snr): + if snr < 10: + return 0.7 + elif snr < 15: + return 0.5 + elif snr < 20: + return 0.22 + elif snr >= 30: + return 0.1 + return 0.1 # Default to the least aggressive reduction if no specific conditions are met + +def convert_to_base64(audio_data, sample_rate): + buffer = io.BytesIO() + sf.write(buffer, audio_data, sample_rate, format='wav') + buffer.seek(0) + base64_audio = base64.b64encode(buffer.read()).decode('utf-8') + return base64_audio def get_error_arrays(alignments, reference, hypothesis, base64string): insertion = [] diff --git a/requirements.txt b/requirements.txt index e08c250..c18717e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,14 +1,55 @@ +audioread==3.0.1 blinker==1.7.0 +certifi==2024.2.2 +cffi==1.16.0 +charset-normalizer==3.3.2 click==8.1.7 colorama==0.4.6 +contourpy==1.1.1 +cycler==0.12.1 +decorator==5.1.1 eng-to-ipa==0.0.2 -Flask==3.0.0 +flask==3.0.0 +fonttools==4.51.0 fuzzywuzzy==0.18.0 +idna==3.7 +importlib-metadata==7.1.0 +importlib-resources==6.4.0 itsdangerous==2.1.2 Jinja2==3.1.2 jiwer==3.0.3 +joblib==1.4.0 +kiwisolver==1.4.5 +lazy-loader==0.4 Levenshtein==0.24.0 +librosa==0.10.1 +llvmlite==0.41.1 MarkupSafe==2.1.3 +matplotlib==3.7.5 +msgpack==1.0.8 +noisereduce==3.0.2 +numba==0.58.1 +numpy==1.24.4 +packaging==24.0 +pillow==10.3.0 +platformdirs==4.2.0 +pooch==1.8.1 +pycparser==2.22 pydub==0.25.1 +pyparsing==3.1.2 +python-dateutil==2.9.0.post0 +pytz==2024.1 rapidfuzz==3.6.1 -Werkzeug==3.0.1 +requests==2.31.0 +scikit-learn==1.3.2 +scipy==1.10.1 +six==1.16.0 +soundfile==0.12.1 +soxr==0.3.7 +threadpoolctl==3.4.0 +tqdm==4.66.2 +typing-extensions==4.11.0 +tzdata==2024.1 +urllib3==2.2.1 +werkzeug==3.0.1 +zipp==3.18.1