-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit e029ba2
Showing
12 changed files
with
1,000 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Ignorar a pasta do ambiente virtual | ||
venv/ | ||
|
||
# Ignorar arquivos de configuração do VS Code | ||
.vscode/ | ||
|
||
# Ignorar cache do Python | ||
__pycache__/ | ||
*.py[cod] | ||
|
||
# Ignorar logs, arquivos temporários e de sistema | ||
*.log | ||
*.tmp | ||
.DS_Store | ||
|
||
# Ignorar arquivos de dependências que podem ser gerados automaticamente | ||
*.env | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# AudioVision | ||
|
||
**AudioVision** is a web app designed to support individuals with hearing impairments by converting audio files into text and visual representations. Using audio processing and AI, the application not only transcribes audio but also generates graphical sound wave visualizations and provides insightful analyses of the transcribed content. | ||
|
||
### Features | ||
|
||
- **Audio Transcription**: Upload audio files in various formats, such as `.ogg` and `.wav`, to receive an accurate text transcription of the spoken content. | ||
- **Waveform Visualization**: View a graphical representation of sound waves, offering a visual way to understand the characteristics of the audio. | ||
- **Content Analysis**: Leverage AI to interpret the transcription, generating summaries and contextual insights tailored for enhanced clarity and accessibility. | ||
|
||
### Technologies Used | ||
|
||
- **Flask** for the application backend | ||
- **Librosa and Matplotlib** for waveform visualizations | ||
- **SpeechRecognition** for audio transcription | ||
- **Google Generative AI** for intelligent summaries and contextual analysis of audio content | ||
|
||
### How to Use | ||
|
||
1. Upload an audio file on the main page. | ||
2. The app converts the audio to text, displays the transcription, and shows a representative waveform. | ||
3. Optionally, view an AI-driven analysis of the content for broader understanding of the transcribed audio. | ||
|
||
> AudioVision was created to make auditory information more accessible through visual and textual formats, fostering inclusion and accessibility for individuals with hearing loss. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,150 @@ | ||
#!/usr/bin/env python3 | ||
from flask import Flask, request, jsonify, render_template | ||
from flask_cors import CORS | ||
from os import path, makedirs, remove, listdir | ||
from uuid import uuid4 | ||
import speech_recognition as sr | ||
from pydub import AudioSegment | ||
from base64 import b64encode | ||
from io import BytesIO | ||
import matplotlib.pyplot as plt | ||
from librosa import load | ||
import librosa.display | ||
import google.generativeai as genai | ||
from werkzeug.utils import secure_filename | ||
|
||
app = Flask(__name__) | ||
CORS(app) | ||
app.config["UPLOAD_FOLDER"] = "uploads/" | ||
if not path.exists(app.config["UPLOAD_FOLDER"]): | ||
makedirs(app.config["UPLOAD_FOLDER"]) | ||
|
||
gemini_api_key = "" | ||
genai.configure(api_key=gemini_api_key) | ||
model = genai.GenerativeModel("gemini-1.5-flash") | ||
|
||
|
||
@app.route("/") | ||
def index(): | ||
return render_template("index.html") | ||
|
||
|
||
@app.route("/transcribe", methods=["POST"]) | ||
def transcribe(): | ||
if "audio" not in request.files: | ||
return jsonify({"error": "Nenhum arquivo foi enviado."}), 400 | ||
|
||
file = request.files["audio"] | ||
if file.filename == "": | ||
return jsonify({"error": "Nenhum arquivo selecionado."}), 400 | ||
|
||
filename = secure_filename(file.filename) | ||
audio_path = path.join(app.config["UPLOAD_FOLDER"], filename) | ||
|
||
try: | ||
file.save(audio_path) | ||
except Exception as e: | ||
return jsonify({"error": "Erro ao salvar o arquivo.", "details": str(e)}), 500 | ||
|
||
try: | ||
if filename.endswith(".ogg"): | ||
audio = AudioSegment.from_ogg(audio_path) | ||
wav_path = audio_path.replace(".ogg", ".wav") | ||
audio.export(wav_path, format="wav") | ||
if not path.exists(wav_path): | ||
return jsonify({"error": "Erro na conversão do arquivo."}), 500 | ||
else: | ||
wav_path = audio_path | ||
|
||
transcription = transcribe_audio(wav_path) | ||
plot_base64 = generate_waveform(wav_path) | ||
unique_id = str(uuid4()) | ||
|
||
if path.exists(audio_path): | ||
remove(audio_path) | ||
if filename.endswith(".ogg") and path.exists(wav_path): | ||
remove(wav_path) | ||
|
||
response_data = { | ||
"transcription": transcription, | ||
"plot": plot_base64, | ||
"id": unique_id, | ||
} | ||
|
||
clear_uploads() | ||
return jsonify(response_data) | ||
|
||
except Exception as e: | ||
return ( | ||
jsonify({"error": "Erro ao processar o arquivo.", "details": str(e)}), | ||
500, | ||
) | ||
|
||
|
||
@app.route("/analyze", methods=["POST"]) | ||
def analyze(): | ||
data = request.get_json() | ||
transcription = data.get("transcription", "") | ||
if not transcription: | ||
return jsonify({"error": "Texto de transcrição ausente."}), 400 | ||
|
||
analysis_result = analyze_transcription(transcription) | ||
return jsonify({"analysis": analysis_result}) | ||
|
||
|
||
def clear_uploads(): | ||
for filename in listdir(app.config["UPLOAD_FOLDER"]): | ||
file_path = path.join(app.config["UPLOAD_FOLDER"], filename) | ||
try: | ||
if path.isfile(file_path): | ||
remove(file_path) | ||
except Exception as e: | ||
print(f"Erro ao tentar remover o arquivo {file_path}: {e}") | ||
|
||
|
||
def transcribe_audio(wav_path: str) -> str: | ||
recognizer = sr.Recognizer() | ||
with sr.AudioFile(wav_path) as source: | ||
audio_data = recognizer.record(source) | ||
try: | ||
return recognizer.recognize_google(audio_data, language="pt-BR") | ||
except sr.UnknownValueError: | ||
return "Não foi possível entender o áudio." | ||
except sr.RequestError as e: | ||
return f"Erro ao conectar ao serviço de reconhecimento de fala: {e}" | ||
|
||
|
||
def analyze_transcription(transcription: str) -> str: | ||
prompt = ( | ||
f"Você é um assistente inteligente. O texto a seguir é uma transcrição de áudio " | ||
f'que pode conter erros. Tente entender e resumir o conteúdo: "{transcription}"' | ||
) | ||
try: | ||
response = model.generate_content(prompt) | ||
if response and response.candidates: | ||
candidate = response.candidates[0] | ||
gemini_response = "".join(part.text for part in candidate.content.parts) | ||
return gemini_response | ||
else: | ||
return "Desculpe, não consegui analisar a transcrição." | ||
except Exception as e: | ||
return f"Erro ao tentar analisar a transcrição: {e}" | ||
|
||
|
||
def generate_waveform(audio_path: str) -> str: | ||
y, sr = load(audio_path, sr=None) | ||
plt.figure(figsize=(12, 4)) | ||
librosa.display.waveshow(y, sr=sr, color="white") | ||
plt.axis("off") | ||
buf = BytesIO() | ||
plt.tight_layout() | ||
plt.savefig( | ||
buf, format="png", bbox_inches="tight", pad_inches=0.1, transparent=True | ||
) | ||
plt.close() | ||
buf.seek(0) | ||
return f"data:image/png;base64,{b64encode(buf.read()).decode('utf-8')}" | ||
|
||
|
||
if __name__ == "__main__": | ||
app.run(debug=True, port=5001) |
Binary file not shown.
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.