Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
vxncius-dev committed Oct 31, 2024
0 parents commit e029ba2
Show file tree
Hide file tree
Showing 12 changed files with 1,000 additions and 0 deletions.
18 changes: 18 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Ignorar a pasta do ambiente virtual
venv/

# Ignorar arquivos de configuração do VS Code
.vscode/

# Ignorar cache do Python
__pycache__/
*.py[cod]

# Ignorar logs, arquivos temporários e de sistema
*.log
*.tmp
.DS_Store

# Ignorar arquivos de dependências que podem ser gerados automaticamente
*.env
.env
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# AudioVision

**AudioVision** is a web app designed to support individuals with hearing impairments by converting audio files into text and visual representations. Using audio processing and AI, the application not only transcribes audio but also generates graphical sound wave visualizations and provides insightful analyses of the transcribed content.

### Features

- **Audio Transcription**: Upload audio files in various formats, such as `.ogg` and `.wav`, to receive an accurate text transcription of the spoken content.
- **Waveform Visualization**: View a graphical representation of sound waves, offering a visual way to understand the characteristics of the audio.
- **Content Analysis**: Leverage AI to interpret the transcription, generating summaries and contextual insights tailored for enhanced clarity and accessibility.

### Technologies Used

- **Flask** for the application backend
- **Librosa and Matplotlib** for waveform visualizations
- **SpeechRecognition** for audio transcription
- **Google Generative AI** for intelligent summaries and contextual analysis of audio content

### How to Use

1. Upload an audio file on the main page.
2. The app converts the audio to text, displays the transcription, and shows a representative waveform.
3. Optionally, view an AI-driven analysis of the content for broader understanding of the transcribed audio.

> AudioVision was created to make auditory information more accessible through visual and textual formats, fostering inclusion and accessibility for individuals with hearing loss.
150 changes: 150 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/usr/bin/env python3
from flask import Flask, request, jsonify, render_template
from flask_cors import CORS
from os import path, makedirs, remove, listdir
from uuid import uuid4
import speech_recognition as sr
from pydub import AudioSegment
from base64 import b64encode
from io import BytesIO
import matplotlib.pyplot as plt
from librosa import load
import librosa.display
import google.generativeai as genai
from werkzeug.utils import secure_filename

app = Flask(__name__)
CORS(app)
app.config["UPLOAD_FOLDER"] = "uploads/"
if not path.exists(app.config["UPLOAD_FOLDER"]):
makedirs(app.config["UPLOAD_FOLDER"])

gemini_api_key = ""
genai.configure(api_key=gemini_api_key)
model = genai.GenerativeModel("gemini-1.5-flash")


@app.route("/")
def index():
return render_template("index.html")


@app.route("/transcribe", methods=["POST"])
def transcribe():
if "audio" not in request.files:
return jsonify({"error": "Nenhum arquivo foi enviado."}), 400

file = request.files["audio"]
if file.filename == "":
return jsonify({"error": "Nenhum arquivo selecionado."}), 400

filename = secure_filename(file.filename)
audio_path = path.join(app.config["UPLOAD_FOLDER"], filename)

try:
file.save(audio_path)
except Exception as e:
return jsonify({"error": "Erro ao salvar o arquivo.", "details": str(e)}), 500

try:
if filename.endswith(".ogg"):
audio = AudioSegment.from_ogg(audio_path)
wav_path = audio_path.replace(".ogg", ".wav")
audio.export(wav_path, format="wav")
if not path.exists(wav_path):
return jsonify({"error": "Erro na conversão do arquivo."}), 500
else:
wav_path = audio_path

transcription = transcribe_audio(wav_path)
plot_base64 = generate_waveform(wav_path)
unique_id = str(uuid4())

if path.exists(audio_path):
remove(audio_path)
if filename.endswith(".ogg") and path.exists(wav_path):
remove(wav_path)

response_data = {
"transcription": transcription,
"plot": plot_base64,
"id": unique_id,
}

clear_uploads()
return jsonify(response_data)

except Exception as e:
return (
jsonify({"error": "Erro ao processar o arquivo.", "details": str(e)}),
500,
)


@app.route("/analyze", methods=["POST"])
def analyze():
data = request.get_json()
transcription = data.get("transcription", "")
if not transcription:
return jsonify({"error": "Texto de transcrição ausente."}), 400

analysis_result = analyze_transcription(transcription)
return jsonify({"analysis": analysis_result})


def clear_uploads():
for filename in listdir(app.config["UPLOAD_FOLDER"]):
file_path = path.join(app.config["UPLOAD_FOLDER"], filename)
try:
if path.isfile(file_path):
remove(file_path)
except Exception as e:
print(f"Erro ao tentar remover o arquivo {file_path}: {e}")


def transcribe_audio(wav_path: str) -> str:
recognizer = sr.Recognizer()
with sr.AudioFile(wav_path) as source:
audio_data = recognizer.record(source)
try:
return recognizer.recognize_google(audio_data, language="pt-BR")
except sr.UnknownValueError:
return "Não foi possível entender o áudio."
except sr.RequestError as e:
return f"Erro ao conectar ao serviço de reconhecimento de fala: {e}"


def analyze_transcription(transcription: str) -> str:
prompt = (
f"Você é um assistente inteligente. O texto a seguir é uma transcrição de áudio "
f'que pode conter erros. Tente entender e resumir o conteúdo: "{transcription}"'
)
try:
response = model.generate_content(prompt)
if response and response.candidates:
candidate = response.candidates[0]
gemini_response = "".join(part.text for part in candidate.content.parts)
return gemini_response
else:
return "Desculpe, não consegui analisar a transcrição."
except Exception as e:
return f"Erro ao tentar analisar a transcrição: {e}"


def generate_waveform(audio_path: str) -> str:
y, sr = load(audio_path, sr=None)
plt.figure(figsize=(12, 4))
librosa.display.waveshow(y, sr=sr, color="white")
plt.axis("off")
buf = BytesIO()
plt.tight_layout()
plt.savefig(
buf, format="png", bbox_inches="tight", pad_inches=0.1, transparent=True
)
plt.close()
buf.seek(0)
return f"data:image/png;base64,{b64encode(buf.read()).decode('utf-8')}"


if __name__ == "__main__":
app.run(debug=True, port=5001)
Binary file added requirements.txt
Binary file not shown.
Binary file added static/assets/Poppins-Regular.ttf
Binary file not shown.
Binary file added static/assets/cursor.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added static/assets/gemini.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added static/assets/github2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added static/assets/icon.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added static/assets/swipe.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit e029ba2

Please sign in to comment.