diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9d022a5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +# Ignorar a pasta do ambiente virtual +venv/ + +# Ignorar arquivos de configuração do VS Code +.vscode/ + +# Ignorar cache do Python +__pycache__/ +*.py[cod] + +# Ignorar logs, arquivos temporários e de sistema +*.log +*.tmp +.DS_Store + +# Ignorar arquivos de dependências que podem ser gerados automaticamente +*.env +.env diff --git a/README.md b/README.md new file mode 100644 index 0000000..7ff9c4b --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +# AudioVision + +**AudioVision** is a web app designed to support individuals with hearing impairments by converting audio files into text and visual representations. Using audio processing and AI, the application not only transcribes audio but also generates graphical sound wave visualizations and provides insightful analyses of the transcribed content. + +### Features + +- **Audio Transcription**: Upload audio files in various formats, such as `.ogg` and `.wav`, to receive an accurate text transcription of the spoken content. +- **Waveform Visualization**: View a graphical representation of sound waves, offering a visual way to understand the characteristics of the audio. +- **Content Analysis**: Leverage AI to interpret the transcription, generating summaries and contextual insights tailored for enhanced clarity and accessibility. + +### Technologies Used + +- **Flask** for the application backend +- **Librosa and Matplotlib** for waveform visualizations +- **SpeechRecognition** for audio transcription +- **Google Generative AI** for intelligent summaries and contextual analysis of audio content + +### How to Use + +1. Upload an audio file on the main page. +2. The app converts the audio to text, displays the transcription, and shows a representative waveform. +3. Optionally, view an AI-driven analysis of the content for broader understanding of the transcribed audio. + +> AudioVision was created to make auditory information more accessible through visual and textual formats, fostering inclusion and accessibility for individuals with hearing loss. \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..44263f3 --- /dev/null +++ b/main.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +from flask import Flask, request, jsonify, render_template +from flask_cors import CORS +from os import path, makedirs, remove, listdir +from uuid import uuid4 +import speech_recognition as sr +from pydub import AudioSegment +from base64 import b64encode +from io import BytesIO +import matplotlib.pyplot as plt +from librosa import load +import librosa.display +import google.generativeai as genai +from werkzeug.utils import secure_filename + +app = Flask(__name__) +CORS(app) +app.config["UPLOAD_FOLDER"] = "uploads/" +if not path.exists(app.config["UPLOAD_FOLDER"]): + makedirs(app.config["UPLOAD_FOLDER"]) + +gemini_api_key = "" +genai.configure(api_key=gemini_api_key) +model = genai.GenerativeModel("gemini-1.5-flash") + + +@app.route("/") +def index(): + return render_template("index.html") + + +@app.route("/transcribe", methods=["POST"]) +def transcribe(): + if "audio" not in request.files: + return jsonify({"error": "Nenhum arquivo foi enviado."}), 400 + + file = request.files["audio"] + if file.filename == "": + return jsonify({"error": "Nenhum arquivo selecionado."}), 400 + + filename = secure_filename(file.filename) + audio_path = path.join(app.config["UPLOAD_FOLDER"], filename) + + try: + file.save(audio_path) + except Exception as e: + return jsonify({"error": "Erro ao salvar o arquivo.", "details": str(e)}), 500 + + try: + if filename.endswith(".ogg"): + audio = AudioSegment.from_ogg(audio_path) + wav_path = audio_path.replace(".ogg", ".wav") + audio.export(wav_path, format="wav") + if not path.exists(wav_path): + return jsonify({"error": "Erro na conversão do arquivo."}), 500 + else: + wav_path = audio_path + + transcription = transcribe_audio(wav_path) + plot_base64 = generate_waveform(wav_path) + unique_id = str(uuid4()) + + if path.exists(audio_path): + remove(audio_path) + if filename.endswith(".ogg") and path.exists(wav_path): + remove(wav_path) + + response_data = { + "transcription": transcription, + "plot": plot_base64, + "id": unique_id, + } + + clear_uploads() + return jsonify(response_data) + + except Exception as e: + return ( + jsonify({"error": "Erro ao processar o arquivo.", "details": str(e)}), + 500, + ) + + +@app.route("/analyze", methods=["POST"]) +def analyze(): + data = request.get_json() + transcription = data.get("transcription", "") + if not transcription: + return jsonify({"error": "Texto de transcrição ausente."}), 400 + + analysis_result = analyze_transcription(transcription) + return jsonify({"analysis": analysis_result}) + + +def clear_uploads(): + for filename in listdir(app.config["UPLOAD_FOLDER"]): + file_path = path.join(app.config["UPLOAD_FOLDER"], filename) + try: + if path.isfile(file_path): + remove(file_path) + except Exception as e: + print(f"Erro ao tentar remover o arquivo {file_path}: {e}") + + +def transcribe_audio(wav_path: str) -> str: + recognizer = sr.Recognizer() + with sr.AudioFile(wav_path) as source: + audio_data = recognizer.record(source) + try: + return recognizer.recognize_google(audio_data, language="pt-BR") + except sr.UnknownValueError: + return "Não foi possível entender o áudio." + except sr.RequestError as e: + return f"Erro ao conectar ao serviço de reconhecimento de fala: {e}" + + +def analyze_transcription(transcription: str) -> str: + prompt = ( + f"Você é um assistente inteligente. O texto a seguir é uma transcrição de áudio " + f'que pode conter erros. Tente entender e resumir o conteúdo: "{transcription}"' + ) + try: + response = model.generate_content(prompt) + if response and response.candidates: + candidate = response.candidates[0] + gemini_response = "".join(part.text for part in candidate.content.parts) + return gemini_response + else: + return "Desculpe, não consegui analisar a transcrição." + except Exception as e: + return f"Erro ao tentar analisar a transcrição: {e}" + + +def generate_waveform(audio_path: str) -> str: + y, sr = load(audio_path, sr=None) + plt.figure(figsize=(12, 4)) + librosa.display.waveshow(y, sr=sr, color="white") + plt.axis("off") + buf = BytesIO() + plt.tight_layout() + plt.savefig( + buf, format="png", bbox_inches="tight", pad_inches=0.1, transparent=True + ) + plt.close() + buf.seek(0) + return f"data:image/png;base64,{b64encode(buf.read()).decode('utf-8')}" + + +if __name__ == "__main__": + app.run(debug=True, port=5001) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..167c69b Binary files /dev/null and b/requirements.txt differ diff --git a/static/assets/Poppins-Regular.ttf b/static/assets/Poppins-Regular.ttf new file mode 100644 index 0000000..d8a201f Binary files /dev/null and b/static/assets/Poppins-Regular.ttf differ diff --git a/static/assets/cursor.png b/static/assets/cursor.png new file mode 100644 index 0000000..07614de Binary files /dev/null and b/static/assets/cursor.png differ diff --git a/static/assets/gemini.png b/static/assets/gemini.png new file mode 100644 index 0000000..43cf4b7 Binary files /dev/null and b/static/assets/gemini.png differ diff --git a/static/assets/github2.png b/static/assets/github2.png new file mode 100644 index 0000000..f948c96 Binary files /dev/null and b/static/assets/github2.png differ diff --git a/static/assets/icon.png b/static/assets/icon.png new file mode 100644 index 0000000..4cd7975 Binary files /dev/null and b/static/assets/icon.png differ diff --git a/static/assets/swipe.png b/static/assets/swipe.png new file mode 100644 index 0000000..617d3d8 Binary files /dev/null and b/static/assets/swipe.png differ diff --git a/static/style.css b/static/style.css new file mode 100644 index 0000000..18c9462 --- /dev/null +++ b/static/style.css @@ -0,0 +1,347 @@ +@font-face { + font-family: "Poppins"; + src: url("./assets/Poppins-Regular.ttf"); +} + +:root { + --minSize: 400px; +} + +html { + color-scheme: dark; + font-family: "Malgun Gothic"; + scroll-behavior: smooth; +} + +::selection { + background: #333; +} + +* { + padding: 0; + margin: 0; + box-sizing: border-box; + list-style: none; + border: none; + outline: none; + font-family: "Poppins", "Malgun Gothic"; + cursor: url("./assets/cursor.png"), auto; +} + +body { + min-width: var(--minSize); + min-height: 100vh; + background: #181a1b; + overflow: hidden; + scroll-snap-type: y mandatory; +} + +::-webkit-scrollbar { + display: none; +} + +section { + padding-inline: 40px; + min-width: var(--minSize); + height: 100vh; + scroll-snap-align: start; + overflow: hidden; + position: relative; +} + +.home { + display: flex; + flex-direction: column; + justify-content: space-between; +} + +.content { + max-width: 740px; + margin-inline: auto; + min-height: 500px; +} + +.home .content { + display: grid; + place-content: center; + height: 100%; + padding-bottom: 30px; +} + +.home nav *::selection { + background: none; +} + +.home h1 { + max-width: 700px; + min-width: 300px; + width: 54vw; + font-size: clamp(2em, 3.5em, 4vw); + line-height: 1em; + margin-bottom: 7px; +} + +.home .button_upload { + margin-top: 10px; + max-width: 100px; +} + +small { + text-align: center; + margin-bottom: 40px; +} + +i { + font-size: .8em; + min-width: 300px; +} + +.gradient_container { + width: 70vw; + height: 100vh; + display: flex; + flex-wrap: wrap; + animation: rtt 180s linear infinite; + position: absolute; + left: -190px; + bottom: -480px; + z-index: -1; +} + +@keyframes rtt { + to { + transform: rotate(360deg); + } +} + +.box { + width: 50%; + height: 50%; + filter: blur(150px); +} + +.box:nth-child(1) { + background: #4f4954; +} + +.box:nth-child(2) { + background: rgb(79, 78, 84); +} + +.box:nth-child(3) { + background: #3d383e; +} + +.box:nth-child(4) { + background: #60636a; +} + +nav { + min-width: var(--minSize); + padding: 40px 10px 10px; + display: flex; + justify-content: space-between; + align-items: center; + flex-wrap: wrap; + gap: 7px; +} + +.button_upload { + background: #ffffff; + color: #111; + border-radius: 20px; + padding: 5px 14px; + font-size: .9em; + font-weight: bold; + white-space: nowrap; + height: 32px; + text-align: center; +} + +.button_upload:active{ + scale: .98; +} + +.button_upload_loading { + background: + linear-gradient(90deg, #0001 33%, #0005 50%, #0001 66%) #f2f2f2; + background-size: 300% 100%; + animation: l1 1s infinite linear; +} + +@keyframes l1 { + 0% { + background-position: right + } +} + +.home nav { + justify-content: start; +} + +.size { + margin: 10px; + font-size: .8em; + height: 30px; +} + +ul { + min-width: var(--minSize); + display: flex; + flex-direction: column; + overflow-y: auto; + height: 80vh; + margin-top: 15px; +} + +li { + padding: 20px 30px; + display: flex; + flex-direction: column; + gap: 10px; +} + +li:hover { + border: none; + background: #333; + border-radius: 20px; +} + +.top { + width: 100%; + display: flex; + align-items: center; + justify-content: space-between; + gap: 7px; + flex-wrap: wrap; +} + +.top button { + opacity: .4; + height: 18px; + width: 18px; + background: none; +} + +.top button:hover { + opacity: 1; +} + +.audio { + display: flex; + align-items: center; + margin-left: -7px; + background: #212121; + width: 230px; + height: 45px; + border-radius: 30px; + padding: 3px 12px; +} + +.audio button { + background: none; + font-size: 1.2em; + width: 30px; + height: 30px; + margin-top: -3px; +} + +.audioControl { + display: flex; + flex-direction: column; + gap: 7px; +} + +.imgControl { + position: relative; + width: 180px; + height: 30px; + border: none; +} + +.imgControl * { + border: none; +} + +.img-comp-img { + position: absolute; + top: 0; + overflow: hidden; +} + +.control { + opacity: .5; +} + +.img-comp-slider { + --size: 10px; + position: absolute; + top: 50%; + transform: translateY(-50%); + z-index: 9; + cursor: url("./assets/swipe.png"), auto; + width: var(--size); + height: var(--size); + background: #53bdeb; + border-radius: 10px; +} + +.audioControl span { + font-size: .8em; + font-weight: bold; +} + +.result::first-letter { + text-transform: uppercase; +} + +.GeminiArea { + display: flex; + flex-direction: column; + text-align: left; + align-items: start; +} + +.GeminiArea span{ + font-weight: normal; + font-style: normal; + display: inline; + font-size: .9em; +} + +.aiContext { + border-left: 4px solid #666; + padding-left: 10px; + margin: 6px 0 10px; + font-size: .9em; +} + +.modelName { + transition: .3s; + font-style: italic; + color: #fff; + text-decoration: none; + font-size: .9em; + display: inline-flex; + align-items: center; + gap: 7px; +} + +.audioNav{ + display: flex; + align-items: center; + gap: 5px; +} + +.gem{ + width: 40px; + height: 40px; + border-radius: 50px; + display: grid; + place-content: center; + background: #212121; +} + +.gem img{ + image-rendering: crisp-edges; +} \ No newline at end of file diff --git a/templates/index.html b/templates/index.html new file mode 100644 index 0000000..ed1e938 --- /dev/null +++ b/templates/index.html @@ -0,0 +1,461 @@ + + + + + + + AudioVision + + + + + +
+ +
+

Convert sound to meaning with one click.

+ — Capture the sound and watch your words come to life. + + +
+ you can drop an audio file anywhere on the page +
+ + + + +
+
+
+
+ + + +
+
+ + + + + \ No newline at end of file