-
Notifications
You must be signed in to change notification settings - Fork 0
[Feat] voice to text by stt #6
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
b3a91f0
6ca42ff
af2290c
04e7f0d
990df49
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| from dotenv import load_dotenv # type: ignore | ||
| load_dotenv() | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,11 @@ | ||
| import os | ||
|
|
||
| # μ λ‘λ κΈ°λ³Έ λ² μ΄μ€ ν리ν½μ€ (νκ²½λ³μ S3_PREFIXλ‘ μ€λ²λΌμ΄λ κ°λ₯) | ||
| VOICE_BASE_PREFIX = os.getenv("S3_PREFIX", "voices") | ||
|
|
||
| # κΈ°λ³Έ ν΄λλͺ (μμ²μ folder λ―Έμ§μ μ μ¬μ©) | ||
| DEFAULT_UPLOAD_FOLDER = "voiceFile" | ||
|
|
||
| # # νμ μ νμ© ν΄λ μ§ν© μ μ (μ: κ²μ¦μ©) | ||
| # ALLOWED_FOLDERS = {"raw", "processed", "public"} | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,117 @@ | ||
| import io | ||
| import os | ||
| import tempfile | ||
| from typing import Dict, Any | ||
| import librosa | ||
| import torch | ||
| from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor | ||
| import numpy as np | ||
|
|
||
|
|
||
| class EmotionAnalyzer: | ||
| def __init__(self): | ||
| self.model = None | ||
| self.feature_extractor = None | ||
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | ||
| self._load_model() | ||
|
|
||
| def _load_model(self): | ||
| """Hugging Face λͺ¨λΈ λ‘λ""" | ||
| model_name = "jungjongho/wav2vec2-xlsr-korean-speech-emotion-recognition" | ||
|
|
||
| try: | ||
| self.model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) | ||
| self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) | ||
| self.model.to(self.device) | ||
| self.model.eval() | ||
| except Exception as e: | ||
| print(f"λͺ¨λΈ λ‘λ μ€ν¨: {e}") | ||
| self.model = None | ||
| self.feature_extractor = None | ||
|
|
||
| def analyze_emotion(self, audio_file) -> Dict[str, Any]: | ||
| """ | ||
| μμ± νμΌμ κ°μ μ λΆμν©λλ€. | ||
|
|
||
| Args: | ||
| audio_file: μ λ‘λλ μμ± νμΌ (FastAPI UploadFile) | ||
|
|
||
| Returns: | ||
| Dict: κ°μ λΆμ κ²°κ³Ό | ||
| """ | ||
| if not self.model or not self.feature_extractor: | ||
| return { | ||
| "error": "λͺ¨λΈμ΄ λ‘λλμ§ μμμ΅λλ€", | ||
| "emotion": "unknown", | ||
| "confidence": 0.0 | ||
| } | ||
|
|
||
| try: | ||
| # μμ νμΌλ‘ μ μ₯ | ||
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: | ||
| content = audio_file.file.read() | ||
| tmp_file.write(content) | ||
| tmp_file_path = tmp_file.name | ||
|
|
||
| # μ€λμ€ λ‘λ (16kHzλ‘ λ¦¬μνλ§) | ||
| audio, sr = librosa.load(tmp_file_path, sr=16000) | ||
|
|
||
| # νΉμ± μΆμΆ | ||
| inputs = self.feature_extractor( | ||
| audio, | ||
| sampling_rate=16000, | ||
| return_tensors="pt", | ||
| padding=True | ||
| ) | ||
|
|
||
| # GPUλ‘ μ΄λ | ||
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | ||
|
|
||
| # μΆλ‘ | ||
| with torch.no_grad(): | ||
| outputs = self.model(**inputs) | ||
| predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) | ||
|
|
||
| # κ°μ λΌλ²¨ (λͺ¨λΈμ λ°λΌ μ‘°μ νμ) | ||
| emotion_labels = ["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"] | ||
|
|
||
| # κ°μ₯ λμ νλ₯ μ κ°μ | ||
| predicted_class = torch.argmax(predictions, dim=-1).item() | ||
| confidence = predictions[0][predicted_class].item() | ||
| emotion = emotion_labels[predicted_class] if predicted_class < len(emotion_labels) else "unknown" | ||
|
|
||
| # λͺ¨λ κ°μ μ νλ₯ | ||
| emotion_scores = { | ||
| emotion_labels[i]: predictions[0][i].item() | ||
| for i in range(min(len(emotion_labels), predictions.shape[1])) | ||
| } | ||
|
|
||
| return { | ||
| "emotion": emotion, | ||
| "confidence": confidence, | ||
| "emotion_scores": emotion_scores, | ||
| "audio_duration": len(audio) / sr, | ||
| "sample_rate": sr | ||
| } | ||
|
|
||
| except Exception as e: | ||
| return { | ||
| "error": f"λΆμ μ€ μ€λ₯ λ°μ: {str(e)}", | ||
| "emotion": "unknown", | ||
| "confidence": 0.0 | ||
| } | ||
| finally: | ||
| # μμ νμΌ μ 리 | ||
| try: | ||
| os.unlink(tmp_file_path) | ||
| except OSError as e: | ||
| print(f"μμ νμΌ μμ μ€ν¨: {tmp_file_path}, μ€λ₯: {e}") | ||
|
|
||
|
|
||
| # μ μ μΈμ€ν΄μ€ | ||
| emotion_analyzer = EmotionAnalyzer() | ||
|
|
||
|
|
||
| def analyze_voice_emotion(audio_file) -> Dict[str, Any]: | ||
| """μμ± κ°μ λΆμ ν¨μ""" | ||
| return emotion_analyzer.analyze_emotion(audio_file) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,7 +1,144 @@ | ||
| from fastapi import FastAPI | ||
| import os | ||
| from typing import Optional | ||
| from fastapi import FastAPI, UploadFile, File, HTTPException, Form | ||
| from fastapi.responses import JSONResponse | ||
| from typing import List | ||
| from .s3_service import upload_fileobj, list_bucket_objects | ||
| from .constants import VOICE_BASE_PREFIX, DEFAULT_UPLOAD_FOLDER | ||
| from .emotion_service import analyze_voice_emotion | ||
| from .stt_service import transcribe_voice | ||
|
|
||
| app = FastAPI(title="Caring API") | ||
|
|
||
| @app.get("/health") | ||
| def health(): | ||
| return {"status": "ok"} | ||
|
|
||
|
|
||
| # POST : upload voice | ||
| @app.post("/voices/upload") | ||
| async def upload_voice( | ||
| file: UploadFile = File(...), | ||
| folder: Optional[str] = Form(default=None), # μ: "raw" λλ "user123/session1" | ||
| ): | ||
| bucket = os.getenv("S3_BUCKET_NAME") | ||
| if not bucket: | ||
| raise HTTPException(status_code=500, detail="S3_BUCKET_NAME not configured") | ||
|
|
||
| # ν€: optional prefix/YYYYMMDD_originalname | ||
| base_prefix = VOICE_BASE_PREFIX.rstrip("/") | ||
| effective_prefix = f"{base_prefix}/{folder or DEFAULT_UPLOAD_FOLDER}".rstrip("/") | ||
| filename = os.path.basename(file.filename or "upload.wav") | ||
| key = f"{effective_prefix}/{filename}" | ||
|
|
||
| # νμΌμ S3μ μ λ‘λ | ||
| # Content-Type μ μ₯ | ||
| upload_fileobj(bucket=bucket, key=key, fileobj=file.file, content_type=file.content_type) | ||
| # μ΄ν μλΉμλ₯Ό μν΄ ν¬μΈν° 리μ | ||
| try: | ||
| file.file.seek(0) | ||
| except Exception: | ||
| pass | ||
|
|
||
| # κ°μ λΆμ μν | ||
| emotion_result = analyze_voice_emotion(file) | ||
|
|
||
| # DBκ° μμΌλ―λ‘, λ²ν·μ νμΌ λͺ©λ‘μ λ°ν | ||
| names = list_bucket_objects(bucket=bucket, prefix=effective_prefix) | ||
| return { | ||
| "uploaded": key, | ||
| "files": names, | ||
| "emotion_analysis": emotion_result | ||
| } | ||
|
|
||
|
|
||
| # GET : query my voice histories | ||
| @app.get("/voices") | ||
| async def list_voices(skip: int = 0, limit: int = 50, folder: Optional[str] = None): | ||
| bucket = os.getenv("S3_BUCKET_NAME") | ||
| if not bucket: | ||
| raise HTTPException(status_code=500, detail="S3_BUCKET_NAME not configured") | ||
| base_prefix = VOICE_BASE_PREFIX.rstrip("/") | ||
| effective_prefix = f"{base_prefix}/{folder or DEFAULT_UPLOAD_FOLDER}".rstrip("/") | ||
|
|
||
| keys = list_bucket_objects(bucket=bucket, prefix=effective_prefix) | ||
| # νμ΄μ§ λΉμ·νκ² sliceλ§ μ μ© | ||
| sliced = keys[skip: skip + limit] | ||
| return {"items": sliced, "count": len(sliced), "next": skip + len(sliced)} | ||
|
|
||
|
|
||
| # GET : query specific voice & show result | ||
| @app.get("/voices/{voice_id}") | ||
| async def get_voice(voice_id: str): | ||
| # λ΄λΆ λ‘μ§μ μλ΅, λλ―Έ μμΈ λ°ν | ||
| result = { | ||
| "voice_id": voice_id, | ||
| "filename": f"{voice_id}.wav", | ||
| "status": "processed", | ||
| "duration_sec": 12.34, | ||
| "analysis": {"pitch_mean": 220.5, "energy": 0.82} | ||
| } | ||
| return JSONResponse(content=result) | ||
|
|
||
|
|
||
| # POST : analyze emotion from uploaded voice file | ||
| @app.post("/voices/analyze-emotion") | ||
| async def analyze_emotion(file: UploadFile = File(...)): | ||
| """μμ± νμΌμ κ°μ μ λΆμν©λλ€.""" | ||
| emotion_result = analyze_voice_emotion(file) | ||
| return emotion_result | ||
|
|
||
|
|
||
| # POST : convert speech to text using Google STT | ||
| @app.post("/voices/transcribe") | ||
| async def transcribe_speech( | ||
| file: UploadFile = File(...), | ||
| language_code: str = "ko-KR" | ||
| ): | ||
| """μμ± νμΌμ ν μ€νΈλ‘ λ³νν©λλ€.""" | ||
| stt_result = transcribe_voice(file, language_code) | ||
| return stt_result | ||
|
|
||
|
|
||
| # POST : upload voice with both emotion analysis and STT | ||
| @app.post("/voices/upload-with-analysis") | ||
| async def upload_voice_with_analysis( | ||
| file: UploadFile = File(...), | ||
| folder: Optional[str] = Form(default=None), | ||
| language_code: str = Form(default="ko-KR") | ||
| ): | ||
| """μμ± νμΌμ μ λ‘λνκ³ κ°μ λΆμκ³Ό STTλ₯Ό λͺ¨λ μνν©λλ€.""" | ||
| bucket = os.getenv("S3_BUCKET_NAME") | ||
| if not bucket: | ||
| raise HTTPException(status_code=500, detail="S3_BUCKET_NAME not configured") | ||
|
|
||
| # S3 μ λ‘λ | ||
| base_prefix = VOICE_BASE_PREFIX.rstrip("/") | ||
| effective_prefix = f"{base_prefix}/{folder or DEFAULT_UPLOAD_FOLDER}".rstrip("/") | ||
| filename = os.path.basename(file.filename or "upload.wav") | ||
| key = f"{effective_prefix}/{filename}" | ||
| upload_fileobj(bucket=bucket, key=key, fileobj=file.file, content_type=file.content_type) | ||
| try: | ||
| file.file.seek(0) | ||
| except Exception: | ||
| pass | ||
|
|
||
| # κ°μ λΆμ | ||
| emotion_result = analyze_voice_emotion(file) | ||
| try: | ||
| file.file.seek(0) | ||
| except Exception: | ||
| pass | ||
|
|
||
| # STT λ³ν | ||
| stt_result = transcribe_voice(file, language_code) | ||
|
|
||
| # νμΌ λͺ©λ‘ μ‘°ν | ||
| names = list_bucket_objects(bucket=bucket, prefix=effective_prefix) | ||
|
|
||
| return { | ||
| "uploaded": key, | ||
| "files": names, | ||
| "emotion_analysis": emotion_result, | ||
| "transcription": stt_result | ||
| } |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,40 @@ | ||||||||||||||||||||||||
| import os | ||||||||||||||||||||||||
| from typing import List | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| import boto3 # type: ignore | ||||||||||||||||||||||||
| from botocore.client import Config # type: ignore | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def get_s3_client(): | ||||||||||||||||||||||||
| region = os.getenv("AWS_REGION", "ap-northeast-2") | ||||||||||||||||||||||||
| kwargs = { | ||||||||||||||||||||||||
| "region_name": region, | ||||||||||||||||||||||||
| "config": Config(signature_version="s3v4"), | ||||||||||||||||||||||||
| } | ||||||||||||||||||||||||
| access_key = os.getenv("AWS_ACCESS_KEY_ID") | ||||||||||||||||||||||||
| secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") | ||||||||||||||||||||||||
| session_token = os.getenv("AWS_SESSION_TOKEN") | ||||||||||||||||||||||||
| if access_key and secret_key: | ||||||||||||||||||||||||
| kwargs["aws_access_key_id"] = access_key | ||||||||||||||||||||||||
| kwargs["aws_secret_access_key"] = secret_key | ||||||||||||||||||||||||
| if session_token: | ||||||||||||||||||||||||
| kwargs["aws_session_token"] = session_token | ||||||||||||||||||||||||
| return boto3.client("s3", **kwargs) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def upload_fileobj(bucket: str, key: str, fileobj) -> str: | ||||||||||||||||||||||||
| s3 = get_s3_client() | ||||||||||||||||||||||||
| s3.upload_fileobj(fileobj, bucket, key) | ||||||||||||||||||||||||
| return key | ||||||||||||||||||||||||
|
Comment on lines
+25
to
+28
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Content-Type λ§€κ°λ³μ λλ½μΌλ‘ λ°νμ μ€λ₯ λ°μ
λ€μ diffλ₯Ό μ μ©νμ¬ Content-Typeμ S3 λ©νλ°μ΄ν°λ‘ μ μ₯νλλ‘ μμ νμΈμ: -def upload_fileobj(bucket: str, key: str, fileobj) -> str:
+def upload_fileobj(bucket: str, key: str, fileobj, content_type: str = None) -> str:
s3 = get_s3_client()
- s3.upload_fileobj(fileobj, bucket, key)
+ extra_args = {}
+ if content_type:
+ extra_args["ContentType"] = content_type
+ s3.upload_fileobj(fileobj, bucket, key, ExtraArgs=extra_args if extra_args else None)
return keyπ Committable suggestion
Suggested change
π€ Prompt for AI Agents |
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def list_bucket_objects(bucket: str, prefix: str = "") -> List[str]: | ||||||||||||||||||||||||
| s3 = get_s3_client() | ||||||||||||||||||||||||
| paginator = s3.get_paginator("list_objects_v2") | ||||||||||||||||||||||||
| keys: List[str] = [] | ||||||||||||||||||||||||
| for page in paginator.paginate(Bucket=bucket, Prefix=prefix): | ||||||||||||||||||||||||
| for obj in page.get("Contents", []) or []: | ||||||||||||||||||||||||
| keys.append(obj["Key"]) | ||||||||||||||||||||||||
| return keys | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
π§© Analysis chain
νμΌ ν¬μΈν° 리μ νμ κ²μ¦
audio_file.file.read()λ₯Ό νΈμΆνλ©΄ νμΌ ν¬μΈν°κ° μμ§λ©λλ€. λμΌνUploadFileκ°μ²΄μ λν΄ μ΄ ν¨μκ° μ¬λ¬ λ² νΈμΆλ κ²½μ° λ¬Έμ κ° λ°μν μ μμ΅λλ€.λ€μ μ€ν¬λ¦½νΈλ₯Ό μ€ννμ¬ νμΌμ΄ μ¬λ¬ λ² μ²λ¦¬λλμ§ νμΈνμΈμ:
νμΌμ΄ μ¬λ¬ λ² μ²λ¦¬λλ κ²½μ°, λ€μκ³Ό κ°μ΄ νμΌ ν¬μΈν°λ₯Ό 리μ νλ κ²μ κ³ λ €νμΈμ:
try: # μμ νμΌλ‘ μ μ₯ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: content = audio_file.file.read() + audio_file.file.seek(0) # νμΌ ν¬μΈν° 리μ tmp_file.write(content) tmp_file_path = tmp_file.nameπ Script executed:
Length of output: 3131
νμΌ ν¬μΈν° μμ§ λ¬Έμ νμΈ - μ¦μ μμ νμ
μ½λ κ²μ¦ κ²°κ³Ό, λμΌν
UploadFileκ°μ²΄κ° μμ°¨μ μΌλ‘ μ¬λ¬ ν¨μμ μ λ¬λλ©΄μ νμΌ ν¬μΈν°κ° μμ§λλ λ¬Έμ κ° μ€μ λ‘ λ°μν©λλ€:/uploadμλν¬μΈνΈ (app/main.py 32-37):upload_fileobj()μμfile.fileμ μ½μ ν,analyze_voice_emotion(file)μ΄ νΈμΆλλλ° μ΄λ―Έ νμΌ ν¬μΈν°κ° EOF μνμ λλ€./voices/upload-and-analyzeμλν¬μΈνΈ (app/main.py 110-118):upload_fileobj()βanalyze_voice_emotion(file)βtranscribe_voice(file, language_code)μμλ‘ λμΌν νμΌ κ°μ²΄κ° μΈ λ² μ½νλ €κ³ μλν©λλ€.νμ μμ μ¬ν (app/emotion_service.py 50-51):
try: # μμ νμΌλ‘ μ μ₯ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: content = audio_file.file.read() + audio_file.file.seek(0) # λ€λ₯Έ ν¨μμμλ μ½μ μ μλλ‘ ν¬μΈν° 리μ tmp_file.write(content) tmp_file_path = tmp_file.nameπ€ Prompt for AI Agents