diff --git a/README.md b/README.md index 59ac98a..60450c0 100644 --- a/README.md +++ b/README.md @@ -19,9 +19,34 @@ pip install -r requirements.txt ## 실행 +개발 서버(Uvicorn) 실행: + ```bash -python -m app.main +uvicorn app.main:app --reload --port 8000 +``` + +API 문서: `http://127.0.0.1:8000/docs` + +## 환경 변수 설정 + +프로젝트 루트에 `.env` 파일을 생성하고 다음 값을 채우세요. + +### AWS S3 설정 ``` +AWS_ACCESS_KEY_ID=... +AWS_SECRET_ACCESS_KEY=... +AWS_REGION=ap-northeast-2 +S3_BUCKET_NAME=your-bucket +S3_PREFIX=voices +``` + +### Google Cloud Speech-to-Text 설정 +``` +# 서비스 계정 키 파일 경로 설정 +GOOGLE_APPLICATION_CREDENTIALS=/path/to/your/service-account-key.json +``` + +`.env`는 `app/__init__.py`에서 자동 로드됩니다. ## 프로젝트 구조 @@ -29,7 +54,7 @@ python -m app.main caring-voice/ ├── app/ │ ├── __init__.py -│ └── main.py # 메인 엔트리 포인트 +│ └── main.py # FastAPI 엔트리 포인트 및 엔드포인트 ├── .gitignore ├── README.md ├── requirements.txt diff --git a/app/__init__.py b/app/__init__.py index e69de29..1e86223 100644 --- a/app/__init__.py +++ b/app/__init__.py @@ -0,0 +1,3 @@ +from dotenv import load_dotenv # type: ignore +load_dotenv() + diff --git a/app/constants.py b/app/constants.py new file mode 100644 index 0000000..981926a --- /dev/null +++ b/app/constants.py @@ -0,0 +1,11 @@ +import os + +# 업로드 기본 베이스 프리픽스 (환경변수 S3_PREFIX로 오버라이드 가능) +VOICE_BASE_PREFIX = os.getenv("S3_PREFIX", "voices") + +# 기본 폴더명 (요청에 folder 미지정 시 사용) +DEFAULT_UPLOAD_FOLDER = "voiceFile" + +# # 필요 시 허용 폴더 집합 정의 (예: 검증용) +# ALLOWED_FOLDERS = {"raw", "processed", "public"} + diff --git a/app/emotion_service.py b/app/emotion_service.py new file mode 100644 index 0000000..5cfdcfc --- /dev/null +++ b/app/emotion_service.py @@ -0,0 +1,117 @@ +import io +import os +import tempfile +from typing import Dict, Any +import librosa +import torch +from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor +import numpy as np + + +class EmotionAnalyzer: + def __init__(self): + self.model = None + self.feature_extractor = None + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + self._load_model() + + def _load_model(self): + """Hugging Face 모델 로드""" + model_name = "jungjongho/wav2vec2-xlsr-korean-speech-emotion-recognition" + + try: + self.model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name) + self.model.to(self.device) + self.model.eval() + except Exception as e: + print(f"모델 로드 실패: {e}") + self.model = None + self.feature_extractor = None + + def analyze_emotion(self, audio_file) -> Dict[str, Any]: + """ + 음성 파일의 감정을 분석합니다. + + Args: + audio_file: 업로드된 음성 파일 (FastAPI UploadFile) + + Returns: + Dict: 감정 분석 결과 + """ + if not self.model or not self.feature_extractor: + return { + "error": "모델이 로드되지 않았습니다", + "emotion": "unknown", + "confidence": 0.0 + } + + try: + # 임시 파일로 저장 + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: + content = audio_file.file.read() + tmp_file.write(content) + tmp_file_path = tmp_file.name + + # 오디오 로드 (16kHz로 리샘플링) + audio, sr = librosa.load(tmp_file_path, sr=16000) + + # 특성 추출 + inputs = self.feature_extractor( + audio, + sampling_rate=16000, + return_tensors="pt", + padding=True + ) + + # GPU로 이동 + inputs = {k: v.to(self.device) for k, v in inputs.items()} + + # 추론 + with torch.no_grad(): + outputs = self.model(**inputs) + predictions = torch.nn.functional.softmax(outputs.logits, dim=-1) + + # 감정 라벨 (모델에 따라 조정 필요) + emotion_labels = ["neutral", "happy", "sad", "angry", "fear", "surprise", "disgust"] + + # 가장 높은 확률의 감정 + predicted_class = torch.argmax(predictions, dim=-1).item() + confidence = predictions[0][predicted_class].item() + emotion = emotion_labels[predicted_class] if predicted_class < len(emotion_labels) else "unknown" + + # 모든 감정의 확률 + emotion_scores = { + emotion_labels[i]: predictions[0][i].item() + for i in range(min(len(emotion_labels), predictions.shape[1])) + } + + return { + "emotion": emotion, + "confidence": confidence, + "emotion_scores": emotion_scores, + "audio_duration": len(audio) / sr, + "sample_rate": sr + } + + except Exception as e: + return { + "error": f"분석 중 오류 발생: {str(e)}", + "emotion": "unknown", + "confidence": 0.0 + } + finally: + # 임시 파일 정리 + try: + os.unlink(tmp_file_path) + except OSError as e: + print(f"임시 파일 삭제 실패: {tmp_file_path}, 오류: {e}") + + +# 전역 인스턴스 +emotion_analyzer = EmotionAnalyzer() + + +def analyze_voice_emotion(audio_file) -> Dict[str, Any]: + """음성 감정 분석 함수""" + return emotion_analyzer.analyze_emotion(audio_file) diff --git a/app/main.py b/app/main.py index 98d96a2..a6d1b88 100644 --- a/app/main.py +++ b/app/main.py @@ -1,7 +1,144 @@ -from fastapi import FastAPI +import os +from typing import Optional +from fastapi import FastAPI, UploadFile, File, HTTPException, Form +from fastapi.responses import JSONResponse +from typing import List +from .s3_service import upload_fileobj, list_bucket_objects +from .constants import VOICE_BASE_PREFIX, DEFAULT_UPLOAD_FOLDER +from .emotion_service import analyze_voice_emotion +from .stt_service import transcribe_voice app = FastAPI(title="Caring API") @app.get("/health") def health(): return {"status": "ok"} + + +# POST : upload voice +@app.post("/voices/upload") +async def upload_voice( + file: UploadFile = File(...), + folder: Optional[str] = Form(default=None), # 예: "raw" 또는 "user123/session1" +): + bucket = os.getenv("S3_BUCKET_NAME") + if not bucket: + raise HTTPException(status_code=500, detail="S3_BUCKET_NAME not configured") + + # 키: optional prefix/YYYYMMDD_originalname + base_prefix = VOICE_BASE_PREFIX.rstrip("/") + effective_prefix = f"{base_prefix}/{folder or DEFAULT_UPLOAD_FOLDER}".rstrip("/") + filename = os.path.basename(file.filename or "upload.wav") + key = f"{effective_prefix}/{filename}" + + # 파일을 S3에 업로드 + # Content-Type 저장 + upload_fileobj(bucket=bucket, key=key, fileobj=file.file, content_type=file.content_type) + # 이후 소비자를 위해 포인터 리셋 + try: + file.file.seek(0) + except Exception: + pass + + # 감정 분석 수행 + emotion_result = analyze_voice_emotion(file) + + # DB가 없으므로, 버킷의 파일 목록을 반환 + names = list_bucket_objects(bucket=bucket, prefix=effective_prefix) + return { + "uploaded": key, + "files": names, + "emotion_analysis": emotion_result + } + + +# GET : query my voice histories +@app.get("/voices") +async def list_voices(skip: int = 0, limit: int = 50, folder: Optional[str] = None): + bucket = os.getenv("S3_BUCKET_NAME") + if not bucket: + raise HTTPException(status_code=500, detail="S3_BUCKET_NAME not configured") + base_prefix = VOICE_BASE_PREFIX.rstrip("/") + effective_prefix = f"{base_prefix}/{folder or DEFAULT_UPLOAD_FOLDER}".rstrip("/") + + keys = list_bucket_objects(bucket=bucket, prefix=effective_prefix) + # 페이징 비슷하게 slice만 적용 + sliced = keys[skip: skip + limit] + return {"items": sliced, "count": len(sliced), "next": skip + len(sliced)} + + +# GET : query specific voice & show result +@app.get("/voices/{voice_id}") +async def get_voice(voice_id: str): + # 내부 로직은 생략, 더미 상세 반환 + result = { + "voice_id": voice_id, + "filename": f"{voice_id}.wav", + "status": "processed", + "duration_sec": 12.34, + "analysis": {"pitch_mean": 220.5, "energy": 0.82} + } + return JSONResponse(content=result) + + +# POST : analyze emotion from uploaded voice file +@app.post("/voices/analyze-emotion") +async def analyze_emotion(file: UploadFile = File(...)): + """음성 파일의 감정을 분석합니다.""" + emotion_result = analyze_voice_emotion(file) + return emotion_result + + +# POST : convert speech to text using Google STT +@app.post("/voices/transcribe") +async def transcribe_speech( + file: UploadFile = File(...), + language_code: str = "ko-KR" +): + """음성 파일을 텍스트로 변환합니다.""" + stt_result = transcribe_voice(file, language_code) + return stt_result + + +# POST : upload voice with both emotion analysis and STT +@app.post("/voices/upload-with-analysis") +async def upload_voice_with_analysis( + file: UploadFile = File(...), + folder: Optional[str] = Form(default=None), + language_code: str = Form(default="ko-KR") +): + """음성 파일을 업로드하고 감정 분석과 STT를 모두 수행합니다.""" + bucket = os.getenv("S3_BUCKET_NAME") + if not bucket: + raise HTTPException(status_code=500, detail="S3_BUCKET_NAME not configured") + + # S3 업로드 + base_prefix = VOICE_BASE_PREFIX.rstrip("/") + effective_prefix = f"{base_prefix}/{folder or DEFAULT_UPLOAD_FOLDER}".rstrip("/") + filename = os.path.basename(file.filename or "upload.wav") + key = f"{effective_prefix}/{filename}" + upload_fileobj(bucket=bucket, key=key, fileobj=file.file, content_type=file.content_type) + try: + file.file.seek(0) + except Exception: + pass + + # 감정 분석 + emotion_result = analyze_voice_emotion(file) + try: + file.file.seek(0) + except Exception: + pass + + # STT 변환 + stt_result = transcribe_voice(file, language_code) + + # 파일 목록 조회 + names = list_bucket_objects(bucket=bucket, prefix=effective_prefix) + + return { + "uploaded": key, + "files": names, + "emotion_analysis": emotion_result, + "transcription": stt_result + } diff --git a/app/s3_service.py b/app/s3_service.py new file mode 100644 index 0000000..d86a2d7 --- /dev/null +++ b/app/s3_service.py @@ -0,0 +1,40 @@ +import os +from typing import List + +import boto3 # type: ignore +from botocore.client import Config # type: ignore + + +def get_s3_client(): + region = os.getenv("AWS_REGION", "ap-northeast-2") + kwargs = { + "region_name": region, + "config": Config(signature_version="s3v4"), + } + access_key = os.getenv("AWS_ACCESS_KEY_ID") + secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") + session_token = os.getenv("AWS_SESSION_TOKEN") + if access_key and secret_key: + kwargs["aws_access_key_id"] = access_key + kwargs["aws_secret_access_key"] = secret_key + if session_token: + kwargs["aws_session_token"] = session_token + return boto3.client("s3", **kwargs) + + +def upload_fileobj(bucket: str, key: str, fileobj) -> str: + s3 = get_s3_client() + s3.upload_fileobj(fileobj, bucket, key) + return key + + +def list_bucket_objects(bucket: str, prefix: str = "") -> List[str]: + s3 = get_s3_client() + paginator = s3.get_paginator("list_objects_v2") + keys: List[str] = [] + for page in paginator.paginate(Bucket=bucket, Prefix=prefix): + for obj in page.get("Contents", []) or []: + keys.append(obj["Key"]) + return keys + + diff --git a/app/stt_service.py b/app/stt_service.py new file mode 100644 index 0000000..29a2612 --- /dev/null +++ b/app/stt_service.py @@ -0,0 +1,124 @@ +import io +import tempfile +import os +from typing import Dict, Any, Optional +from google.cloud import speech +from google.oauth2 import service_account +import librosa +import numpy as np + + +class GoogleSTTService: + def __init__(self): + self.client = None + self._initialize_client() + + def _initialize_client(self): + """Google Cloud Speech-to-Text 클라이언트 초기화""" + try: + # 환경변수에서 서비스 계정 키 파일 경로 가져오기 + credentials_path = os.getenv("GOOGLE_APPLICATION_CREDENTIALS") + + if credentials_path and os.path.exists(credentials_path): + # 서비스 계정 키 파일로 인증 + credentials = service_account.Credentials.from_service_account_file( + credentials_path, + scopes=["https://www.googleapis.com/auth/cloud-platform"] + ) + self.client = speech.SpeechClient(credentials=credentials) + else: + # 기본 인증 (환경변수 GOOGLE_APPLICATION_CREDENTIALS 설정됨) + self.client = speech.SpeechClient() + + except Exception as e: + print(f"Google STT 클라이언트 초기화 실패: {e}") + self.client = None + + def transcribe_audio(self, audio_file, language_code: str = "ko-KR") -> Dict[str, Any]: + """ + 음성 파일을 텍스트로 변환합니다. + + Args: + audio_file: 업로드된 음성 파일 (FastAPI UploadFile) + language_code: 언어 코드 (기본값: ko-KR) + + Returns: + Dict: STT 결과 + """ + if not self.client: + return { + "error": "Google STT 클라이언트가 초기화되지 않았습니다", + "transcript": "", + "confidence": 0.0 + } + + try: + # 임시 파일로 저장 + with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file: + content = audio_file.file.read() + audio_file.file.seek(0) + tmp_file.write(content) + tmp_file_path = tmp_file.name + + # 오디오 파일 로드 및 전처리 + audio_data, sample_rate = librosa.load(tmp_file_path, sr=16000) + + # 오디오 데이터를 bytes로 변환 + audio_data = np.clip(audio_data, -1.0, 1.0) + audio_bytes = (audio_data * 32767).astype('int16').tobytes() + + # Google Cloud Speech-to-Text 요청 구성 + audio = speech.RecognitionAudio(content=audio_bytes) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=sample_rate, + language_code=language_code, + enable_automatic_punctuation=True, + enable_word_time_offsets=True, + model="latest_long", # 최신 장시간 모델 사용 + ) + + # STT 요청 실행 + response = self.client.recognize(config=config, audio=audio) + + # 결과 처리 + if response.results: + result = response.results[0] + transcript = result.alternatives[0].transcript + confidence = result.alternatives[0].confidence + + return { + "transcript": transcript, + "confidence": confidence, + "language_code": language_code, + "audio_duration": len(audio_data) / sample_rate, + "sample_rate": sample_rate + } + else: + return { + "error": "음성을 인식할 수 없습니다", + "transcript": "", + "confidence": 0.0 + } + + except Exception as e: + return { + "error": f"STT 처리 중 오류 발생: {str(e)}", + "transcript": "", + "confidence": 0.0 + } + finally: + # 임시 파일 정리 + try: + os.unlink(tmp_file_path) + except: + pass + + +# 전역 인스턴스 +stt_service = GoogleSTTService() + + +def transcribe_voice(audio_file, language_code: str = "ko-KR") -> Dict[str, Any]: + """음성을 텍스트로 변환하는 함수""" + return stt_service.transcribe_audio(audio_file, language_code) diff --git a/requirements.txt b/requirements.txt index 21a99f5..297b90c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,10 @@ -# 현재 프로젝트에 특별한 의존성이 없습니다. -# 필요에 따라 패키지를 추가하세요. - -# 예시: -# requests>=2.31.0 -# numpy>=1.24.0 -# pandas>=2.0.0 +fastapi>=0.115.0 +uvicorn[standard]>=0.30.0 +boto3>=1.34.0 +python-dotenv>=1.0.1 +transformers>=4.30.0 +torch>=2.0.0 +librosa>=0.10.0 +scipy>=1.10.0 +google-cloud-speech>=2.21.0 +google-auth>=2.23.0