diff --git a/app/services/summarizer.py b/app/services/summarizer.py index 56b6dc4..1b5ec26 100644 --- a/app/services/summarizer.py +++ b/app/services/summarizer.py @@ -1,5 +1,48 @@ # 비즈니스 로직 / AI 추론 모듈 +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +import time +from typing import List + +# 1. 모델 불러오기 및 4bit 양자화 설정 +model_id = "nlpai-lab/KULLM3" +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16 +) + +tokenizer = AutoTokenizer.from_pretrained(model_id) + +if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + print(f"Initial VRAM usage: {torch.cuda.memory_allocated() / (1024**3):.2f} GB") + +start_load_time = time.time() +model = AutoModelForCausalLM.from_pretrained( + model_id, + quantization_config=bnb_config, + device_map="auto" +) +end_load_time = time.time() + +print(f"\nModel loaded in {end_load_time - start_load_time:.2f} seconds") + +if torch.cuda.is_available(): + initial_vram_after_load = torch.cuda.memory_allocated() + peak_vram_after_load = torch.cuda.max_memory_allocated() + print(f"VRAM allocated after model load: {initial_vram_after_load / (1024**3):.2f} GB") + print(f"Peak VRAM used during model load: {peak_vram_after_load / (1024**3):.2f} GB") + +# 2. LLM 채팅 프롬프트 포맷 + +def build_chat_prompt(prompt: str): + return f"[INST] {prompt.strip()} [/INST]" + +# 3. 프롬프트 생성 함수 (기존 유지) def build_transform_prompt(title: str, content: str, level: str) -> str: base = f"다음 뉴스 제목과 본문을 사용자의 이해 수준에 맞게 다시 써줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}\n" if level == "상": @@ -12,3 +55,43 @@ def build_transform_prompt(title: str, content: str, level: str) -> str: def build_summary_prompt(title: str, content: str) -> str: return f"다음 뉴스 제목과 본문을 한문장으로 간단히 요약해줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}" + +# 4. 배치 추론 함수 +def kullm_batch_generate(prompts: List[str], max_new_tokens=512): + chat_prompts = [build_chat_prompt(p) for p in prompts] + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats() + inputs = tokenizer(chat_prompts, return_tensors="pt", padding=True).to(model.device) + input_ids = inputs.input_ids + attention_mask = inputs.attention_mask + start_infer_time = time.time() + output = model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=max_new_tokens, + do_sample=True, + temperature=0.2, + top_p=0.2, + pad_token_id=tokenizer.eos_token_id + ) + end_infer_time = time.time() + generation_time = end_infer_time - start_infer_time + decoded_results = [] + generated_tokens_list = [] + for i in range(len(prompts)): + original_input_len = (input_ids[i] != tokenizer.pad_token_id).sum().item() + generated_tokens = output[i].shape[0] - original_input_len + generated_tokens_list.append(generated_tokens) + result_text = tokenizer.decode(output[i], skip_special_tokens=True) + decoded_results.append(result_text.split('[/INST]')[-1].strip()) + current_vram = 0 + peak_vram = 0 + if torch.cuda.is_available(): + current_vram = torch.cuda.memory_allocated() + peak_vram = torch.cuda.max_memory_allocated() + return decoded_results, generation_time, generated_tokens_list, current_vram, peak_vram + +# 5. 단일 프롬프트용 generate_content 함수 +def generate_content(prompt: str, max_new_tokens=512) -> str: + results, _, _, _, _ = kullm_batch_generate([prompt], max_new_tokens=max_new_tokens) + return results[0] diff --git a/requirements.txt b/requirements.txt index e69de29..c8da40e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,11 @@ +fastapi +uvicorn==0.29.0 # FastAPI 실행용 ASGI 서버 +pydantic +python-dotenv +torch==2.7.1 +transformers==4.51.3 #hf 모델 로드하고 추론 핵심 라이브러리 +safetensors #HuggingFace 모델이 저장되는 .safetensors 파일 포맷을 빠르고 안전하게 로드하기 위한 라이브러리 +accelerate>=0.20.3 # HuggingFace 모델 가속 및 디바이스 관리 +huggingface-hub +sentencepiece #모델 토크나이저 +bitsandbytes==0.42.0 # gpu용 4bit양자화 \ No newline at end of file