From 3668889a044598473b43389fc999f59cc0f1ae26 Mon Sep 17 00:00:00 2001 From: sunny Date: Sun, 13 Jul 2025 01:58:54 +0900 Subject: [PATCH 1/3] =?UTF-8?q?chore:=20=EC=9D=98=EC=A1=B4=EC=84=B1=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- requirements.txt | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/requirements.txt b/requirements.txt index e69de29..c8da40e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1,11 @@ +fastapi +uvicorn==0.29.0 # FastAPI 실행용 ASGI 서버 +pydantic +python-dotenv +torch==2.7.1 +transformers==4.51.3 #hf 모델 로드하고 추론 핵심 라이브러리 +safetensors #HuggingFace 모델이 저장되는 .safetensors 파일 포맷을 빠르고 안전하게 로드하기 위한 라이브러리 +accelerate>=0.20.3 # HuggingFace 모델 가속 및 디바이스 관리 +huggingface-hub +sentencepiece #모델 토크나이저 +bitsandbytes==0.42.0 # gpu용 4bit양자화 \ No newline at end of file From dfff4adcfc0c586a9955d4f5a7c188c7f7168ddf Mon Sep 17 00:00:00 2001 From: sunny Date: Sun, 13 Jul 2025 02:01:19 +0900 Subject: [PATCH 2/3] =?UTF-8?q?feat:=20generate=5Fcontent=20=ED=95=A8?= =?UTF-8?q?=EC=88=98=20=EC=B6=94=EA=B0=80=20=EB=B0=8F=20KULLM3=20=EB=AA=A8?= =?UTF-8?q?=EB=8D=B8=20=EC=97=B0=EB=8F=99=20(transformers=20=EA=B8=B0?= =?UTF-8?q?=EB=B0=98)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/services/summarizer.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/app/services/summarizer.py b/app/services/summarizer.py index 56b6dc4..0d95dbf 100644 --- a/app/services/summarizer.py +++ b/app/services/summarizer.py @@ -1,5 +1,20 @@ # 비즈니스 로직 / AI 추론 모듈 +from transformers import AutoModelForCausalLM, AutoTokenizer +import torch + +# 모델과 토크나이저를 전역으로 로드 (최초 1회만) +tokenizer = AutoTokenizer.from_pretrained("nlpai-lab/KULLM3") +model = AutoModelForCausalLM.from_pretrained("nlpai-lab/KULLM3") + +def generate_content(prompt: str) -> str: + inputs = tokenizer(prompt, return_tensors="pt") + with torch.no_grad(): + outputs = model.generate(**inputs, max_new_tokens=256) + result = tokenizer.decode(outputs[0], skip_special_tokens=True) + # 프롬프트 부분 제거 (모델에 따라 필요) + return result[len(prompt):].strip() if result.startswith(prompt) else result + def build_transform_prompt(title: str, content: str, level: str) -> str: base = f"다음 뉴스 제목과 본문을 사용자의 이해 수준에 맞게 다시 써줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}\n" if level == "상": From 58b23c34e62e045c740105f31a5b92cc3cf01ba2 Mon Sep 17 00:00:00 2001 From: sunny Date: Sun, 13 Jul 2025 02:04:47 +0900 Subject: [PATCH 3/3] =?UTF-8?q?refactor:=20KULLM3=204bit=20=EC=96=91?= =?UTF-8?q?=EC=9E=90=ED=99=94=20=EB=B0=8F=20=EB=B0=B0=EC=B9=98=20=EC=B6=94?= =?UTF-8?q?=EB=A1=A0=20=EA=B8=B0=EB=B0=98=20LLM=20=EB=AA=A8=EB=93=88?= =?UTF-8?q?=EB=A1=9C=20summarizer.py=20=EB=A6=AC=ED=8C=A9=ED=86=A0?= =?UTF-8?q?=EB=A7=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/services/summarizer.py | 90 +++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 11 deletions(-) diff --git a/app/services/summarizer.py b/app/services/summarizer.py index 0d95dbf..1b5ec26 100644 --- a/app/services/summarizer.py +++ b/app/services/summarizer.py @@ -1,20 +1,48 @@ # 비즈니스 로직 / AI 추론 모듈 -from transformers import AutoModelForCausalLM, AutoTokenizer import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +import time +from typing import List -# 모델과 토크나이저를 전역으로 로드 (최초 1회만) -tokenizer = AutoTokenizer.from_pretrained("nlpai-lab/KULLM3") -model = AutoModelForCausalLM.from_pretrained("nlpai-lab/KULLM3") +# 1. 모델 불러오기 및 4bit 양자화 설정 +model_id = "nlpai-lab/KULLM3" +bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_compute_dtype=torch.bfloat16 +) -def generate_content(prompt: str) -> str: - inputs = tokenizer(prompt, return_tensors="pt") - with torch.no_grad(): - outputs = model.generate(**inputs, max_new_tokens=256) - result = tokenizer.decode(outputs[0], skip_special_tokens=True) - # 프롬프트 부분 제거 (모델에 따라 필요) - return result[len(prompt):].strip() if result.startswith(prompt) else result +tokenizer = AutoTokenizer.from_pretrained(model_id) +if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + print(f"Initial VRAM usage: {torch.cuda.memory_allocated() / (1024**3):.2f} GB") + +start_load_time = time.time() +model = AutoModelForCausalLM.from_pretrained( + model_id, + quantization_config=bnb_config, + device_map="auto" +) +end_load_time = time.time() + +print(f"\nModel loaded in {end_load_time - start_load_time:.2f} seconds") + +if torch.cuda.is_available(): + initial_vram_after_load = torch.cuda.memory_allocated() + peak_vram_after_load = torch.cuda.max_memory_allocated() + print(f"VRAM allocated after model load: {initial_vram_after_load / (1024**3):.2f} GB") + print(f"Peak VRAM used during model load: {peak_vram_after_load / (1024**3):.2f} GB") + +# 2. LLM 채팅 프롬프트 포맷 + +def build_chat_prompt(prompt: str): + return f"[INST] {prompt.strip()} [/INST]" + +# 3. 프롬프트 생성 함수 (기존 유지) def build_transform_prompt(title: str, content: str, level: str) -> str: base = f"다음 뉴스 제목과 본문을 사용자의 이해 수준에 맞게 다시 써줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}\n" if level == "상": @@ -27,3 +55,43 @@ def build_transform_prompt(title: str, content: str, level: str) -> str: def build_summary_prompt(title: str, content: str) -> str: return f"다음 뉴스 제목과 본문을 한문장으로 간단히 요약해줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}" + +# 4. 배치 추론 함수 +def kullm_batch_generate(prompts: List[str], max_new_tokens=512): + chat_prompts = [build_chat_prompt(p) for p in prompts] + if torch.cuda.is_available(): + torch.cuda.reset_peak_memory_stats() + inputs = tokenizer(chat_prompts, return_tensors="pt", padding=True).to(model.device) + input_ids = inputs.input_ids + attention_mask = inputs.attention_mask + start_infer_time = time.time() + output = model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + max_new_tokens=max_new_tokens, + do_sample=True, + temperature=0.2, + top_p=0.2, + pad_token_id=tokenizer.eos_token_id + ) + end_infer_time = time.time() + generation_time = end_infer_time - start_infer_time + decoded_results = [] + generated_tokens_list = [] + for i in range(len(prompts)): + original_input_len = (input_ids[i] != tokenizer.pad_token_id).sum().item() + generated_tokens = output[i].shape[0] - original_input_len + generated_tokens_list.append(generated_tokens) + result_text = tokenizer.decode(output[i], skip_special_tokens=True) + decoded_results.append(result_text.split('[/INST]')[-1].strip()) + current_vram = 0 + peak_vram = 0 + if torch.cuda.is_available(): + current_vram = torch.cuda.memory_allocated() + peak_vram = torch.cuda.max_memory_allocated() + return decoded_results, generation_time, generated_tokens_list, current_vram, peak_vram + +# 5. 단일 프롬프트용 generate_content 함수 +def generate_content(prompt: str, max_new_tokens=512) -> str: + results, _, _, _, _ = kullm_batch_generate([prompt], max_new_tokens=max_new_tokens) + return results[0]