From 3668889a044598473b43389fc999f59cc0f1ae26 Mon Sep 17 00:00:00 2001
From: sunny <anna7789@naver.com>
Date: Sun, 13 Jul 2025 01:58:54 +0900
Subject: [PATCH 1/3] =?UTF-8?q?chore:=20=EC=9D=98=EC=A1=B4=EC=84=B1=20?=
 =?UTF-8?q?=EC=B6=94=EA=B0=80?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 requirements.txt | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index e69de29..c8da40e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,11 @@
+fastapi
+uvicorn==0.29.0 # FastAPI 실행용 ASGI 서버
+pydantic
+python-dotenv
+torch==2.7.1
+transformers==4.51.3 #hf 모델 로드하고 추론 핵심 라이브러리
+safetensors #HuggingFace 모델이 저장되는 .safetensors 파일 포맷을 빠르고 안전하게 로드하기 위한 라이브러리
+accelerate>=0.20.3 # HuggingFace 모델 가속 및 디바이스 관리
+huggingface-hub
+sentencepiece #모델 토크나이저
+bitsandbytes==0.42.0 # gpu용 4bit양자화
\ No newline at end of file

From dfff4adcfc0c586a9955d4f5a7c188c7f7168ddf Mon Sep 17 00:00:00 2001
From: sunny <anna7789@naver.com>
Date: Sun, 13 Jul 2025 02:01:19 +0900
Subject: [PATCH 2/3] =?UTF-8?q?feat:=20generate=5Fcontent=20=ED=95=A8?=
 =?UTF-8?q?=EC=88=98=20=EC=B6=94=EA=B0=80=20=EB=B0=8F=20KULLM3=20=EB=AA=A8?=
 =?UTF-8?q?=EB=8D=B8=20=EC=97=B0=EB=8F=99=20(transformers=20=EA=B8=B0?=
 =?UTF-8?q?=EB=B0=98)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/summarizer.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/app/services/summarizer.py b/app/services/summarizer.py
index 56b6dc4..0d95dbf 100644
--- a/app/services/summarizer.py
+++ b/app/services/summarizer.py
@@ -1,5 +1,20 @@
 # 비즈니스 로직 / AI 추론 모듈
 
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import torch
+
+# 모델과 토크나이저를 전역으로 로드 (최초 1회만)
+tokenizer = AutoTokenizer.from_pretrained("nlpai-lab/KULLM3")
+model = AutoModelForCausalLM.from_pretrained("nlpai-lab/KULLM3")
+
+def generate_content(prompt: str) -> str:
+    inputs = tokenizer(prompt, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model.generate(**inputs, max_new_tokens=256)
+    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # 프롬프트 부분 제거 (모델에 따라 필요)
+    return result[len(prompt):].strip() if result.startswith(prompt) else result
+
 def build_transform_prompt(title: str, content: str, level: str) -> str:
     base = f"다음 뉴스 제목과 본문을 사용자의 이해 수준에 맞게 다시 써줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}\n"
     if level == "상":

From 58b23c34e62e045c740105f31a5b92cc3cf01ba2 Mon Sep 17 00:00:00 2001
From: sunny <anna7789@naver.com>
Date: Sun, 13 Jul 2025 02:04:47 +0900
Subject: [PATCH 3/3] =?UTF-8?q?refactor:=20KULLM3=204bit=20=EC=96=91?=
 =?UTF-8?q?=EC=9E=90=ED=99=94=20=EB=B0=8F=20=EB=B0=B0=EC=B9=98=20=EC=B6=94?=
 =?UTF-8?q?=EB=A1=A0=20=EA=B8=B0=EB=B0=98=20LLM=20=EB=AA=A8=EB=93=88?=
 =?UTF-8?q?=EB=A1=9C=20summarizer.py=20=EB=A6=AC=ED=8C=A9=ED=86=A0?=
 =?UTF-8?q?=EB=A7=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 app/services/summarizer.py | 90 +++++++++++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 11 deletions(-)

diff --git a/app/services/summarizer.py b/app/services/summarizer.py
index 0d95dbf..1b5ec26 100644
--- a/app/services/summarizer.py
+++ b/app/services/summarizer.py
@@ -1,20 +1,48 @@
 # 비즈니스 로직 / AI 추론 모듈
 
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+import time
+from typing import List
 
-# 모델과 토크나이저를 전역으로 로드 (최초 1회만)
-tokenizer = AutoTokenizer.from_pretrained("nlpai-lab/KULLM3")
-model = AutoModelForCausalLM.from_pretrained("nlpai-lab/KULLM3")
+# 1. 모델 불러오기 및 4bit 양자화 설정
+model_id = "nlpai-lab/KULLM3"
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16
+)
 
-def generate_content(prompt: str) -> str:
-    inputs = tokenizer(prompt, return_tensors="pt")
-    with torch.no_grad():
-        outputs = model.generate(**inputs, max_new_tokens=256)
-    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    # 프롬프트 부분 제거 (모델에 따라 필요)
-    return result[len(prompt):].strip() if result.startswith(prompt) else result
+tokenizer = AutoTokenizer.from_pretrained(model_id)
 
+if torch.cuda.is_available():
+    torch.cuda.empty_cache()
+    torch.cuda.reset_peak_memory_stats()
+    print(f"Initial VRAM usage: {torch.cuda.memory_allocated() / (1024**3):.2f} GB")
+
+start_load_time = time.time()
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    quantization_config=bnb_config,
+    device_map="auto"
+)
+end_load_time = time.time()
+
+print(f"\nModel loaded in {end_load_time - start_load_time:.2f} seconds")
+
+if torch.cuda.is_available():
+    initial_vram_after_load = torch.cuda.memory_allocated()
+    peak_vram_after_load = torch.cuda.max_memory_allocated()
+    print(f"VRAM allocated after model load: {initial_vram_after_load / (1024**3):.2f} GB")
+    print(f"Peak VRAM used during model load: {peak_vram_after_load / (1024**3):.2f} GB")
+
+# 2. LLM 채팅 프롬프트 포맷
+
+def build_chat_prompt(prompt: str):
+    return f"<s>[INST] {prompt.strip()} [/INST]"
+
+# 3. 프롬프트 생성 함수 (기존 유지)
 def build_transform_prompt(title: str, content: str, level: str) -> str:
     base = f"다음 뉴스 제목과 본문을 사용자의 이해 수준에 맞게 다시 써줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}\n"
     if level == "상":
@@ -27,3 +55,43 @@ def build_transform_prompt(title: str, content: str, level: str) -> str:
 
 def build_summary_prompt(title: str, content: str) -> str:
     return f"다음 뉴스 제목과 본문을 한문장으로 간단히 요약해줘.\n\n뉴스 제목: {title}\n뉴스 본문: {content}"
+
+# 4. 배치 추론 함수
+def kullm_batch_generate(prompts: List[str], max_new_tokens=512):
+    chat_prompts = [build_chat_prompt(p) for p in prompts]
+    if torch.cuda.is_available():
+        torch.cuda.reset_peak_memory_stats()
+    inputs = tokenizer(chat_prompts, return_tensors="pt", padding=True).to(model.device)
+    input_ids = inputs.input_ids
+    attention_mask = inputs.attention_mask
+    start_infer_time = time.time()
+    output = model.generate(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=0.2,
+        top_p=0.2,
+        pad_token_id=tokenizer.eos_token_id
+    )
+    end_infer_time = time.time()
+    generation_time = end_infer_time - start_infer_time
+    decoded_results = []
+    generated_tokens_list = []
+    for i in range(len(prompts)):
+        original_input_len = (input_ids[i] != tokenizer.pad_token_id).sum().item()
+        generated_tokens = output[i].shape[0] - original_input_len
+        generated_tokens_list.append(generated_tokens)
+        result_text = tokenizer.decode(output[i], skip_special_tokens=True)
+        decoded_results.append(result_text.split('[/INST]')[-1].strip())
+    current_vram = 0
+    peak_vram = 0
+    if torch.cuda.is_available():
+        current_vram = torch.cuda.memory_allocated()
+        peak_vram = torch.cuda.max_memory_allocated()
+    return decoded_results, generation_time, generated_tokens_list, current_vram, peak_vram
+
+# 5. 단일 프롬프트용 generate_content 함수
+def generate_content(prompt: str, max_new_tokens=512) -> str:
+    results, _, _, _, _ = kullm_batch_generate([prompt], max_new_tokens=max_new_tokens)
+    return results[0]