From 58a050f29d9bf209f532f90bbc784748aad07538 Mon Sep 17 00:00:00 2001 From: sunny Date: Sun, 13 Jul 2025 23:52:57 +0900 Subject: [PATCH] =?UTF-8?q?feat:=20vLLM=EB=8F=84=EC=9E=85=20(close=20#7)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/services/summarizer.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/app/services/summarizer.py b/app/services/summarizer.py index fbab4d4..6827872 100644 --- a/app/services/summarizer.py +++ b/app/services/summarizer.py @@ -4,6 +4,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import time from typing import List +import httpx # 모델 로딩 및 4bit 양자화 설정 model_id = "sunnyanna/KULLM3-AWQ" @@ -43,5 +44,24 @@ def kullm_batch_generate(prompts: List[str], max_new_tokens=512): decoded_results.append(result_text.split('[/INST]')[-1].strip()) return decoded_results -def generate_content(prompt: str, max_new_tokens=512) -> str: - return kullm_batch_generate([prompt], max_new_tokens=max_new_tokens)[0] \ No newline at end of file +VLLM_API_URL = "http://localhost:8000/v1/completions" + +async def vllm_generate_content(prompt: str, max_tokens: int = 512) -> str: + headers = {"Content-Type": "application/json"} + payload = { + "model": "sunnyanna/KULLM3-AWQ", + "prompt": prompt, + "max_tokens": max_tokens, + "temperature": 0.2, + "top_p": 0.2, + "stop": None + } + async with httpx.AsyncClient() as client: + response = await client.post(VLLM_API_URL, headers=headers, json=payload) + response.raise_for_status() + result = response.json() + return result["choices"][0]["text"].strip() + +# 기존 generate_content 함수 대체 +async def generate_content(prompt: str, max_new_tokens=512) -> str: + return await vllm_generate_content(prompt, max_tokens=max_new_tokens) \ No newline at end of file