diff --git a/__pycache__/main.cpython-310.pyc b/__pycache__/main.cpython-310.pyc deleted file mode 100644 index 620f7ea..0000000 Binary files a/__pycache__/main.cpython-310.pyc and /dev/null differ diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..6e79cdb --- /dev/null +++ b/app/config.py @@ -0,0 +1,12 @@ +from dotenv import load_dotenv +import os + +INDEX_PATH = "data/faiss_index" +UPLOAD_DIR = "data/uploads" + +os.makedirs(UPLOAD_DIR, exist_ok=True) +os.makedirs(INDEX_PATH, exist_ok=True) + +load_dotenv() + + diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..760d3be --- /dev/null +++ b/app/main.py @@ -0,0 +1,35 @@ +from dotenv import load_dotenv +from fastapi import FastAPI, UploadFile, File, HTTPException +from services.embedding_service import embed_pdf +from services.task_service import generate_task +import os + + +load_dotenv() + +app = FastAPI() + +INDEX_PATH = "data/faiss_index" +UPLOAD_DIR = "data/uploads" + +os.makedirs(UPLOAD_DIR, exist_ok=True) +os.makedirs(INDEX_PATH, exist_ok=True) + +@app.post("/upload") +async def upload_pdf(file: UploadFile = File(...)): + if not file.filename.endswith(".pdf"): + raise HTTPException(status_code=400, detail="PDF 파일만 업로드 가능합니다.") + + file_path = os.path.join(UPLOAD_DIR, file.filename) + + with open(file_path, "wb") as f: + content = await file.read() + f.write(content) + + chunk_count = embed_pdf(file_path, INDEX_PATH) + return {"message": f"{chunk_count}개의 청크가 임베딩되어 저장되었습니다."} + +@app.get("/generate-task") +async def get_task(prompt: str): + result = generate_task(prompt, INDEX_PATH) + return {"task": result} diff --git a/app/routes.py b/app/routes.py new file mode 100644 index 0000000..45b04fe --- /dev/null +++ b/app/routes.py @@ -0,0 +1,35 @@ +from dotenv import load_dotenv +from fastapi import APIRouter, UploadFile, File, HTTPException +from services.embedding_service import embed_file +from services.task_service import generate_task +from app.config import UPLOAD_DIR, INDEX_PATH +import os + +router = APIRouter() +ALLOWED_EXTENSIONS = {".pdf", ".txt",".html", ".json", ".docx", ".xlsx", ".pptx"} + +@router.post("/upload") +async def upload_file(file: UploadFile = File(...)): + ext = os.path.splitext(file.filename)[1].lower() + if ext not in ALLOWED_EXTENSIONS: + raise HTTPException( + status_code=400, + detail="지원하지 않는 파일 형식입니다. PDF, TXT, HTML, JSON, DOCX, XLSX, PPTX만 업로드 가능합니다." + ) + + os.makedirs(UPLOAD_DIR, exist_ok=True) + file_path = os.path.join(UPLOAD_DIR, file.filename) + + with open(file_path, "wb") as f: + content = await file.read() + f.write(content) + + # TODO: 확장자별 다른 임베딩 함수로 변경 가능 + chunk_count = embed_file(file_path, INDEX_PATH) + + return {"message": f"{chunk_count}개의 청크가 임베딩되어 저장되었습니다."} + +@router.get("/generate-task") +async def get_task(prompt: str): + result = generate_task(prompt, INDEX_PATH) + return {"task": result} diff --git a/data/faiss_index/index.faiss b/data/faiss_index/index.faiss deleted file mode 100644 index d84c32d..0000000 Binary files a/data/faiss_index/index.faiss and /dev/null differ diff --git a/data/faiss_index/index.pkl b/data/faiss_index/index.pkl deleted file mode 100644 index 464e817..0000000 Binary files a/data/faiss_index/index.pkl and /dev/null differ diff --git a/main.py b/main.py index 760d3be..207b61b 100644 --- a/main.py +++ b/main.py @@ -1,35 +1,7 @@ -from dotenv import load_dotenv -from fastapi import FastAPI, UploadFile, File, HTTPException -from services.embedding_service import embed_pdf -from services.task_service import generate_task -import os - - -load_dotenv() +from fastapi import FastAPI +from app.routes import router app = FastAPI() +app.include_router(router) -INDEX_PATH = "data/faiss_index" -UPLOAD_DIR = "data/uploads" - -os.makedirs(UPLOAD_DIR, exist_ok=True) -os.makedirs(INDEX_PATH, exist_ok=True) - -@app.post("/upload") -async def upload_pdf(file: UploadFile = File(...)): - if not file.filename.endswith(".pdf"): - raise HTTPException(status_code=400, detail="PDF 파일만 업로드 가능합니다.") - - file_path = os.path.join(UPLOAD_DIR, file.filename) - - with open(file_path, "wb") as f: - content = await file.read() - f.write(content) - - chunk_count = embed_pdf(file_path, INDEX_PATH) - return {"message": f"{chunk_count}개의 청크가 임베딩되어 저장되었습니다."} - -@app.get("/generate-task") -async def get_task(prompt: str): - result = generate_task(prompt, INDEX_PATH) - return {"task": result} +#uvicorn main:app --reload : 실행 명령어 \ No newline at end of file diff --git a/services/__pycache__/__init__.cpython-310.pyc b/services/__pycache__/__init__.cpython-310.pyc deleted file mode 100644 index 100329e..0000000 Binary files a/services/__pycache__/__init__.cpython-310.pyc and /dev/null differ diff --git a/services/__pycache__/embedding_service.cpython-310.pyc b/services/__pycache__/embedding_service.cpython-310.pyc deleted file mode 100644 index 9891b22..0000000 Binary files a/services/__pycache__/embedding_service.cpython-310.pyc and /dev/null differ diff --git a/services/__pycache__/task_service.cpython-310.pyc b/services/__pycache__/task_service.cpython-310.pyc deleted file mode 100644 index 267aebd..0000000 Binary files a/services/__pycache__/task_service.cpython-310.pyc and /dev/null differ diff --git a/services/embedding_service.py b/services/embedding_service.py index 5edfd25..272035e 100644 --- a/services/embedding_service.py +++ b/services/embedding_service.py @@ -1,10 +1,31 @@ -from langchain_community.document_loaders import PyPDFLoader +from langchain.document_loaders import ( + PyPDFLoader, + TextLoader, + UnstructuredHTMLLoader, + JSONLoader, + UnstructuredFileLoader, +) + from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_openai import OpenAIEmbeddings from langchain_community.vectorstores import FAISS -def embed_pdf(file_path: str, index_path: str) -> int: - loader = PyPDFLoader(file_path) +def embed_file(file_path: str, index_path: str) -> int: + ext = file_path.split(".")[-1].lower() + + if ext == "pdf": + loader = PyPDFLoader(file_path) + elif ext == "txt": + loader = TextLoader(file_path) + elif ext == "html": + loader = UnstructuredHTMLLoader(file_path) + elif ext == "json": + loader = JSONLoader(file_path) + elif ext in ["docx", "xlsx", "pptx"]: + loader = UnstructuredFileLoader(file_path) + else: + raise ValueError(f"지원하지 않는 파일 형식입니다: {ext}") + docs = loader.load() splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) @@ -12,6 +33,6 @@ def embed_pdf(file_path: str, index_path: str) -> int: embeddings = OpenAIEmbeddings(model="text-embedding-3-small") vectorstore = FAISS.from_documents(chunks, embeddings) - vectorstore.save_local(index_path) + return len(chunks)