diff --git a/.gitignore b/.gitignore index 97b2fd0..d45b2ae 100644 --- a/.gitignore +++ b/.gitignore @@ -57,5 +57,5 @@ Thumbs.db # Vector database persistence data/ -# Knowledge Documents -documents/ +# Knowledge Documents (root folder only) +/documents/ diff --git a/src/config.py b/src/config.py index 9adc396..b047175 100644 --- a/src/config.py +++ b/src/config.py @@ -20,8 +20,8 @@ class Settings(BaseSettings): app_version: str = "0.1.0" debug: bool = False - # Server - host: str = "0.0.0.0" + # Server - bind to all interfaces for container deployment + host: str = "0.0.0.0" # nosec B104 port: int = 8000 # LLM Provider (OpenRouter) @@ -53,6 +53,7 @@ class Settings(BaseSettings): rag_chunk_overlap: int = 800 # Characters rag_top_k: int = 5 # Number of chunks to retrieve documents_path: str = "./documents" + uploads_path: str = "./uploads" # Directory for user-uploaded documents # Semantic Cache cache_enabled: bool = True # Enable semantic caching for faster responses diff --git a/src/infrastructure/database/connection.py b/src/infrastructure/database/connection.py index e9f8801..4f11b57 100644 --- a/src/infrastructure/database/connection.py +++ b/src/infrastructure/database/connection.py @@ -35,6 +35,24 @@ ); CREATE INDEX IF NOT EXISTS idx_messages_user_created ON messages(user_id, created_at); + +CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + filename TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + description TEXT, + file_path TEXT NOT NULL, + file_type TEXT NOT NULL, + file_size_bytes INTEGER NOT NULL, + uploaded_by TEXT, + is_indexed INTEGER DEFAULT 0, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY (uploaded_by) REFERENCES users(id) ON DELETE SET NULL +); + +CREATE INDEX IF NOT EXISTS idx_documents_filename ON documents(filename); +CREATE INDEX IF NOT EXISTS idx_documents_created ON documents(created_at); """ diff --git a/src/modules/auth/service.py b/src/modules/auth/service.py index ae7d27b..ee37eb4 100644 --- a/src/modules/auth/service.py +++ b/src/modules/auth/service.py @@ -133,7 +133,7 @@ def create_token(self, user: User) -> LoginResponse: return LoginResponse( access_token=token, - token_type="bearer", + token_type="bearer", # nosec B106 - OAuth2 token type, not a password expires_in=self._jwt_expire_hours * 3600, user=UserResponse( id=user.id, diff --git a/src/modules/documents/__init__.py b/src/modules/documents/__init__.py new file mode 100644 index 0000000..853ef0a --- /dev/null +++ b/src/modules/documents/__init__.py @@ -0,0 +1,35 @@ +"""Document management module. + +Provides functionality for uploading, managing, and indexing documents +for the RAG pipeline. +""" + +from src.modules.documents.exceptions import ( + DocumentAlreadyExistsError, + DocumentError, + DocumentIndexingError, + DocumentNotFoundError, + DocumentValidationError, +) +from src.modules.documents.models import Document +from src.modules.documents.repository import DocumentRepository +from src.modules.documents.schemas import ( + DocumentListResponse, + DocumentResponse, + DocumentUploadRequest, +) +from src.modules.documents.service import DocumentService + +__all__ = [ + "Document", + "DocumentAlreadyExistsError", + "DocumentError", + "DocumentIndexingError", + "DocumentListResponse", + "DocumentNotFoundError", + "DocumentRepository", + "DocumentResponse", + "DocumentService", + "DocumentUploadRequest", + "DocumentValidationError", +] diff --git a/src/modules/documents/exceptions.py b/src/modules/documents/exceptions.py new file mode 100644 index 0000000..845cb62 --- /dev/null +++ b/src/modules/documents/exceptions.py @@ -0,0 +1,39 @@ +"""Document management exceptions.""" + + +class DocumentError(Exception): + """Base exception for document operations.""" + + pass + + +class DocumentNotFoundError(DocumentError): + """Raised when a document is not found.""" + + def __init__(self, identifier: str) -> None: + self.identifier = identifier + super().__init__(f"Document not found: {identifier}") + + +class DocumentAlreadyExistsError(DocumentError): + """Raised when a document with the same filename already exists.""" + + def __init__(self, filename: str) -> None: + self.filename = filename + super().__init__(f"Document already exists: {filename}") + + +class DocumentValidationError(DocumentError): + """Raised when document validation fails.""" + + def __init__(self, message: str) -> None: + super().__init__(message) + + +class DocumentIndexingError(DocumentError): + """Raised when document indexing fails.""" + + def __init__(self, filename: str, reason: str) -> None: + self.filename = filename + self.reason = reason + super().__init__(f"Failed to index document '{filename}': {reason}") diff --git a/src/modules/documents/models.py b/src/modules/documents/models.py new file mode 100644 index 0000000..25b2806 --- /dev/null +++ b/src/modules/documents/models.py @@ -0,0 +1,48 @@ +"""Document domain models.""" + +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any +from uuid import UUID + + +@dataclass +class Document: + """Document domain model representing an uploaded document.""" + + id: UUID + filename: str + title: str + description: str | None + file_path: Path + file_type: str # 'markdown' or 'text' + file_size_bytes: int + uploaded_by: UUID | None + is_indexed: bool + created_at: datetime + updated_at: datetime + + @classmethod + def from_row(cls, row: dict[str, Any]) -> "Document": + """Create a Document from a database row. + + Args: + row: Dictionary containing database row data. + + Returns: + Document instance. + """ + return cls( + id=UUID(str(row["id"])), + filename=str(row["filename"]), + title=str(row["title"]), + description=str(row["description"]) if row["description"] else None, + file_path=Path(str(row["file_path"])), + file_type=str(row["file_type"]), + file_size_bytes=int(row["file_size_bytes"]), + uploaded_by=UUID(str(row["uploaded_by"])) if row["uploaded_by"] else None, + is_indexed=bool(row["is_indexed"]), + created_at=datetime.fromisoformat(str(row["created_at"])), + updated_at=datetime.fromisoformat(str(row["updated_at"])), + ) diff --git a/src/modules/documents/repository.py b/src/modules/documents/repository.py new file mode 100644 index 0000000..0bde968 --- /dev/null +++ b/src/modules/documents/repository.py @@ -0,0 +1,218 @@ +"""Document repository for database operations.""" + +from datetime import UTC, datetime +from pathlib import Path +from uuid import UUID, uuid4 + +import structlog + +from src.infrastructure.database import Database +from src.modules.documents.exceptions import ( + DocumentAlreadyExistsError, + DocumentNotFoundError, +) +from src.modules.documents.models import Document + +logger = structlog.get_logger() + + +class DocumentRepository: + """Repository for document CRUD operations. + + Handles all database interactions for document metadata. + File storage is handled by the service layer. + """ + + def __init__(self, database: Database) -> None: + """Initialize the repository. + + Args: + database: Database connection instance. + """ + self._db = database + + async def create( + self, + filename: str, + title: str, + file_path: Path, + file_type: str, + file_size_bytes: int, + *, + description: str | None = None, + uploaded_by: UUID | None = None, + ) -> Document: + """Create a new document record. + + Args: + filename: Original filename. + title: Document title (extracted or provided). + file_path: Path where file is stored. + file_type: Type of document ('markdown' or 'text'). + file_size_bytes: Size of the file in bytes. + description: Optional document description. + uploaded_by: ID of the user who uploaded the document. + + Returns: + Created Document instance. + + Raises: + DocumentAlreadyExistsError: If a document with the same filename exists. + """ + doc_id = uuid4() + now = datetime.now(UTC).isoformat() + + try: + await self._db.execute( + """ + INSERT INTO documents ( + id, filename, title, description, file_path, file_type, + file_size_bytes, uploaded_by, is_indexed, created_at, updated_at + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, 0, ?, ?) + """, + ( + str(doc_id), + filename, + title, + description, + str(file_path), + file_type, + file_size_bytes, + str(uploaded_by) if uploaded_by else None, + now, + now, + ), + ) + except Exception as e: + if "UNIQUE constraint failed" in str(e): + raise DocumentAlreadyExistsError(filename) from e + raise + + logger.info( + "document_created", + doc_id=str(doc_id), + filename=filename, + file_type=file_type, + ) + + return Document( + id=doc_id, + filename=filename, + title=title, + description=description, + file_path=file_path, + file_type=file_type, + file_size_bytes=file_size_bytes, + uploaded_by=uploaded_by, + is_indexed=False, + created_at=datetime.fromisoformat(now), + updated_at=datetime.fromisoformat(now), + ) + + async def get_by_id(self, doc_id: UUID) -> Document | None: + """Get a document by its ID. + + Args: + doc_id: Document UUID. + + Returns: + Document if found, None otherwise. + """ + row = await self._db.fetch_one( + "SELECT * FROM documents WHERE id = ?", + (str(doc_id),), + ) + return Document.from_row(dict(row)) if row else None + + async def get_by_filename(self, filename: str) -> Document | None: + """Get a document by its filename. + + Args: + filename: Document filename. + + Returns: + Document if found, None otherwise. + """ + row = await self._db.fetch_one( + "SELECT * FROM documents WHERE filename = ?", + (filename,), + ) + return Document.from_row(dict(row)) if row else None + + async def list_all(self, *, include_unindexed: bool = True) -> list[Document]: + """List all documents. + + Args: + include_unindexed: Whether to include documents that are not indexed. + + Returns: + List of documents ordered by creation date (newest first). + """ + if include_unindexed: + rows = await self._db.fetch_all( + "SELECT * FROM documents ORDER BY created_at DESC" + ) + else: + rows = await self._db.fetch_all( + "SELECT * FROM documents WHERE is_indexed = 1 ORDER BY created_at DESC" + ) + return [Document.from_row(dict(row)) for row in rows] + + async def update_index_status(self, doc_id: UUID, is_indexed: bool) -> None: + """Update the indexing status of a document. + + Args: + doc_id: Document UUID. + is_indexed: New indexing status. + + Raises: + DocumentNotFoundError: If document doesn't exist. + """ + now = datetime.now(UTC).isoformat() + cursor = await self._db.execute( + "UPDATE documents SET is_indexed = ?, updated_at = ? WHERE id = ?", + (1 if is_indexed else 0, now, str(doc_id)), + ) + if cursor.rowcount == 0: + raise DocumentNotFoundError(str(doc_id)) + + logger.debug( + "document_index_status_updated", + doc_id=str(doc_id), + is_indexed=is_indexed, + ) + + async def delete(self, doc_id: UUID) -> Document: + """Delete a document record. + + Args: + doc_id: Document UUID. + + Returns: + The deleted Document. + + Raises: + DocumentNotFoundError: If document doesn't exist. + """ + # First get the document + doc = await self.get_by_id(doc_id) + if doc is None: + raise DocumentNotFoundError(str(doc_id)) + + await self._db.execute( + "DELETE FROM documents WHERE id = ?", + (str(doc_id),), + ) + + logger.info("document_deleted", doc_id=str(doc_id), filename=doc.filename) + return doc + + async def count(self) -> int: + """Get the total number of documents. + + Returns: + Number of documents. + """ + row = await self._db.fetch_one("SELECT COUNT(*) as count FROM documents") + return int(row["count"]) if row else 0 diff --git a/src/modules/documents/schemas.py b/src/modules/documents/schemas.py new file mode 100644 index 0000000..5e3fb21 --- /dev/null +++ b/src/modules/documents/schemas.py @@ -0,0 +1,47 @@ +"""Document Pydantic schemas for API validation.""" + +from datetime import datetime +from uuid import UUID + +from pydantic import BaseModel, Field + + +class DocumentUploadRequest(BaseModel): + """Request schema for document upload. + + Note: The file itself is handled via FastAPI's UploadFile, + not through this schema. + """ + + description: str | None = Field( + default=None, + max_length=500, + description="Optional description of the document", + ) + + +class DocumentResponse(BaseModel): + """Response schema for a single document.""" + + id: UUID + filename: str + title: str + description: str | None + file_type: str + file_size_bytes: int + uploaded_by: UUID | None + is_indexed: bool + created_at: datetime + updated_at: datetime + + class Config: + """Pydantic configuration.""" + + from_attributes = True + + +class DocumentListResponse(BaseModel): + """Response schema for document list.""" + + documents: list[DocumentResponse] + total_count: int diff --git a/src/modules/documents/service.py b/src/modules/documents/service.py new file mode 100644 index 0000000..65a9a57 --- /dev/null +++ b/src/modules/documents/service.py @@ -0,0 +1,349 @@ +"""Document service for business logic.""" + +from pathlib import Path +from uuid import UUID + +import structlog + +from src.modules.documents.exceptions import ( + DocumentAlreadyExistsError, + DocumentIndexingError, + DocumentNotFoundError, + DocumentValidationError, +) +from src.modules.documents.models import Document +from src.modules.documents.repository import DocumentRepository +from src.modules.rag import RAGService +from src.modules.rag.loader import DocumentLoadError, load_document + +logger = structlog.get_logger() + +# Supported file extensions +SUPPORTED_EXTENSIONS = {".md", ".txt"} + +# Maximum file size in bytes (1 MB) +MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 + +# Maximum number of uploaded documents +MAX_DOCUMENTS = 20 + +# Characters not allowed in filenames +FORBIDDEN_FILENAME_CHARS = {"/", "\\", "\x00", "\n", "\r"} + + +class DocumentService: + """Service for document management business logic. + + Orchestrates document upload, deletion, and indexing operations. + Handles the coordination between file storage, database, and vector store. + """ + + def __init__( + self, + repository: DocumentRepository, + rag_service: RAGService, + uploads_path: Path, + documents_path: Path, + ) -> None: + """Initialize the document service. + + Args: + repository: Document repository for database operations. + rag_service: RAG service for document indexing. + uploads_path: Path to the uploads directory. + documents_path: Path to the static documents directory. + """ + self._repo = repository + self._rag = rag_service + self._uploads_path = uploads_path + self._documents_path = documents_path + + # Ensure uploads directory exists + self._uploads_path.mkdir(parents=True, exist_ok=True) + + async def upload_document( + self, + file_content: bytes, + filename: str, + *, + description: str | None = None, + uploaded_by: UUID | None = None, + ) -> Document: + """Upload and index a new document. + + Steps: + 1. Validate filename, size, extension + 2. Check for duplicates + 3. Save file to uploads directory + 4. Extract title from content + 5. Create database record + 6. Index via RAG service + 7. Update index status + 8. Return Document + + Args: + file_content: Raw file content as bytes. + filename: Original filename. + description: Optional document description. + uploaded_by: ID of the user uploading the document. + + Returns: + Created and indexed Document. + + Raises: + DocumentValidationError: Invalid file format or size. + DocumentAlreadyExistsError: Document with same filename exists. + DocumentIndexingError: Indexing failed. + """ + # Validate filename + if not filename: + raise DocumentValidationError("Filename is required") + + # Sanitize filename - only use the base name and strip whitespace + safe_filename = Path(filename).name.strip() + + # Robust filename validation + if not safe_filename or safe_filename in (".", ".."): + raise DocumentValidationError("Invalid filename") + + # Reject hidden files (starting with .) + if safe_filename.startswith("."): + raise DocumentValidationError("Hidden files are not allowed") + + # Check for forbidden characters + if any(char in safe_filename for char in FORBIDDEN_FILENAME_CHARS): + raise DocumentValidationError("Filename contains invalid characters") + + # Validate extension + suffix = Path(safe_filename).suffix.lower() + if suffix not in SUPPORTED_EXTENSIONS: + raise DocumentValidationError( + f"Unsupported file type: {suffix}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}" + ) + + # Validate size + file_size = len(file_content) + if file_size > MAX_FILE_SIZE_BYTES: + max_mb = MAX_FILE_SIZE_BYTES / (1024 * 1024) + raise DocumentValidationError( + f"File too large. Maximum size: {max_mb:.0f} MB" + ) + + if file_size == 0: + raise DocumentValidationError("File is empty") + + # Check for duplicates + existing = await self._repo.get_by_filename(safe_filename) + if existing: + raise DocumentAlreadyExistsError(safe_filename) + + # Check document count limit + current_count = await self._repo.count() + if current_count >= MAX_DOCUMENTS: + raise DocumentValidationError( + f"Maximum document limit reached ({MAX_DOCUMENTS}). " + "Please delete existing documents before uploading new ones." + ) + + # Determine file path + file_path = self._uploads_path / safe_filename + + try: + # Save file + file_path.write_bytes(file_content) + logger.debug( + "document_file_saved", filename=safe_filename, path=str(file_path) + ) + + # Extract title using existing loader + loaded_doc = load_document(file_path) + title = loaded_doc.title + file_type = loaded_doc.document_type + + # Create database record + doc = await self._repo.create( + filename=safe_filename, + title=title, + file_path=file_path, + file_type=file_type, + file_size_bytes=file_size, + description=description, + uploaded_by=uploaded_by, + ) + + # Index the document + result = await self._rag.index_document(file_path) + + if not result.success: + # Indexing failed - clean up + await self._repo.delete(doc.id) + file_path.unlink(missing_ok=True) + raise DocumentIndexingError( + safe_filename, result.error_message or "Unknown error" + ) + + # Update index status + await self._repo.update_index_status(doc.id, is_indexed=True) + + logger.info( + "document_uploaded", + doc_id=str(doc.id), + filename=safe_filename, + title=title, + chunks_created=result.chunks_created, + uploaded_by=str(uploaded_by) if uploaded_by else None, + ) + + # Return updated document + return await self._repo.get_by_id(doc.id) or doc + + except ( + DocumentAlreadyExistsError, + DocumentValidationError, + DocumentIndexingError, + ): + # Re-raise domain exceptions + raise + except DocumentLoadError as e: + # Clean up on load error + file_path.unlink(missing_ok=True) + raise DocumentValidationError(f"Failed to parse document: {e}") from e + except Exception as e: + # Clean up on any other error + file_path.unlink(missing_ok=True) + logger.error( + "document_upload_failed", + filename=safe_filename, + error=str(e), + error_type=type(e).__name__, + ) + raise DocumentValidationError(f"Upload failed: {e}") from e + + async def delete_document(self, doc_id: UUID) -> None: + """Delete a document and reindex the vector store. + + Due to ChromaDB limitations, we cannot delete individual document + chunks. Instead, we: + 1. Delete the database record + 2. Delete the file from filesystem + 3. Clear the entire vector store + 4. Reindex all remaining documents + + Args: + doc_id: UUID of the document to delete. + + Raises: + DocumentNotFoundError: Document doesn't exist. + """ + # Get document + doc = await self._repo.get_by_id(doc_id) + if doc is None: + raise DocumentNotFoundError(str(doc_id)) + + filename = doc.filename + file_path = doc.file_path + + # Delete from database first + await self._repo.delete(doc_id) + + # Delete file + if file_path.exists(): + file_path.unlink() + logger.debug("document_file_deleted", filename=filename) + + # Clear and reindex + await self._reindex_all() + + logger.info( + "document_deleted_and_reindexed", + doc_id=str(doc_id), + filename=filename, + ) + + async def _reindex_all(self) -> None: + """Clear vector store and reindex all documents. + + Reindexes documents from both: + - Uploaded documents (./uploads/) + - Static documents (./documents/) + + Updates database records to reflect actual indexing status. + """ + # Clear existing index + await self._rag.clear_index() + + # Track successfully indexed filenames + successfully_indexed: set[str] = set() + + # Reindex static documents + if self._documents_path.exists(): + results = await self._rag.index_all_documents(self._documents_path) + for r in results: + if r.success: + successfully_indexed.add(r.source) + success_count = sum(1 for r in results if r.success) + logger.debug( + "static_documents_reindexed", + path=str(self._documents_path), + success_count=success_count, + total=len(results), + ) + + # Reindex uploaded documents + if self._uploads_path.exists(): + results = await self._rag.index_all_documents(self._uploads_path) + for r in results: + if r.success: + successfully_indexed.add(r.source) + success_count = sum(1 for r in results if r.success) + logger.debug( + "uploaded_documents_reindexed", + path=str(self._uploads_path), + success_count=success_count, + total=len(results), + ) + + # Update index status based on actual indexing results + docs = await self._repo.list_all() + for doc in docs: + is_indexed = doc.filename in successfully_indexed + await self._repo.update_index_status(doc.id, is_indexed=is_indexed) + if not is_indexed: + logger.warning( + "document_reindex_failed", + doc_id=str(doc.id), + filename=doc.filename, + ) + + async def list_documents(self) -> list[Document]: + """List all uploaded documents. + + Returns: + List of documents ordered by creation date (newest first). + """ + return await self._repo.list_all() + + async def get_document(self, doc_id: UUID) -> Document: + """Get a document by ID. + + Args: + doc_id: Document UUID. + + Returns: + Document instance. + + Raises: + DocumentNotFoundError: Document doesn't exist. + """ + doc = await self._repo.get_by_id(doc_id) + if doc is None: + raise DocumentNotFoundError(str(doc_id)) + return doc + + async def get_document_count(self) -> int: + """Get the total number of uploaded documents. + + Returns: + Number of documents. + """ + return await self._repo.count() diff --git a/src/web/admin_routes.py b/src/web/admin_routes.py index a585d3d..5247d76 100644 --- a/src/web/admin_routes.py +++ b/src/web/admin_routes.py @@ -3,19 +3,29 @@ from dataclasses import dataclass from pathlib import Path from typing import Annotated, Any +from uuid import UUID import structlog -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends, File, Form, Request, UploadFile from fastapi.responses import HTMLResponse, Response from src.config import Settings, get_settings from src.infrastructure.cache import ChromaSemanticCache +from src.infrastructure.database import get_database from src.infrastructure.embeddings import ( EmbeddingProviderError, OpenAIEmbeddingProvider, ) from src.infrastructure.llm import OpenRouterProvider from src.infrastructure.vectordb import ChromaVectorStore +from src.modules.documents import ( + DocumentAlreadyExistsError, + DocumentIndexingError, + DocumentNotFoundError, + DocumentRepository, + DocumentService, + DocumentValidationError, +) from src.modules.rag import HybridRetriever, RAGService, list_documents, load_document from src.modules.rag.loader import DocumentLoadError from src.web.dependencies import require_admin @@ -80,6 +90,28 @@ def get_admin_rag_service( ) +def get_document_service( + settings: Annotated[Settings, Depends(get_settings)], + rag_service: Annotated[RAGService | None, Depends(get_admin_rag_service)], +) -> DocumentService | None: + """Get the document service for document management operations. + + Returns None if RAG is not configured. + """ + if rag_service is None: + return None + + database = get_database() + repository = DocumentRepository(database) + + return DocumentService( + repository=repository, + rag_service=rag_service, + uploads_path=Path(settings.uploads_path), + documents_path=Path(settings.documents_path), + ) + + def _load_document_info(doc_path: Path) -> DocumentInfo: """Load document metadata for admin display. @@ -112,17 +144,22 @@ async def admin_index( vector_store: Annotated[ChromaVectorStore, Depends(get_vector_store)], semantic_cache: Annotated[ChromaSemanticCache | None, Depends(get_semantic_cache)], hybrid_retriever: Annotated[HybridRetriever | None, Depends(get_hybrid_retriever)], + doc_service: Annotated[DocumentService | None, Depends(get_document_service)], _admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)], ) -> Response: """Render the admin dashboard. Requires admin authentication when enabled. """ + # Static documents from ./documents/ documents_path = Path(settings.documents_path) document_paths = list_documents(documents_path) + static_documents = [_load_document_info(doc_path) for doc_path in document_paths] - # Load document metadata for each document - documents = [_load_document_info(doc_path) for doc_path in document_paths] + # Uploaded documents from database + uploaded_documents = [] + if doc_service is not None: + uploaded_documents = await doc_service.list_documents() # Check configuration status llm_configured = settings.openrouter_api_key is not None @@ -142,9 +179,11 @@ async def admin_index( request=request, name="admin/index.html", context={ - "documents": documents, + "static_documents": static_documents, + "uploaded_documents": uploaded_documents, "chunk_count": vector_store.count(), "documents_path": str(documents_path), + "uploads_path": str(settings.uploads_path), "llm_configured": llm_configured, "embeddings_configured": embeddings_configured, "cache_enabled": cache_enabled, @@ -162,7 +201,7 @@ async def index_documents( rag_service: Annotated[RAGService | None, Depends(get_admin_rag_service)], _admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)], ) -> Response: - """Index all documents in the documents folder. + """Index all documents from both static and uploads directories. Requires admin authentication when enabled. """ @@ -176,13 +215,22 @@ async def index_documents( ) try: - documents_path = Path(settings.documents_path) - # Clear existing index await rag_service.clear_index() - # Index all documents - results = await rag_service.index_all_documents(documents_path) + results = [] + + # Index static documents + documents_path = Path(settings.documents_path) + if documents_path.exists(): + static_results = await rag_service.index_all_documents(documents_path) + results.extend(static_results) + + # Index uploaded documents + uploads_path = Path(settings.uploads_path) + if uploads_path.exists(): + upload_results = await rag_service.index_all_documents(uploads_path) + results.extend(upload_results) total_chunks = sum(r.chunks_created for r in results if r.success) success_count = sum(1 for r in results if r.success) @@ -271,3 +319,207 @@ async def clear_cache( "message": f"Failed to clear cache: {e}", }, ) + + +# Document Management Routes + + +@router.post("/documents/upload", response_class=HTMLResponse) +async def upload_document( + request: Request, + file: Annotated[UploadFile, File()], + description: Annotated[str | None, Form()] = None, + doc_service: Annotated[ + DocumentService | None, Depends(get_document_service) + ] = None, + admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)] = None, +) -> Response: + """Upload and index a new document. + + Requires admin authentication when enabled. + """ + if doc_service is None: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={ + "error_message": "RAG not configured. Please set OPENROUTER_API_KEY and EMBEDDING_API_KEY." + }, + ) + + if not file.filename: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": "No file selected."}, + ) + + # Sanitize filename early for safe logging + safe_filename = Path(file.filename).name.strip() if file.filename else "unknown" + + # Validate file size before reading into memory (defense against memory exhaustion) + max_size_bytes = 1 * 1024 * 1024 # 1 MB + if file.size is not None and file.size > max_size_bytes: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": "File too large. Maximum size: 1 MB"}, + ) + + try: + # Read file content + file_content = await file.read() + + # Double-check size after reading (in case file.size was unavailable) + if len(file_content) > max_size_bytes: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": "File too large. Maximum size: 1 MB"}, + ) + + # Get user ID if authenticated + uploaded_by = None + if admin_user: + uploaded_by = UUID(admin_user["user_id"]) + + # Upload and index document + doc = await doc_service.upload_document( + file_content=file_content, + filename=file.filename, + description=description, + uploaded_by=uploaded_by, + ) + + logger.info( + "admin_document_uploaded", + filename=doc.filename, + title=doc.title, + uploaded_by=str(uploaded_by) if uploaded_by else "anonymous", + ) + + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_success.html", + context={ + "document": doc, + }, + ) + + except DocumentAlreadyExistsError as e: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": f"Document '{e.filename}' already exists."}, + ) + + except DocumentValidationError as e: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": str(e)}, + ) + + except DocumentIndexingError as e: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": f"Indexing failed: {e.reason}"}, + ) + + except Exception as e: + logger.error( + "admin_document_upload_error", + error=str(e), + error_type=type(e).__name__, + filename=safe_filename, # Use sanitized filename for safe logging + ) + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": f"Upload failed: {e}"}, + ) + + +@router.delete("/documents/{doc_id}", response_class=HTMLResponse) +async def delete_document( + request: Request, + doc_id: UUID, + doc_service: Annotated[ + DocumentService | None, Depends(get_document_service) + ] = None, + _admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)] = None, +) -> Response: + """Delete a document and rebuild the index. + + Due to ChromaDB limitations, this clears and rebuilds the entire index. + Requires admin authentication when enabled. + """ + if doc_service is None: + return templates.TemplateResponse( + request=request, + name="admin/partials/delete_error.html", + context={ + "error_message": "RAG not configured. Please set OPENROUTER_API_KEY and EMBEDDING_API_KEY." + }, + ) + + try: + # Get document info before deletion for logging + doc = await doc_service.get_document(doc_id) + filename = doc.filename + + # Delete document (includes reindexing) + await doc_service.delete_document(doc_id) + + logger.info("admin_document_deleted", doc_id=str(doc_id), filename=filename) + + return templates.TemplateResponse( + request=request, + name="admin/partials/delete_success.html", + context={"filename": filename}, + ) + + except DocumentNotFoundError: + return templates.TemplateResponse( + request=request, + name="admin/partials/delete_error.html", + context={"error_message": "Document not found."}, + ) + + except Exception as e: + logger.error( + "admin_document_delete_error", + error=str(e), + error_type=type(e).__name__, + doc_id=str(doc_id), + ) + return templates.TemplateResponse( + request=request, + name="admin/partials/delete_error.html", + context={"error_message": f"Delete failed: {e}"}, + ) + + +@router.get("/documents", response_class=HTMLResponse) +async def list_uploaded_documents( + request: Request, + doc_service: Annotated[ + DocumentService | None, Depends(get_document_service) + ] = None, + _admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)] = None, +) -> Response: + """List all uploaded documents. + + Requires admin authentication when enabled. + Returns an HTMX partial for the documents list. + """ + documents = [] + if doc_service is not None: + documents = await doc_service.list_documents() + + return templates.TemplateResponse( + request=request, + name="admin/partials/documents_list.html", + context={"documents": documents}, + ) diff --git a/src/web/templates.py b/src/web/templates.py index 6f7a445..17831c0 100644 --- a/src/web/templates.py +++ b/src/web/templates.py @@ -94,7 +94,7 @@ def markdown_filter(text: str) -> Markup: strip=True, # Strip disallowed tags instead of escaping them ) - return Markup(clean_html) + return Markup(clean_html) # nosec B704 - sanitized by bleach.clean() # Register the markdown filter diff --git a/src/web/templates/admin/index.html b/src/web/templates/admin/index.html index 473b4ec..5ebaf26 100644 --- a/src/web/templates/admin/index.html +++ b/src/web/templates/admin/index.html @@ -52,7 +52,7 @@
+ Location: {{ uploads_path }}
+
Loading...
+
Location: {{ documents_path }}
+ (read-only, managed via filesystem)
- No documents found. Add .md or .txt files to the documents folder. + No static documents found. Add .md or .txt files to the documents folder.
{% endif %}{{ error_message }}
++ "{{ filename }}" has been deleted and the index has been rebuilt. +
++ No uploaded documents yet. Use the form above to upload your first document. +
+{% endif %} diff --git a/src/web/templates/admin/partials/upload_error.html b/src/web/templates/admin/partials/upload_error.html new file mode 100644 index 0000000..5bd54ef --- /dev/null +++ b/src/web/templates/admin/partials/upload_error.html @@ -0,0 +1,11 @@ +{{ error_message }}
++ {{ document.title }} ({{ document.filename }}) +
++ The document has been indexed and is now available for Q&A. +
+