From 3fe3bc92d89d2b9348ee7db090ea82ebb83fbf3f Mon Sep 17 00:00:00 2001 From: Chris Krough <461869+ckrough@users.noreply.github.com> Date: Sat, 27 Dec 2025 16:27:27 -0500 Subject: [PATCH 1/3] feat: add document management via admin web interface Add ability for admins to upload, delete, and list documents through the web UI. Documents are stored on filesystem with metadata in SQLite. Key features: - Upload .md/.txt files (max 1 MB, limit 20 documents) - Auto-extract title from content - Delete with confirmation (triggers full reindex due to ChromaDB limitation) - List uploaded documents with metadata (title, type, size, upload date) - Separate sections for uploaded vs static documents Security: - Path traversal prevention with robust filename validation - File size validation before reading into memory - Document count limit to prevent abuse - Sanitized logging to prevent log injection Also fixes pre-existing bandit findings: - B104: Add nosec for intentional 0.0.0.0 binding - B106: Add nosec for OAuth2 "bearer" token type - B704: Add nosec for bleach-sanitized Markup --- src/config.py | 5 +- src/infrastructure/database/connection.py | 18 ++ src/modules/auth/service.py | 2 +- src/web/admin_routes.py | 270 +++++++++++++++++- src/web/templates.py | 2 +- src/web/templates/admin/index.html | 86 +++++- .../admin/partials/delete_error.html | 11 + .../admin/partials/delete_success.html | 18 ++ .../admin/partials/documents_list.html | 60 ++++ .../admin/partials/upload_error.html | 11 + .../admin/partials/upload_success.html | 21 ++ uv.lock | 2 +- 12 files changed, 486 insertions(+), 20 deletions(-) create mode 100644 src/web/templates/admin/partials/delete_error.html create mode 100644 src/web/templates/admin/partials/delete_success.html create mode 100644 src/web/templates/admin/partials/documents_list.html create mode 100644 src/web/templates/admin/partials/upload_error.html create mode 100644 src/web/templates/admin/partials/upload_success.html diff --git a/src/config.py b/src/config.py index 9adc396..b047175 100644 --- a/src/config.py +++ b/src/config.py @@ -20,8 +20,8 @@ class Settings(BaseSettings): app_version: str = "0.1.0" debug: bool = False - # Server - host: str = "0.0.0.0" + # Server - bind to all interfaces for container deployment + host: str = "0.0.0.0" # nosec B104 port: int = 8000 # LLM Provider (OpenRouter) @@ -53,6 +53,7 @@ class Settings(BaseSettings): rag_chunk_overlap: int = 800 # Characters rag_top_k: int = 5 # Number of chunks to retrieve documents_path: str = "./documents" + uploads_path: str = "./uploads" # Directory for user-uploaded documents # Semantic Cache cache_enabled: bool = True # Enable semantic caching for faster responses diff --git a/src/infrastructure/database/connection.py b/src/infrastructure/database/connection.py index e9f8801..4f11b57 100644 --- a/src/infrastructure/database/connection.py +++ b/src/infrastructure/database/connection.py @@ -35,6 +35,24 @@ ); CREATE INDEX IF NOT EXISTS idx_messages_user_created ON messages(user_id, created_at); + +CREATE TABLE IF NOT EXISTS documents ( + id TEXT PRIMARY KEY, + filename TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, + description TEXT, + file_path TEXT NOT NULL, + file_type TEXT NOT NULL, + file_size_bytes INTEGER NOT NULL, + uploaded_by TEXT, + is_indexed INTEGER DEFAULT 0, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL, + FOREIGN KEY (uploaded_by) REFERENCES users(id) ON DELETE SET NULL +); + +CREATE INDEX IF NOT EXISTS idx_documents_filename ON documents(filename); +CREATE INDEX IF NOT EXISTS idx_documents_created ON documents(created_at); """ diff --git a/src/modules/auth/service.py b/src/modules/auth/service.py index ae7d27b..ee37eb4 100644 --- a/src/modules/auth/service.py +++ b/src/modules/auth/service.py @@ -133,7 +133,7 @@ def create_token(self, user: User) -> LoginResponse: return LoginResponse( access_token=token, - token_type="bearer", + token_type="bearer", # nosec B106 - OAuth2 token type, not a password expires_in=self._jwt_expire_hours * 3600, user=UserResponse( id=user.id, diff --git a/src/web/admin_routes.py b/src/web/admin_routes.py index a585d3d..5247d76 100644 --- a/src/web/admin_routes.py +++ b/src/web/admin_routes.py @@ -3,19 +3,29 @@ from dataclasses import dataclass from pathlib import Path from typing import Annotated, Any +from uuid import UUID import structlog -from fastapi import APIRouter, Depends, Request +from fastapi import APIRouter, Depends, File, Form, Request, UploadFile from fastapi.responses import HTMLResponse, Response from src.config import Settings, get_settings from src.infrastructure.cache import ChromaSemanticCache +from src.infrastructure.database import get_database from src.infrastructure.embeddings import ( EmbeddingProviderError, OpenAIEmbeddingProvider, ) from src.infrastructure.llm import OpenRouterProvider from src.infrastructure.vectordb import ChromaVectorStore +from src.modules.documents import ( + DocumentAlreadyExistsError, + DocumentIndexingError, + DocumentNotFoundError, + DocumentRepository, + DocumentService, + DocumentValidationError, +) from src.modules.rag import HybridRetriever, RAGService, list_documents, load_document from src.modules.rag.loader import DocumentLoadError from src.web.dependencies import require_admin @@ -80,6 +90,28 @@ def get_admin_rag_service( ) +def get_document_service( + settings: Annotated[Settings, Depends(get_settings)], + rag_service: Annotated[RAGService | None, Depends(get_admin_rag_service)], +) -> DocumentService | None: + """Get the document service for document management operations. + + Returns None if RAG is not configured. + """ + if rag_service is None: + return None + + database = get_database() + repository = DocumentRepository(database) + + return DocumentService( + repository=repository, + rag_service=rag_service, + uploads_path=Path(settings.uploads_path), + documents_path=Path(settings.documents_path), + ) + + def _load_document_info(doc_path: Path) -> DocumentInfo: """Load document metadata for admin display. @@ -112,17 +144,22 @@ async def admin_index( vector_store: Annotated[ChromaVectorStore, Depends(get_vector_store)], semantic_cache: Annotated[ChromaSemanticCache | None, Depends(get_semantic_cache)], hybrid_retriever: Annotated[HybridRetriever | None, Depends(get_hybrid_retriever)], + doc_service: Annotated[DocumentService | None, Depends(get_document_service)], _admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)], ) -> Response: """Render the admin dashboard. Requires admin authentication when enabled. """ + # Static documents from ./documents/ documents_path = Path(settings.documents_path) document_paths = list_documents(documents_path) + static_documents = [_load_document_info(doc_path) for doc_path in document_paths] - # Load document metadata for each document - documents = [_load_document_info(doc_path) for doc_path in document_paths] + # Uploaded documents from database + uploaded_documents = [] + if doc_service is not None: + uploaded_documents = await doc_service.list_documents() # Check configuration status llm_configured = settings.openrouter_api_key is not None @@ -142,9 +179,11 @@ async def admin_index( request=request, name="admin/index.html", context={ - "documents": documents, + "static_documents": static_documents, + "uploaded_documents": uploaded_documents, "chunk_count": vector_store.count(), "documents_path": str(documents_path), + "uploads_path": str(settings.uploads_path), "llm_configured": llm_configured, "embeddings_configured": embeddings_configured, "cache_enabled": cache_enabled, @@ -162,7 +201,7 @@ async def index_documents( rag_service: Annotated[RAGService | None, Depends(get_admin_rag_service)], _admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)], ) -> Response: - """Index all documents in the documents folder. + """Index all documents from both static and uploads directories. Requires admin authentication when enabled. """ @@ -176,13 +215,22 @@ async def index_documents( ) try: - documents_path = Path(settings.documents_path) - # Clear existing index await rag_service.clear_index() - # Index all documents - results = await rag_service.index_all_documents(documents_path) + results = [] + + # Index static documents + documents_path = Path(settings.documents_path) + if documents_path.exists(): + static_results = await rag_service.index_all_documents(documents_path) + results.extend(static_results) + + # Index uploaded documents + uploads_path = Path(settings.uploads_path) + if uploads_path.exists(): + upload_results = await rag_service.index_all_documents(uploads_path) + results.extend(upload_results) total_chunks = sum(r.chunks_created for r in results if r.success) success_count = sum(1 for r in results if r.success) @@ -271,3 +319,207 @@ async def clear_cache( "message": f"Failed to clear cache: {e}", }, ) + + +# Document Management Routes + + +@router.post("/documents/upload", response_class=HTMLResponse) +async def upload_document( + request: Request, + file: Annotated[UploadFile, File()], + description: Annotated[str | None, Form()] = None, + doc_service: Annotated[ + DocumentService | None, Depends(get_document_service) + ] = None, + admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)] = None, +) -> Response: + """Upload and index a new document. + + Requires admin authentication when enabled. + """ + if doc_service is None: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={ + "error_message": "RAG not configured. Please set OPENROUTER_API_KEY and EMBEDDING_API_KEY." + }, + ) + + if not file.filename: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": "No file selected."}, + ) + + # Sanitize filename early for safe logging + safe_filename = Path(file.filename).name.strip() if file.filename else "unknown" + + # Validate file size before reading into memory (defense against memory exhaustion) + max_size_bytes = 1 * 1024 * 1024 # 1 MB + if file.size is not None and file.size > max_size_bytes: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": "File too large. Maximum size: 1 MB"}, + ) + + try: + # Read file content + file_content = await file.read() + + # Double-check size after reading (in case file.size was unavailable) + if len(file_content) > max_size_bytes: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": "File too large. Maximum size: 1 MB"}, + ) + + # Get user ID if authenticated + uploaded_by = None + if admin_user: + uploaded_by = UUID(admin_user["user_id"]) + + # Upload and index document + doc = await doc_service.upload_document( + file_content=file_content, + filename=file.filename, + description=description, + uploaded_by=uploaded_by, + ) + + logger.info( + "admin_document_uploaded", + filename=doc.filename, + title=doc.title, + uploaded_by=str(uploaded_by) if uploaded_by else "anonymous", + ) + + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_success.html", + context={ + "document": doc, + }, + ) + + except DocumentAlreadyExistsError as e: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": f"Document '{e.filename}' already exists."}, + ) + + except DocumentValidationError as e: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": str(e)}, + ) + + except DocumentIndexingError as e: + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": f"Indexing failed: {e.reason}"}, + ) + + except Exception as e: + logger.error( + "admin_document_upload_error", + error=str(e), + error_type=type(e).__name__, + filename=safe_filename, # Use sanitized filename for safe logging + ) + return templates.TemplateResponse( + request=request, + name="admin/partials/upload_error.html", + context={"error_message": f"Upload failed: {e}"}, + ) + + +@router.delete("/documents/{doc_id}", response_class=HTMLResponse) +async def delete_document( + request: Request, + doc_id: UUID, + doc_service: Annotated[ + DocumentService | None, Depends(get_document_service) + ] = None, + _admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)] = None, +) -> Response: + """Delete a document and rebuild the index. + + Due to ChromaDB limitations, this clears and rebuilds the entire index. + Requires admin authentication when enabled. + """ + if doc_service is None: + return templates.TemplateResponse( + request=request, + name="admin/partials/delete_error.html", + context={ + "error_message": "RAG not configured. Please set OPENROUTER_API_KEY and EMBEDDING_API_KEY." + }, + ) + + try: + # Get document info before deletion for logging + doc = await doc_service.get_document(doc_id) + filename = doc.filename + + # Delete document (includes reindexing) + await doc_service.delete_document(doc_id) + + logger.info("admin_document_deleted", doc_id=str(doc_id), filename=filename) + + return templates.TemplateResponse( + request=request, + name="admin/partials/delete_success.html", + context={"filename": filename}, + ) + + except DocumentNotFoundError: + return templates.TemplateResponse( + request=request, + name="admin/partials/delete_error.html", + context={"error_message": "Document not found."}, + ) + + except Exception as e: + logger.error( + "admin_document_delete_error", + error=str(e), + error_type=type(e).__name__, + doc_id=str(doc_id), + ) + return templates.TemplateResponse( + request=request, + name="admin/partials/delete_error.html", + context={"error_message": f"Delete failed: {e}"}, + ) + + +@router.get("/documents", response_class=HTMLResponse) +async def list_uploaded_documents( + request: Request, + doc_service: Annotated[ + DocumentService | None, Depends(get_document_service) + ] = None, + _admin_user: Annotated[dict[str, Any] | None, Depends(require_admin)] = None, +) -> Response: + """List all uploaded documents. + + Requires admin authentication when enabled. + Returns an HTMX partial for the documents list. + """ + documents = [] + if doc_service is not None: + documents = await doc_service.list_documents() + + return templates.TemplateResponse( + request=request, + name="admin/partials/documents_list.html", + context={"documents": documents}, + ) diff --git a/src/web/templates.py b/src/web/templates.py index 6f7a445..17831c0 100644 --- a/src/web/templates.py +++ b/src/web/templates.py @@ -94,7 +94,7 @@ def markdown_filter(text: str) -> Markup: strip=True, # Strip disallowed tags instead of escaping them ) - return Markup(clean_html) + return Markup(clean_html) # nosec B704 - sanitized by bleach.clean() # Register the markdown filter diff --git a/src/web/templates/admin/index.html b/src/web/templates/admin/index.html index 473b4ec..5ebaf26 100644 --- a/src/web/templates/admin/index.html +++ b/src/web/templates/admin/index.html @@ -52,7 +52,7 @@

Document Index

Chunks Indexed
-
{{ documents|length }}
+
{{ static_documents|length + uploaded_documents|length }}
Documents Available
@@ -115,16 +115,90 @@

Document Index

- + +
+

Upload Document

+
+
+ + +

Maximum 20 documents total

+
+
+ + +
+
+ + + Uploading and indexing... + +
+
+
+
+ + +
+

Uploaded Documents

+

+ Location: {{ uploads_path }} +

+ +
+

Loading...

+
+
+ + Deleting and rebuilding index... + +
+ +
-

Available Documents

+

Static Documents

Location: {{ documents_path }} + (read-only, managed via filesystem)

- {% if documents %} + {% if static_documents %} {% else %}

- No documents found. Add .md or .txt files to the documents folder. + No static documents found. Add .md or .txt files to the documents folder.

{% endif %}
diff --git a/src/web/templates/admin/partials/delete_error.html b/src/web/templates/admin/partials/delete_error.html new file mode 100644 index 0000000..bdf8023 --- /dev/null +++ b/src/web/templates/admin/partials/delete_error.html @@ -0,0 +1,11 @@ +
+
+ + + +
+

Delete failed

+

{{ error_message }}

+
+
+
diff --git a/src/web/templates/admin/partials/delete_success.html b/src/web/templates/admin/partials/delete_success.html new file mode 100644 index 0000000..8c9e52e --- /dev/null +++ b/src/web/templates/admin/partials/delete_success.html @@ -0,0 +1,18 @@ +
+
+ + + +
+

Document deleted

+

+ "{{ filename }}" has been deleted and the index has been rebuilt. +

+
+
+
+ + diff --git a/src/web/templates/admin/partials/documents_list.html b/src/web/templates/admin/partials/documents_list.html new file mode 100644 index 0000000..49295bc --- /dev/null +++ b/src/web/templates/admin/partials/documents_list.html @@ -0,0 +1,60 @@ +{% if documents %} + +{% else %} +

+ No uploaded documents yet. Use the form above to upload your first document. +

+{% endif %} diff --git a/src/web/templates/admin/partials/upload_error.html b/src/web/templates/admin/partials/upload_error.html new file mode 100644 index 0000000..5bd54ef --- /dev/null +++ b/src/web/templates/admin/partials/upload_error.html @@ -0,0 +1,11 @@ +
+
+ + + +
+

Upload failed

+

{{ error_message }}

+
+
+
diff --git a/src/web/templates/admin/partials/upload_success.html b/src/web/templates/admin/partials/upload_success.html new file mode 100644 index 0000000..69c114a --- /dev/null +++ b/src/web/templates/admin/partials/upload_success.html @@ -0,0 +1,21 @@ +
+
+ + + +
+

Document uploaded successfully!

+

+ {{ document.title }} ({{ document.filename }}) +

+

+ The document has been indexed and is now available for Q&A. +

+
+
+
+ + diff --git a/uv.lock b/uv.lock index d7eb58b..5eba9dc 100644 --- a/uv.lock +++ b/uv.lock @@ -1793,7 +1793,7 @@ wheels = [ [[package]] name = "retriever" -version = "0.1.0" +version = "1.0.0" source = { editable = "." } dependencies = [ { name = "aiobreaker" }, From 5c9ea3d8e2b6ce2f4477f04565b36454eaae3650 Mon Sep 17 00:00:00 2001 From: Chris Krough <461869+ckrough@users.noreply.github.com> Date: Sat, 27 Dec 2025 16:33:24 -0500 Subject: [PATCH 2/3] fix: add missing documents module to git The .gitignore pattern 'documents/' was matching src/modules/documents/. Changed to '/documents/' to only match root documents folder. --- .gitignore | 4 +- src/modules/documents/__init__.py | 35 +++ src/modules/documents/exceptions.py | 39 ++++ src/modules/documents/models.py | 48 ++++ src/modules/documents/repository.py | 218 +++++++++++++++++ src/modules/documents/schemas.py | 47 ++++ src/modules/documents/service.py | 349 ++++++++++++++++++++++++++++ 7 files changed, 738 insertions(+), 2 deletions(-) create mode 100644 src/modules/documents/__init__.py create mode 100644 src/modules/documents/exceptions.py create mode 100644 src/modules/documents/models.py create mode 100644 src/modules/documents/repository.py create mode 100644 src/modules/documents/schemas.py create mode 100644 src/modules/documents/service.py diff --git a/.gitignore b/.gitignore index 97b2fd0..d45b2ae 100644 --- a/.gitignore +++ b/.gitignore @@ -57,5 +57,5 @@ Thumbs.db # Vector database persistence data/ -# Knowledge Documents -documents/ +# Knowledge Documents (root folder only) +/documents/ diff --git a/src/modules/documents/__init__.py b/src/modules/documents/__init__.py new file mode 100644 index 0000000..853ef0a --- /dev/null +++ b/src/modules/documents/__init__.py @@ -0,0 +1,35 @@ +"""Document management module. + +Provides functionality for uploading, managing, and indexing documents +for the RAG pipeline. +""" + +from src.modules.documents.exceptions import ( + DocumentAlreadyExistsError, + DocumentError, + DocumentIndexingError, + DocumentNotFoundError, + DocumentValidationError, +) +from src.modules.documents.models import Document +from src.modules.documents.repository import DocumentRepository +from src.modules.documents.schemas import ( + DocumentListResponse, + DocumentResponse, + DocumentUploadRequest, +) +from src.modules.documents.service import DocumentService + +__all__ = [ + "Document", + "DocumentAlreadyExistsError", + "DocumentError", + "DocumentIndexingError", + "DocumentListResponse", + "DocumentNotFoundError", + "DocumentRepository", + "DocumentResponse", + "DocumentService", + "DocumentUploadRequest", + "DocumentValidationError", +] diff --git a/src/modules/documents/exceptions.py b/src/modules/documents/exceptions.py new file mode 100644 index 0000000..845cb62 --- /dev/null +++ b/src/modules/documents/exceptions.py @@ -0,0 +1,39 @@ +"""Document management exceptions.""" + + +class DocumentError(Exception): + """Base exception for document operations.""" + + pass + + +class DocumentNotFoundError(DocumentError): + """Raised when a document is not found.""" + + def __init__(self, identifier: str) -> None: + self.identifier = identifier + super().__init__(f"Document not found: {identifier}") + + +class DocumentAlreadyExistsError(DocumentError): + """Raised when a document with the same filename already exists.""" + + def __init__(self, filename: str) -> None: + self.filename = filename + super().__init__(f"Document already exists: {filename}") + + +class DocumentValidationError(DocumentError): + """Raised when document validation fails.""" + + def __init__(self, message: str) -> None: + super().__init__(message) + + +class DocumentIndexingError(DocumentError): + """Raised when document indexing fails.""" + + def __init__(self, filename: str, reason: str) -> None: + self.filename = filename + self.reason = reason + super().__init__(f"Failed to index document '{filename}': {reason}") diff --git a/src/modules/documents/models.py b/src/modules/documents/models.py new file mode 100644 index 0000000..25b2806 --- /dev/null +++ b/src/modules/documents/models.py @@ -0,0 +1,48 @@ +"""Document domain models.""" + +from dataclasses import dataclass +from datetime import datetime +from pathlib import Path +from typing import Any +from uuid import UUID + + +@dataclass +class Document: + """Document domain model representing an uploaded document.""" + + id: UUID + filename: str + title: str + description: str | None + file_path: Path + file_type: str # 'markdown' or 'text' + file_size_bytes: int + uploaded_by: UUID | None + is_indexed: bool + created_at: datetime + updated_at: datetime + + @classmethod + def from_row(cls, row: dict[str, Any]) -> "Document": + """Create a Document from a database row. + + Args: + row: Dictionary containing database row data. + + Returns: + Document instance. + """ + return cls( + id=UUID(str(row["id"])), + filename=str(row["filename"]), + title=str(row["title"]), + description=str(row["description"]) if row["description"] else None, + file_path=Path(str(row["file_path"])), + file_type=str(row["file_type"]), + file_size_bytes=int(row["file_size_bytes"]), + uploaded_by=UUID(str(row["uploaded_by"])) if row["uploaded_by"] else None, + is_indexed=bool(row["is_indexed"]), + created_at=datetime.fromisoformat(str(row["created_at"])), + updated_at=datetime.fromisoformat(str(row["updated_at"])), + ) diff --git a/src/modules/documents/repository.py b/src/modules/documents/repository.py new file mode 100644 index 0000000..0bde968 --- /dev/null +++ b/src/modules/documents/repository.py @@ -0,0 +1,218 @@ +"""Document repository for database operations.""" + +from datetime import UTC, datetime +from pathlib import Path +from uuid import UUID, uuid4 + +import structlog + +from src.infrastructure.database import Database +from src.modules.documents.exceptions import ( + DocumentAlreadyExistsError, + DocumentNotFoundError, +) +from src.modules.documents.models import Document + +logger = structlog.get_logger() + + +class DocumentRepository: + """Repository for document CRUD operations. + + Handles all database interactions for document metadata. + File storage is handled by the service layer. + """ + + def __init__(self, database: Database) -> None: + """Initialize the repository. + + Args: + database: Database connection instance. + """ + self._db = database + + async def create( + self, + filename: str, + title: str, + file_path: Path, + file_type: str, + file_size_bytes: int, + *, + description: str | None = None, + uploaded_by: UUID | None = None, + ) -> Document: + """Create a new document record. + + Args: + filename: Original filename. + title: Document title (extracted or provided). + file_path: Path where file is stored. + file_type: Type of document ('markdown' or 'text'). + file_size_bytes: Size of the file in bytes. + description: Optional document description. + uploaded_by: ID of the user who uploaded the document. + + Returns: + Created Document instance. + + Raises: + DocumentAlreadyExistsError: If a document with the same filename exists. + """ + doc_id = uuid4() + now = datetime.now(UTC).isoformat() + + try: + await self._db.execute( + """ + INSERT INTO documents ( + id, filename, title, description, file_path, file_type, + file_size_bytes, uploaded_by, is_indexed, created_at, updated_at + ) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, 0, ?, ?) + """, + ( + str(doc_id), + filename, + title, + description, + str(file_path), + file_type, + file_size_bytes, + str(uploaded_by) if uploaded_by else None, + now, + now, + ), + ) + except Exception as e: + if "UNIQUE constraint failed" in str(e): + raise DocumentAlreadyExistsError(filename) from e + raise + + logger.info( + "document_created", + doc_id=str(doc_id), + filename=filename, + file_type=file_type, + ) + + return Document( + id=doc_id, + filename=filename, + title=title, + description=description, + file_path=file_path, + file_type=file_type, + file_size_bytes=file_size_bytes, + uploaded_by=uploaded_by, + is_indexed=False, + created_at=datetime.fromisoformat(now), + updated_at=datetime.fromisoformat(now), + ) + + async def get_by_id(self, doc_id: UUID) -> Document | None: + """Get a document by its ID. + + Args: + doc_id: Document UUID. + + Returns: + Document if found, None otherwise. + """ + row = await self._db.fetch_one( + "SELECT * FROM documents WHERE id = ?", + (str(doc_id),), + ) + return Document.from_row(dict(row)) if row else None + + async def get_by_filename(self, filename: str) -> Document | None: + """Get a document by its filename. + + Args: + filename: Document filename. + + Returns: + Document if found, None otherwise. + """ + row = await self._db.fetch_one( + "SELECT * FROM documents WHERE filename = ?", + (filename,), + ) + return Document.from_row(dict(row)) if row else None + + async def list_all(self, *, include_unindexed: bool = True) -> list[Document]: + """List all documents. + + Args: + include_unindexed: Whether to include documents that are not indexed. + + Returns: + List of documents ordered by creation date (newest first). + """ + if include_unindexed: + rows = await self._db.fetch_all( + "SELECT * FROM documents ORDER BY created_at DESC" + ) + else: + rows = await self._db.fetch_all( + "SELECT * FROM documents WHERE is_indexed = 1 ORDER BY created_at DESC" + ) + return [Document.from_row(dict(row)) for row in rows] + + async def update_index_status(self, doc_id: UUID, is_indexed: bool) -> None: + """Update the indexing status of a document. + + Args: + doc_id: Document UUID. + is_indexed: New indexing status. + + Raises: + DocumentNotFoundError: If document doesn't exist. + """ + now = datetime.now(UTC).isoformat() + cursor = await self._db.execute( + "UPDATE documents SET is_indexed = ?, updated_at = ? WHERE id = ?", + (1 if is_indexed else 0, now, str(doc_id)), + ) + if cursor.rowcount == 0: + raise DocumentNotFoundError(str(doc_id)) + + logger.debug( + "document_index_status_updated", + doc_id=str(doc_id), + is_indexed=is_indexed, + ) + + async def delete(self, doc_id: UUID) -> Document: + """Delete a document record. + + Args: + doc_id: Document UUID. + + Returns: + The deleted Document. + + Raises: + DocumentNotFoundError: If document doesn't exist. + """ + # First get the document + doc = await self.get_by_id(doc_id) + if doc is None: + raise DocumentNotFoundError(str(doc_id)) + + await self._db.execute( + "DELETE FROM documents WHERE id = ?", + (str(doc_id),), + ) + + logger.info("document_deleted", doc_id=str(doc_id), filename=doc.filename) + return doc + + async def count(self) -> int: + """Get the total number of documents. + + Returns: + Number of documents. + """ + row = await self._db.fetch_one("SELECT COUNT(*) as count FROM documents") + return int(row["count"]) if row else 0 diff --git a/src/modules/documents/schemas.py b/src/modules/documents/schemas.py new file mode 100644 index 0000000..5e3fb21 --- /dev/null +++ b/src/modules/documents/schemas.py @@ -0,0 +1,47 @@ +"""Document Pydantic schemas for API validation.""" + +from datetime import datetime +from uuid import UUID + +from pydantic import BaseModel, Field + + +class DocumentUploadRequest(BaseModel): + """Request schema for document upload. + + Note: The file itself is handled via FastAPI's UploadFile, + not through this schema. + """ + + description: str | None = Field( + default=None, + max_length=500, + description="Optional description of the document", + ) + + +class DocumentResponse(BaseModel): + """Response schema for a single document.""" + + id: UUID + filename: str + title: str + description: str | None + file_type: str + file_size_bytes: int + uploaded_by: UUID | None + is_indexed: bool + created_at: datetime + updated_at: datetime + + class Config: + """Pydantic configuration.""" + + from_attributes = True + + +class DocumentListResponse(BaseModel): + """Response schema for document list.""" + + documents: list[DocumentResponse] + total_count: int diff --git a/src/modules/documents/service.py b/src/modules/documents/service.py new file mode 100644 index 0000000..65a9a57 --- /dev/null +++ b/src/modules/documents/service.py @@ -0,0 +1,349 @@ +"""Document service for business logic.""" + +from pathlib import Path +from uuid import UUID + +import structlog + +from src.modules.documents.exceptions import ( + DocumentAlreadyExistsError, + DocumentIndexingError, + DocumentNotFoundError, + DocumentValidationError, +) +from src.modules.documents.models import Document +from src.modules.documents.repository import DocumentRepository +from src.modules.rag import RAGService +from src.modules.rag.loader import DocumentLoadError, load_document + +logger = structlog.get_logger() + +# Supported file extensions +SUPPORTED_EXTENSIONS = {".md", ".txt"} + +# Maximum file size in bytes (1 MB) +MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 + +# Maximum number of uploaded documents +MAX_DOCUMENTS = 20 + +# Characters not allowed in filenames +FORBIDDEN_FILENAME_CHARS = {"/", "\\", "\x00", "\n", "\r"} + + +class DocumentService: + """Service for document management business logic. + + Orchestrates document upload, deletion, and indexing operations. + Handles the coordination between file storage, database, and vector store. + """ + + def __init__( + self, + repository: DocumentRepository, + rag_service: RAGService, + uploads_path: Path, + documents_path: Path, + ) -> None: + """Initialize the document service. + + Args: + repository: Document repository for database operations. + rag_service: RAG service for document indexing. + uploads_path: Path to the uploads directory. + documents_path: Path to the static documents directory. + """ + self._repo = repository + self._rag = rag_service + self._uploads_path = uploads_path + self._documents_path = documents_path + + # Ensure uploads directory exists + self._uploads_path.mkdir(parents=True, exist_ok=True) + + async def upload_document( + self, + file_content: bytes, + filename: str, + *, + description: str | None = None, + uploaded_by: UUID | None = None, + ) -> Document: + """Upload and index a new document. + + Steps: + 1. Validate filename, size, extension + 2. Check for duplicates + 3. Save file to uploads directory + 4. Extract title from content + 5. Create database record + 6. Index via RAG service + 7. Update index status + 8. Return Document + + Args: + file_content: Raw file content as bytes. + filename: Original filename. + description: Optional document description. + uploaded_by: ID of the user uploading the document. + + Returns: + Created and indexed Document. + + Raises: + DocumentValidationError: Invalid file format or size. + DocumentAlreadyExistsError: Document with same filename exists. + DocumentIndexingError: Indexing failed. + """ + # Validate filename + if not filename: + raise DocumentValidationError("Filename is required") + + # Sanitize filename - only use the base name and strip whitespace + safe_filename = Path(filename).name.strip() + + # Robust filename validation + if not safe_filename or safe_filename in (".", ".."): + raise DocumentValidationError("Invalid filename") + + # Reject hidden files (starting with .) + if safe_filename.startswith("."): + raise DocumentValidationError("Hidden files are not allowed") + + # Check for forbidden characters + if any(char in safe_filename for char in FORBIDDEN_FILENAME_CHARS): + raise DocumentValidationError("Filename contains invalid characters") + + # Validate extension + suffix = Path(safe_filename).suffix.lower() + if suffix not in SUPPORTED_EXTENSIONS: + raise DocumentValidationError( + f"Unsupported file type: {suffix}. Supported: {', '.join(SUPPORTED_EXTENSIONS)}" + ) + + # Validate size + file_size = len(file_content) + if file_size > MAX_FILE_SIZE_BYTES: + max_mb = MAX_FILE_SIZE_BYTES / (1024 * 1024) + raise DocumentValidationError( + f"File too large. Maximum size: {max_mb:.0f} MB" + ) + + if file_size == 0: + raise DocumentValidationError("File is empty") + + # Check for duplicates + existing = await self._repo.get_by_filename(safe_filename) + if existing: + raise DocumentAlreadyExistsError(safe_filename) + + # Check document count limit + current_count = await self._repo.count() + if current_count >= MAX_DOCUMENTS: + raise DocumentValidationError( + f"Maximum document limit reached ({MAX_DOCUMENTS}). " + "Please delete existing documents before uploading new ones." + ) + + # Determine file path + file_path = self._uploads_path / safe_filename + + try: + # Save file + file_path.write_bytes(file_content) + logger.debug( + "document_file_saved", filename=safe_filename, path=str(file_path) + ) + + # Extract title using existing loader + loaded_doc = load_document(file_path) + title = loaded_doc.title + file_type = loaded_doc.document_type + + # Create database record + doc = await self._repo.create( + filename=safe_filename, + title=title, + file_path=file_path, + file_type=file_type, + file_size_bytes=file_size, + description=description, + uploaded_by=uploaded_by, + ) + + # Index the document + result = await self._rag.index_document(file_path) + + if not result.success: + # Indexing failed - clean up + await self._repo.delete(doc.id) + file_path.unlink(missing_ok=True) + raise DocumentIndexingError( + safe_filename, result.error_message or "Unknown error" + ) + + # Update index status + await self._repo.update_index_status(doc.id, is_indexed=True) + + logger.info( + "document_uploaded", + doc_id=str(doc.id), + filename=safe_filename, + title=title, + chunks_created=result.chunks_created, + uploaded_by=str(uploaded_by) if uploaded_by else None, + ) + + # Return updated document + return await self._repo.get_by_id(doc.id) or doc + + except ( + DocumentAlreadyExistsError, + DocumentValidationError, + DocumentIndexingError, + ): + # Re-raise domain exceptions + raise + except DocumentLoadError as e: + # Clean up on load error + file_path.unlink(missing_ok=True) + raise DocumentValidationError(f"Failed to parse document: {e}") from e + except Exception as e: + # Clean up on any other error + file_path.unlink(missing_ok=True) + logger.error( + "document_upload_failed", + filename=safe_filename, + error=str(e), + error_type=type(e).__name__, + ) + raise DocumentValidationError(f"Upload failed: {e}") from e + + async def delete_document(self, doc_id: UUID) -> None: + """Delete a document and reindex the vector store. + + Due to ChromaDB limitations, we cannot delete individual document + chunks. Instead, we: + 1. Delete the database record + 2. Delete the file from filesystem + 3. Clear the entire vector store + 4. Reindex all remaining documents + + Args: + doc_id: UUID of the document to delete. + + Raises: + DocumentNotFoundError: Document doesn't exist. + """ + # Get document + doc = await self._repo.get_by_id(doc_id) + if doc is None: + raise DocumentNotFoundError(str(doc_id)) + + filename = doc.filename + file_path = doc.file_path + + # Delete from database first + await self._repo.delete(doc_id) + + # Delete file + if file_path.exists(): + file_path.unlink() + logger.debug("document_file_deleted", filename=filename) + + # Clear and reindex + await self._reindex_all() + + logger.info( + "document_deleted_and_reindexed", + doc_id=str(doc_id), + filename=filename, + ) + + async def _reindex_all(self) -> None: + """Clear vector store and reindex all documents. + + Reindexes documents from both: + - Uploaded documents (./uploads/) + - Static documents (./documents/) + + Updates database records to reflect actual indexing status. + """ + # Clear existing index + await self._rag.clear_index() + + # Track successfully indexed filenames + successfully_indexed: set[str] = set() + + # Reindex static documents + if self._documents_path.exists(): + results = await self._rag.index_all_documents(self._documents_path) + for r in results: + if r.success: + successfully_indexed.add(r.source) + success_count = sum(1 for r in results if r.success) + logger.debug( + "static_documents_reindexed", + path=str(self._documents_path), + success_count=success_count, + total=len(results), + ) + + # Reindex uploaded documents + if self._uploads_path.exists(): + results = await self._rag.index_all_documents(self._uploads_path) + for r in results: + if r.success: + successfully_indexed.add(r.source) + success_count = sum(1 for r in results if r.success) + logger.debug( + "uploaded_documents_reindexed", + path=str(self._uploads_path), + success_count=success_count, + total=len(results), + ) + + # Update index status based on actual indexing results + docs = await self._repo.list_all() + for doc in docs: + is_indexed = doc.filename in successfully_indexed + await self._repo.update_index_status(doc.id, is_indexed=is_indexed) + if not is_indexed: + logger.warning( + "document_reindex_failed", + doc_id=str(doc.id), + filename=doc.filename, + ) + + async def list_documents(self) -> list[Document]: + """List all uploaded documents. + + Returns: + List of documents ordered by creation date (newest first). + """ + return await self._repo.list_all() + + async def get_document(self, doc_id: UUID) -> Document: + """Get a document by ID. + + Args: + doc_id: Document UUID. + + Returns: + Document instance. + + Raises: + DocumentNotFoundError: Document doesn't exist. + """ + doc = await self._repo.get_by_id(doc_id) + if doc is None: + raise DocumentNotFoundError(str(doc_id)) + return doc + + async def get_document_count(self) -> int: + """Get the total number of uploaded documents. + + Returns: + Number of documents. + """ + return await self._repo.count() From e134e197f20f05c32d44c1179a547591a64957ab Mon Sep 17 00:00:00 2001 From: Chris Krough <461869+ckrough@users.noreply.github.com> Date: Sat, 27 Dec 2025 16:38:08 -0500 Subject: [PATCH 3/3] test: add comprehensive tests for documents module Add 31 tests covering: - Document model and from_row deserialization - DocumentRepository CRUD operations - DocumentService upload, delete, list operations - Validation (size, extension, filename, document limit) - Exception handling and error cases Restores coverage to 80%+ (was 75% without tests). --- tests/test_documents.py | 671 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 671 insertions(+) create mode 100644 tests/test_documents.py diff --git a/tests/test_documents.py b/tests/test_documents.py new file mode 100644 index 0000000..999439d --- /dev/null +++ b/tests/test_documents.py @@ -0,0 +1,671 @@ +"""Tests for document management module.""" + +import tempfile +from datetime import UTC, datetime +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock +from uuid import uuid4 + +import pytest + +from src.infrastructure.database import Database +from src.modules.documents import ( + Document, + DocumentAlreadyExistsError, + DocumentIndexingError, + DocumentNotFoundError, + DocumentRepository, + DocumentService, + DocumentValidationError, +) +from src.modules.rag import RAGService +from src.modules.rag.schemas import IndexingResult + + +@pytest.fixture +async def database() -> Database: + """Create a temporary test database.""" + with tempfile.TemporaryDirectory() as tmpdir: + db_path = Path(tmpdir) / "test.db" + db = Database(db_path) + await db.connect() + yield db + await db.disconnect() + + +@pytest.fixture +async def repository(database: Database) -> DocumentRepository: + """Create a document repository with test database.""" + return DocumentRepository(database) + + +@pytest.fixture +def mock_rag_service() -> MagicMock: + """Create a mock RAG service.""" + mock = MagicMock(spec=RAGService) + mock.index_document = AsyncMock( + return_value=IndexingResult( + source="test.md", + success=True, + chunks_created=5, + error_message=None, + ) + ) + mock.clear_index = AsyncMock() + mock.index_all_documents = AsyncMock(return_value=[]) + return mock + + +@pytest.fixture +def temp_dirs() -> tuple[Path, Path]: + """Create temporary upload and documents directories.""" + with tempfile.TemporaryDirectory() as tmpdir: + uploads = Path(tmpdir) / "uploads" + documents = Path(tmpdir) / "documents" + uploads.mkdir() + documents.mkdir() + yield uploads, documents + + +class TestDocumentModel: + """Tests for Document model.""" + + def test_document_creation(self) -> None: + """Should create a document with all fields.""" + now = datetime.now(UTC) + doc = Document( + id=uuid4(), + filename="test.md", + title="Test Document", + description="A test document", + file_path=Path("/tmp/test.md"), + file_type="markdown", + file_size_bytes=1024, + uploaded_by=uuid4(), + is_indexed=True, + created_at=now, + updated_at=now, + ) + assert doc.filename == "test.md" + assert doc.title == "Test Document" + assert doc.is_indexed + + def test_document_from_row(self) -> None: + """Should create document from database row.""" + now = datetime.now(UTC).isoformat() + doc_id = str(uuid4()) + user_id = str(uuid4()) + row = { + "id": doc_id, + "filename": "test.md", + "title": "Test Document", + "description": "A test", + "file_path": "/tmp/test.md", + "file_type": "markdown", + "file_size_bytes": 1024, + "uploaded_by": user_id, + "is_indexed": 1, + "created_at": now, + "updated_at": now, + } + doc = Document.from_row(row) + assert doc.filename == "test.md" + assert doc.is_indexed + assert doc.uploaded_by is not None + + def test_document_from_row_without_uploaded_by(self) -> None: + """Should handle null uploaded_by.""" + now = datetime.now(UTC).isoformat() + row = { + "id": str(uuid4()), + "filename": "test.md", + "title": "Test", + "description": None, + "file_path": "/tmp/test.md", + "file_type": "text", + "file_size_bytes": 512, + "uploaded_by": None, + "is_indexed": 0, + "created_at": now, + "updated_at": now, + } + doc = Document.from_row(row) + assert doc.uploaded_by is None + assert not doc.is_indexed + + +class TestDocumentRepository: + """Tests for DocumentRepository.""" + + @pytest.mark.asyncio + async def test_create_document(self, repository: DocumentRepository) -> None: + """Should create a document in the database.""" + doc = await repository.create( + filename="test.md", + title="Test Document", + file_path=Path("/tmp/test.md"), + file_type="markdown", + file_size_bytes=1024, + ) + assert doc.filename == "test.md" + assert doc.title == "Test Document" + assert doc.id is not None + + @pytest.mark.asyncio + async def test_create_document_with_description( + self, repository: DocumentRepository + ) -> None: + """Should create document with optional description.""" + doc = await repository.create( + filename="test.txt", + title="Full Doc", + file_path=Path("/tmp/test.txt"), + file_type="text", + file_size_bytes=512, + description="A full document", + ) + assert doc.description == "A full document" + assert doc.uploaded_by is None + + @pytest.mark.asyncio + async def test_get_by_id(self, repository: DocumentRepository) -> None: + """Should retrieve document by ID.""" + doc = await repository.create( + filename="test.md", + title="Test", + file_path=Path("/tmp/test.md"), + file_type="markdown", + file_size_bytes=100, + ) + retrieved = await repository.get_by_id(doc.id) + assert retrieved is not None + assert retrieved.id == doc.id + assert retrieved.filename == "test.md" + + @pytest.mark.asyncio + async def test_get_by_id_not_found(self, repository: DocumentRepository) -> None: + """Should return None for non-existent ID.""" + result = await repository.get_by_id(uuid4()) + assert result is None + + @pytest.mark.asyncio + async def test_get_by_filename(self, repository: DocumentRepository) -> None: + """Should retrieve document by filename.""" + await repository.create( + filename="unique.md", + title="Unique", + file_path=Path("/tmp/unique.md"), + file_type="markdown", + file_size_bytes=200, + ) + result = await repository.get_by_filename("unique.md") + assert result is not None + assert result.filename == "unique.md" + + @pytest.mark.asyncio + async def test_get_by_filename_not_found( + self, repository: DocumentRepository + ) -> None: + """Should return None for non-existent filename.""" + result = await repository.get_by_filename("nonexistent.md") + assert result is None + + @pytest.mark.asyncio + async def test_list_all(self, repository: DocumentRepository) -> None: + """Should list all documents ordered by created_at desc.""" + await repository.create( + filename="first.md", + title="First", + file_path=Path("/tmp/first.md"), + file_type="markdown", + file_size_bytes=100, + ) + await repository.create( + filename="second.md", + title="Second", + file_path=Path("/tmp/second.md"), + file_type="markdown", + file_size_bytes=200, + ) + docs = await repository.list_all() + assert len(docs) == 2 + # Newest first + assert docs[0].filename == "second.md" + assert docs[1].filename == "first.md" + + @pytest.mark.asyncio + async def test_update_index_status(self, repository: DocumentRepository) -> None: + """Should update document index status.""" + doc = await repository.create( + filename="test.md", + title="Test", + file_path=Path("/tmp/test.md"), + file_type="markdown", + file_size_bytes=100, + ) + assert not doc.is_indexed + + await repository.update_index_status(doc.id, is_indexed=True) + updated = await repository.get_by_id(doc.id) + assert updated is not None + assert updated.is_indexed + + @pytest.mark.asyncio + async def test_delete(self, repository: DocumentRepository) -> None: + """Should delete a document.""" + doc = await repository.create( + filename="delete-me.md", + title="Delete Me", + file_path=Path("/tmp/delete-me.md"), + file_type="markdown", + file_size_bytes=50, + ) + await repository.delete(doc.id) + result = await repository.get_by_id(doc.id) + assert result is None + + @pytest.mark.asyncio + async def test_count(self, repository: DocumentRepository) -> None: + """Should count documents.""" + assert await repository.count() == 0 + + await repository.create( + filename="one.md", + title="One", + file_path=Path("/tmp/one.md"), + file_type="markdown", + file_size_bytes=100, + ) + assert await repository.count() == 1 + + await repository.create( + filename="two.md", + title="Two", + file_path=Path("/tmp/two.md"), + file_type="markdown", + file_size_bytes=100, + ) + assert await repository.count() == 2 + + +class TestDocumentService: + """Tests for DocumentService.""" + + @pytest.mark.asyncio + async def test_upload_document_success( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should upload and index a document.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + content = b"# Test Document\n\nThis is a test." + doc = await service.upload_document( + file_content=content, + filename="test.md", + description="A test document", + ) + + assert doc.filename == "test.md" + assert doc.title == "Test Document" + assert (uploads / "test.md").exists() + mock_rag_service.index_document.assert_called_once() + + @pytest.mark.asyncio + async def test_upload_document_validates_extension( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should reject unsupported file types.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + with pytest.raises(DocumentValidationError, match="Unsupported file type"): + await service.upload_document( + file_content=b"test", + filename="test.pdf", + ) + + @pytest.mark.asyncio + async def test_upload_document_validates_size( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should reject files that are too large.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + # 2 MB content (exceeds 1 MB limit) + large_content = b"x" * (2 * 1024 * 1024) + with pytest.raises(DocumentValidationError, match="File too large"): + await service.upload_document( + file_content=large_content, + filename="large.md", + ) + + @pytest.mark.asyncio + async def test_upload_document_validates_empty( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should reject empty files.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + with pytest.raises(DocumentValidationError, match="File is empty"): + await service.upload_document( + file_content=b"", + filename="empty.md", + ) + + @pytest.mark.asyncio + async def test_upload_document_rejects_duplicate( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should reject duplicate filenames.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + content = b"# Test\n\nContent" + await service.upload_document(file_content=content, filename="dup.md") + + with pytest.raises(DocumentAlreadyExistsError): + await service.upload_document(file_content=content, filename="dup.md") + + @pytest.mark.asyncio + async def test_upload_document_validates_filename( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should reject invalid filenames.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + with pytest.raises(DocumentValidationError, match="Hidden files"): + await service.upload_document( + file_content=b"test", + filename=".hidden.md", + ) + + @pytest.mark.asyncio + async def test_upload_document_enforces_limit( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should reject uploads when at document limit.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + # Create 20 documents to hit limit + for i in range(20): + content = f"# Doc {i}\n\nContent".encode() + await service.upload_document(file_content=content, filename=f"doc{i}.md") + + with pytest.raises(DocumentValidationError, match="Maximum document limit"): + await service.upload_document( + file_content=b"# Too many\n\nContent", + filename="toomany.md", + ) + + @pytest.mark.asyncio + async def test_upload_document_handles_indexing_failure( + self, + database: Database, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should clean up on indexing failure.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + + mock_rag = MagicMock(spec=RAGService) + mock_rag.index_document = AsyncMock( + return_value=IndexingResult( + source="fail.md", + success=False, + chunks_created=0, + error_message="Embedding failed", + ) + ) + + service = DocumentService( + repository=repo, + rag_service=mock_rag, + uploads_path=uploads, + documents_path=documents, + ) + + with pytest.raises(DocumentIndexingError): + await service.upload_document( + file_content=b"# Fail\n\nContent", + filename="fail.md", + ) + + # File should be cleaned up + assert not (uploads / "fail.md").exists() + # DB record should be cleaned up + assert await repo.count() == 0 + + @pytest.mark.asyncio + async def test_delete_document( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should delete document and trigger reindex.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + content = b"# Delete Me\n\nContent" + doc = await service.upload_document(file_content=content, filename="delete.md") + + await service.delete_document(doc.id) + + assert not (uploads / "delete.md").exists() + assert await repo.get_by_id(doc.id) is None + mock_rag_service.clear_index.assert_called() + + @pytest.mark.asyncio + async def test_delete_document_not_found( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should raise error for non-existent document.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + with pytest.raises(DocumentNotFoundError): + await service.delete_document(uuid4()) + + @pytest.mark.asyncio + async def test_list_documents( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should list all documents.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + await service.upload_document( + file_content=b"# One\n\nContent", filename="one.md" + ) + await service.upload_document( + file_content=b"# Two\n\nContent", filename="two.md" + ) + + docs = await service.list_documents() + assert len(docs) == 2 + + @pytest.mark.asyncio + async def test_get_document( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should get document by ID.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + created = await service.upload_document( + file_content=b"# Get Me\n\nContent", filename="get.md" + ) + doc = await service.get_document(created.id) + assert doc.filename == "get.md" + + @pytest.mark.asyncio + async def test_get_document_not_found( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should raise error for non-existent document.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + with pytest.raises(DocumentNotFoundError): + await service.get_document(uuid4()) + + @pytest.mark.asyncio + async def test_get_document_count( + self, + database: Database, + mock_rag_service: MagicMock, + temp_dirs: tuple[Path, Path], + ) -> None: + """Should count documents.""" + uploads, documents = temp_dirs + repo = DocumentRepository(database) + service = DocumentService( + repository=repo, + rag_service=mock_rag_service, + uploads_path=uploads, + documents_path=documents, + ) + + assert await service.get_document_count() == 0 + + await service.upload_document( + file_content=b"# Count\n\nContent", filename="count.md" + ) + assert await service.get_document_count() == 1 + + +class TestDocumentExceptions: + """Tests for document exceptions.""" + + def test_document_not_found_error(self) -> None: + """Should create DocumentNotFoundError with identifier.""" + error = DocumentNotFoundError("abc123") + assert "abc123" in str(error) + + def test_document_already_exists_error(self) -> None: + """Should create DocumentAlreadyExistsError with filename.""" + error = DocumentAlreadyExistsError("test.md") + assert error.filename == "test.md" + assert "test.md" in str(error) + + def test_document_validation_error(self) -> None: + """Should create DocumentValidationError with message.""" + error = DocumentValidationError("Invalid format") + assert "Invalid format" in str(error) + + def test_document_indexing_error(self) -> None: + """Should create DocumentIndexingError with details.""" + error = DocumentIndexingError("test.md", "Embedding failed") + assert error.filename == "test.md" + assert error.reason == "Embedding failed" + assert "test.md" in str(error)