diff --git a/data/test_sync_folder/test.md b/data/test_sync_folder/test.md new file mode 100644 index 00000000..46a4dfa0 --- /dev/null +++ b/data/test_sync_folder/test.md @@ -0,0 +1 @@ +Test Image diff --git a/data/test_sync_folder/test_doc.txt b/data/test_sync_folder/test_doc.txt new file mode 100644 index 00000000..557db03d --- /dev/null +++ b/data/test_sync_folder/test_doc.txt @@ -0,0 +1 @@ +Hello World diff --git a/data/test_sync_folder_2/doc2.txt b/data/test_sync_folder_2/doc2.txt new file mode 100644 index 00000000..eae065bc --- /dev/null +++ b/data/test_sync_folder_2/doc2.txt @@ -0,0 +1 @@ +Content 2 diff --git a/src/api/routers/knowledge.py b/src/api/routers/knowledge.py index 5982a8f9..3b03bde8 100644 --- a/src/api/routers/knowledge.py +++ b/src/api/routers/knowledge.py @@ -142,6 +142,7 @@ async def run_upload_processing_task( base_url: str, uploaded_file_paths: list[str], rag_provider: str = None, + folder_id: str | None = None, # For folder sync state tracking ): """Background task for processing uploaded files""" task_manager = TaskIDManager.get_instance() @@ -179,18 +180,37 @@ async def run_upload_processing_task( current=0, total=len(processed_files), ) + + # Update sync state with SOURCE paths (not destination) for proper change detection + if folder_id: + try: + manager = get_kb_manager() + # Map processed files back to their source paths by matching filenames + processed_basenames = {p.name for p in processed_files} + synced_source_paths = [ + src for src in uploaded_file_paths if Path(src).name in processed_basenames + ] + manager.update_folder_sync_state(kb_name, folder_id, synced_source_paths) + logger.info( + f"Updated sync state for {len(synced_source_paths)} source files in folder {folder_id}" + ) + except Exception as e: + logger.error(f"Failed to update folder sync state: {e}") + adder.extract_numbered_items_for_new_docs(processed_files, batch_size=20) adder.update_metadata(len(new_files)) progress_tracker.update( ProgressStage.COMPLETED, - f"Successfully processed {len(processed_files)} files!", - current=len(processed_files), - total=len(processed_files), + f"Successfully processed {len(processed_files) if processed_files else 0} files!", + current=len(processed_files) if processed_files else 0, + total=len(processed_files) if processed_files else len(new_files), ) - logger.success(f"[{task_id}] Processed {len(processed_files)} files to KB '{kb_name}'") + logger.success( + f"[{task_id}] Processed {len(processed_files) if processed_files else 0} files to KB '{kb_name}'" + ) task_manager.update_task_status(task_id, "completed") except Exception as e: error_msg = f"Upload processing failed (KB '{kb_name}'): {e}" diff --git a/src/knowledge/add_documents.py b/src/knowledge/add_documents.py index 7e8cdfeb..b49ef735 100644 --- a/src/knowledge/add_documents.py +++ b/src/knowledge/add_documents.py @@ -207,22 +207,25 @@ async def process_new_documents(self, new_files: List[Path]): Uses FileTypeRouter to classify files and route them appropriately: - PDF/DOCX/images -> MinerU parser (full document analysis) - Text/Markdown -> Direct read + LightRAG insert (fast) + + Falls back to LightRAG pipeline if RAGAnything is unavailable. """ if not new_files: return None - if raganything_cls is None: - raise ImportError("RAGAnything module not found.") - - from src.services.rag.components.routing import FileTypeRouter - # Pre-import progress stage if needed to avoid overhead in loop ProgressStage: Any = None if self.progress_tracker: from src.knowledge.progress_tracker import ProgressStage + # Fall back to LightRAG if RAGAnything is not available + if raganything_cls is None: + logger.warning("RAGAnything not available, using LightRAG fallback for text processing") + return await self._process_with_lightrag_fallback(new_files, ProgressStage) + # Use unified LLM client from src/services/llm from src.services.llm import get_llm_client + from src.services.rag.components.routing import FileTypeRouter llm_client = get_llm_client() self.llm_cfg = llm_client.config @@ -349,6 +352,136 @@ async def unified_embed_func(texts): await self.fix_structure() return processed_files + async def _process_with_lightrag_fallback(self, new_files: List[Path], ProgressStage): + """Fallback processing using LightRAG when RAGAnything is not available. + + This is a simpler pipeline that: + 1. Reads text content from supported files + 2. Inserts into LightRAG for indexing + """ + from lightrag import LightRAG + from lightrag.llm.openai import openai_complete_if_cache + + self.llm_cfg = get_llm_config() + model = self.llm_cfg.model + api_key = self.api_key or self.llm_cfg.api_key + base_url = self.base_url or self.llm_cfg.base_url + + logger.info(f"Using LightRAG fallback for {len(new_files)} files") + + # Initialize LightRAG with async LLM function + async def llm_model_func(prompt, system_prompt=None, history_messages=[], **kwargs): + return await openai_complete_if_cache( + model, + prompt, + system_prompt=system_prompt, + history_messages=history_messages, + api_key=api_key, + base_url=base_url, + **kwargs, + ) + + # Use embedding config + reset_embedding_client() + embedding_cfg = get_embedding_config() + embedding_client = get_embedding_client() + + async def embedding_func(texts): + logger.info(f"Embedding {len(texts)} texts...") + result = await embedding_client.embed(texts) + return result + + try: + rag = LightRAG( + working_dir=str(self.rag_storage_dir), + llm_model_func=llm_model_func, + embedding_func=EmbeddingFunc( + embedding_dim=embedding_cfg.dim, + max_token_size=embedding_cfg.max_tokens, + func=embedding_func, + ), + ) + logger.info("LightRAG instance created successfully") + except Exception as e: + logger.error(f"Failed to create LightRAG instance: {e}") + raise + + processed_files = [] + for idx, doc_file in enumerate(new_files, 1): + try: + if self.progress_tracker and ProgressStage: + self.progress_tracker.update( + ProgressStage.PROCESSING_FILE, + f"Processing {doc_file.name} (fallback mode)", + current=idx, + total=len(new_files), + ) + + if not doc_file.exists(): + logger.error(f" ✗ Failed: File missing {doc_file.name}") + continue + + # Read text content based on file type + text_content = await self._extract_text_content(doc_file) + + if text_content and text_content.strip(): + # Insert into LightRAG + await rag.ainsert(text_content) + logger.info(f" ✓ Indexed (fallback): {doc_file.name}") + processed_files.append(doc_file) + self._record_successful_hash(doc_file) + else: + logger.warning(f" ⚠ No text content extracted: {doc_file.name}") + processed_files.append(doc_file) + self._record_successful_hash(doc_file) + + except Exception as e: + logger.exception(f" ✗ Failed {doc_file.name}: {e}") + + return processed_files + + async def _extract_text_content(self, file_path: Path) -> str: + """Extract text content from a file based on its extension.""" + suffix = file_path.suffix.lower() + + try: + if suffix in [".txt", ".md", ".markdown"]: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + return f.read() + + elif suffix == ".pdf": + try: + import fitz # PyMuPDF + + doc = fitz.open(file_path) + text_parts = [] + for page in doc: + text_parts.append(page.get_text()) + doc.close() + return "\n".join(text_parts) + except ImportError: + logger.warning("PyMuPDF not available for PDF extraction") + return "" + + elif suffix in [".docx", ".doc"]: + try: + from docx import Document + + doc = Document(file_path) + text_parts = [para.text for para in doc.paragraphs] + return "\n".join(text_parts) + except ImportError: + logger.warning("python-docx not available for Word extraction") + return "" + + else: + logger.warning(f"Unsupported file type for text extraction: {suffix}") + return "" + + except Exception as e: + logger.error(f"Error extracting text from {file_path}: {e}") + return "" + def _record_successful_hash(self, file_path: Path): """Update metadata with the hash of a successfully processed file.""" file_hash = self._get_file_hash(file_path) diff --git a/web/components/knowledge/ContentViewModal.tsx b/web/components/knowledge/ContentViewModal.tsx new file mode 100644 index 00000000..3a4c09d9 --- /dev/null +++ b/web/components/knowledge/ContentViewModal.tsx @@ -0,0 +1,229 @@ +"use strict"; +import { useState } from "react"; +import { + Database, + Search, + FileText, + Image as ImageIcon, + X, + Loader2, +} from "lucide-react"; +import { apiUrl } from "@/lib/api"; + +interface ContentViewModalProps { + isOpen: boolean; + onClose: () => void; + kbName: string; + content: { + documents: any[]; + images: any[]; + } | null; + loading: boolean; +} + +export default function ContentViewModal({ + isOpen, + onClose, + kbName, + content, + loading, +}: ContentViewModalProps) { + const [activeContentTab, setActiveContentTab] = useState< + "documents" | "images" + >("documents"); + const [searchTerm, setSearchTerm] = useState(""); + const [deepSearchResults, setDeepSearchResults] = useState([]); + const [searchingDeep, setSearchingDeep] = useState(false); + + // Helper functions + const formatFileSize = (bytes: number) => { + if (bytes === 0) return "0 B"; + const k = 1024; + const sizes = ["B", "KB", "MB", "GB", "TB"]; + const i = Math.floor(Math.log(bytes) / Math.log(k)); + return parseFloat((bytes / Math.pow(k, i)).toFixed(1)) + " " + sizes[i]; + }; + + const formatDate = (dateString: string) => { + return new Date(dateString).toLocaleDateString(undefined, { + year: "numeric", + month: "short", + day: "numeric", + hour: "2-digit", + minute: "2-digit", + }); + }; + + if (!isOpen) return null; + + return ( +
+
+
+
+

+ + Knowledge Base Content +

+

+ Viewing content for{" "} + {kbName} +

+
+ +
+ + {/* Tabs */} +
+ + +
+ + {/* Search */} +
+
+ + setSearchTerm(e.target.value)} + className="w-full pl-9 pr-4 py-2 border border-slate-200 dark:border-slate-600 rounded-xl bg-slate-50 dark:bg-slate-700 text-sm focus:outline-none focus:ring-2 focus:ring-blue-500 focus:border-transparent" + /> +
+
+ + {/* Content Area */} +
+ {loading ? ( +
+ +
+ ) : !content || + (activeContentTab === "documents" && + content.documents.length === 0) || + (activeContentTab === "images" && content.images.length === 0) ? ( +
+ {activeContentTab === "documents" ? ( + + ) : ( + + )} +

No {activeContentTab} found

+
+ ) : ( + <> + {/* Documents View (Table) */} + {activeContentTab === "documents" && ( + + + + + + + + + + {content.documents + .filter((item) => + item.name + .toLowerCase() + .includes(searchTerm.toLowerCase()), + ) + .map((item, i) => ( + + + + + + ))} + +
NameSizeModified
+ + + {item.name} + + + {formatFileSize(item.size)} + + {formatDate(item.last_modified)} +
+ )} + + {/* Images View (Grid) */} + {activeContentTab === "images" && ( +
+ {content.images + .filter((item) => + item.name + .toLowerCase() + .includes(searchTerm.toLowerCase()), + ) + .map((item, i) => ( +
+
+ {item.name} +
+
+
+

+ {item.name} +

+
+ {formatFileSize(item.size || 0)} + {formatDate(item.last_modified)} +
+
+
+ ))} +
+ )} + + )} +
+
+
+ ); +}