diff --git a/CHANGELOG.md b/CHANGELOG.md index 530e63f..290d9c1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,13 @@ All notable changes to Library Manager will be documented in this file. ### Added +- **Issue #110 Part 2: Folder triage** - New `library_manager/folder_triage.py` module that + categorizes folder names as clean/messy/garbage before processing. Clean folders use path hints + normally. Messy folders (scene release tags, torrent markers, quality indicators) skip path + parsing and rely on audio/metadata only. Garbage folders (hash names, numbers-only, generic + placeholders) also skip path hints and get a confidence penalty. Triage results stored in DB + and logged during scans. Integrated into Whisper transcription hints, AI identification + prompts, and the processing pipeline queue. - **Issue #103: In-app hints and tooltips** - New `library_manager/hints.py` module with contextual documentation for all features and settings. Hover over the (?) icon next to any setting to see a plain-language explanation of what it does. Tooltips added to: all identification layers, AI diff --git a/app.py b/app.py index cec2b49..b6bcac7 100644 --- a/app.py +++ b/app.py @@ -111,6 +111,7 @@ get_instance_data, save_instance_data, ) +from library_manager.folder_triage import triage_folder, triage_book_path, should_use_path_hints, confidence_modifier from library_manager.hints import get_all_hints # Try to import P2P cache (optional - gracefully degrades if not available) @@ -3197,31 +3198,39 @@ def search_book_searxng(query, duration_hours=None): return [] -def calculate_input_quality(folder_name, filenames, info): +def calculate_input_quality(folder_name, filenames, info, folder_triage='clean'): """ Score the quality of input data for AI identification. Returns a score 0-100 and list of usable clues found. Low quality inputs (random numbers, 'unknown', no words) should not be trusted to AI as it will hallucinate famous books. + + Issue #110: folder_triage controls whether folder name is trusted as input. """ score = 0 clues = [] - # Check folder name for useful info - folder_clean = re.sub(r'[_\-\d\.\[\]\(\)]', ' ', folder_name or '').strip() - words = [w for w in folder_clean.split() if len(w) > 2 and w.lower() not in ('unknown', 'audiobook', 'audio', 'book', 'mp3', 'the', 'and', 'part')] + # Issue #110: Only trust folder name for clean folders + use_folder = should_use_path_hints(folder_triage) + + if use_folder: + # Check folder name for useful info + folder_clean = re.sub(r'[_\-\d\.\[\]\(\)]', ' ', folder_name or '').strip() + words = [w for w in folder_clean.split() if len(w) > 2 and w.lower() not in ('unknown', 'audiobook', 'audio', 'book', 'mp3', 'the', 'and', 'part')] - if words: - score += min(40, len(words) * 10) # Up to 40 points for meaningful words - clues.append(f"folder_words: {words[:5]}") + if words: + score += min(40, len(words) * 10) # Up to 40 points for meaningful words + clues.append(f"folder_words: {words[:5]}") - # Check for author-title pattern (e.g., "Author - Title") - if ' - ' in (folder_name or ''): - score += 20 - clues.append("has_author_title_separator") + # Check for author-title pattern (e.g., "Author - Title") + if ' - ' in (folder_name or ''): + score += 20 + clues.append("has_author_title_separator") + else: + clues.append(f"folder_skipped: triage={folder_triage}") - # Check metadata tags + # Check metadata tags (always trusted regardless of folder triage) if info.get('title') and info.get('title') not in ('none', 'Unknown', ''): score += 25 clues.append(f"has_title_tag: {info.get('title')[:30]}") @@ -3240,6 +3249,12 @@ def calculate_input_quality(folder_name, filenames, info): score = max(0, score - 50) # Heavy penalty for "unknown_123" type names clues.append("PENALTY: numeric_garbage_name") + # Issue #110: Apply confidence modifier for garbage folders + modifier = confidence_modifier(folder_triage) + if modifier: + score = max(0, score + modifier) + clues.append(f"triage_modifier: {modifier}") + return min(100, score), clues @@ -3349,11 +3364,14 @@ def identify_book_with_ai(file_group, config): info = file_group.get('detected_info', {}) folder_name = file_group.get('folder_name', '') + # Issue #110: Determine folder triage for this book + ft = file_group.get('folder_triage') or triage_folder(folder_name) + # Build context for AI filenames = [Path(f).name if isinstance(f, str) else f.name for f in files[:20]] # === HALLUCINATION PREVENTION: Input quality check === - input_quality, clues = calculate_input_quality(folder_name, filenames, info) + input_quality, clues = calculate_input_quality(folder_name, filenames, info, folder_triage=ft) if input_quality < 25: # Input is garbage - don't even try AI, it will hallucinate @@ -3387,7 +3405,7 @@ def identify_book_with_ai(file_group, config): - Or are you GUESSING based on a generic title? (If guessing, return null!) Input information: -- Folder name: {folder_name} +- Folder name: {folder_name if should_use_path_hints(ft) else '[UNRELIABLE - ignore folder name]'} - Files ({len(files)} total): {', '.join(filenames[:10])}{'...' if len(filenames) > 10 else ''} - Duration: {info.get('duration_hours', 'unknown')} hours - Album tag: {info.get('title', 'none')} @@ -4816,6 +4834,7 @@ def deep_scan_library(config): scanned = 0 # New books added to tracking queued = 0 # Books added to fix queue issues_found = {} # path -> list of issues + triage_counts = {'clean': 0, 'messy': 0, 'garbage': 0} # Issue #110: Folder triage stats # Track files for duplicate detection file_signatures = {} # signature -> list of paths @@ -4973,6 +4992,8 @@ def deep_scan_library(config): flat_author, flat_title = extract_author_title(author) # Issue #132: Resolve path to prevent duplicates flat_path = str(author_dir.resolve()) + # Issue #110: Triage folder name quality + flat_triage = triage_folder(author) checked += 1 @@ -4988,13 +5009,15 @@ def deep_scan_library(config): if has_profile: continue flat_book_id = existing_flat['id'] + c.execute('UPDATE books SET folder_triage = ? WHERE id = ?', + (flat_triage, flat_book_id)) else: - c.execute('''INSERT INTO books (path, current_author, current_title, status) - VALUES (?, ?, ?, 'pending')''', (flat_path, flat_author, flat_title)) + c.execute('''INSERT INTO books (path, current_author, current_title, status, folder_triage) + VALUES (?, ?, ?, 'pending', ?)''', (flat_path, flat_author, flat_title, flat_triage)) conn.commit() flat_book_id = c.lastrowid scanned += 1 - logger.info(f"Added flat book: {flat_author} - {flat_title}") + logger.info(f"Added flat book: {flat_author} - {flat_title} (triage: {flat_triage})") # Queue for processing c.execute('SELECT id FROM queue WHERE book_id = ?', (flat_book_id,)) @@ -5137,6 +5160,8 @@ def deep_scan_library(config): continue # No audio files, skip checked += 1 + # Issue #110: Triage folder name quality + series_book_triage = triage_folder(book_title) # Check if already tracked c.execute('SELECT id, status, profile, user_locked FROM books WHERE path = ?', (book_path,)) @@ -5150,9 +5175,11 @@ def deep_scan_library(config): if has_profile: continue book_id = existing_book['id'] + c.execute('UPDATE books SET folder_triage = ? WHERE id = ?', + (series_book_triage, book_id)) else: - c.execute('''INSERT INTO books (path, current_author, current_title, status) - VALUES (?, ?, ?, 'pending')''', (book_path, author, book_title)) + c.execute('''INSERT INTO books (path, current_author, current_title, status, folder_triage) + VALUES (?, ?, ?, 'pending', ?)''', (book_path, author, book_title, series_book_triage)) conn.commit() book_id = c.lastrowid scanned += 1 @@ -5196,6 +5223,12 @@ def deep_scan_library(config): # This is a valid book folder - count it checked += 1 + # Issue #110: Triage folder name quality + folder_triage_result = triage_folder(title) + triage_counts[folder_triage_result] = triage_counts.get(folder_triage_result, 0) + 1 + if folder_triage_result != 'clean': + logger.info(f"Folder triage: {folder_triage_result} - {title[:60]}") + # Analyze title title_issues = analyze_title(title, author) cleaned_title, clean_issues = clean_title(title) @@ -5258,9 +5291,12 @@ def deep_scan_library(config): queued += 1 continue book_id = existing['id'] + # Update triage for existing books (backfill) + c.execute('UPDATE books SET folder_triage = ? WHERE id = ?', + (folder_triage_result, book_id)) else: - c.execute('''INSERT INTO books (path, current_author, current_title, status) - VALUES (?, ?, ?, 'pending')''', (path, author, title)) + c.execute('''INSERT INTO books (path, current_author, current_title, status, folder_triage) + VALUES (?, ?, ?, 'pending', ?)''', (path, author, title, folder_triage_result)) conn.commit() book_id = c.lastrowid scanned += 1 @@ -5320,6 +5356,7 @@ def deep_scan_library(config): logger.info(f"Scanned: {scanned} new books added to tracking") logger.info(f"Queued: {queued} books need fixing") logger.info(f"Already correct: {checked - queued} books") + logger.info(f"Folder triage: {triage_counts['clean']} clean, {triage_counts['messy']} messy, {triage_counts['garbage']} garbage") return checked, scanned, queued @@ -5513,19 +5550,26 @@ def transcribe_audio_intro(file_path, duration_seconds=45): initial_prompt = "This is an audiobook introduction. The narrator typically announces the book title, author name, and narrator." # Add folder hints to the prompt if available + # Issue #110: Only use folder hints for clean triage folders folder_path = Path(file_path).parent folder_name = folder_path.name parent_name = folder_path.parent.name if folder_path.parent else "" - # Extract potential author/title from folder structure for spelling hints - hints = [] - if parent_name and parent_name not in ['audiobooks', 'Unknown', '']: - hints.append(parent_name) - if folder_name and folder_name not in ['audiobooks', 'Unknown', '']: - hints.append(folder_name) + # Check folder triage before trusting folder names as hints + folder_triage_result = triage_folder(folder_name) - if hints: - initial_prompt += f" Possible names: {', '.join(hints)}." + if should_use_path_hints(folder_triage_result): + # Extract potential author/title from folder structure for spelling hints + hints = [] + if parent_name and parent_name not in ['audiobooks', 'Unknown', '']: + hints.append(parent_name) + if folder_name and folder_name not in ['audiobooks', 'Unknown', '']: + hints.append(folder_name) + + if hints: + initial_prompt += f" Possible names: {', '.join(hints)}." + else: + logger.info(f"[LAYER 1/AUDIO] Skipping folder hints (triage: {folder_triage_result}): {folder_name[:40]}") # Transcribe with better settings for accuracy segments, info = whisper_model.transcribe( @@ -9118,7 +9162,8 @@ def build_order_by(sort_cols, default_order): # Issue #36: Filter out series_folder and multi_book_files - they should never appear in queue order = build_order_by(QUEUE_SORT_COLS, 'q.priority, q.added_at') c.execute('''SELECT q.id as queue_id, q.reason, q.added_at, q.priority, - b.id as book_id, b.path, b.current_author, b.current_title, b.status + b.id as book_id, b.path, b.current_author, b.current_title, b.status, + b.folder_triage FROM queue q JOIN books b ON q.book_id = b.id WHERE b.status NOT IN ('series_folder', 'multi_book_files', 'verified', 'fixed') @@ -9135,7 +9180,8 @@ def build_order_by(sort_cols, default_order): 'status': 'in_queue', 'reason': row['reason'], 'priority': row['priority'], - 'added_at': row['added_at'] + 'added_at': row['added_at'], + 'folder_triage': row['folder_triage'] }) elif status_filter == 'fixed': diff --git a/library_manager/database.py b/library_manager/database.py index 7a176be..eb12340 100644 --- a/library_manager/database.py +++ b/library_manager/database.py @@ -126,6 +126,13 @@ def init_db(db_path=None): except: pass # Column already exists + # Add folder_triage column - categorizes folder name quality (clean/messy/garbage) + # Issue #110: Used to decide whether to trust path-derived hints + try: + c.execute("ALTER TABLE books ADD COLUMN folder_triage TEXT DEFAULT 'clean'") + except: + pass # Column already exists + # Stats table - daily stats c.execute('''CREATE TABLE IF NOT EXISTS stats ( id INTEGER PRIMARY KEY, diff --git a/library_manager/folder_triage.py b/library_manager/folder_triage.py new file mode 100644 index 0000000..ecd7528 --- /dev/null +++ b/library_manager/folder_triage.py @@ -0,0 +1,99 @@ +""" +Folder triage - categorize folder names by cleanliness. + +Determines processing strategy per-folder: +- CLEAN: Use folder name as hints for author/title parsing +- MESSY: Skip path parsing, rely on audio/metadata only +- GARBAGE: Skip path parsing, expect harder match, lower confidence + +Issue #110 Part 2 +""" +import os +import re +import logging +from typing import List, Tuple + +logger = logging.getLogger(__name__) + +# Scene release tags, torrent markers, quality indicators +MESSY_PATTERNS: List[str] = [ + r'\{[a-z]+\}', # {mb}, {cbt} + r'\[[A-Z0-9]+\]', # [FLAC], [MP3] + r'\([^)]*(?:narrator|read by|unabridged|abridged|rip|scene|kbps)\b[^)]*\)', # (narrator: Thorne), (Unabridged) + r'^\d{4}\s*-', # 2023 - + r'\d{2}\.\d{2}\.\d{2}', # 01.10.42 + r'\d+k\b', # 62k, 128k + r'\d+kbps', # 64kbps + r'\bHQ\b|\bLQ\b', # Quality markers + r'-[A-Z]{2,4}$', # -TEAM suffix (scene release) + r'\.com\b', # Website in name + r'\bwww\.', # Website prefix + r'\b(rip|ripped|scene)\b', # Rip indicators + r'\b(x264|aac|mp3|flac|ogg|m4b)\b', # Codec in name +] + +# Completely useless folder names +GARBAGE_PATTERNS: List[str] = [ + r'^[a-f0-9]{12,}$', # Hash-only names (12+ hex chars) + r'^[\d\s\-\.]+$', # Numbers only + r'^(New Folder|tmp|downloads?|torrents?|audiobooks?|untitled)$', + r'^(CD|Disc|Track)\s*\d+$', # Disc/track folders + r'^Unknown\s*(Artist|Author|Album)?$', # Generic unknowns +] + +# Compiled patterns for performance (compiled once at import time) +_MESSY_COMPILED = [re.compile(p, re.IGNORECASE) for p in MESSY_PATTERNS] +_GARBAGE_COMPILED = [re.compile(p, re.IGNORECASE) for p in GARBAGE_PATTERNS] + + +def triage_folder(folder_name: str) -> str: + """ + Categorize a folder name by cleanliness. + + Returns: + 'clean' - Folder name looks like a real author/title + 'messy' - Has scene tags or markers but might have useful info + 'garbage' - Completely useless (hash, numbers, generic placeholder) + """ + if not folder_name or not folder_name.strip(): + return 'garbage' + + folder_name = folder_name.strip() + + # Check garbage first (most restrictive) + for pattern in _GARBAGE_COMPILED: + if pattern.match(folder_name): + return 'garbage' + + # Check messy patterns + for pattern in _MESSY_COMPILED: + if pattern.search(folder_name): + return 'messy' + + return 'clean' + + +def triage_book_path(book_path: str) -> Tuple[str, str]: + """ + Triage the book folder from a full book path. + + For a path like /audiobooks/Author Name/Book Title, + triages the immediate parent folder (Book Title). + + Returns: + tuple: (triage_result, folder_name) + """ + folder_name = os.path.basename(book_path) if book_path else '' + return triage_folder(folder_name), folder_name + + +def should_use_path_hints(triage_result: str) -> bool: + """Whether path-derived hints should be trusted for this triage category.""" + return triage_result == 'clean' + + +def confidence_modifier(triage_result: str) -> int: + """Confidence adjustment based on folder triage category.""" + if triage_result == 'garbage': + return -10 + return 0 diff --git a/library_manager/pipeline/layer_ai_queue.py b/library_manager/pipeline/layer_ai_queue.py index cea2803..bae9c70 100644 --- a/library_manager/pipeline/layer_ai_queue.py +++ b/library_manager/pipeline/layer_ai_queue.py @@ -125,7 +125,7 @@ def process_queue( # Process items at specified layer (or layer 4 for folder fallback) c.execute('''SELECT q.id as queue_id, q.book_id, q.reason, b.path, b.current_author, b.current_title, - b.confidence, b.profile + b.confidence, b.profile, b.folder_triage FROM queue q JOIN books b ON q.book_id = b.id WHERE b.verification_layer = ? @@ -137,7 +137,7 @@ def process_queue( # API disabled - process all queue items directly with AI c.execute('''SELECT q.id as queue_id, q.book_id, q.reason, b.path, b.current_author, b.current_title, - b.confidence, b.profile + b.confidence, b.profile, b.folder_triage FROM queue q JOIN books b ON q.book_id = b.id WHERE b.status NOT IN ('verified', 'fixed', 'series_folder', 'multi_book_files', 'needs_attention') @@ -205,7 +205,18 @@ def process_queue( return len(garbage_batch), 0 # (processed, fixed) # Build messy names for AI - messy_names = [f"{row['current_author']} - {row['current_title']}" for row in batch] + # Issue #110: For messy/garbage triage folders, mark the folder name as unreliable + messy_names = [] + for row in batch: + triage = row.get('folder_triage') or 'clean' + name = f"{row['current_author']} - {row['current_title']}" + if triage == 'garbage': + name += " [FOLDER NAME UNRELIABLE - use audio/metadata only]" + logger.info(f"[{layer_name}] Garbage triage folder, suppressing path hints: {row['current_title'][:40]}") + elif triage == 'messy': + name += " [FOLDER NAME MAY BE UNRELIABLE]" + logger.info(f"[{layer_name}] Messy triage folder: {row['current_title'][:40]}") + messy_names.append(name) logger.info(f"[DEBUG] Processing batch of {len(batch)} items:") for i, name in enumerate(messy_names):