From 3e085cacce1ef50bd30177b04be9c6d1005497e5 Mon Sep 17 00:00:00 2001 From: deucebucket Date: Mon, 16 Feb 2026 00:06:38 -0600 Subject: [PATCH 1/4] Fix #127: Use path info to complete partial Skaldleita results When Skaldleita returns a truncated name (e.g., "James S. A" instead of "James S. A. Corey"), the file path often contains the full name. This adds _complete_result_from_path() which: - Completes truncated author names when path has a longer version that starts with the SL result - Completes truncated titles using the same starts-with logic - Extracts series info from path components when SL returned none - Gives a small confidence boost when path corroborates SL results - Never replaces a longer SL result with a shorter path fragment - Requires minimum 4 characters to avoid false matches on trivial prefixes Applied at three points in the pipeline: 1. SL requeue with partial ID 2. SL full identification after sanity check 3. AI fallback after sanity check --- library_manager/pipeline/layer_audio_id.py | 132 ++++++++++++++++++++- 1 file changed, 131 insertions(+), 1 deletion(-) diff --git a/library_manager/pipeline/layer_audio_id.py b/library_manager/pipeline/layer_audio_id.py index afd2641..36474b0 100644 --- a/library_manager/pipeline/layer_audio_id.py +++ b/library_manager/pipeline/layer_audio_id.py @@ -14,6 +14,7 @@ import json import logging import os +import re import time from datetime import datetime, timedelta from pathlib import Path @@ -146,7 +147,6 @@ def _validate_ai_result_against_path(result: Dict, folder_hint: str, book_path: # Clean up - remove common noise def clean_text(text): - import re # Remove brackets, hashes, special chars text = re.sub(r'\[[^\]]*\]', ' ', text) text = re.sub(r'[^a-z0-9\s]', ' ', text) @@ -188,6 +188,129 @@ def clean_text(text): return result +def _complete_result_from_path(result: Dict, folder_hint: str, book_path: str) -> Dict: + """ + Use path info to complete partial/truncated Skaldleita results. + + When SL returns a truncated name (e.g., "James S. A" instead of "James S. A. Corey"), + the file path often contains the full name. If the path version starts with the SL + version, use the longer path version. + + Only completes - never replaces a longer SL result with a shorter path fragment. + + Args: + result: The identification result dict (author, title, series, etc.) + folder_hint: "current_author - current_title" string from path parsing + book_path: Full filesystem path to the book folder + + Returns: + The result dict with potentially completed fields. + """ + if not result or not folder_hint: + return result + + # Parse the folder hint into author and title components + # folder_hint format: "current_author - current_title" + hint_parts = folder_hint.split(' - ', 1) + path_author = hint_parts[0].strip() if len(hint_parts) >= 1 else '' + path_title = hint_parts[1].strip() if len(hint_parts) >= 2 else '' + + def _is_truncated_version(shorter: str, longer: str) -> bool: + """Check if 'shorter' is a truncated prefix of 'longer'. + + Returns True if the shorter string is a prefix of the longer string + (case-insensitive), and the longer string has meaningful additional content. + Requires the shorter version to be at least 4 characters to avoid false + matches on trivial prefixes. + """ + if not shorter or not longer: + return False + s = shorter.strip() + l = longer.strip() + if len(s) < 4 or len(s) >= len(l): + return False + return l.lower().startswith(s.lower()) + + completed_any = False + + # Complete author if truncated + sl_author = result.get('author', '') or '' + if sl_author and path_author and _is_truncated_version(sl_author, path_author): + logger.info(f"[PATH COMPLETE] Author: '{sl_author}' -> '{path_author}' (path has full name)") + result['author'] = path_author + result['path_completed_author'] = True + completed_any = True + + # Complete title if truncated + sl_title = result.get('title', '') or '' + if sl_title and path_title and _is_truncated_version(sl_title, path_title): + logger.info(f"[PATH COMPLETE] Title: '{sl_title}' -> '{path_title}' (path has full name)") + result['title'] = path_title + result['path_completed_title'] = True + completed_any = True + + # Try to extract series info from path if SL returned none + # Path often has patterns like "The Stormlight Archive 01" or "Series Name/Book 01" + sl_series = result.get('series', '') or '' + if not sl_series and path_title: + # Check for series number patterns in the path title + # e.g., "The Way of Kings (Stormlight Archive 01)" or "Book Title - Series Name 01" + # Look for series patterns in the full path (parent directories) + path_obj = Path(book_path) if book_path else None + if path_obj: + # Check parent directory names for series info not captured by SL + # The folder structure is typically: library/Author/SeriesOrTitle/BookTitle + # or: library/Author/Title + parts = path_obj.parts + # Look at path components between author and book for series folders + for part in parts: + # Match patterns like "Series Name 01" or "Series Name - Book 01" + series_match = re.match( + r'^(.+?)\s*[-–]\s*(?:Book\s+)?(\d+(?:\.\d+)?)\s*$', + part, re.IGNORECASE + ) + if not series_match: + series_match = re.match( + r'^(.+?)\s+(\d+(?:\.\d+)?)\s*$', + part, re.IGNORECASE + ) + if series_match: + potential_series = series_match.group(1).strip() + potential_num = series_match.group(2).strip() + # Only use if the series name is meaningful (not just the title) + if (len(potential_series) >= 3 + and potential_series.lower() != sl_title.lower() + and potential_series.lower() != (result.get('author') or '').lower()): + logger.info(f"[PATH COMPLETE] Series: '{potential_series}' #{potential_num} (extracted from path)") + result['series'] = potential_series + result['series_num'] = potential_num + result['path_completed_series'] = True + completed_any = True + break + + # If we completed anything, give a small confidence boost since path corroborates SL + if completed_any: + raw_conf = result.get('confidence', 0.7) + try: + if isinstance(raw_conf, str): + # String confidence levels - bump up one tier + if raw_conf == 'low': + result['confidence'] = 'medium' + elif raw_conf == 'medium': + result['confidence'] = 'high' + # 'high' stays high + elif isinstance(raw_conf, (int, float)): + # Numeric confidence - small boost (5%) for path agreement, cap at 0.95 + if raw_conf <= 1: + result['confidence'] = min(0.95, raw_conf + 0.05) + else: + result['confidence'] = min(95, raw_conf + 5) + except (ValueError, TypeError): + pass # Leave confidence unchanged if we can't parse it + + return result + + def process_layer_1_audio( config: Dict, get_db: Callable, @@ -444,6 +567,8 @@ def process_layer_1_audio( 'sl_source': sl_source, 'requeue_suggested': True } + # Issue #127: Complete truncated SL results using path info + result = _complete_result_from_path(result, folder_hint, book_path) # Continue processing - let the normal flow create pending_fix # The requeue flag will be used to schedule a future recheck else: @@ -475,6 +600,8 @@ def process_layer_1_audio( transcript = bookdb_result.get('transcript') # Keep transcript for AI result = None # Clear to trigger AI fallback else: + # Issue #127: Complete truncated SL results using path info + bookdb_result = _complete_result_from_path(bookdb_result, folder_hint, book_path) result = bookdb_result # Passed sanity check else: # Skaldleita didn't get a full match - might have a transcript though @@ -513,6 +640,9 @@ def process_layer_1_audio( # This catches cases where AI completely misparses (e.g., narrator name as author) if result: result = _validate_ai_result_against_path(result, folder_hint, book_path) + # Issue #127: Complete truncated AI results using path info + if result and not result.get('sanity_failed'): + result = _complete_result_from_path(result, folder_hint, book_path) if result and result.get('author') and result.get('title') and result.get('confidence') != 'none': # Got identification from audio! From ddd1db0b0257e178ef438f120cb8810d6891d904 Mon Sep 17 00:00:00 2001 From: deucebucket Date: Mon, 16 Feb 2026 01:21:33 -0600 Subject: [PATCH 2/4] Address vibe-check review: limit path iteration scope, add CHANGELOG --- CHANGELOG.md | 11 +++++++++++ library_manager/pipeline/layer_audio_id.py | 7 +++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d1a7093..bf38403 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ All notable changes to Library Manager will be documented in this file. +## [Unreleased] + +### Added + +- **Issue #127: Path-based completion for partial results** - When Skaldleita returns truncated + names (e.g., "James S. A" instead of "James S. A. Corey"), the system now uses folder path + information to complete the full name. Also extracts series information from path structure + when missing from audio identification. Requires minimum 4-char prefix match for safety. + +--- + ## [0.9.0-beta.125] - 2026-02-14 ### Fixed diff --git a/library_manager/pipeline/layer_audio_id.py b/library_manager/pipeline/layer_audio_id.py index 36474b0..99976f1 100644 --- a/library_manager/pipeline/layer_audio_id.py +++ b/library_manager/pipeline/layer_audio_id.py @@ -259,11 +259,10 @@ def _is_truncated_version(shorter: str, longer: str) -> bool: path_obj = Path(book_path) if book_path else None if path_obj: # Check parent directory names for series info not captured by SL - # The folder structure is typically: library/Author/SeriesOrTitle/BookTitle - # or: library/Author/Title + # Typical structure: library/Author/SeriesOrTitle/BookTitle + # Only check the last 3-4 relevant dirs, not filesystem root parts = path_obj.parts - # Look at path components between author and book for series folders - for part in parts: + for part in parts[-4:-1]: # Match patterns like "Series Name 01" or "Series Name - Book 01" series_match = re.match( r'^(.+?)\s*[-–]\s*(?:Book\s+)?(\d+(?:\.\d+)?)\s*$', From 2565924c95b426a070ce253d2c1ad425bd9bcb62 Mon Sep 17 00:00:00 2001 From: deucebucket Date: Mon, 16 Feb 2026 01:24:55 -0600 Subject: [PATCH 3/4] Bump version to beta.127 --- CHANGELOG.md | 2 +- README.md | 2 +- app.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bf38403..9c4ed71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to Library Manager will be documented in this file. -## [Unreleased] +## [0.9.0-beta.127] - 2026-02-16 ### Added diff --git a/README.md b/README.md index 5891303..9719546 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ **Smart Audiobook Library Organizer with Multi-Source Metadata & AI Verification** -[![Version](https://img.shields.io/badge/version-0.9.0--beta.125-blue.svg)](CHANGELOG.md) +[![Version](https://img.shields.io/badge/version-0.9.0--beta.127-blue.svg)](CHANGELOG.md) [![Docker](https://img.shields.io/badge/docker-ghcr.io-blue.svg)](https://ghcr.io/deucebucket/library-manager) [![License](https://img.shields.io/badge/license-AGPL--3.0-blue.svg)](LICENSE) diff --git a/app.py b/app.py index f215e6c..f023fee 100644 --- a/app.py +++ b/app.py @@ -11,7 +11,7 @@ - Multi-provider AI (Gemini, OpenRouter, Ollama) """ -APP_VERSION = "0.9.0-beta.125" +APP_VERSION = "0.9.0-beta.127" GITHUB_REPO = "deucebucket/library-manager" # Your GitHub repo # Versioning Guide: From d9380a6034dcf3c430d209ceb41accc29594dcfe Mon Sep 17 00:00:00 2001 From: deucebucket Date: Mon, 16 Feb 2026 01:27:11 -0600 Subject: [PATCH 4/4] Address vibe-check review: handle numeric string confidence, add comments --- library_manager/pipeline/layer_audio_id.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/library_manager/pipeline/layer_audio_id.py b/library_manager/pipeline/layer_audio_id.py index 99976f1..ab16f71 100644 --- a/library_manager/pipeline/layer_audio_id.py +++ b/library_manager/pipeline/layer_audio_id.py @@ -276,7 +276,7 @@ def _is_truncated_version(shorter: str, longer: str) -> bool: if series_match: potential_series = series_match.group(1).strip() potential_num = series_match.group(2).strip() - # Only use if the series name is meaningful (not just the title) + # Min length 3 to avoid false matches like "The" or "No" if (len(potential_series) >= 3 and potential_series.lower() != sl_title.lower() and potential_series.lower() != (result.get('author') or '').lower()): @@ -292,12 +292,20 @@ def _is_truncated_version(shorter: str, longer: str) -> bool: raw_conf = result.get('confidence', 0.7) try: if isinstance(raw_conf, str): - # String confidence levels - bump up one tier - if raw_conf == 'low': - result['confidence'] = 'medium' - elif raw_conf == 'medium': - result['confidence'] = 'high' - # 'high' stays high + # SL can return string levels or numeric strings like "0.7" + try: + numeric = float(raw_conf) + if numeric <= 1: + result['confidence'] = min(0.95, numeric + 0.05) + else: + result['confidence'] = min(95, numeric + 5) + except ValueError: + # Named confidence levels - bump up one tier + if raw_conf == 'low': + result['confidence'] = 'medium' + elif raw_conf == 'medium': + result['confidence'] = 'high' + # 'high' stays high elif isinstance(raw_conf, (int, float)): # Numeric confidence - small boost (5%) for path agreement, cap at 0.95 if raw_conf <= 1: