From 36f695ed38b6ac7c278e9628a318d373a7db3a88 Mon Sep 17 00:00:00 2001 From: deucebucket Date: Tue, 10 Feb 2026 06:51:17 -0600 Subject: [PATCH 1/3] fix: Author folder dedup and series-as-author rejection (#142, #143) - Deduplicate author folders using fuzzy matching (difflib.SequenceMatcher >= 0.85) and standardized initials comparison to prevent duplicate folders like "James S.A. Corey" vs "James S. A. Corey" - Reject BookDB results where author equals series name (Skaldleita #90 corrupt data, e.g. author "Laundry Files" instead of "Charles Stross") - Defense-in-depth in BookProfile.finalize() to catch series-as-author from any source, with fallback to next-best author candidate - Enable standardize_author_initials by default to normalize initials at the config level --- library_manager/config.py | 2 +- library_manager/models/book_profile.py | 58 +++++++++++++++++ library_manager/providers/bookdb.py | 7 ++ library_manager/utils/path_safety.py | 89 +++++++++++++++++++++++++- 4 files changed, 154 insertions(+), 2 deletions(-) diff --git a/library_manager/config.py b/library_manager/config.py index 986be72..ac4fda7 100644 --- a/library_manager/config.py +++ b/library_manager/config.py @@ -79,7 +79,7 @@ def _detect_data_dir(): "update_channel": "beta", # "stable", "beta", or "nightly" "naming_format": "author/title", # "author/title", "author - title", "custom" "custom_naming_template": "{author}/{title}", # Custom template with {author}, {title}, {series}, etc. - "standardize_author_initials": False, # Normalize initials: "James S A Corey" -> "James S. A. Corey" (Issue #54) + "standardize_author_initials": True, # Normalize initials: "James S A Corey" -> "James S. A. Corey" (Issue #54) # Metadata embedding settings "metadata_embedding_enabled": False, # Embed tags into audio files when fixes are applied "metadata_embedding_overwrite_managed": True, # Overwrite managed fields (title/author/series/etc) diff --git a/library_manager/models/book_profile.py b/library_manager/models/book_profile.py index 6d4210c..54829ee 100644 --- a/library_manager/models/book_profile.py +++ b/library_manager/models/book_profile.py @@ -301,9 +301,67 @@ def finalize(self): fv.value = value fv.confidence = confidence + # Defense-in-depth: reject author that equals series name (Skaldleita #90) + # Corrupt data can arrive from any source, not just BookDB + if (self.author.value and self.series.value and + str(self.author.value).lower().strip() == str(self.series.value).lower().strip()): + bad_author = self.author.value + # Try to find an alternative author from the raw values + alternative = self._find_alternative_author(bad_author) + if alternative: + self.author.value = alternative[0] + self.author.confidence = alternative[1] + else: + self.author.value = None + self.author.confidence = 0 + if 'series_as_author' not in self.issues: + self.issues.append('series_as_author') + self.needs_attention = True + self.calculate_overall_confidence() self.last_updated = datetime.now().isoformat() + def _find_alternative_author(self, bad_value: str): + """Find the next-best author candidate, excluding the bad value. + Returns (value, confidence) or None.""" + if not self.author.raw_values: + return None + + def normalize(val): + return str(val).lower().strip() if val else None + + bad_normalized = normalize(bad_value) + + # Group by normalized value (same logic as calculate_field_confidence) + value_groups = {} + for source, value in self.author.raw_values.items(): + if value is None: + continue + normalized = normalize(value) + if normalized == bad_normalized: + continue # Skip the corrupt value + if normalized not in value_groups: + value_groups[normalized] = [] + weight = SOURCE_WEIGHTS.get(source, 30) + value_groups[normalized].append((source, value, weight)) + + if not value_groups: + return None + + # Pick the best remaining candidate + best_value = None + best_weight = 0 + for normalized, sources in value_groups.items(): + total_weight = sum(w for _, _, w in sources) + if total_weight > best_weight: + best_weight = total_weight + best_source = max(sources, key=lambda x: x[2]) + best_value = best_source[1] + + if best_value: + return (best_value, min(best_weight, 100)) + return None + def calculate_overall_confidence(self) -> int: """Calculate weighted overall confidence from field confidences.""" total_weight = 0 diff --git a/library_manager/providers/bookdb.py b/library_manager/providers/bookdb.py index 1206c26..1335214 100644 --- a/library_manager/providers/bookdb.py +++ b/library_manager/providers/bookdb.py @@ -222,6 +222,13 @@ def search_bookdb(title, author=None, api_key=None, retry_count=0, bookdb_url=No 'confidence': data.get('confidence', 0) } + # Defensive: Skaldleita bug #90 - series name imported as author entity + # e.g. author "Laundry Files" when it should be "Charles Stross" + if result.get('author') and result.get('series'): + if result['author'].lower().strip() == result['series'].lower().strip(): + logger.warning(f"[BOOKDB] Corrupt data: author '{result['author']}' equals series name, discarding") + result['author'] = None + if result['title'] and result['author']: logger.info(f"Skaldleita found: {result['author']} - {result['title']}" + (f" ({result['series']} #{result['series_num']})" if result['series'] else "") + diff --git a/library_manager/utils/path_safety.py b/library_manager/utils/path_safety.py index c07ec44..1c81b9c 100644 --- a/library_manager/utils/path_safety.py +++ b/library_manager/utils/path_safety.py @@ -1,9 +1,11 @@ """Path sanitization and building utilities.""" +import os import re import logging +from difflib import SequenceMatcher from pathlib import Path from typing import Optional, Tuple -from library_manager.utils.naming import strip_encoding_junk +from library_manager.utils.naming import strip_encoding_junk, standardize_initials logger = logging.getLogger(__name__) @@ -301,6 +303,80 @@ def sanitize_path_component(name): return name +def _normalize_author_for_matching(name): + """Normalize an author name for fuzzy comparison. + Lowercases, collapses whitespace, strips punctuation except periods in initials.""" + if not name: + return '' + n = name.lower().strip() + # Collapse whitespace + n = re.sub(r'\s+', ' ', n) + # Remove non-alphanumeric except spaces and periods (keep periods for initials) + n = re.sub(r"[^\w\s.]", '', n) + return n.strip() + + +def find_existing_author_folder(lib_path, target_author): + """Find an existing author folder that matches target_author (Issue #142). + + Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey" + or "Alistair MacLean" vs "Alistair Maclean". + + Matching strategies (in order): + 1. Exact normalized match (case-insensitive, whitespace-collapsed) + 2. Standardized initials match (both through standardize_initials()) + 3. difflib.SequenceMatcher fuzzy match (ratio >= 0.85) + + Returns the existing folder name if found, None otherwise. + """ + if not target_author or not lib_path: + return None + + try: + lib = Path(lib_path) + if not lib.is_dir(): + return None + + # List only top-level directories + existing_dirs = [d for d in os.listdir(lib) if os.path.isdir(lib / d)] + except OSError: + return None + + if not existing_dirs: + return None + + target_normalized = _normalize_author_for_matching(target_author) + target_initials = _normalize_author_for_matching(standardize_initials(target_author)) + + best_match = None + best_ratio = 0.0 + + for dirname in existing_dirs: + dir_normalized = _normalize_author_for_matching(dirname) + + # Strategy 1: Exact normalized match + if target_normalized == dir_normalized: + return dirname + + # Strategy 2: Standardized initials match + dir_initials = _normalize_author_for_matching(standardize_initials(dirname)) + if target_initials == dir_initials: + return dirname + + # Strategy 3: Fuzzy match with SequenceMatcher + ratio = SequenceMatcher(None, target_normalized, dir_normalized).ratio() + if ratio >= 0.85 and ratio > best_ratio: + best_ratio = ratio + best_match = dirname + + if best_match: + logger.info(f"[DEDUP] Reusing existing folder '{best_match}' for author '{target_author}' " + f"(similarity: {best_ratio:.2f})") + return best_match + + return None + + def build_new_path(lib_path, author, title, series=None, series_num=None, narrator=None, year=None, edition=None, variant=None, language=None, language_code=None, config=None): """Build a new path based on the naming format configuration. @@ -346,6 +422,12 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat logger.error(f"BLOCKED: Invalid author '{author}' or title '{title}' - would create dangerous path") return None + # Issue #142: Check for existing author folder with similar name + # Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey" + existing_folder = find_existing_author_folder(lib_path, safe_author) + if existing_folder: + safe_author = existing_folder + # Issue #92: Strip "Unabridged"/"Abridged" markers if enabled if config and config.get('strip_unabridged', False): safe_title = strip_unabridged_markers(safe_title) @@ -531,6 +613,10 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat # Issue #96: Library-style format: "LastName, FirstName/Title" author_lf = format_author_lf(author) safe_author_lf = sanitize_path_component(author_lf) if author_lf else safe_author + # Issue #142: Dedup for LF format too + existing_lf = find_existing_author_folder(lib_path, safe_author_lf) + if existing_lf: + safe_author_lf = existing_lf if series_grouping and safe_series: result_path = lib_path / safe_author_lf / safe_series / title_folder else: @@ -573,6 +659,7 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat __all__ = [ 'sanitize_path_component', 'build_new_path', + 'find_existing_author_folder', 'format_language_tag', 'apply_language_tag', 'strip_unabridged_markers', From 07b55167b47ab323ae04ba4c5d576926b0922801 Mon Sep 17 00:00:00 2001 From: deucebucket Date: Tue, 10 Feb 2026 06:56:06 -0600 Subject: [PATCH 2/3] docs: Add CHANGELOG, type hints, logging per vibe-check review (#142) - Add CHANGELOG entry for beta.121 documenting both fixes - Bump APP_VERSION to 0.9.0-beta.121 - Add return type hints to find_existing_author_folder() and _find_alternative_author() - Add logger + warning when series-as-author detected in finalize() --- CHANGELOG.md | 17 +++++++++++++++++ app.py | 2 +- library_manager/models/book_profile.py | 8 ++++++-- library_manager/utils/path_safety.py | 2 +- 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2c5c758..b921c7a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,23 @@ All notable changes to Library Manager will be documented in this file. +## [0.9.0-beta.121] - 2026-02-10 + +### Fixed + +- **Issue #142: Duplicate author folders from name variants** - New `find_existing_author_folder()` + deduplicates author folders using 3-tier matching: exact normalized, standardized initials, and + fuzzy match (SequenceMatcher >= 85%). Prevents separate folders like "James S.A. Corey" vs + "James S. A. Corey" or "Alistair MacLean" vs "Alistair Maclean". Applied to both standard and + `author_lf/title` naming formats. +- **Issue #143: Series name used as author folder** - Defensive filter in BookDB provider discards + results where author equals series name (corrupt Skaldleita data per skaldleita#90, e.g. author + "Laundry Files" instead of "Charles Stross"). Defense-in-depth check in BookProfile.finalize() + catches this from any source, with automatic fallback to next-best author candidate. +- `standardize_author_initials` now defaults to `True` to reduce author folder fragmentation. + +--- + ## [0.9.0-beta.120] - 2026-02-09 ### Fixed diff --git a/app.py b/app.py index 0acc0fa..289577d 100644 --- a/app.py +++ b/app.py @@ -11,7 +11,7 @@ - Multi-provider AI (Gemini, OpenRouter, Ollama) """ -APP_VERSION = "0.9.0-beta.120" +APP_VERSION = "0.9.0-beta.121" GITHUB_REPO = "deucebucket/library-manager" # Your GitHub repo # Versioning Guide: diff --git a/library_manager/models/book_profile.py b/library_manager/models/book_profile.py index 54829ee..47dba5a 100644 --- a/library_manager/models/book_profile.py +++ b/library_manager/models/book_profile.py @@ -1,12 +1,15 @@ """Book Profile system - confidence-scored metadata profiles for comprehensive book identification.""" import json +import logging import re from datetime import datetime from dataclasses import dataclass, field -from typing import Optional, List, Dict, Any +from typing import Optional, List, Dict, Any, Tuple from pathlib import Path +logger = logging.getLogger(__name__) + # Source weights for confidence calculation (higher = more trusted) SOURCE_WEIGHTS = { @@ -306,6 +309,7 @@ def finalize(self): if (self.author.value and self.series.value and str(self.author.value).lower().strip() == str(self.series.value).lower().strip()): bad_author = self.author.value + logger.warning(f"[PROFILE] Series-as-author detected: '{bad_author}' == '{self.series.value}', finding alternative") # Try to find an alternative author from the raw values alternative = self._find_alternative_author(bad_author) if alternative: @@ -321,7 +325,7 @@ def finalize(self): self.calculate_overall_confidence() self.last_updated = datetime.now().isoformat() - def _find_alternative_author(self, bad_value: str): + def _find_alternative_author(self, bad_value: str) -> Optional[Tuple[str, int]]: """Find the next-best author candidate, excluding the bad value. Returns (value, confidence) or None.""" if not self.author.raw_values: diff --git a/library_manager/utils/path_safety.py b/library_manager/utils/path_safety.py index 1c81b9c..c9fc829 100644 --- a/library_manager/utils/path_safety.py +++ b/library_manager/utils/path_safety.py @@ -316,7 +316,7 @@ def _normalize_author_for_matching(name): return n.strip() -def find_existing_author_folder(lib_path, target_author): +def find_existing_author_folder(lib_path, target_author) -> Optional[str]: """Find an existing author folder that matches target_author (Issue #142). Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey" From f2638acd8e9bbf7f87c7b22243194c85516b6dd6 Mon Sep 17 00:00:00 2001 From: deucebucket Date: Tue, 10 Feb 2026 06:59:36 -0600 Subject: [PATCH 3/3] fix: Address vibe-check review round 2 (#142) - Replace os.listdir() with pathlib iterdir() for consistency - Log OSError when listing library directory fails - Pre-compute normalized strings in finalize() series-as-author check - Clarify defense-in-depth comment in bookdb.py - Remove unused os import --- library_manager/models/book_profile.py | 5 +++-- library_manager/providers/bookdb.py | 5 +++-- library_manager/utils/path_safety.py | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/library_manager/models/book_profile.py b/library_manager/models/book_profile.py index 47dba5a..2b7f99e 100644 --- a/library_manager/models/book_profile.py +++ b/library_manager/models/book_profile.py @@ -306,8 +306,9 @@ def finalize(self): # Defense-in-depth: reject author that equals series name (Skaldleita #90) # Corrupt data can arrive from any source, not just BookDB - if (self.author.value and self.series.value and - str(self.author.value).lower().strip() == str(self.series.value).lower().strip()): + author_normalized = str(self.author.value).lower().strip() if self.author.value else None + series_normalized = str(self.series.value).lower().strip() if self.series.value else None + if author_normalized and series_normalized and author_normalized == series_normalized: bad_author = self.author.value logger.warning(f"[PROFILE] Series-as-author detected: '{bad_author}' == '{self.series.value}', finding alternative") # Try to find an alternative author from the raw values diff --git a/library_manager/providers/bookdb.py b/library_manager/providers/bookdb.py index 1335214..7d593f6 100644 --- a/library_manager/providers/bookdb.py +++ b/library_manager/providers/bookdb.py @@ -222,8 +222,9 @@ def search_bookdb(title, author=None, api_key=None, retry_count=0, bookdb_url=No 'confidence': data.get('confidence', 0) } - # Defensive: Skaldleita bug #90 - series name imported as author entity - # e.g. author "Laundry Files" when it should be "Charles Stross" + # Defense-in-depth: also checked in BookProfile.finalize(), but catching + # here prevents bad data propagation through cache and downstream layers + # Skaldleita bug #90 - series name imported as author entity if result.get('author') and result.get('series'): if result['author'].lower().strip() == result['series'].lower().strip(): logger.warning(f"[BOOKDB] Corrupt data: author '{result['author']}' equals series name, discarding") diff --git a/library_manager/utils/path_safety.py b/library_manager/utils/path_safety.py index c9fc829..c61624e 100644 --- a/library_manager/utils/path_safety.py +++ b/library_manager/utils/path_safety.py @@ -1,5 +1,4 @@ """Path sanitization and building utilities.""" -import os import re import logging from difflib import SequenceMatcher @@ -338,8 +337,9 @@ def find_existing_author_folder(lib_path, target_author) -> Optional[str]: return None # List only top-level directories - existing_dirs = [d for d in os.listdir(lib) if os.path.isdir(lib / d)] - except OSError: + existing_dirs = [d.name for d in lib.iterdir() if d.is_dir()] + except OSError as e: + logger.debug(f"Error listing library directory {lib_path}: {e}") return None if not existing_dirs: