Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,23 @@

All notable changes to Library Manager will be documented in this file.

## [0.9.0-beta.121] - 2026-02-10

### Fixed

- **Issue #142: Duplicate author folders from name variants** - New `find_existing_author_folder()`
deduplicates author folders using 3-tier matching: exact normalized, standardized initials, and
fuzzy match (SequenceMatcher >= 85%). Prevents separate folders like "James S.A. Corey" vs
"James S. A. Corey" or "Alistair MacLean" vs "Alistair Maclean". Applied to both standard and
`author_lf/title` naming formats.
- **Issue #143: Series name used as author folder** - Defensive filter in BookDB provider discards
results where author equals series name (corrupt Skaldleita data per skaldleita#90, e.g. author
"Laundry Files" instead of "Charles Stross"). Defense-in-depth check in BookProfile.finalize()
catches this from any source, with automatic fallback to next-best author candidate.
- `standardize_author_initials` now defaults to `True` to reduce author folder fragmentation.

---

## [0.9.0-beta.120] - 2026-02-09

### Fixed
Expand Down
2 changes: 1 addition & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
- Multi-provider AI (Gemini, OpenRouter, Ollama)
"""

APP_VERSION = "0.9.0-beta.120"
APP_VERSION = "0.9.0-beta.121"
GITHUB_REPO = "deucebucket/library-manager" # Your GitHub repo

# Versioning Guide:
Expand Down Expand Up @@ -698,7 +698,7 @@
try:
with open(ERROR_REPORTS_PATH, 'r') as f:
reports = json.load(f)
except:

Check failure on line 701 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (E722)

app.py:701:13: E722 Do not use bare `except`
reports = []

# Add new report (keep last 100 reports to avoid file bloat)
Expand All @@ -722,7 +722,7 @@
try:
with open(ERROR_REPORTS_PATH, 'r') as f:
return json.load(f)
except:

Check failure on line 725 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (E722)

app.py:725:9: E722 Do not use bare `except`
return []
return []

Expand Down Expand Up @@ -1677,7 +1677,7 @@
continue
result = call_gemini(prompt, merged_config)
if result:
logger.info(f"[PROVIDER CHAIN] Success with gemini")

Check failure on line 1680 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F541)

app.py:1680:33: F541 f-string without any placeholders
return result

elif provider == 'openrouter':
Expand All @@ -1686,13 +1686,13 @@
continue
result = call_openrouter(prompt, merged_config)
if result:
logger.info(f"[PROVIDER CHAIN] Success with openrouter")

Check failure on line 1689 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F541)

app.py:1689:33: F541 f-string without any placeholders
return result

elif provider == 'ollama':
result = call_ollama(prompt, merged_config)
if result:
logger.info(f"[PROVIDER CHAIN] Success with ollama")

Check failure on line 1695 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F541)

app.py:1695:33: F541 f-string without any placeholders
return result

else:
Expand Down Expand Up @@ -1794,7 +1794,7 @@
return result
elif result and result.get('transcript'):
# Got transcript but no match - still useful, return for potential AI fallback
logger.info(f"[AUDIO CHAIN] BookDB returned transcript only")

Check failure on line 1797 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F541)

app.py:1797:37: F541 f-string without any placeholders
return result
elif result is None and attempt < max_retries - 1:
# Connection might be down, wait and retry
Expand Down Expand Up @@ -2126,11 +2126,11 @@
device = "cuda"
# int8 works on all CUDA devices including GTX 1080 (compute 6.1)
# float16 only works on newer GPUs (compute 7.0+)
logger.info(f"[WHISPER] Using CUDA GPU acceleration (10x faster)")

Check failure on line 2129 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F541)

app.py:2129:29: F541 f-string without any placeholders
else:
logger.info(f"[WHISPER] Using CPU (no CUDA GPU detected)")

Check failure on line 2131 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F541)

app.py:2131:29: F541 f-string without any placeholders
except ImportError:
logger.info(f"[WHISPER] Using CPU (ctranslate2 not available)")

Check failure on line 2133 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (F541)

app.py:2133:25: F541 f-string without any placeholders

_whisper_model = WhisperModel(model_name, device=device, compute_type=compute_type)
_whisper_model_name = model_name
Expand Down Expand Up @@ -2337,7 +2337,7 @@
if sample_path and os.path.exists(sample_path):
try:
os.unlink(sample_path)
except:

Check failure on line 2340 in app.py

View workflow job for this annotation

GitHub Actions / lint

Ruff (E722)

app.py:2340:13: E722 Do not use bare `except`
pass

return result
Expand Down
2 changes: 1 addition & 1 deletion library_manager/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def _detect_data_dir():
"update_channel": "beta", # "stable", "beta", or "nightly"
"naming_format": "author/title", # "author/title", "author - title", "custom"
"custom_naming_template": "{author}/{title}", # Custom template with {author}, {title}, {series}, etc.
"standardize_author_initials": False, # Normalize initials: "James S A Corey" -> "James S. A. Corey" (Issue #54)
"standardize_author_initials": True, # Normalize initials: "James S A Corey" -> "James S. A. Corey" (Issue #54)
# Metadata embedding settings
"metadata_embedding_enabled": False, # Embed tags into audio files when fixes are applied
"metadata_embedding_overwrite_managed": True, # Overwrite managed fields (title/author/series/etc)
Expand Down
65 changes: 64 additions & 1 deletion library_manager/models/book_profile.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,15 @@
"""Book Profile system - confidence-scored metadata profiles for comprehensive book identification."""

import json
import logging
import re
from datetime import datetime
from dataclasses import dataclass, field
from typing import Optional, List, Dict, Any
from typing import Optional, List, Dict, Any, Tuple
from pathlib import Path

logger = logging.getLogger(__name__)


# Source weights for confidence calculation (higher = more trusted)
SOURCE_WEIGHTS = {
Expand Down Expand Up @@ -301,9 +304,69 @@ def finalize(self):
fv.value = value
fv.confidence = confidence

# Defense-in-depth: reject author that equals series name (Skaldleita #90)
# Corrupt data can arrive from any source, not just BookDB
author_normalized = str(self.author.value).lower().strip() if self.author.value else None
series_normalized = str(self.series.value).lower().strip() if self.series.value else None
if author_normalized and series_normalized and author_normalized == series_normalized:
bad_author = self.author.value
logger.warning(f"[PROFILE] Series-as-author detected: '{bad_author}' == '{self.series.value}', finding alternative")
# Try to find an alternative author from the raw values
alternative = self._find_alternative_author(bad_author)
if alternative:
self.author.value = alternative[0]
self.author.confidence = alternative[1]
else:
self.author.value = None
self.author.confidence = 0
if 'series_as_author' not in self.issues:
self.issues.append('series_as_author')
self.needs_attention = True

self.calculate_overall_confidence()
self.last_updated = datetime.now().isoformat()

def _find_alternative_author(self, bad_value: str) -> Optional[Tuple[str, int]]:
"""Find the next-best author candidate, excluding the bad value.
Returns (value, confidence) or None."""
if not self.author.raw_values:
return None

def normalize(val):
return str(val).lower().strip() if val else None

bad_normalized = normalize(bad_value)

# Group by normalized value (same logic as calculate_field_confidence)
value_groups = {}
for source, value in self.author.raw_values.items():
if value is None:
continue
normalized = normalize(value)
if normalized == bad_normalized:
continue # Skip the corrupt value
if normalized not in value_groups:
value_groups[normalized] = []
weight = SOURCE_WEIGHTS.get(source, 30)
value_groups[normalized].append((source, value, weight))

if not value_groups:
return None

# Pick the best remaining candidate
best_value = None
best_weight = 0
for normalized, sources in value_groups.items():
total_weight = sum(w for _, _, w in sources)
if total_weight > best_weight:
best_weight = total_weight
best_source = max(sources, key=lambda x: x[2])
best_value = best_source[1]

if best_value:
return (best_value, min(best_weight, 100))
return None

def calculate_overall_confidence(self) -> int:
"""Calculate weighted overall confidence from field confidences."""
total_weight = 0
Expand Down
8 changes: 8 additions & 0 deletions library_manager/providers/bookdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,14 @@ def search_bookdb(title, author=None, api_key=None, retry_count=0, bookdb_url=No
'confidence': data.get('confidence', 0)
}

# Defense-in-depth: also checked in BookProfile.finalize(), but catching
# here prevents bad data propagation through cache and downstream layers
# Skaldleita bug #90 - series name imported as author entity
if result.get('author') and result.get('series'):
if result['author'].lower().strip() == result['series'].lower().strip():
logger.warning(f"[BOOKDB] Corrupt data: author '{result['author']}' equals series name, discarding")
result['author'] = None

if result['title'] and result['author']:
logger.info(f"Skaldleita found: {result['author']} - {result['title']}" +
(f" ({result['series']} #{result['series_num']})" if result['series'] else "") +
Expand Down
89 changes: 88 additions & 1 deletion library_manager/utils/path_safety.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
"""Path sanitization and building utilities."""
import re
import logging
from difflib import SequenceMatcher
from pathlib import Path
from typing import Optional, Tuple
from library_manager.utils.naming import strip_encoding_junk
from library_manager.utils.naming import strip_encoding_junk, standardize_initials

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -301,6 +302,81 @@ def sanitize_path_component(name):
return name


def _normalize_author_for_matching(name):
"""Normalize an author name for fuzzy comparison.
Lowercases, collapses whitespace, strips punctuation except periods in initials."""
if not name:
return ''
n = name.lower().strip()
# Collapse whitespace
n = re.sub(r'\s+', ' ', n)
# Remove non-alphanumeric except spaces and periods (keep periods for initials)
n = re.sub(r"[^\w\s.]", '', n)
return n.strip()


def find_existing_author_folder(lib_path, target_author) -> Optional[str]:
"""Find an existing author folder that matches target_author (Issue #142).

Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey"
or "Alistair MacLean" vs "Alistair Maclean".

Matching strategies (in order):
1. Exact normalized match (case-insensitive, whitespace-collapsed)
2. Standardized initials match (both through standardize_initials())
3. difflib.SequenceMatcher fuzzy match (ratio >= 0.85)

Returns the existing folder name if found, None otherwise.
"""
if not target_author or not lib_path:
return None

try:
lib = Path(lib_path)
if not lib.is_dir():
return None

# List only top-level directories
existing_dirs = [d.name for d in lib.iterdir() if d.is_dir()]
except OSError as e:
logger.debug(f"Error listing library directory {lib_path}: {e}")
return None

if not existing_dirs:
return None

target_normalized = _normalize_author_for_matching(target_author)
target_initials = _normalize_author_for_matching(standardize_initials(target_author))

best_match = None
best_ratio = 0.0

for dirname in existing_dirs:
dir_normalized = _normalize_author_for_matching(dirname)

# Strategy 1: Exact normalized match
if target_normalized == dir_normalized:
return dirname

# Strategy 2: Standardized initials match
dir_initials = _normalize_author_for_matching(standardize_initials(dirname))
if target_initials == dir_initials:
return dirname

# Strategy 3: Fuzzy match with SequenceMatcher
ratio = SequenceMatcher(None, target_normalized, dir_normalized).ratio()
if ratio >= 0.85 and ratio > best_ratio:
best_ratio = ratio
best_match = dirname

if best_match:
logger.info(f"[DEDUP] Reusing existing folder '{best_match}' for author '{target_author}' "
f"(similarity: {best_ratio:.2f})")
return best_match

return None


def build_new_path(lib_path, author, title, series=None, series_num=None, narrator=None, year=None,
edition=None, variant=None, language=None, language_code=None, config=None):
"""Build a new path based on the naming format configuration.
Expand Down Expand Up @@ -346,6 +422,12 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
logger.error(f"BLOCKED: Invalid author '{author}' or title '{title}' - would create dangerous path")
return None

# Issue #142: Check for existing author folder with similar name
# Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey"
existing_folder = find_existing_author_folder(lib_path, safe_author)
if existing_folder:
safe_author = existing_folder

# Issue #92: Strip "Unabridged"/"Abridged" markers if enabled
if config and config.get('strip_unabridged', False):
safe_title = strip_unabridged_markers(safe_title)
Expand Down Expand Up @@ -531,6 +613,10 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
# Issue #96: Library-style format: "LastName, FirstName/Title"
author_lf = format_author_lf(author)
safe_author_lf = sanitize_path_component(author_lf) if author_lf else safe_author
# Issue #142: Dedup for LF format too
existing_lf = find_existing_author_folder(lib_path, safe_author_lf)
if existing_lf:
safe_author_lf = existing_lf
if series_grouping and safe_series:
result_path = lib_path / safe_author_lf / safe_series / title_folder
else:
Expand Down Expand Up @@ -573,6 +659,7 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
__all__ = [
'sanitize_path_component',
'build_new_path',
'find_existing_author_folder',
'format_language_tag',
'apply_language_tag',
'strip_unabridged_markers',
Expand Down