deucebucket · deucebucket · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026 · Feb 10, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,23 @@
 
 All notable changes to Library Manager will be documented in this file.
 
+## [0.9.0-beta.121] - 2026-02-10
+
+### Fixed
+
+- **Issue #142: Duplicate author folders from name variants** - New `find_existing_author_folder()`
+  deduplicates author folders using 3-tier matching: exact normalized, standardized initials, and
+  fuzzy match (SequenceMatcher >= 85%). Prevents separate folders like "James S.A. Corey" vs
+  "James S. A. Corey" or "Alistair MacLean" vs "Alistair Maclean". Applied to both standard and
+  `author_lf/title` naming formats.
+- **Issue #143: Series name used as author folder** - Defensive filter in BookDB provider discards
+  results where author equals series name (corrupt Skaldleita data per skaldleita#90, e.g. author
+  "Laundry Files" instead of "Charles Stross"). Defense-in-depth check in BookProfile.finalize()
+  catches this from any source, with automatic fallback to next-best author candidate.
+- `standardize_author_initials` now defaults to `True` to reduce author folder fragmentation.
+
+---
+
 ## [0.9.0-beta.120] - 2026-02-09
 
 ### Fixed

diff --git a/app.py b/app.py
@@ -11,7 +11,7 @@
 - Multi-provider AI (Gemini, OpenRouter, Ollama)
 """
 
-APP_VERSION = "0.9.0-beta.120"
+APP_VERSION = "0.9.0-beta.121"
 GITHUB_REPO = "deucebucket/library-manager"  # Your GitHub repo
 
 # Versioning Guide:
@@ -698,7 +698,7 @@
            try:
                with open(ERROR_REPORTS_PATH, 'r') as f:
                    reports = json.load(f)
            except:
                reports = []

        # Add new report (keep last 100 reports to avoid file bloat)
@@ -722,7 +722,7 @@
        try:
            with open(ERROR_REPORTS_PATH, 'r') as f:
                return json.load(f)
        except:
            return []
    return []

@@ -1677,7 +1677,7 @@
                    continue
                result = call_gemini(prompt, merged_config)
                if result:
                    logger.info(f"[PROVIDER CHAIN] Success with gemini")
                    return result

            elif provider == 'openrouter':
@@ -1686,13 +1686,13 @@
                    continue
                result = call_openrouter(prompt, merged_config)
                if result:
                    logger.info(f"[PROVIDER CHAIN] Success with openrouter")
                    return result

            elif provider == 'ollama':
                result = call_ollama(prompt, merged_config)
                if result:
                    logger.info(f"[PROVIDER CHAIN] Success with ollama")
                    return result

            else:
@@ -1794,7 +1794,7 @@
                        return result
                    elif result and result.get('transcript'):
                        # Got transcript but no match - still useful, return for potential AI fallback
                        logger.info(f"[AUDIO CHAIN] BookDB returned transcript only")
                        return result
                    elif result is None and attempt < max_retries - 1:
                        # Connection might be down, wait and retry
@@ -2126,11 +2126,11 @@
                device = "cuda"
                # int8 works on all CUDA devices including GTX 1080 (compute 6.1)
                # float16 only works on newer GPUs (compute 7.0+)
                logger.info(f"[WHISPER] Using CUDA GPU acceleration (10x faster)")
            else:
                logger.info(f"[WHISPER] Using CPU (no CUDA GPU detected)")
        except ImportError:
            logger.info(f"[WHISPER] Using CPU (ctranslate2 not available)")

        _whisper_model = WhisperModel(model_name, device=device, compute_type=compute_type)
        _whisper_model_name = model_name
@@ -2337,7 +2337,7 @@
        if sample_path and os.path.exists(sample_path):
            try:
                os.unlink(sample_path)
            except:
                pass

    return result

diff --git a/library_manager/config.py b/library_manager/config.py
@@ -79,7 +79,7 @@ def _detect_data_dir():
     "update_channel": "beta",  # "stable", "beta", or "nightly"
     "naming_format": "author/title",  # "author/title", "author - title", "custom"
     "custom_naming_template": "{author}/{title}",  # Custom template with {author}, {title}, {series}, etc.
-    "standardize_author_initials": False,  # Normalize initials: "James S A Corey" -> "James S. A. Corey" (Issue #54)
+    "standardize_author_initials": True,  # Normalize initials: "James S A Corey" -> "James S. A. Corey" (Issue #54)
     # Metadata embedding settings
     "metadata_embedding_enabled": False,  # Embed tags into audio files when fixes are applied
     "metadata_embedding_overwrite_managed": True,  # Overwrite managed fields (title/author/series/etc)

diff --git a/library_manager/models/book_profile.py b/library_manager/models/book_profile.py
@@ -1,12 +1,15 @@
 """Book Profile system - confidence-scored metadata profiles for comprehensive book identification."""
 
 import json
+import logging
 import re
 from datetime import datetime
 from dataclasses import dataclass, field
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Tuple
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 
 # Source weights for confidence calculation (higher = more trusted)
 SOURCE_WEIGHTS = {
@@ -301,9 +304,69 @@ def finalize(self):
                 fv.value = value
                 fv.confidence = confidence
 
+        # Defense-in-depth: reject author that equals series name (Skaldleita #90)
+        # Corrupt data can arrive from any source, not just BookDB
+        author_normalized = str(self.author.value).lower().strip() if self.author.value else None
+        series_normalized = str(self.series.value).lower().strip() if self.series.value else None
+        if author_normalized and series_normalized and author_normalized == series_normalized:
+            bad_author = self.author.value
+            logger.warning(f"[PROFILE] Series-as-author detected: '{bad_author}' == '{self.series.value}', finding alternative")
+            # Try to find an alternative author from the raw values
+            alternative = self._find_alternative_author(bad_author)
+            if alternative:
+                self.author.value = alternative[0]
+                self.author.confidence = alternative[1]
+            else:
+                self.author.value = None
+                self.author.confidence = 0
+            if 'series_as_author' not in self.issues:
+                self.issues.append('series_as_author')
+            self.needs_attention = True
+
         self.calculate_overall_confidence()
         self.last_updated = datetime.now().isoformat()
 
+    def _find_alternative_author(self, bad_value: str) -> Optional[Tuple[str, int]]:
+        """Find the next-best author candidate, excluding the bad value.
+        Returns (value, confidence) or None."""
+        if not self.author.raw_values:
+            return None
+
+        def normalize(val):
+            return str(val).lower().strip() if val else None
+
+        bad_normalized = normalize(bad_value)
+
+        # Group by normalized value (same logic as calculate_field_confidence)
+        value_groups = {}
+        for source, value in self.author.raw_values.items():
+            if value is None:
+                continue
+            normalized = normalize(value)
+            if normalized == bad_normalized:
+                continue  # Skip the corrupt value
+            if normalized not in value_groups:
+                value_groups[normalized] = []
+            weight = SOURCE_WEIGHTS.get(source, 30)
+            value_groups[normalized].append((source, value, weight))
+
+        if not value_groups:
+            return None
+
+        # Pick the best remaining candidate
+        best_value = None
+        best_weight = 0
+        for normalized, sources in value_groups.items():
+            total_weight = sum(w for _, _, w in sources)
+            if total_weight > best_weight:
+                best_weight = total_weight
+                best_source = max(sources, key=lambda x: x[2])
+                best_value = best_source[1]
+
+        if best_value:
+            return (best_value, min(best_weight, 100))
+        return None
+
     def calculate_overall_confidence(self) -> int:
         """Calculate weighted overall confidence from field confidences."""
         total_weight = 0

diff --git a/library_manager/providers/bookdb.py b/library_manager/providers/bookdb.py
@@ -222,6 +222,14 @@ def search_bookdb(title, author=None, api_key=None, retry_count=0, bookdb_url=No
             'confidence': data.get('confidence', 0)
         }
 
+        # Defense-in-depth: also checked in BookProfile.finalize(), but catching
+        # here prevents bad data propagation through cache and downstream layers
+        # Skaldleita bug #90 - series name imported as author entity
+        if result.get('author') and result.get('series'):
+            if result['author'].lower().strip() == result['series'].lower().strip():
+                logger.warning(f"[BOOKDB] Corrupt data: author '{result['author']}' equals series name, discarding")
+                result['author'] = None
+
         if result['title'] and result['author']:
             logger.info(f"Skaldleita found: {result['author']} - {result['title']}" +
                        (f" ({result['series']} #{result['series_num']})" if result['series'] else "") +

diff --git a/library_manager/utils/path_safety.py b/library_manager/utils/path_safety.py
@@ -1,9 +1,10 @@
 """Path sanitization and building utilities."""
 import re
 import logging
+from difflib import SequenceMatcher
 from pathlib import Path
 from typing import Optional, Tuple
-from library_manager.utils.naming import strip_encoding_junk
+from library_manager.utils.naming import strip_encoding_junk, standardize_initials
 
 logger = logging.getLogger(__name__)
 
@@ -301,6 +302,81 @@ def sanitize_path_component(name):
     return name
 
 
+def _normalize_author_for_matching(name):
+    """Normalize an author name for fuzzy comparison.
+    Lowercases, collapses whitespace, strips punctuation except periods in initials."""
+    if not name:
+        return ''
+    n = name.lower().strip()
+    # Collapse whitespace
+    n = re.sub(r'\s+', ' ', n)
+    # Remove non-alphanumeric except spaces and periods (keep periods for initials)
+    n = re.sub(r"[^\w\s.]", '', n)
+    return n.strip()
+
+
+def find_existing_author_folder(lib_path, target_author) -> Optional[str]:
+    """Find an existing author folder that matches target_author (Issue #142).
+
+    Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey"
+    or "Alistair MacLean" vs "Alistair Maclean".
+
+    Matching strategies (in order):
+    1. Exact normalized match (case-insensitive, whitespace-collapsed)
+    2. Standardized initials match (both through standardize_initials())
+    3. difflib.SequenceMatcher fuzzy match (ratio >= 0.85)
+
+    Returns the existing folder name if found, None otherwise.
+    """
+    if not target_author or not lib_path:
+        return None
+
+    try:
+        lib = Path(lib_path)
+        if not lib.is_dir():
+            return None
+
+        # List only top-level directories
+        existing_dirs = [d.name for d in lib.iterdir() if d.is_dir()]
+    except OSError as e:
+        logger.debug(f"Error listing library directory {lib_path}: {e}")
+        return None
+
+    if not existing_dirs:
+        return None
+
+    target_normalized = _normalize_author_for_matching(target_author)
+    target_initials = _normalize_author_for_matching(standardize_initials(target_author))
+
+    best_match = None
+    best_ratio = 0.0
+
+    for dirname in existing_dirs:
+        dir_normalized = _normalize_author_for_matching(dirname)
+
+        # Strategy 1: Exact normalized match
+        if target_normalized == dir_normalized:
+            return dirname
+
+        # Strategy 2: Standardized initials match
+        dir_initials = _normalize_author_for_matching(standardize_initials(dirname))
+        if target_initials == dir_initials:
+            return dirname
+
+        # Strategy 3: Fuzzy match with SequenceMatcher
+        ratio = SequenceMatcher(None, target_normalized, dir_normalized).ratio()
+        if ratio >= 0.85 and ratio > best_ratio:
+            best_ratio = ratio
+            best_match = dirname
+
+    if best_match:
+        logger.info(f"[DEDUP] Reusing existing folder '{best_match}' for author '{target_author}' "
+                    f"(similarity: {best_ratio:.2f})")
+        return best_match
+
+    return None
+
+
 def build_new_path(lib_path, author, title, series=None, series_num=None, narrator=None, year=None,
                    edition=None, variant=None, language=None, language_code=None, config=None):
     """Build a new path based on the naming format configuration.
@@ -346,6 +422,12 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
         logger.error(f"BLOCKED: Invalid author '{author}' or title '{title}' - would create dangerous path")
         return None
 
+    # Issue #142: Check for existing author folder with similar name
+    # Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey"
+    existing_folder = find_existing_author_folder(lib_path, safe_author)
+    if existing_folder:
+        safe_author = existing_folder
+
     # Issue #92: Strip "Unabridged"/"Abridged" markers if enabled
     if config and config.get('strip_unabridged', False):
         safe_title = strip_unabridged_markers(safe_title)
@@ -531,6 +613,10 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
         # Issue #96: Library-style format: "LastName, FirstName/Title"
         author_lf = format_author_lf(author)
         safe_author_lf = sanitize_path_component(author_lf) if author_lf else safe_author
+        # Issue #142: Dedup for LF format too
+        existing_lf = find_existing_author_folder(lib_path, safe_author_lf)
+        if existing_lf:
+            safe_author_lf = existing_lf
         if series_grouping and safe_series:
             result_path = lib_path / safe_author_lf / safe_series / title_folder
         else:
@@ -573,6 +659,7 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
 __all__ = [
     'sanitize_path_component',
     'build_new_path',
+    'find_existing_author_folder',
     'format_language_tag',
     'apply_language_tag',
     'strip_unabridged_markers',