From 36f695ed38b6ac7c278e9628a318d373a7db3a88 Mon Sep 17 00:00:00 2001
From: deucebucket <deucebucket@users.noreply.github.com>
Date: Tue, 10 Feb 2026 06:51:17 -0600
Subject: [PATCH 1/3] fix: Author folder dedup and series-as-author rejection
 (#142, #143)

- Deduplicate author folders using fuzzy matching (difflib.SequenceMatcher
  >= 0.85) and standardized initials comparison to prevent duplicate
  folders like "James S.A. Corey" vs "James S. A. Corey"
- Reject BookDB results where author equals series name (Skaldleita #90
  corrupt data, e.g. author "Laundry Files" instead of "Charles Stross")
- Defense-in-depth in BookProfile.finalize() to catch series-as-author
  from any source, with fallback to next-best author candidate
- Enable standardize_author_initials by default to normalize initials
  at the config level
---
 library_manager/config.py              |  2 +-
 library_manager/models/book_profile.py | 58 +++++++++++++++++
 library_manager/providers/bookdb.py    |  7 ++
 library_manager/utils/path_safety.py   | 89 +++++++++++++++++++++++++-
 4 files changed, 154 insertions(+), 2 deletions(-)

diff --git a/library_manager/config.py b/library_manager/config.py
index 986be72..ac4fda7 100644
--- a/library_manager/config.py
+++ b/library_manager/config.py
@@ -79,7 +79,7 @@ def _detect_data_dir():
     "update_channel": "beta",  # "stable", "beta", or "nightly"
     "naming_format": "author/title",  # "author/title", "author - title", "custom"
     "custom_naming_template": "{author}/{title}",  # Custom template with {author}, {title}, {series}, etc.
-    "standardize_author_initials": False,  # Normalize initials: "James S A Corey" -> "James S. A. Corey" (Issue #54)
+    "standardize_author_initials": True,  # Normalize initials: "James S A Corey" -> "James S. A. Corey" (Issue #54)
     # Metadata embedding settings
     "metadata_embedding_enabled": False,  # Embed tags into audio files when fixes are applied
     "metadata_embedding_overwrite_managed": True,  # Overwrite managed fields (title/author/series/etc)
diff --git a/library_manager/models/book_profile.py b/library_manager/models/book_profile.py
index 6d4210c..54829ee 100644
--- a/library_manager/models/book_profile.py
+++ b/library_manager/models/book_profile.py
@@ -301,9 +301,67 @@ def finalize(self):
                 fv.value = value
                 fv.confidence = confidence
 
+        # Defense-in-depth: reject author that equals series name (Skaldleita #90)
+        # Corrupt data can arrive from any source, not just BookDB
+        if (self.author.value and self.series.value and
+                str(self.author.value).lower().strip() == str(self.series.value).lower().strip()):
+            bad_author = self.author.value
+            # Try to find an alternative author from the raw values
+            alternative = self._find_alternative_author(bad_author)
+            if alternative:
+                self.author.value = alternative[0]
+                self.author.confidence = alternative[1]
+            else:
+                self.author.value = None
+                self.author.confidence = 0
+            if 'series_as_author' not in self.issues:
+                self.issues.append('series_as_author')
+            self.needs_attention = True
+
         self.calculate_overall_confidence()
         self.last_updated = datetime.now().isoformat()
 
+    def _find_alternative_author(self, bad_value: str):
+        """Find the next-best author candidate, excluding the bad value.
+        Returns (value, confidence) or None."""
+        if not self.author.raw_values:
+            return None
+
+        def normalize(val):
+            return str(val).lower().strip() if val else None
+
+        bad_normalized = normalize(bad_value)
+
+        # Group by normalized value (same logic as calculate_field_confidence)
+        value_groups = {}
+        for source, value in self.author.raw_values.items():
+            if value is None:
+                continue
+            normalized = normalize(value)
+            if normalized == bad_normalized:
+                continue  # Skip the corrupt value
+            if normalized not in value_groups:
+                value_groups[normalized] = []
+            weight = SOURCE_WEIGHTS.get(source, 30)
+            value_groups[normalized].append((source, value, weight))
+
+        if not value_groups:
+            return None
+
+        # Pick the best remaining candidate
+        best_value = None
+        best_weight = 0
+        for normalized, sources in value_groups.items():
+            total_weight = sum(w for _, _, w in sources)
+            if total_weight > best_weight:
+                best_weight = total_weight
+                best_source = max(sources, key=lambda x: x[2])
+                best_value = best_source[1]
+
+        if best_value:
+            return (best_value, min(best_weight, 100))
+        return None
+
     def calculate_overall_confidence(self) -> int:
         """Calculate weighted overall confidence from field confidences."""
         total_weight = 0
diff --git a/library_manager/providers/bookdb.py b/library_manager/providers/bookdb.py
index 1206c26..1335214 100644
--- a/library_manager/providers/bookdb.py
+++ b/library_manager/providers/bookdb.py
@@ -222,6 +222,13 @@ def search_bookdb(title, author=None, api_key=None, retry_count=0, bookdb_url=No
             'confidence': data.get('confidence', 0)
         }
 
+        # Defensive: Skaldleita bug #90 - series name imported as author entity
+        # e.g. author "Laundry Files" when it should be "Charles Stross"
+        if result.get('author') and result.get('series'):
+            if result['author'].lower().strip() == result['series'].lower().strip():
+                logger.warning(f"[BOOKDB] Corrupt data: author '{result['author']}' equals series name, discarding")
+                result['author'] = None
+
         if result['title'] and result['author']:
             logger.info(f"Skaldleita found: {result['author']} - {result['title']}" +
                        (f" ({result['series']} #{result['series_num']})" if result['series'] else "") +
diff --git a/library_manager/utils/path_safety.py b/library_manager/utils/path_safety.py
index c07ec44..1c81b9c 100644
--- a/library_manager/utils/path_safety.py
+++ b/library_manager/utils/path_safety.py
@@ -1,9 +1,11 @@
 """Path sanitization and building utilities."""
+import os
 import re
 import logging
+from difflib import SequenceMatcher
 from pathlib import Path
 from typing import Optional, Tuple
-from library_manager.utils.naming import strip_encoding_junk
+from library_manager.utils.naming import strip_encoding_junk, standardize_initials
 
 logger = logging.getLogger(__name__)
 
@@ -301,6 +303,80 @@ def sanitize_path_component(name):
     return name
 
 
+def _normalize_author_for_matching(name):
+    """Normalize an author name for fuzzy comparison.
+    Lowercases, collapses whitespace, strips punctuation except periods in initials."""
+    if not name:
+        return ''
+    n = name.lower().strip()
+    # Collapse whitespace
+    n = re.sub(r'\s+', ' ', n)
+    # Remove non-alphanumeric except spaces and periods (keep periods for initials)
+    n = re.sub(r"[^\w\s.]", '', n)
+    return n.strip()
+
+
+def find_existing_author_folder(lib_path, target_author):
+    """Find an existing author folder that matches target_author (Issue #142).
+
+    Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey"
+    or "Alistair MacLean" vs "Alistair Maclean".
+
+    Matching strategies (in order):
+    1. Exact normalized match (case-insensitive, whitespace-collapsed)
+    2. Standardized initials match (both through standardize_initials())
+    3. difflib.SequenceMatcher fuzzy match (ratio >= 0.85)
+
+    Returns the existing folder name if found, None otherwise.
+    """
+    if not target_author or not lib_path:
+        return None
+
+    try:
+        lib = Path(lib_path)
+        if not lib.is_dir():
+            return None
+
+        # List only top-level directories
+        existing_dirs = [d for d in os.listdir(lib) if os.path.isdir(lib / d)]
+    except OSError:
+        return None
+
+    if not existing_dirs:
+        return None
+
+    target_normalized = _normalize_author_for_matching(target_author)
+    target_initials = _normalize_author_for_matching(standardize_initials(target_author))
+
+    best_match = None
+    best_ratio = 0.0
+
+    for dirname in existing_dirs:
+        dir_normalized = _normalize_author_for_matching(dirname)
+
+        # Strategy 1: Exact normalized match
+        if target_normalized == dir_normalized:
+            return dirname
+
+        # Strategy 2: Standardized initials match
+        dir_initials = _normalize_author_for_matching(standardize_initials(dirname))
+        if target_initials == dir_initials:
+            return dirname
+
+        # Strategy 3: Fuzzy match with SequenceMatcher
+        ratio = SequenceMatcher(None, target_normalized, dir_normalized).ratio()
+        if ratio >= 0.85 and ratio > best_ratio:
+            best_ratio = ratio
+            best_match = dirname
+
+    if best_match:
+        logger.info(f"[DEDUP] Reusing existing folder '{best_match}' for author '{target_author}' "
+                    f"(similarity: {best_ratio:.2f})")
+        return best_match
+
+    return None
+
+
 def build_new_path(lib_path, author, title, series=None, series_num=None, narrator=None, year=None,
                    edition=None, variant=None, language=None, language_code=None, config=None):
     """Build a new path based on the naming format configuration.
@@ -346,6 +422,12 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
         logger.error(f"BLOCKED: Invalid author '{author}' or title '{title}' - would create dangerous path")
         return None
 
+    # Issue #142: Check for existing author folder with similar name
+    # Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey"
+    existing_folder = find_existing_author_folder(lib_path, safe_author)
+    if existing_folder:
+        safe_author = existing_folder
+
     # Issue #92: Strip "Unabridged"/"Abridged" markers if enabled
     if config and config.get('strip_unabridged', False):
         safe_title = strip_unabridged_markers(safe_title)
@@ -531,6 +613,10 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
         # Issue #96: Library-style format: "LastName, FirstName/Title"
         author_lf = format_author_lf(author)
         safe_author_lf = sanitize_path_component(author_lf) if author_lf else safe_author
+        # Issue #142: Dedup for LF format too
+        existing_lf = find_existing_author_folder(lib_path, safe_author_lf)
+        if existing_lf:
+            safe_author_lf = existing_lf
         if series_grouping and safe_series:
             result_path = lib_path / safe_author_lf / safe_series / title_folder
         else:
@@ -573,6 +659,7 @@ def build_new_path(lib_path, author, title, series=None, series_num=None, narrat
 __all__ = [
     'sanitize_path_component',
     'build_new_path',
+    'find_existing_author_folder',
     'format_language_tag',
     'apply_language_tag',
     'strip_unabridged_markers',

From 07b55167b47ab323ae04ba4c5d576926b0922801 Mon Sep 17 00:00:00 2001
From: deucebucket <deucebucket@users.noreply.github.com>
Date: Tue, 10 Feb 2026 06:56:06 -0600
Subject: [PATCH 2/3] docs: Add CHANGELOG, type hints, logging per vibe-check
 review (#142)

- Add CHANGELOG entry for beta.121 documenting both fixes
- Bump APP_VERSION to 0.9.0-beta.121
- Add return type hints to find_existing_author_folder() and
  _find_alternative_author()
- Add logger + warning when series-as-author detected in finalize()
---
 CHANGELOG.md                           | 17 +++++++++++++++++
 app.py                                 |  2 +-
 library_manager/models/book_profile.py |  8 ++++++--
 library_manager/utils/path_safety.py   |  2 +-
 4 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2c5c758..b921c7a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,23 @@
 
 All notable changes to Library Manager will be documented in this file.
 
+## [0.9.0-beta.121] - 2026-02-10
+
+### Fixed
+
+- **Issue #142: Duplicate author folders from name variants** - New `find_existing_author_folder()`
+  deduplicates author folders using 3-tier matching: exact normalized, standardized initials, and
+  fuzzy match (SequenceMatcher >= 85%). Prevents separate folders like "James S.A. Corey" vs
+  "James S. A. Corey" or "Alistair MacLean" vs "Alistair Maclean". Applied to both standard and
+  `author_lf/title` naming formats.
+- **Issue #143: Series name used as author folder** - Defensive filter in BookDB provider discards
+  results where author equals series name (corrupt Skaldleita data per skaldleita#90, e.g. author
+  "Laundry Files" instead of "Charles Stross"). Defense-in-depth check in BookProfile.finalize()
+  catches this from any source, with automatic fallback to next-best author candidate.
+- `standardize_author_initials` now defaults to `True` to reduce author folder fragmentation.
+
+---
+
 ## [0.9.0-beta.120] - 2026-02-09
 
 ### Fixed
diff --git a/app.py b/app.py
index 0acc0fa..289577d 100644
--- a/app.py
+++ b/app.py
@@ -11,7 +11,7 @@
 - Multi-provider AI (Gemini, OpenRouter, Ollama)
 """
 
-APP_VERSION = "0.9.0-beta.120"
+APP_VERSION = "0.9.0-beta.121"
 GITHUB_REPO = "deucebucket/library-manager"  # Your GitHub repo
 
 # Versioning Guide:
diff --git a/library_manager/models/book_profile.py b/library_manager/models/book_profile.py
index 54829ee..47dba5a 100644
--- a/library_manager/models/book_profile.py
+++ b/library_manager/models/book_profile.py
@@ -1,12 +1,15 @@
 """Book Profile system - confidence-scored metadata profiles for comprehensive book identification."""
 
 import json
+import logging
 import re
 from datetime import datetime
 from dataclasses import dataclass, field
-from typing import Optional, List, Dict, Any
+from typing import Optional, List, Dict, Any, Tuple
 from pathlib import Path
 
+logger = logging.getLogger(__name__)
+
 
 # Source weights for confidence calculation (higher = more trusted)
 SOURCE_WEIGHTS = {
@@ -306,6 +309,7 @@ def finalize(self):
         if (self.author.value and self.series.value and
                 str(self.author.value).lower().strip() == str(self.series.value).lower().strip()):
             bad_author = self.author.value
+            logger.warning(f"[PROFILE] Series-as-author detected: '{bad_author}' == '{self.series.value}', finding alternative")
             # Try to find an alternative author from the raw values
             alternative = self._find_alternative_author(bad_author)
             if alternative:
@@ -321,7 +325,7 @@ def finalize(self):
         self.calculate_overall_confidence()
         self.last_updated = datetime.now().isoformat()
 
-    def _find_alternative_author(self, bad_value: str):
+    def _find_alternative_author(self, bad_value: str) -> Optional[Tuple[str, int]]:
         """Find the next-best author candidate, excluding the bad value.
         Returns (value, confidence) or None."""
         if not self.author.raw_values:
diff --git a/library_manager/utils/path_safety.py b/library_manager/utils/path_safety.py
index 1c81b9c..c9fc829 100644
--- a/library_manager/utils/path_safety.py
+++ b/library_manager/utils/path_safety.py
@@ -316,7 +316,7 @@ def _normalize_author_for_matching(name):
     return n.strip()
 
 
-def find_existing_author_folder(lib_path, target_author):
+def find_existing_author_folder(lib_path, target_author) -> Optional[str]:
     """Find an existing author folder that matches target_author (Issue #142).
 
     Prevents duplicate folders like "James S.A. Corey" vs "James S. A. Corey"

From f2638acd8e9bbf7f87c7b22243194c85516b6dd6 Mon Sep 17 00:00:00 2001
From: deucebucket <deucebucket@users.noreply.github.com>
Date: Tue, 10 Feb 2026 06:59:36 -0600
Subject: [PATCH 3/3] fix: Address vibe-check review round 2 (#142)

- Replace os.listdir() with pathlib iterdir() for consistency
- Log OSError when listing library directory fails
- Pre-compute normalized strings in finalize() series-as-author check
- Clarify defense-in-depth comment in bookdb.py
- Remove unused os import
---
 library_manager/models/book_profile.py | 5 +++--
 library_manager/providers/bookdb.py    | 5 +++--
 library_manager/utils/path_safety.py   | 6 +++---
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/library_manager/models/book_profile.py b/library_manager/models/book_profile.py
index 47dba5a..2b7f99e 100644
--- a/library_manager/models/book_profile.py
+++ b/library_manager/models/book_profile.py
@@ -306,8 +306,9 @@ def finalize(self):
 
         # Defense-in-depth: reject author that equals series name (Skaldleita #90)
         # Corrupt data can arrive from any source, not just BookDB
-        if (self.author.value and self.series.value and
-                str(self.author.value).lower().strip() == str(self.series.value).lower().strip()):
+        author_normalized = str(self.author.value).lower().strip() if self.author.value else None
+        series_normalized = str(self.series.value).lower().strip() if self.series.value else None
+        if author_normalized and series_normalized and author_normalized == series_normalized:
             bad_author = self.author.value
             logger.warning(f"[PROFILE] Series-as-author detected: '{bad_author}' == '{self.series.value}', finding alternative")
             # Try to find an alternative author from the raw values
diff --git a/library_manager/providers/bookdb.py b/library_manager/providers/bookdb.py
index 1335214..7d593f6 100644
--- a/library_manager/providers/bookdb.py
+++ b/library_manager/providers/bookdb.py
@@ -222,8 +222,9 @@ def search_bookdb(title, author=None, api_key=None, retry_count=0, bookdb_url=No
             'confidence': data.get('confidence', 0)
         }
 
-        # Defensive: Skaldleita bug #90 - series name imported as author entity
-        # e.g. author "Laundry Files" when it should be "Charles Stross"
+        # Defense-in-depth: also checked in BookProfile.finalize(), but catching
+        # here prevents bad data propagation through cache and downstream layers
+        # Skaldleita bug #90 - series name imported as author entity
         if result.get('author') and result.get('series'):
             if result['author'].lower().strip() == result['series'].lower().strip():
                 logger.warning(f"[BOOKDB] Corrupt data: author '{result['author']}' equals series name, discarding")
diff --git a/library_manager/utils/path_safety.py b/library_manager/utils/path_safety.py
index c9fc829..c61624e 100644
--- a/library_manager/utils/path_safety.py
+++ b/library_manager/utils/path_safety.py
@@ -1,5 +1,4 @@
 """Path sanitization and building utilities."""
-import os
 import re
 import logging
 from difflib import SequenceMatcher
@@ -338,8 +337,9 @@ def find_existing_author_folder(lib_path, target_author) -> Optional[str]:
             return None
 
         # List only top-level directories
-        existing_dirs = [d for d in os.listdir(lib) if os.path.isdir(lib / d)]
-    except OSError:
+        existing_dirs = [d.name for d in lib.iterdir() if d.is_dir()]
+    except OSError as e:
+        logger.debug(f"Error listing library directory {lib_path}: {e}")
         return None
 
     if not existing_dirs: