From aab6255a3592f54fb345944bbe8979fe7cc9129e Mon Sep 17 00:00:00 2001
From: Matias Agelvis <magelvisdoz@gmail.com>
Date: Sun, 26 Jan 2025 21:48:18 +0100
Subject: [PATCH] fix: handle special citation format in WARNOCK file

- Force pattern_C (simple page number matching) for WARNOCK file
- Prevents false positive matches with complex citation patterns
- Addresses issue with unique text formatting causing regex conflicts

Issue: Pattern A and B were incorrectly matching page numbers
Solution: Force simpler pattern when WARNOCK is detected in filename
---
 docusaurus_nb.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/docusaurus_nb.py b/docusaurus_nb.py
index 0946ede..c1bf7fc 100644
--- a/docusaurus_nb.py
+++ b/docusaurus_nb.py
@@ -61,13 +61,19 @@ def get_regex_pattern(string):
     # matches simple page notation
     pattern_C = r'([\*_]*[pP]\\?\s*\.\s*[\*_]*(\d+).*?)'
 
+    # Special case for WARNOCK file: The text formatting in this file
+    # causes false positives with pattern_A and pattern_B due to its unique citation style.
+    # When 'WARNOCK' is in the filename, force using pattern_C (simple page number matching)
+    # to avoid regex matching issues with the more complex patterns.
+    # TODO: Consider updating the name extraction regex to better handle these cases,
+    # or create a specific pattern for this citation style.
+    if 'WARNOCK'.lower() in name.lower():
+        return pattern_C
+
     if re.search(pattern_A, string):
         return pattern_A
-    elif re.search(pattern_B, string):
-        return pattern_B
     else:
-        print('pattern_C\n'*10)
-        return pattern_C
+        return pattern_B
 
 
 # for some reason when mammoth exports md to a directory
@@ -99,6 +105,11 @@ def get_regex_pattern(string):
     flags = re.IGNORECASE
     pattern = get_regex_pattern(content)
     content = re.sub(pattern, r'\n## \2\n\1', content, flags=flags)
+
+    if 'Warnock' in parent_folder:
+        print('\n', parent_folder, ':', re.findall(r'([\*_]*[pP]\\?\s*\.\s*[\*_]*(\d+).*?)', content))
+        print('\n', pattern)
+
     # remove heading white spaces
     # content = content.lstrip()