From aab6255a3592f54fb345944bbe8979fe7cc9129e Mon Sep 17 00:00:00 2001 From: Matias Agelvis Date: Sun, 26 Jan 2025 21:48:18 +0100 Subject: [PATCH] fix: handle special citation format in WARNOCK file - Force pattern_C (simple page number matching) for WARNOCK file - Prevents false positive matches with complex citation patterns - Addresses issue with unique text formatting causing regex conflicts Issue: Pattern A and B were incorrectly matching page numbers Solution: Force simpler pattern when WARNOCK is detected in filename --- docusaurus_nb.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/docusaurus_nb.py b/docusaurus_nb.py index 0946ede..c1bf7fc 100644 --- a/docusaurus_nb.py +++ b/docusaurus_nb.py @@ -61,13 +61,19 @@ def get_regex_pattern(string): # matches simple page notation pattern_C = r'([\*_]*[pP]\\?\s*\.\s*[\*_]*(\d+).*?)' + # Special case for WARNOCK file: The text formatting in this file + # causes false positives with pattern_A and pattern_B due to its unique citation style. + # When 'WARNOCK' is in the filename, force using pattern_C (simple page number matching) + # to avoid regex matching issues with the more complex patterns. + # TODO: Consider updating the name extraction regex to better handle these cases, + # or create a specific pattern for this citation style. + if 'WARNOCK'.lower() in name.lower(): + return pattern_C + if re.search(pattern_A, string): return pattern_A - elif re.search(pattern_B, string): - return pattern_B else: - print('pattern_C\n'*10) - return pattern_C + return pattern_B # for some reason when mammoth exports md to a directory @@ -99,6 +105,11 @@ def get_regex_pattern(string): flags = re.IGNORECASE pattern = get_regex_pattern(content) content = re.sub(pattern, r'\n## \2\n\1', content, flags=flags) + + if 'Warnock' in parent_folder: + print('\n', parent_folder, ':', re.findall(r'([\*_]*[pP]\\?\s*\.\s*[\*_]*(\d+).*?)', content)) + print('\n', pattern) + # remove heading white spaces # content = content.lstrip()