Fix incorrect PGN parsing of move number tokens like "11... c5??" (WIP)

This commit adds a small patch to detect tokens that represent a move number, such as "11." or "11...", and store them temporarily. The next real move token (e.g. "c5??") is then combined with the pending move number to form "11...c5??", ensuring the move is not treated as a separate root-level node. This prevents parsing errors in PGN trees where moves like "11... c5??" were incorrectly inserted alongside sibling moves, instead of as a child in the correct place.
lichess-org · Feb 3, 2025 · 1c1fa0d · 1c1fa0d
1 parent fdefa32
commit 1c1fa0d
Show file tree

Hide file tree

Showing 7 changed files with 102 additions and 6 deletions.
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/dartchess.iml b/.idea/dartchess.iml
diff --git a/.idea/material_theme_project_new.xml b/.idea/material_theme_project_new.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/lib/src/pgn.dart b/lib/src/pgn.dart
@@ -786,7 +786,7 @@ class _PgnParser {
               if (_isWhitespace(line) || _isCommentLine(line)) return;
             }
             final tokenRegex = RegExp(
-                r'(?:[NBKRQ]?[a-h]?[1-8]?[-x]?[a-h][1-8](?:=?[nbrqkNBRQK])?|[pnbrqkPNBRQK]?@[a-h][1-8]|O-O-O|0-0-0|O-O|0-0)[+#]?|--|Z0|0000|@@@@|{|;|\$\d{1,4}|[?!]{1,2}|\(|\)|\*|1-0|0-1|1\/2-1\/2/');
+                r'(?:[NBKRQ]?[a-h]?[1-8]?[-x]?[a-h][1-8](?:=?[nbrqkNBRQK])?|[pnbrqkPNBRQK]?@[a-h][1-8]|O-O-O|0-0-0|O-O|0-0)[+#]?|--|Z0|0000|@@@@|{|;|\$\d{1,4}|[?!]{1,2}|\(|\)|\*|1-0|0-1|1\/2-1\/2|\d+\.+\S*');
             final matches = tokenRegex.allMatches(line);
             for (final match in matches) {
               final frame = _stack[_stack.length - 1];
@@ -831,16 +831,39 @@ class _PgnParser {
                   }
                   continue continuedLine;
                 } else {
-                  if (token == 'Z0' || token == '0000' || token == '@@@@') {
+                  // If token includes something like "1.e4" or "2...Nc3", strip the leading digits and dots.
+                  // "2...Nc3" => strip "2...", left with "Nc3"
+                  // "1.e4"    => strip "1.", left with "e4"
+                  // If the remainder is empty or purely dots, skip it.
+                  final moveToken = _stripMoveNumberPrefix(token);
+                  if (moveToken == null || moveToken.isEmpty) {
+                    // skip this token
+                    continue;
+                  }
+                  // Also handle weird placeholders like "Z0" => convert to "--"
+                  if (moveToken == 'Z0' ||
+                      moveToken == '0000' ||
+                      moveToken == '@@@@') {
                     token = '--';
-                  } else if (token.startsWith('0')) {
-                    token = token.replaceAll('0', 'O');
+                  } else if (moveToken.startsWith('0')) {
+                    // "0-0" => "O-O" or "0-0-0" => "O-O-O" is handled above in the pattern
+                    // but "0blabla"? This code historically replaced leading '0' with 'O'
+                    // We'll do the same logic:
+                    token = moveToken.replaceAll('0', 'O');
+                  } else {
+                    token = moveToken;
                   }
+
+                  // Create new node with this SAN
                   if (frame.node != null) {
                     frame.parent = frame.node!;
                   }
-                  frame.node = PgnChildNode(PgnNodeData(
-                      san: token, startingComments: frame.startingComments));
+                  frame.node = PgnChildNode(
+                    PgnNodeData(
+                      san: token,
+                      startingComments: frame.startingComments,
+                    ),
+                  );
                   frame.startingComments = null;
                   frame.root = false;
                   frame.parent.children.add(frame.node!);
@@ -871,6 +894,33 @@ class _PgnParser {
     }
   }
 
+  /// Strip leading "<digits>.<dots>" from tokens like "1.e4", "1...Nc3"
+  /// so that the SAN stored is "e4" or "Nc3".
+  ///
+  /// Example:
+  ///   "1.e4" => "e4"
+  ///   "2...Nc3" => "Nc3"
+  ///   "11...a1??" => "a1??"
+  ///
+  /// If the remainder is purely dots or empty, return null so we skip it.
+  String? _stripMoveNumberPrefix(String token) {
+    // This pattern captures optional digits+dot sequences like "12...", leaving rest
+    final re = RegExp(r'^(\d+\.+)(.*)$');
+    final match = re.firstMatch(token);
+    if (match == null) {
+      // no leading "<digits>."
+      return token;
+    }
+    // group2 => the portion after the leading digits/dots
+    final remainder = match.group(2) ?? '';
+    final trimmed = remainder.replaceFirst(RegExp(r'^\.+'), '');
+    // if trimmed is empty or still dots, skip
+    if (trimmed.trim().isEmpty || trimmed.trim() == '...') {
+      return null;
+    }
+    return trimmed;
+  }
+
   void _handleNag(int nag) {
     final frame = _stack[_stack.length - 1];
     if (frame.node != null) {