refactor: Make the grammar stricter for unicode code point escapes (#358

)
ohmjs · Jan 29, 2022 · 164b642 · 164b642
1 parent 684bab4
commit 164b642
Show file tree

Hide file tree

Showing 7 changed files with 44 additions and 23 deletions.
diff --git a/packages/ohm-js/dist/ohm-grammar.js b/packages/ohm-js/dist/ohm-grammar.js
diff --git a/packages/ohm-js/src/common.js b/packages/ohm-js/src/common.js
@@ -40,7 +40,7 @@ exports.abstract = function(optMethodName) {
 
 exports.assert = function(cond, message) {
   if (!cond) {
-    throw new Error(message);
+    throw new Error(message || 'Assertion failed');
   }
 };
 
@@ -134,24 +134,9 @@ exports.StringBuffer.prototype.contents = function() {
   return this.strings.join('');
 };
 
-// Character escaping and unescaping
-
-exports.escapeChar = function(c, optDelim) {
-  const charCode = c.charCodeAt(0);
-  if ((c === '"' || c === "'") && optDelim && c !== optDelim) {
-    return c;
-  } else if (charCode < 128) {
-    return escapeStringFor[charCode];
-  } else if (128 <= charCode && charCode < 256) {
-    return '\\x' + exports.padLeft(charCode.toString(16), 2, '0');
-  } else {
-    return '\\u' + exports.padLeft(charCode.toString(16), 4, '0');
-  }
-};
-
 const escapeUnicode = str => String.fromCodePoint(parseInt(str, 16));
 
-exports.unescapeChar = function(s) {
+exports.unescapeCodePoint = function(s) {
   if (s.charAt(0) === '\\') {
     switch (s.charAt(1)) {
       case 'b':

diff --git a/packages/ohm-js/src/errors.js b/packages/ohm-js/src/errors.js
@@ -4,9 +4,9 @@
 // Imports
 // --------------------------------------------------------------------
 
-const pexprs = require('./pexprs-main');
-
+const {assert} = require('./common');
 const Namespace = require('./Namespace');
+const pexprs = require('./pexprs-main');
 
 // --------------------------------------------------------------------
 // Private stuff
@@ -209,6 +209,18 @@ function multipleSuperSplices(expr) {
   return createError("'...' can appear at most once in a rule body", expr.source);
 }
 
+// Unicode code point escapes
+
+function invalidCodePoint(applyWrapper) {
+  const node = applyWrapper._node;
+  assert(node && node.isNonterminal() && node.ctorName === 'escapeChar_unicodeCodePoint');
+
+  // Get an interval that covers all of the hex digits.
+  const digitIntervals = applyWrapper.children.slice(1, -1).map(d => d.source);
+  const fullInterval = digitIntervals[0].coverageWith(...digitIntervals.slice(1));
+  return createError(`U+${fullInterval.contents} is not a valid Unicode code point`, fullInterval);
+}
+
 // ----------------- Kleene operators -----------------
 
 function kleeneExprHasNullableOperand(kleeneExpr, applicationStack) {
@@ -314,6 +326,7 @@ module.exports = {
   inconsistentArity,
   incorrectArgumentType,
   intervalSourcesDontMatch,
+  invalidCodePoint,
   invalidConstructorCall,
   invalidParameter,
   grammarSyntaxError,

diff --git a/packages/ohm-js/src/main.js b/packages/ohm-js/src/main.js
@@ -226,8 +226,15 @@ function buildGrammar(match, namespace, optOhmGrammarForTesting) {
       return c.visit();
     },
 
-    terminalChar(_) {
-      return common.unescapeChar(this.sourceString);
+    escapeChar(c) {
+      try {
+        return common.unescapeCodePoint(this.sourceString);
+      } catch (err) {
+        if (err instanceof RangeError && err.message.startsWith('Invalid code point ')) {
+          throw errors.invalidCodePoint(c);
+        }
+        throw err; // Rethrow
+      }
     },
 
     NonemptyListOf(x, _, xs) {

diff --git a/packages/ohm-js/src/ohm-grammar.ohm b/packages/ohm-js/src/ohm-grammar.ohm
@@ -102,7 +102,8 @@ Ohm {
     | "\\n"                                      -- lineFeed
     | "\\r"                                      -- carriageReturn
     | "\\t"                                      -- tab
-    | "\\u{" hexDigit+ "}"                       -- unicodeCodePoint
+    | "\\u{" hexDigit hexDigit? hexDigit?
+             hexDigit? hexDigit? hexDigit? "}"   -- unicodeCodePoint
     | "\\u" hexDigit hexDigit hexDigit hexDigit  -- unicodeEscape
     | "\\x" hexDigit hexDigit                    -- hexEscape
 

diff --git a/packages/ohm-js/test/test-ohm-syntax.js b/packages/ohm-js/test/test-ohm-syntax.js
@@ -117,6 +117,15 @@ test('unicode code point escapes', t => {
   assertSucceeds(t, ohm.grammar(String.raw`G { start = "\u{1F920}" }`).match('🤠'));
   assertSucceeds(t, ohm.grammar(String.raw`G { start = "🤠" }`).match('🤠'));
   assertSucceeds(t, ohm.grammar(String.raw`G { a = "😬" b="🤠" }`).match('🤠', 'b'));
+
+  // More than 6 hex digits is just a parse error. (We'd like to make this nicer.)
+  t.throws(() => ohm.grammar(String.raw`G { start = "\u{0000000} }`), {
+    message: /Expected "\\"" or not "\\\\"/,
+  });
+
+  t.throws(() => ohm.grammar('G { start = "\\u{FFFFFF}" }'), {
+    message: /U\+FFFFFF is not a valid Unicode code point/,
+  });
 });
 
 describe('unicode', test => {

diff --git a/packages/ohm-js/test/test-recipes.js b/packages/ohm-js/test/test-recipes.js
@@ -292,3 +292,9 @@ test('semantics recipes w/ method shorthand', t => {
   });
   t.truthy(makeRecipe(s2.toRecipe()), 'recipe with an unusual unicode char');
 });
+
+test('recipes with astral plane code units', t => {
+  const g = ohm.grammar(String.raw`G { start = "\u{1F920}" }`);
+  t.truthy(
+      ohm.makeRecipe(g.toRecipe()).match('🤠').succeeded());
+});