Skip to content

Commit

Permalink
refactor: Make the grammar stricter for unicode code point escapes (#358
Browse files Browse the repository at this point in the history
)
  • Loading branch information
pdubroy authored Jan 29, 2022
1 parent 684bab4 commit 164b642
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 23 deletions.
2 changes: 1 addition & 1 deletion packages/ohm-js/dist/ohm-grammar.js

Large diffs are not rendered by default.

19 changes: 2 additions & 17 deletions packages/ohm-js/src/common.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ exports.abstract = function(optMethodName) {

exports.assert = function(cond, message) {
if (!cond) {
throw new Error(message);
throw new Error(message || 'Assertion failed');
}
};

Expand Down Expand Up @@ -134,24 +134,9 @@ exports.StringBuffer.prototype.contents = function() {
return this.strings.join('');
};

// Character escaping and unescaping

exports.escapeChar = function(c, optDelim) {
const charCode = c.charCodeAt(0);
if ((c === '"' || c === "'") && optDelim && c !== optDelim) {
return c;
} else if (charCode < 128) {
return escapeStringFor[charCode];
} else if (128 <= charCode && charCode < 256) {
return '\\x' + exports.padLeft(charCode.toString(16), 2, '0');
} else {
return '\\u' + exports.padLeft(charCode.toString(16), 4, '0');
}
};

const escapeUnicode = str => String.fromCodePoint(parseInt(str, 16));

exports.unescapeChar = function(s) {
exports.unescapeCodePoint = function(s) {
if (s.charAt(0) === '\\') {
switch (s.charAt(1)) {
case 'b':
Expand Down
17 changes: 15 additions & 2 deletions packages/ohm-js/src/errors.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
// Imports
// --------------------------------------------------------------------

const pexprs = require('./pexprs-main');

const {assert} = require('./common');
const Namespace = require('./Namespace');
const pexprs = require('./pexprs-main');

// --------------------------------------------------------------------
// Private stuff
Expand Down Expand Up @@ -209,6 +209,18 @@ function multipleSuperSplices(expr) {
return createError("'...' can appear at most once in a rule body", expr.source);
}

// Unicode code point escapes

function invalidCodePoint(applyWrapper) {
const node = applyWrapper._node;
assert(node && node.isNonterminal() && node.ctorName === 'escapeChar_unicodeCodePoint');

// Get an interval that covers all of the hex digits.
const digitIntervals = applyWrapper.children.slice(1, -1).map(d => d.source);
const fullInterval = digitIntervals[0].coverageWith(...digitIntervals.slice(1));
return createError(`U+${fullInterval.contents} is not a valid Unicode code point`, fullInterval);
}

// ----------------- Kleene operators -----------------

function kleeneExprHasNullableOperand(kleeneExpr, applicationStack) {
Expand Down Expand Up @@ -314,6 +326,7 @@ module.exports = {
inconsistentArity,
incorrectArgumentType,
intervalSourcesDontMatch,
invalidCodePoint,
invalidConstructorCall,
invalidParameter,
grammarSyntaxError,
Expand Down
11 changes: 9 additions & 2 deletions packages/ohm-js/src/main.js
Original file line number Diff line number Diff line change
Expand Up @@ -226,8 +226,15 @@ function buildGrammar(match, namespace, optOhmGrammarForTesting) {
return c.visit();
},

terminalChar(_) {
return common.unescapeChar(this.sourceString);
escapeChar(c) {
try {
return common.unescapeCodePoint(this.sourceString);
} catch (err) {
if (err instanceof RangeError && err.message.startsWith('Invalid code point ')) {
throw errors.invalidCodePoint(c);
}
throw err; // Rethrow
}
},

NonemptyListOf(x, _, xs) {
Expand Down
3 changes: 2 additions & 1 deletion packages/ohm-js/src/ohm-grammar.ohm
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ Ohm {
| "\\n" -- lineFeed
| "\\r" -- carriageReturn
| "\\t" -- tab
| "\\u{" hexDigit+ "}" -- unicodeCodePoint
| "\\u{" hexDigit hexDigit? hexDigit?
hexDigit? hexDigit? hexDigit? "}" -- unicodeCodePoint
| "\\u" hexDigit hexDigit hexDigit hexDigit -- unicodeEscape
| "\\x" hexDigit hexDigit -- hexEscape

Expand Down
9 changes: 9 additions & 0 deletions packages/ohm-js/test/test-ohm-syntax.js
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ test('unicode code point escapes', t => {
assertSucceeds(t, ohm.grammar(String.raw`G { start = "\u{1F920}" }`).match('🤠'));
assertSucceeds(t, ohm.grammar(String.raw`G { start = "🤠" }`).match('🤠'));
assertSucceeds(t, ohm.grammar(String.raw`G { a = "😬" b="🤠" }`).match('🤠', 'b'));

// More than 6 hex digits is just a parse error. (We'd like to make this nicer.)
t.throws(() => ohm.grammar(String.raw`G { start = "\u{0000000} }`), {
message: /Expected "\\"" or not "\\\\"/,
});

t.throws(() => ohm.grammar('G { start = "\\u{FFFFFF}" }'), {
message: /U\+FFFFFF is not a valid Unicode code point/,
});
});

describe('unicode', test => {
Expand Down
6 changes: 6 additions & 0 deletions packages/ohm-js/test/test-recipes.js
Original file line number Diff line number Diff line change
Expand Up @@ -292,3 +292,9 @@ test('semantics recipes w/ method shorthand', t => {
});
t.truthy(makeRecipe(s2.toRecipe()), 'recipe with an unusual unicode char');
});

test('recipes with astral plane code units', t => {
const g = ohm.grammar(String.raw`G { start = "\u{1F920}" }`);
t.truthy(
ohm.makeRecipe(g.toRecipe()).match('🤠').succeeded());
});

0 comments on commit 164b642

Please sign in to comment.