Skip to content

Commit 7ef25b2

Browse files
committed
feat: update grammar to use full utf-8 range
1 parent 29a49d3 commit 7ef25b2

File tree

2 files changed

+54
-2
lines changed

2 files changed

+54
-2
lines changed

common/define-grammar.js

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ module.exports = function defineGrammar(dialect) {
109109
extras: $ => {
110110
const extras = [
111111
$.comment,
112-
/[\s\uFEFF\u2060\u200B\u00A0]/,
112+
/[\s\u00A0\u200B\u2060\uFEFF]/,
113113
];
114114

115115
if (dialect === 'php') {
@@ -1517,7 +1517,7 @@ module.exports = function defineGrammar(dialect) {
15171517
$._expression,
15181518
),
15191519

1520-
name: _ => /[_a-zA-Z\u00A1-\u00ff][_a-zA-Z\u00A1-\u00ff\d]*/,
1520+
name: _ => name(),
15211521

15221522
_reserved_identifier: _ => choice(
15231523
'self',
@@ -1605,3 +1605,21 @@ function pipeSep1(rule) {
16051605
function ampSep1(rule) {
16061606
return seq(rule, repeat(seq(token('&'), rule)));
16071607
}
1608+
1609+
/**
1610+
* Creates a regex that matches PHP identifiers.
1611+
*
1612+
* Note that PHP officially only supports the following character regex
1613+
* for identifiers: ^[a-zA-Z_\x80-\xff][a-zA-Z0-9_\x80-\xff]*$
1614+
* However, there is a "bug" in how PHP parses multi-byte characters that allows
1615+
* for a much larger range of characters to be used in identifiers.
1616+
1617+
* @see https://www.php.net/manual/en/language.variables.basics.php
1618+
*
1619+
* @return {RexExp}
1620+
*/
1621+
function name() {
1622+
// We need to side step around the whitespace characters in the extras array.
1623+
const range = String.raw`\u0080-\u009f\u00a1-\u200a\u200c-\u205f\u2061-\ufefe\uff00-\uffff`;
1624+
return new RegExp(`[_a-zA-Z${range}][_a-zA-Z${range}\\d]*`);
1625+
}

common/test/corpus/expressions.txt

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,3 +1558,37 @@ Foo::{$bar};
15581558
(class_constant_access_expression
15591559
(name)
15601560
(name (variable_name (name))))))
1561+
1562+
===============================================
1563+
UTF-8 identifiers
1564+
===============================================
1565+
1566+
<?php
1567+
$漢字;
1568+
1569+
<<<漢漢字
1570+
This is a heredoc
1571+
漢漢字;
1572+
1573+
<<<'字漢'
1574+
This is a nowdoc.
1575+
字漢;
1576+
1577+
---
1578+
1579+
(program
1580+
(php_tag)
1581+
(expression_statement
1582+
(variable_name (name)))
1583+
(expression_statement
1584+
(heredoc
1585+
(heredoc_start)
1586+
(heredoc_body
1587+
(string_value))
1588+
(heredoc_end)))
1589+
(expression_statement
1590+
(nowdoc
1591+
(heredoc_start)
1592+
(nowdoc_body
1593+
(nowdoc_string))
1594+
(heredoc_end))))

0 commit comments

Comments
 (0)