Skip to content

Commit 114ca4e

Browse files
committed
59: add support for \uXXXX escapes within string literals
This is in response to edn-format/edn#65 . This is an extension as string literals as currently documented do not specify support for \uXXXX escapes. https://github.com/edn-format/edn/tree/a51127aecd318096667ae0dafa25353ecb07c9c3 Syntax Notes: - Unicode escape must begin with "\u". This is case sensitive "\U" will be rejected. - "\u" must be followed by exactly four hex digits taken from this set: 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F - The digits are not case sensitive. - Each such Unicode escape encodes a single 16-bit Java char. Since Java uses UTF-16 internally (for historical reasons) code points beyond the basic multilingual plane as a pair of unicode escapes. (see also "surrogate pairs") Disabling: By default \uXXXX escapes are now supported in String literals. Parser.Config (and Parser.Config.Builder) now support a flag which can be set to false to disable support for \uXXXX in string literals. This restores the old behavior of throwing an EdnSyntaxException when such escapes are encountered.
1 parent a489fb1 commit 114ca4e

File tree

4 files changed

+101
-3
lines changed

4 files changed

+101
-3
lines changed

src/main/java/us/bpsm/edn/parser/Parser.java

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,25 @@ public interface Config {
248248
*/
249249
public TagHandler getTagHandler(Tag tag);
250250

251+
/**
252+
* When true, the parser will accept ∖uXXXX escape sequences in string
253+
* literals and replace them with the corresponding java char in the
254+
* parsed string. When false, such escape sequences will throw an.
255+
* <p>
256+
* The default is true, which is not in strict accodance with the
257+
* letter of edn-format/README, but:
258+
* <ul>
259+
* <li>Clojure's own edn reader behaves in this way.</li>
260+
* <li>Character literals do allow this syntax according to
261+
* edn-format/README</li>
262+
* </ul>
263+
* {@link EdnSyntaxException}.
264+
* @return
265+
*/
266+
public default boolean unicodeEscapesInStringLiteralsAreAccepted() {
267+
return true;
268+
}
269+
251270
/**
252271
* This Builder is used to create a {@link Parser.Config}.
253272
* Fresh Builder instances are provided by
@@ -326,6 +345,16 @@ public interface Builder {
326345
*/
327346
public Builder putTagHandler(Tag tag, TagHandler handler);
328347

348+
/**
349+
* Toggle the Parser's willingness to accept unicode escapes
350+
* in string literals. By default unicode escapes will be
351+
* accepted.
352+
* {@link Config#unicodeEscapesInStringLiteralsAreAccepted()}
353+
*/
354+
public Builder acceptUnicodeEscapesInStringLiterals(
355+
boolean acceptUnicodeEscapes
356+
);
357+
329358
/**
330359
* Build and return the {@link Config} described by the
331360
* sequence of calls made on this Builder. Calling

src/main/java/us/bpsm/edn/parser/Parsers.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ public void unread(int ch) throws IOException {
196196
public static Builder newParserConfigBuilder() {
197197
return new Builder() {
198198
boolean used = false;
199+
boolean acceptUnicodeEscapes = true;
199200
CollectionBuilder.Factory listFactory = DEFAULT_LIST_FACTORY;
200201
CollectionBuilder.Factory vectorFactory = DEFAULT_VECTOR_FACTORY;
201202
CollectionBuilder.Factory setFactory = DEFAULT_SET_FACTORY;
@@ -232,6 +233,13 @@ public Builder putTagHandler(Tag tag, TagHandler handler) {
232233
return this;
233234
}
234235

236+
@Override
237+
public Builder acceptUnicodeEscapesInStringLiterals(boolean acceptUnicodeEscapes) {
238+
checkState();
239+
this.acceptUnicodeEscapes = acceptUnicodeEscapes;
240+
return this;
241+
}
242+
235243
public Config build() {
236244
checkState();
237245
used = true;
@@ -255,6 +263,11 @@ public Factory getMapFactory() {
255263
public TagHandler getTagHandler(Tag tag) {
256264
return tagHandlers.get(tag);
257265
}
266+
267+
@Override
268+
public boolean unicodeEscapesInStringLiteralsAreAccepted() {
269+
return acceptUnicodeEscapes;
270+
}
258271
};
259272
}
260273

src/main/java/us/bpsm/edn/parser/ScannerImpl.java

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,7 @@
77
import static us.bpsm.edn.parser.Parser.Config.BIG_INTEGER_TAG;
88
import static us.bpsm.edn.parser.Parser.Config.DOUBLE_TAG;
99
import static us.bpsm.edn.parser.Parser.Config.LONG_TAG;
10-
import static us.bpsm.edn.util.CharClassify.isDigit;
11-
import static us.bpsm.edn.util.CharClassify.isWhitespace;
12-
import static us.bpsm.edn.util.CharClassify.separatesTokens;
10+
import static us.bpsm.edn.util.CharClassify.*;
1311

1412
import java.io.IOException;
1513
import java.math.BigDecimal;
@@ -35,6 +33,7 @@ class ScannerImpl implements Scanner {
3533
private final TagHandler bigDecimalHandler;
3634
private final TagHandler bigIntegerHandler;
3735
private final TagHandler doubleHandler;
36+
private final boolean unicodeEscapesInStringLiteralsAreAccepted;
3837

3938
/**
4039
* Scanner may throw an IOException during construction, in which case
@@ -50,6 +49,8 @@ class ScannerImpl implements Scanner {
5049
this.bigIntegerHandler = cfg.getTagHandler(BIG_INTEGER_TAG);
5150
this.doubleHandler = cfg.getTagHandler(DOUBLE_TAG);
5251
this.bigDecimalHandler = cfg.getTagHandler(BIG_DECIMAL_TAG);
52+
this.unicodeEscapesInStringLiteralsAreAccepted =
53+
cfg.unicodeEscapesInStringLiteralsAreAccepted();
5354
}
5455

5556
/* (non-Javadoc)
@@ -377,6 +378,31 @@ private String readStringLiteral(Parseable pbr) throws IOException {
377378
case '\\':
378379
b.append('\\');
379380
break;
381+
case 'u':
382+
if (!unicodeEscapesInStringLiteralsAreAccepted) {
383+
throw new EdnSyntaxException(
384+
"Unsupported '" + ((char) curr)
385+
+ "' escape in string. "
386+
+ "(Unicode escapes disabled by Parser.Config)"
387+
);
388+
}
389+
/*
390+
2020-05-01 Support for reading unicode escapes within
391+
string literals is an extension to EDN. It is not part of
392+
the spec described here: https://github.com/edn-format/edn
393+
*/
394+
int v = 0;
395+
for (int i = 0; i < 4; i++) {
396+
curr = pbr.read();
397+
int d = Character.digit(curr, 16);
398+
if (d == -1) {
399+
throw new EdnSyntaxException(
400+
"Invalid \\u Unicode escape in string.");
401+
}
402+
v = v * 16 + d;
403+
}
404+
b.append((char)v);
405+
break;
380406
default:
381407
throw new EdnSyntaxException("Unsupported '"+ ((char)curr)
382408
+"' escape in string");

src/test/java/us/bpsm/edn/parser/ScannerTest.java

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import org.junit.Test;
1414

1515
import us.bpsm.edn.EdnException;
16+
import us.bpsm.edn.EdnSyntaxException;
1617
import us.bpsm.edn.Keyword;
1718
import us.bpsm.edn.Symbol;
1819
import us.bpsm.edn.parser.Parseable;
@@ -366,6 +367,35 @@ public void unicodeEscapeCharacterLiterals() {
366367
assertEquals(c, s.nextToken(pbr));
367368
}
368369

370+
@Test
371+
public void unicodeEscapesInStringLiterals() {
372+
String txt = "\"" +
373+
"\\" + "u0000" +
374+
"\\" + "u1234" +
375+
"\\" + "u0Ff0" +
376+
"\"";
377+
String expected = "\u0000\u1234\u0Ff0";
378+
assertEquals(3, expected.length());
379+
Parseable pbr = Parsers.newParseable(txt);
380+
Scanner s = scanner();
381+
assertEquals(expected, s.nextToken(pbr));
382+
}
383+
384+
@Test(expected = EdnSyntaxException.class)
385+
public void truncatedUnicodeEscapeInStringLiteral() {
386+
scanner().nextToken(Parsers.newParseable("\"\\" + "u123\""));
387+
}
388+
389+
@Test(expected = EdnSyntaxException.class)
390+
public void truncatedInputInUnicodeEscapeInStringLiteral() {
391+
scanner().nextToken(Parsers.newParseable("\"\\" + "u123"));
392+
}
393+
394+
@Test(expected = EdnSyntaxException.class)
395+
public void nonDigitInUnicodeEscapeInStringLiteral() {
396+
scanner().nextToken(Parsers.newParseable("\"\\" + "u123?\""));
397+
}
398+
369399
@Test
370400
public void simpleStringWithLinebreak() {
371401
assertEquals("\n", scan("\"\n\""));

0 commit comments

Comments
 (0)