59: add support for \uXXXX escapes within string literals

bpsm · bpsm · commit 114ca4e29880 · 2020-05-01T18:18:24.000+02:00
This is in response to edn-format/edn#65 . This is an extension as string literals as currently documented do not specify support for \uXXXX escapes. https://github.com/edn-format/edn/tree/a51127aecd318096667ae0dafa25353ecb07c9c3 Syntax Notes: - Unicode escape must begin with "\u". This is case sensitive "\U" will be rejected. - "\u" must be followed by exactly four hex digits taken from this set: 0 1 2 3 4 5 6 7 8 9 a b c d e f A B C D E F - The digits are not case sensitive. - Each such Unicode escape encodes a single 16-bit Java char. Since Java uses UTF-16 internally (for historical reasons) code points beyond the basic multilingual plane as a pair of unicode escapes. (see also "surrogate pairs") Disabling: By default \uXXXX escapes are now supported in String literals. Parser.Config (and Parser.Config.Builder) now support a flag which can be set to false to disable support for \uXXXX in string literals. This restores the old behavior of throwing an EdnSyntaxException when such escapes are encountered.
diff --git a/src/main/java/us/bpsm/edn/parser/Parser.java b/src/main/java/us/bpsm/edn/parser/Parser.java
@@ -248,6 +248,25 @@ public interface Config {
          */
         public TagHandler getTagHandler(Tag tag);
 
+        /**
+         * When true, the parser will accept ∖uXXXX escape sequences in string
+         * literals and replace them with the corresponding java char in the
+         * parsed string. When false, such escape sequences will throw an.
+         * <p>
+         * The default is true, which is not in strict accodance with the
+         * letter of edn-format/README, but:
+         * <ul>
+         * <li>Clojure's own edn reader behaves in this way.</li>
+         * <li>Character literals do allow this syntax according to
+         *     edn-format/README</li>
+         * </ul>
+         * {@link EdnSyntaxException}.
+         * @return
+         */
+        public default boolean unicodeEscapesInStringLiteralsAreAccepted() {
+            return true;
+        }
+
         /**
          * This Builder is used to create a {@link Parser.Config}.
          * Fresh Builder instances are provided by
@@ -326,6 +345,16 @@ public interface Builder {
              */
             public Builder putTagHandler(Tag tag, TagHandler handler);
 
+            /**
+             * Toggle the Parser's willingness to accept unicode escapes
+             * in string literals. By default unicode escapes will be
+             * accepted.
+             * {@link Config#unicodeEscapesInStringLiteralsAreAccepted()}
+             */
+            public Builder acceptUnicodeEscapesInStringLiterals(
+              boolean acceptUnicodeEscapes
+            );
+
             /**
              * Build and return the {@link Config} described by the
              * sequence of calls made on this Builder. Calling
diff --git a/src/main/java/us/bpsm/edn/parser/Parsers.java b/src/main/java/us/bpsm/edn/parser/Parsers.java
@@ -196,6 +196,7 @@ public void unread(int ch) throws IOException {
     public static Builder newParserConfigBuilder() {
         return new Builder() {
             boolean used = false;
+            boolean acceptUnicodeEscapes = true;
             CollectionBuilder.Factory listFactory = DEFAULT_LIST_FACTORY;
             CollectionBuilder.Factory vectorFactory = DEFAULT_VECTOR_FACTORY;
             CollectionBuilder.Factory setFactory = DEFAULT_SET_FACTORY;
@@ -232,6 +233,13 @@ public Builder putTagHandler(Tag tag, TagHandler handler) {
                 return this;
             }
 
+            @Override
+            public Builder acceptUnicodeEscapesInStringLiterals(boolean acceptUnicodeEscapes) {
+                checkState();
+                this.acceptUnicodeEscapes = acceptUnicodeEscapes;
+                return this;
+            }
+
             public Config build() {
                 checkState();
                 used = true;
@@ -255,6 +263,11 @@ public Factory getMapFactory() {
                     public TagHandler getTagHandler(Tag tag) {
                         return tagHandlers.get(tag);
                     }
+
+                    @Override
+                    public boolean unicodeEscapesInStringLiteralsAreAccepted() {
+                        return acceptUnicodeEscapes;
+                    }
                 };
             }
 
diff --git a/src/main/java/us/bpsm/edn/parser/ScannerImpl.java b/src/main/java/us/bpsm/edn/parser/ScannerImpl.java
@@ -7,9 +7,7 @@
 import static us.bpsm.edn.parser.Parser.Config.BIG_INTEGER_TAG;
 import static us.bpsm.edn.parser.Parser.Config.DOUBLE_TAG;
 import static us.bpsm.edn.parser.Parser.Config.LONG_TAG;
-import static us.bpsm.edn.util.CharClassify.isDigit;
-import static us.bpsm.edn.util.CharClassify.isWhitespace;
-import static us.bpsm.edn.util.CharClassify.separatesTokens;
+import static us.bpsm.edn.util.CharClassify.*;
 
 import java.io.IOException;
 import java.math.BigDecimal;
@@ -35,6 +33,7 @@ class ScannerImpl implements Scanner {
     private final TagHandler bigDecimalHandler;
     private final TagHandler bigIntegerHandler;
     private final TagHandler doubleHandler;
+    private final boolean unicodeEscapesInStringLiteralsAreAccepted;
 
     /**
      * Scanner may throw an IOException during construction, in which case
@@ -50,6 +49,8 @@ class ScannerImpl implements Scanner {
         this.bigIntegerHandler = cfg.getTagHandler(BIG_INTEGER_TAG);
         this.doubleHandler = cfg.getTagHandler(DOUBLE_TAG);
         this.bigDecimalHandler = cfg.getTagHandler(BIG_DECIMAL_TAG);
+        this.unicodeEscapesInStringLiteralsAreAccepted =
+          cfg.unicodeEscapesInStringLiteralsAreAccepted();
     }
 
     /* (non-Javadoc)
@@ -377,6 +378,31 @@ private String readStringLiteral(Parseable pbr) throws IOException {
                 case '\\':
                     b.append('\\');
                     break;
+                case 'u':
+                    if (!unicodeEscapesInStringLiteralsAreAccepted) {
+                        throw new EdnSyntaxException(
+                          "Unsupported '" + ((char) curr)
+                            + "' escape in string. "
+                            + "(Unicode escapes disabled by Parser.Config)"
+                        );
+                    }
+                    /*
+                    2020-05-01 Support for reading unicode escapes within
+                    string literals is an extension to EDN. It is not part of
+                    the spec described here: https://github.com/edn-format/edn
+                    */
+                    int v = 0;
+                    for (int i = 0; i < 4; i++) {
+                        curr = pbr.read();
+                        int d = Character.digit(curr, 16);
+                        if (d == -1) {
+                            throw new EdnSyntaxException(
+                              "Invalid \\u Unicode escape in string.");
+                        }
+                        v = v * 16 + d;
+                    }
+                    b.append((char)v);
+                    break;
                 default:
                     throw new EdnSyntaxException("Unsupported '"+ ((char)curr)
                             +"' escape in string");
diff --git a/src/test/java/us/bpsm/edn/parser/ScannerTest.java b/src/test/java/us/bpsm/edn/parser/ScannerTest.java
@@ -13,6 +13,7 @@
 import org.junit.Test;
 
 import us.bpsm.edn.EdnException;
+import us.bpsm.edn.EdnSyntaxException;
 import us.bpsm.edn.Keyword;
 import us.bpsm.edn.Symbol;
 import us.bpsm.edn.parser.Parseable;
@@ -366,6 +367,35 @@ public void unicodeEscapeCharacterLiterals() {
             assertEquals(c, s.nextToken(pbr));
     }
 
+    @Test
+    public void unicodeEscapesInStringLiterals() {
+        String txt = "\"" +
+          "\\" + "u0000" +
+          "\\" + "u1234" +
+          "\\" + "u0Ff0" +
+          "\"";
+        String expected = "\u0000\u1234\u0Ff0";
+        assertEquals(3, expected.length());
+        Parseable pbr = Parsers.newParseable(txt);
+        Scanner s = scanner();
+        assertEquals(expected, s.nextToken(pbr));
+    }
+
+    @Test(expected = EdnSyntaxException.class)
+    public void truncatedUnicodeEscapeInStringLiteral() {
+        scanner().nextToken(Parsers.newParseable("\"\\" + "u123\""));
+    }
+
+    @Test(expected = EdnSyntaxException.class)
+    public void truncatedInputInUnicodeEscapeInStringLiteral() {
+        scanner().nextToken(Parsers.newParseable("\"\\" + "u123"));
+    }
+
+    @Test(expected = EdnSyntaxException.class)
+    public void nonDigitInUnicodeEscapeInStringLiteral() {
+        scanner().nextToken(Parsers.newParseable("\"\\" + "u123?\""));
+    }
+
     @Test
     public void simpleStringWithLinebreak() {
         assertEquals("\n", scan("\"\n\""));