Skip to content

Commit

Permalink
Better handling of Java escaped Strings, remove commons-text dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
mikera committed Oct 5, 2024
1 parent 1231dba commit 3c8477b
Show file tree
Hide file tree
Showing 7 changed files with 134 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
import convex.core.lang.reader.antlr.ConvexParser.TagContext;
import convex.core.lang.reader.antlr.ConvexParser.TaggedFormContext;
import convex.core.lang.reader.antlr.ConvexParser.VectorContext;
import convex.core.text.Text;
import convex.core.util.Utils;

public class AntlrReader {
Expand Down Expand Up @@ -446,7 +447,7 @@ public void exitString(StringContext ctx) {
String s=ctx.getStop().getText();
int n=s.length();
s=s.substring(1, n-1); // skip surrounding double quotes
s=ReaderUtils.unescapeString(s);
s=Text.unescapeJava(s);
push(Strings.create(s));
}

Expand Down
13 changes: 0 additions & 13 deletions convex-core/src/main/java/convex/core/lang/reader/ReaderUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

import java.util.HashMap;

import org.apache.commons.text.StringEscapeUtils; // TODO: Kill dependency?

import convex.core.data.ACell;
import convex.core.data.AHashMap;
import convex.core.data.AMap;
Expand Down Expand Up @@ -58,18 +56,7 @@ public static Symbol getQuotingSymbol(String s) {
return quotingSymbols.get(s);
}

/**
* Unescapes a string according to Java rules
* @param s String to unescape
* @return Unescaped string
*/
public static String unescapeString(String s) {
return StringEscapeUtils.unescapeJava(s);
}

public static String escapeString(String s) {
return StringEscapeUtils.escapeJava(s);
}

private static final HashMap<String,ACell> specialLiterals=Maps.hashMapOf(
"##NaN",CVMDouble.NaN,
Expand Down
85 changes: 85 additions & 0 deletions convex-core/src/main/java/convex/core/text/Text.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
import convex.core.Coin;
import convex.core.data.prim.CVMDouble;
import convex.core.data.util.BlobBuilder;
import convex.core.exceptions.TODOException;
import convex.core.util.Utils;

public class Text {
private static final int WHITESPACE_LENGTH = 32;
Expand Down Expand Up @@ -191,7 +193,90 @@ public static String repeat(char c, int count) {
Arrays.fill(cs,c);
return new String(cs);
}

public static String escapeJava(String s) {
throw new TODOException();
}

/**
* Unescapes a Java string
* @param st String to unescape
* @return Unescaped string, or null if not a valid Java String
*/
public static String unescapeJava(String st) {
StringBuilder sb = null;
int n=st.length();

for (int i = 0; i < n; i++) {
int startPos=i;
char ch = st.charAt(i);
if ((ch == '\\')&&(i+1<n)) {
char nextChar = st.charAt(i + 1);

// Check for octal escape, consumes 1-3 octal chars greedily
if (nextChar >= '0' && nextChar <= '7') {
int code=Utils.octalVal(nextChar);
for (int j=i+2; j<i+4; j++) {
if (j>=n) break;
int v=Utils.octalVal(st.charAt(j));
if (v<0) break; // no more octal
if (code>=32) break; // wouldn't be valid, JLS maximum octal value of 377
code=code*8+v;
i++;
}
ch=(char) code;
} else {
switch (nextChar) {
case '\\':
ch = '\\';
break;
case 'b':
ch = '\b';
break;
case 'f':
ch = '\f';
break;
case 'n':
ch = '\n';
break;
case 'r':
ch = '\r';
break;
case 't':
ch = '\t';
break;
case '\"':
ch = '\"';
break;
case '\'':
ch = '\'';
break;
// Hex Unicode: u????
case 'u':
if (i+6 > n) {
return null; // insufficient chars for unicode
}
int cp=0;
for (int j=0; j<4; j++) {
int v=Utils.hexVal(st.charAt(i+j+2));
if (v<0) { // not a hex value
return null;
}
cp=cp*16+Utils.hexVal(st.charAt(i+j+2));
}
if (sb==null) sb=new StringBuilder(st.substring(0, startPos));
sb.append(Character.toChars(cp));
i += 5; // skip extra 5 chars on top of loop increment
continue;
}
}
i++; // skip a char, since we consumed nextChar
}
// We are appending a single char
if (sb==null) sb=new StringBuilder(st.substring(0, startPos));
sb.append(ch);
}
return (sb==null)?st:sb.toString();
}

}
16 changes: 15 additions & 1 deletion convex-core/src/main/java/convex/core/util/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ public static BigInteger hexToBigInt(String hex) {
/**
* Gets the value of a single hex character e.g. hexVal('c') => 12
*
* @param c Character representing a hex digit
* @param c Character possibly representing a hex digit
* @return int in the range 0..15 inclusive, or -1 if not a hex char
*/
public static int hexVal(char c) {
Expand All @@ -303,6 +303,20 @@ public static int hexVal(char c) {

return -1;
}

/**
* Gets the value of a single octal character e.g. octalVal('6') => 6
*
* @param c Character possibly representing an octal digit
* @return int in the range 0..7 inclusive, or -1 if not an octal char
*/
public static int octalVal(char c) {
int v = (int) c;

if ((v<48)||(v>55)) return -1; // out of possible range

return v-48;
}

/**
* Converts a byte array of length N to a hex string of length 2N
Expand Down
1 change: 0 additions & 1 deletion convex-core/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
exports convex.dlfs;

requires transitive org.antlr.antlr4.runtime;
requires org.apache.commons.text;
requires org.bouncycastle.pkix;
requires transitive org.bouncycastle.provider;
requires org.bouncycastle.util;
Expand Down
13 changes: 12 additions & 1 deletion convex-core/src/test/java/convex/core/lang/ReaderTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import convex.core.data.prim.CVMDouble;
import convex.core.data.prim.CVMLong;
import convex.core.exceptions.ParseException;
import convex.core.text.Text;
import convex.test.Samples;

/**
Expand Down Expand Up @@ -243,7 +244,6 @@ public void testStrings() {

// Multi-line String
assertEquals(Strings.create("\n"), Reader.read("\"\n\""));

}

@Test
Expand Down Expand Up @@ -412,6 +412,17 @@ public void doIdempotencyTest(ACell cell) {
doReadPrintTest("^{} 0xa89e59cc8ab9fc6a13785a37938c85b306b24663415effc01063a6e25ef52ebcd3647d3a77e0a33908a372146fdccab6");
}

/**
* Test cases for strings with Java escapes
*/
@Test public void testJavaEscapes() {
doEscapeTest("!0\\","\\410\\");
}

private void doEscapeTest(String raw, String escaped) {
assertEquals(raw,Text.unescapeJava(escaped));
}

/**
* Test cases that should read and print identically
*/
Expand Down
20 changes: 20 additions & 0 deletions convex-core/src/test/java/convex/core/text/TextTest.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package convex.core.text;

import static org.junit.Assert.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

import java.text.ParseException;

Expand All @@ -21,6 +22,25 @@ public void testWhiteSpace() {
checkWhiteSpace(97);
checkWhiteSpace(100);
}

@Test
public void testUnescapeJava() {
assertEquals("foo",Text.unescapeJava("foo"));

assertEquals("\\",Text.unescapeJava("\\"));

assertEquals("zzAzzB",Text.unescapeJava("zz\\u0041zz\\u0042"));
assertNull(Text.unescapeJava("\\u"));
assertNull(Text.unescapeJava("\\u0x0x")); // not valid unicode
assertNull(Text.unescapeJava("\\u012")); // not valid unicode (only 3 chars)

// octal escapes
assertEquals("a%b",Text.unescapeJava("a\\45b"));
assertEquals("\19",Text.unescapeJava("\\19"));
assertEquals("\0\0",Text.unescapeJava("\\0\\0"));
assertEquals("\1\11\111",Text.unescapeJava("\\1\\11\\111"));
assertEquals("!0",Text.unescapeJava("\\410")); // Yeah, JLS has max octal value of 377. Don't ask why...
}

private void checkWhiteSpace(int len) {
String s = Text.whiteSpace(len);
Expand Down

0 comments on commit 3c8477b

Please sign in to comment.