billhails · billhails · Jun 30, 2024 · Jun 29, 2024 · Jun 29, 2024 · Jun 30, 2024
diff --git a/docs/TODO.md b/docs/TODO.md
@@ -31,7 +31,6 @@ More of a wish-list than a hard and fast plan.
    * Combining characters need special handling.
 * Much better error reporting.
 * Error recovery.
-* Fail on non-exhaustive pattern match (optional).
 * User definable operators.
    * With precedence and associativity.
    * `infix 55 left >>= fn(l, r) { ... }`
@@ -42,4 +41,3 @@ More of a wish-list than a hard and fast plan.
    * `alias some = maybe.some;`
    * `alias string = list(char);`
 * (internal) have a NEWZ variant of NEW that bzero's its result.
-* (internal) replace the CEKF support code with generated code.
diff --git a/docs/UNICODE.md b/docs/UNICODE.md
@@ -0,0 +1,56 @@
+# Plan for Unicode Support
+
+Keep it simple, UTF8 only
+
+The Value type has a type Character but this can be amended to a
+`wchar_t`.
+Although we stipulate UTF8 we should store characters in Values
+as Unicode code points.
+However the C strings used in i.e. symbol names by the compiler
+will remain UTF8 and so do not require a `wchar_t` modification.
+
+Any future character input / reader implementation must be able
+to decode UTF8.
+
+Strings and characters should support a `\Uxxxx;` escape allowing
+unicode (hex) code points, as well as directly embedded UTF8.
+
+We should not try too hard to categorize unicode characters, though
+perhaps it might be possible to segregate punctuation from other
+classes.
+
+## Notes on UTF-8
+
+| First   | Last    | Byte 1   | Byte 2   | Byte 3   | Byte 4   | Bits |
+|---------|---------|----------|----------|----------|----------|------|
+| 0x00    | 0x7F    | 0xxxxxxx |          |          |          | 7    |
+| 0x80    | 0x7FF   | 110xxxxx | 10xxxxxx |          |          | 13   |
+| 0x800   | 0xFFFF  | 1110xxxx | 10xxxxxx | 10xxxxxx |          | 16   |
+| 0x10000 | 0x10FFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx | 21   |
+
+## UTF-8 Regexes
+
+Generated using [Unicode.hs](https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html)
+
+```
+0x80 - 0x9F Control characters
+\xC2[\x80-\x9F]
+
+0xA0 - 0xD7FF Normal characters
+\xC2[\xA0-\xBF]|[\xC3-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|\xED[\x80-\x9F]|[\xE1-\xEC][\x80-\xBF])[\x80-\xBF]
+
+0xD800 - 0xDFFF Reserved
+\xED[\xA0-\xBF][\x80-\xBF]
+
+0xE000 - 0xFFFD Normal Characters
+\xEF\xBF[\x80-\xBD]|(\xEF[\x80-\xBE]|\xEE[\x80-\xBF])[\x80-\xBF]
+
+0xFFFE 0xFFFF Reserved
+\xEF\xBF[\xBE-\xBF]
+
+0x10000 - 0x10FFFF Normal Characters
+(\xF0[\x90-\xBF]|\xF4[\x80-\x8F]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF][\x80-\xBF]
+
+0x80 - 0x10FFFF All high-bit Unicode
+[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x80-\xBF])[\x80-\xBF]|(\xF0[\x90-\xBF]|\xF4[\x80-\x8F]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF][\x80-\xBF]
+```
diff --git a/fn/print.fn b/fn/print.fn
@@ -1 +1 @@
-print("Hello, World!")
+print("Hello, WorldΨ")
diff --git a/src/anf_normalize.c b/src/anf_normalize.c
@@ -41,7 +41,7 @@ static Exp *normalizeNamespaces(LamNamespaceArray *nsArray, Exp *tail);
 static Exp *normalizeVar(HashSymbol *var, Exp *tail);
 static Exp *normalizeMaybeBigInteger(MaybeBigInt *integer, Exp *tail);
 static Exp *normalizeStdInteger(int integer, Exp *tail);
-static Exp *normalizeCharacter(char character, Exp *tail);
+static Exp *normalizeCharacter(Character character, Exp *tail);
 static Exp *normalizeUnary(LamUnaryApp *app, Exp *tail);
 static Exp *normalizeAnd(LamAnd *app, Exp *tail);
 static Exp *normalizeOr(LamOr *app, Exp *tail);
@@ -60,7 +60,7 @@ static AexpPrimOp mapPrimOp(LamPrimOp op);
 static Aexp *aexpNormalizeVar(HashSymbol *var);
 static Aexp *aexpNormalizeMaybeBigInteger(MaybeBigInt *integer);
 static Aexp *aexpNormalizeStdInteger(int integer);
-static Aexp *aexpNormalizeCharacter(char character);
+static Aexp *aexpNormalizeCharacter(Character character);
 static Aexp *aexpNormalizeLam(LamLam *lamLam);
 static AexpNamespaceArray *aexpNormalizeNamespaces(LamNamespaceArray *nsArray);
 static AexpVarList *convertVarList(LamVarList *args);
@@ -648,7 +648,7 @@ static Exp *normalizeVar(HashSymbol *var, Exp *tail) {
     return exp;
 }
 
-static Exp *normalizeCharacter(char character, Exp *tail) {
+static Exp *normalizeCharacter(Character character, Exp *tail) {
     ENTER(normalizeCharacter);
     if (tail != NULL) {
         LEAVE(normalizeCharacter);
@@ -871,7 +871,7 @@ static Aexp *aexpNormalizeStdInteger(int integer) {
     return newAexp_Littleinteger(integer);
 }
 
-static Aexp *aexpNormalizeCharacter(char character) {
+static Aexp *aexpNormalizeCharacter(Character character) {
     return newAexp_Character(character);
 }
 

diff --git a/src/bytecode.c b/src/bytecode.c
@@ -26,6 +26,7 @@
 #include "bytecode.h"
 #include "debug.h"
 #include "common.h"
+#include "utf8.h"
 
 #ifdef DEBUG_BYTECODE
 #  include "debugging_on.h"
@@ -68,17 +69,23 @@ char *charRep(Character c) {
         case '\0':
             return "\\0";
         default: {
-            static char buf[8];
+            static unsigned char buf[8];
+#ifdef CHARACTER_IS_CHAR
             sprintf(buf, "%c", c);
-            return buf;
+#else
+            unsigned char *ptr = writeChar(buf, c);
+            *ptr = 0;
+#endif
+            return (char *) buf;
         }
     }
 }
 
 static void addCharacter(ByteCodeArray *b, Character code) {
     DEBUG("%04lx addCharacter %02x", b->size, code);
     reserve(b, sizeof(Character));
-    b->entries[b->size++] = code;
+    memcpy(&b->entries[b->size], &code, sizeof(Character));
+    b->size += sizeof(Character);
 }
 
 static void addByte(ByteCodeArray *b, int code) {

diff --git a/src/bytecode.h b/src/bytecode.h
@@ -53,7 +53,10 @@ static inline Byte readByte(ByteCodeArray *b, Control *i) {
 }
 
 static inline Character readCharacter(ByteCodeArray *b, Control *i) {
-    return b->entries[(*i)++];
+    Character c;
+    memcpy(&c, &b->entries[*i], sizeof(Character));
+    (*i) += sizeof(Character);
+    return c;
 }
 
 static inline void _readWord(ByteCodeArray *b, Control *i, Word *a) {

diff --git a/src/debug.c b/src/debug.c
@@ -265,7 +265,7 @@ void dumpByteCode(ByteCodeArray *bca) {
             }
             break;
             case BYTECODES_TYPE_CHAR:{
-                    char c = readByte(bca, &i);
+                    Character c = readCharacter(bca, &i);
                     eprintf("CHAR [%s]\n", charRep(c));
                 }
                 break;

diff --git a/src/lexer.l b/src/lexer.l
@@ -1,6 +1,7 @@
 %{
 #include "parser.h"
 #include "module.h"
+#include "utf8.h"
 
 void parseCS(char *text, char delimeter);
 %}
@@ -12,7 +13,37 @@ void parseCS(char *text, char delimeter);
 %option noyywrap
 %option extra-type="struct PmModule *"
 
-ID [a-zA-Z_][a-zA-Z_0-9]*
+/* 0x80 - 0x9F Control characters */
+UNICODE_CONTROL \xC2[\x80-\x9F]
+
+/* 0xA0 - 0xD7FF Normal characters */
+UNICODE_CHARACTERS_1 \xC2[\xA0-\xBF]|[\xC3-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|\xED[\x80-\x9F]|[\xE1-\xEC][\x80-\xBF])[\x80-\xBF]
+
+/* 0xD800 - 0xDFFF Reserved */
+UNICODE_RESERVED_1 \xED[\xA0-\xBF][\x80-\xBF]
+
+/* 0xE000 - 0xFFFD Normal Characters */
+UNICODE_CHARACTERS_2 \xEF\xBF[\x80-\xBD]|(\xEF[\x80-\xBE]|\xEE[\x80-\xBF])[\x80-\xBF]
+
+/* 0xFFFE 0xFFFF Reserved */
+UNICODE_RESERVED_2 \xEF\xBF[\xBE-\xBF]
+
+/* 0x1000 - 0x10FFFF Normal Characters */
+UNICODE_CHARACTERS_3 (\xF0[\x90-\xBF]|\xF4[\x80-\x8F]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF][\x80-\xBF]
+
+/* 0x80 - 0x10FFFF All high-bit Unicode */
+UNICODE_ALL [\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x80-\xBF])[\x80-\xBF]|(\xF0[\x90-\xBF]|\xF4[\x80-\x8F]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF][\x80-\xBF]
+
+UNICODE_RESERVED {UNICODE_RESERVED_1}|{UNICODE_RESERVED_2}
+
+UNICODE_CHARACTERS {UNICODE_CHARACTERS_1}|{UNICODE_CHARACTERS_2}|{UNICODE_CHARACTERS_3}
+
+ALPHA [a-zA-Z_]
+ALNUM [a-zA-Z_0-9]
+
+ID ({ALPHA}|{UNICODE_CHARACTERS})({ALNUM}|{UNICODE_CHARACTERS})*
+
+CODEPOINT {UNICODE_ALL}|.
 
 %%
 
@@ -21,9 +52,12 @@ struct PmModule *mod = yyextra;
 %}
 
 [ \t]+ {}
+{UNICODE_CONTROL} {}
 [\n] { incLineNo(mod); }
 \/\/.* { }
 
+{UNICODE_RESERVED} { cant_happen("invalid unicode character detected"); }
+
 [0-9]*\.[0-9]+ { yylval->s = yytext; return IRRATIONAL; }
 [0-9]*\.[0-9]+i { yylval->s = yytext; return IRRATIONAL_I; }
 [0-9]+         { yylval->s = yytext; return NUMBER; }
@@ -54,8 +88,10 @@ struct PmModule *mod = yyextra;
 "assert"        { return ASSERT; }
 "alias"         { return ALIAS; }
 
-\"((\\.)|[^"])*\" { parseCS(yytext, '"'); yylval->s = yytext; return STRING; }
-\'((\\.)|[^'])\'  { parseCS(yytext, '\''); yylval->c = *yytext; return CHAR; }
+\"((\\{CODEPOINT})|[^"])*\" { parseCS(yytext, '"'); yylval->s = yytext; return STRING; }
+\'((\\{CODEPOINT})|[^'])\'  { parseCS(yytext, '\''); utf8_to_unicode_char(&yylval->c, yytext); return CHAR; }
+\'({UNICODE_ALL}|[^'])\'    { parseCS(yytext, '\''); utf8_to_unicode_char(&yylval->c, yytext); return CHAR; }
+\'\\u[0-9a-fA-F]+;\'        { parseCS(yytext, '\''); utf8_to_unicode_char(&yylval->c, yytext); return CHAR; }
 
 "->" { return ARROW; }
 "==" { return EQ; }
@@ -94,6 +130,108 @@ void parseCS(char *text, char delimiter) {
                 case 't':
                     *follow = '\t';
                     break;
+                case 'u': {
+                    *lead++;
+                    Character unicode = 0;
+                    bool finished = false;
+                    while (!finished) {
+                        switch (*lead) {
+                            case '0':
+                                unicode <<= 4;
+                                lead++;
+                                break;
+                            case '1':
+                                unicode <<= 4;
+                                unicode |= 0x01;
+                                lead++;
+                                break;
+                            case '2':
+                                unicode <<= 4;
+                                unicode |= 0x02;
+                                lead++;
+                                break;
+                            case '3':
+                                unicode <<= 4;
+                                unicode |= 0x03;
+                                lead++;
+                                break;
+                            case '4':
+                                unicode <<= 4;
+                                unicode |= 0x04;
+                                lead++;
+                                break;
+                            case '5':
+                                unicode <<= 4;
+                                unicode |= 0x05;
+                                lead++;
+                                break;
+                            case '6':
+                                unicode <<= 4;
+                                unicode |= 0x06;
+                                lead++;
+                                break;
+                            case '7':
+                                unicode <<= 4;
+                                unicode |= 0x07;
+                                lead++;
+                                break;
+                            case '8':
+                                unicode <<= 4;
+                                unicode |= 0x08;
+                                lead++;
+                                break;
+                            case '9':
+                                unicode <<= 4;
+                                unicode |= 0x09;
+                                lead++;
+                                break;
+                            case 'a':
+                            case 'A':
+                                unicode <<= 4;
+                                unicode |= 0x0A;
+                                lead++;
+                                break;
+                            case 'b':
+                            case 'B':
+                                unicode <<= 4;
+                                unicode |= 0x0B;
+                                lead++;
+                                break;
+                            case 'c':
+                            case 'C':
+                                unicode <<= 4;
+                                unicode |= 0x0C;
+                                lead++;
+                                break;
+                            case 'd':
+                            case 'D':
+                                unicode <<= 4;
+                                unicode |= 0x0D;
+                                lead++;
+                                break;
+                            case 'e':
+                            case 'E':
+                                unicode <<= 4;
+                                unicode |= 0x0E;
+                                lead++;
+                                break;
+                            case 'f':
+                            case 'F':
+                                unicode <<= 4;
+                                unicode |= 0x0F;
+                                lead++;
+                                break;
+                            case ';':
+                                finished = true;
+                                break;
+                            default:
+                                cant_happen("error parsing unicode escape");
+                        }
+                    }
+                    follow = writeChar(follow, unicode);
+                    follow--;
+                }
+                break;
                 default:
                     *follow = *lead;
             }

diff --git a/src/parser.y b/src/parser.y
@@ -23,6 +23,7 @@
 #include "lexer.h"
 #include "types.h"
 #include "print_generator.h"
+#include "utf8.h"
 
 AstStringArray *include_paths = NULL;
 
@@ -154,8 +155,14 @@ static MaybeBigInt *makeMaybeBigInt(char *digits, bool imag) {
 
 static AstCharArray *appendCharArray(AstCharArray *res, char *str) {
     while (*str) {
+#ifdef CHARACTER_IS_CHAR
         pushAstCharArray(res, *str);
         str++;
+#else
+        Character unicode;
+        str = utf8_to_unicode_char(&unicode, str);
+        pushAstCharArray(res, unicode);
+#endif
     }
     return res;
 }
@@ -304,7 +311,7 @@ static AstArg *makeAstLookupArg(PmModule *mod, HashSymbol *nsName, HashSymbol *s
 
 %union {
     char *s;
-    char c;
+    Character c;
     bool b;
     MaybeBigInt *bi;
     AstArg *arg;