Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Basic Unicode Support #90

Merged
merged 5 commits into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ More of a wish-list than a hard and fast plan.
* Combining characters need special handling.
* Much better error reporting.
* Error recovery.
* Fail on non-exhaustive pattern match (optional).
* User definable operators.
* With precedence and associativity.
* `infix 55 left >>= fn(l, r) { ... }`
Expand All @@ -42,4 +41,3 @@ More of a wish-list than a hard and fast plan.
* `alias some = maybe.some;`
* `alias string = list(char);`
* (internal) have a NEWZ variant of NEW that bzero's its result.
* (internal) replace the CEKF support code with generated code.
56 changes: 56 additions & 0 deletions docs/UNICODE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Plan for Unicode Support

Keep it simple, UTF8 only

The Value type has a type Character but this can be amended to a
`wchar_t`.
Although we stipulate UTF8 we should store characters in Values
as Unicode code points.
However the C strings used in i.e. symbol names by the compiler
will remain UTF8 and so do not require a `wchar_t` modification.

Any future character input / reader implementation must be able
to decode UTF8.

Strings and characters should support a `\Uxxxx;` escape allowing
unicode (hex) code points, as well as directly embedded UTF8.

We should not try too hard to categorize unicode characters, though
perhaps it might be possible to segregate punctuation from other
classes.

## Notes on UTF-8

| First | Last | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Bits |
|---------|---------|----------|----------|----------|----------|------|
| 0x00 | 0x7F | 0xxxxxxx | | | | 7 |
| 0x80 | 0x7FF | 110xxxxx | 10xxxxxx | | | 13 |
| 0x800 | 0xFFFF | 1110xxxx | 10xxxxxx | 10xxxxxx | | 16 |
| 0x10000 | 0x10FFF | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx | 21 |

## UTF-8 Regexes

Generated using [Unicode.hs](https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html)

```
0x80 - 0x9F Control characters
\xC2[\x80-\x9F]

0xA0 - 0xD7FF Normal characters
\xC2[\xA0-\xBF]|[\xC3-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|\xED[\x80-\x9F]|[\xE1-\xEC][\x80-\xBF])[\x80-\xBF]

0xD800 - 0xDFFF Reserved
\xED[\xA0-\xBF][\x80-\xBF]

0xE000 - 0xFFFD Normal Characters
\xEF\xBF[\x80-\xBD]|(\xEF[\x80-\xBE]|\xEE[\x80-\xBF])[\x80-\xBF]

0xFFFE 0xFFFF Reserved
\xEF\xBF[\xBE-\xBF]

0x10000 - 0x10FFFF Normal Characters
(\xF0[\x90-\xBF]|\xF4[\x80-\x8F]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF][\x80-\xBF]

0x80 - 0x10FFFF All high-bit Unicode
[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x80-\xBF])[\x80-\xBF]|(\xF0[\x90-\xBF]|\xF4[\x80-\x8F]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF][\x80-\xBF]
```
2 changes: 1 addition & 1 deletion fn/print.fn
Original file line number Diff line number Diff line change
@@ -1 +1 @@
print("Hello, World!")
print("Hello, WorldΨ")
8 changes: 4 additions & 4 deletions src/anf_normalize.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ static Exp *normalizeNamespaces(LamNamespaceArray *nsArray, Exp *tail);
static Exp *normalizeVar(HashSymbol *var, Exp *tail);
static Exp *normalizeMaybeBigInteger(MaybeBigInt *integer, Exp *tail);
static Exp *normalizeStdInteger(int integer, Exp *tail);
static Exp *normalizeCharacter(char character, Exp *tail);
static Exp *normalizeCharacter(Character character, Exp *tail);
static Exp *normalizeUnary(LamUnaryApp *app, Exp *tail);
static Exp *normalizeAnd(LamAnd *app, Exp *tail);
static Exp *normalizeOr(LamOr *app, Exp *tail);
Expand All @@ -60,7 +60,7 @@ static AexpPrimOp mapPrimOp(LamPrimOp op);
static Aexp *aexpNormalizeVar(HashSymbol *var);
static Aexp *aexpNormalizeMaybeBigInteger(MaybeBigInt *integer);
static Aexp *aexpNormalizeStdInteger(int integer);
static Aexp *aexpNormalizeCharacter(char character);
static Aexp *aexpNormalizeCharacter(Character character);
static Aexp *aexpNormalizeLam(LamLam *lamLam);
static AexpNamespaceArray *aexpNormalizeNamespaces(LamNamespaceArray *nsArray);
static AexpVarList *convertVarList(LamVarList *args);
Expand Down Expand Up @@ -648,7 +648,7 @@ static Exp *normalizeVar(HashSymbol *var, Exp *tail) {
return exp;
}

static Exp *normalizeCharacter(char character, Exp *tail) {
static Exp *normalizeCharacter(Character character, Exp *tail) {
ENTER(normalizeCharacter);
if (tail != NULL) {
LEAVE(normalizeCharacter);
Expand Down Expand Up @@ -871,7 +871,7 @@ static Aexp *aexpNormalizeStdInteger(int integer) {
return newAexp_Littleinteger(integer);
}

static Aexp *aexpNormalizeCharacter(char character) {
static Aexp *aexpNormalizeCharacter(Character character) {
return newAexp_Character(character);
}

Expand Down
13 changes: 10 additions & 3 deletions src/bytecode.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "bytecode.h"
#include "debug.h"
#include "common.h"
#include "utf8.h"

#ifdef DEBUG_BYTECODE
# include "debugging_on.h"
Expand Down Expand Up @@ -68,17 +69,23 @@ char *charRep(Character c) {
case '\0':
return "\\0";
default: {
static char buf[8];
static unsigned char buf[8];
#ifdef CHARACTER_IS_CHAR
sprintf(buf, "%c", c);
return buf;
#else
unsigned char *ptr = writeChar(buf, c);
*ptr = 0;
#endif
return (char *) buf;
}
}
}

static void addCharacter(ByteCodeArray *b, Character code) {
DEBUG("%04lx addCharacter %02x", b->size, code);
reserve(b, sizeof(Character));
b->entries[b->size++] = code;
memcpy(&b->entries[b->size], &code, sizeof(Character));
b->size += sizeof(Character);
}

static void addByte(ByteCodeArray *b, int code) {
Expand Down
5 changes: 4 additions & 1 deletion src/bytecode.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,10 @@ static inline Byte readByte(ByteCodeArray *b, Control *i) {
}

static inline Character readCharacter(ByteCodeArray *b, Control *i) {
return b->entries[(*i)++];
Character c;
memcpy(&c, &b->entries[*i], sizeof(Character));
(*i) += sizeof(Character);
return c;
}

static inline void _readWord(ByteCodeArray *b, Control *i, Word *a) {
Expand Down
2 changes: 1 addition & 1 deletion src/debug.c
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ void dumpByteCode(ByteCodeArray *bca) {
}
break;
case BYTECODES_TYPE_CHAR:{
char c = readByte(bca, &i);
Character c = readCharacter(bca, &i);
eprintf("CHAR [%s]\n", charRep(c));
}
break;
Expand Down
144 changes: 141 additions & 3 deletions src/lexer.l
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
%{
#include "parser.h"
#include "module.h"
#include "utf8.h"

void parseCS(char *text, char delimeter);
%}
Expand All @@ -12,7 +13,37 @@ void parseCS(char *text, char delimeter);
%option noyywrap
%option extra-type="struct PmModule *"

ID [a-zA-Z_][a-zA-Z_0-9]*
/* 0x80 - 0x9F Control characters */
UNICODE_CONTROL \xC2[\x80-\x9F]

/* 0xA0 - 0xD7FF Normal characters */
UNICODE_CHARACTERS_1 \xC2[\xA0-\xBF]|[\xC3-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|\xED[\x80-\x9F]|[\xE1-\xEC][\x80-\xBF])[\x80-\xBF]

/* 0xD800 - 0xDFFF Reserved */
UNICODE_RESERVED_1 \xED[\xA0-\xBF][\x80-\xBF]

/* 0xE000 - 0xFFFD Normal Characters */
UNICODE_CHARACTERS_2 \xEF\xBF[\x80-\xBD]|(\xEF[\x80-\xBE]|\xEE[\x80-\xBF])[\x80-\xBF]

/* 0xFFFE 0xFFFF Reserved */
UNICODE_RESERVED_2 \xEF\xBF[\xBE-\xBF]

/* 0x1000 - 0x10FFFF Normal Characters */
UNICODE_CHARACTERS_3 (\xF0[\x90-\xBF]|\xF4[\x80-\x8F]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF][\x80-\xBF]

/* 0x80 - 0x10FFFF All high-bit Unicode */
UNICODE_ALL [\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x80-\xBF])[\x80-\xBF]|(\xF0[\x90-\xBF]|\xF4[\x80-\x8F]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF][\x80-\xBF]

UNICODE_RESERVED {UNICODE_RESERVED_1}|{UNICODE_RESERVED_2}

UNICODE_CHARACTERS {UNICODE_CHARACTERS_1}|{UNICODE_CHARACTERS_2}|{UNICODE_CHARACTERS_3}

ALPHA [a-zA-Z_]
ALNUM [a-zA-Z_0-9]

ID ({ALPHA}|{UNICODE_CHARACTERS})({ALNUM}|{UNICODE_CHARACTERS})*

CODEPOINT {UNICODE_ALL}|.

%%

Expand All @@ -21,9 +52,12 @@ struct PmModule *mod = yyextra;
%}

[ \t]+ {}
{UNICODE_CONTROL} {}
[\n] { incLineNo(mod); }
\/\/.* { }

{UNICODE_RESERVED} { cant_happen("invalid unicode character detected"); }

[0-9]*\.[0-9]+ { yylval->s = yytext; return IRRATIONAL; }
[0-9]*\.[0-9]+i { yylval->s = yytext; return IRRATIONAL_I; }
[0-9]+ { yylval->s = yytext; return NUMBER; }
Expand Down Expand Up @@ -54,8 +88,10 @@ struct PmModule *mod = yyextra;
"assert" { return ASSERT; }
"alias" { return ALIAS; }

\"((\\.)|[^"])*\" { parseCS(yytext, '"'); yylval->s = yytext; return STRING; }
\'((\\.)|[^'])\' { parseCS(yytext, '\''); yylval->c = *yytext; return CHAR; }
\"((\\{CODEPOINT})|[^"])*\" { parseCS(yytext, '"'); yylval->s = yytext; return STRING; }
\'((\\{CODEPOINT})|[^'])\' { parseCS(yytext, '\''); utf8_to_unicode_char(&yylval->c, yytext); return CHAR; }
\'({UNICODE_ALL}|[^'])\' { parseCS(yytext, '\''); utf8_to_unicode_char(&yylval->c, yytext); return CHAR; }
\'\\u[0-9a-fA-F]+;\' { parseCS(yytext, '\''); utf8_to_unicode_char(&yylval->c, yytext); return CHAR; }

"->" { return ARROW; }
"==" { return EQ; }
Expand Down Expand Up @@ -94,6 +130,108 @@ void parseCS(char *text, char delimiter) {
case 't':
*follow = '\t';
break;
case 'u': {
*lead++;
Character unicode = 0;
bool finished = false;
while (!finished) {
switch (*lead) {
case '0':
unicode <<= 4;
lead++;
break;
case '1':
unicode <<= 4;
unicode |= 0x01;
lead++;
break;
case '2':
unicode <<= 4;
unicode |= 0x02;
lead++;
break;
case '3':
unicode <<= 4;
unicode |= 0x03;
lead++;
break;
case '4':
unicode <<= 4;
unicode |= 0x04;
lead++;
break;
case '5':
unicode <<= 4;
unicode |= 0x05;
lead++;
break;
case '6':
unicode <<= 4;
unicode |= 0x06;
lead++;
break;
case '7':
unicode <<= 4;
unicode |= 0x07;
lead++;
break;
case '8':
unicode <<= 4;
unicode |= 0x08;
lead++;
break;
case '9':
unicode <<= 4;
unicode |= 0x09;
lead++;
break;
case 'a':
case 'A':
unicode <<= 4;
unicode |= 0x0A;
lead++;
break;
case 'b':
case 'B':
unicode <<= 4;
unicode |= 0x0B;
lead++;
break;
case 'c':
case 'C':
unicode <<= 4;
unicode |= 0x0C;
lead++;
break;
case 'd':
case 'D':
unicode <<= 4;
unicode |= 0x0D;
lead++;
break;
case 'e':
case 'E':
unicode <<= 4;
unicode |= 0x0E;
lead++;
break;
case 'f':
case 'F':
unicode <<= 4;
unicode |= 0x0F;
lead++;
break;
case ';':
finished = true;
break;
default:
cant_happen("error parsing unicode escape");
}
}
follow = writeChar(follow, unicode);
follow--;
}
break;
default:
*follow = *lead;
}
Expand Down
9 changes: 8 additions & 1 deletion src/parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "lexer.h"
#include "types.h"
#include "print_generator.h"
#include "utf8.h"

AstStringArray *include_paths = NULL;

Expand Down Expand Up @@ -154,8 +155,14 @@ static MaybeBigInt *makeMaybeBigInt(char *digits, bool imag) {

static AstCharArray *appendCharArray(AstCharArray *res, char *str) {
while (*str) {
#ifdef CHARACTER_IS_CHAR
pushAstCharArray(res, *str);
str++;
#else
Character unicode;
str = utf8_to_unicode_char(&unicode, str);
pushAstCharArray(res, unicode);
#endif
}
return res;
}
Expand Down Expand Up @@ -304,7 +311,7 @@ static AstArg *makeAstLookupArg(PmModule *mod, HashSymbol *nsName, HashSymbol *s

%union {
char *s;
char c;
Character c;
bool b;
MaybeBigInt *bi;
AstArg *arg;
Expand Down
Loading
Loading