Skip to content

Commit

Permalink
compiler: Support long UTF-8 encoded atoms
Browse files Browse the repository at this point in the history
Support for atoms containing any Unicode code point was added
in Erlang/OTP 20 (PR-1078).

After that change, an atom can contain up to 255 Unicode code
points. However, atoms used in Erlang source code is still limited to
255 bytes because the atom table in the BEAM file only has a byte for
holding the length in bytes of the atom text. For instance, the `🟦`
character has a four-byte encoding (`<<240,159,159,166>>`), meaning
that Erlang source code containing a literal atom consisting of 64 or
more such characters cannot be compiled.

This commit changes the atom table in BEAM files to use two bytes
for the length of each atom. The header for the atom table is also
changed to indicate that two-byte length are used. Attempting to
load a BEAM file compiled with Erlang/OTP 28 in Erlang/OTP 27 or
earlier will result in the following error message:

    1> l(t).
    =ERROR REPORT==== 8-Oct-2024::08:49:01.750424 ===
    beam/beam_load.c(150): Error loading module t:
      corrupt atom table

    {error,badfile}

`beam_lib` is updated to handle the new format. External tools that
use `beam_lib:chunks(Beam, [atoms])` to read the atom table will
continue to work. External tools that do their own parsing of the atom
table will need to be updated.
  • Loading branch information
bjorng committed Oct 8, 2024
1 parent fa80932 commit 0b18e23
Show file tree
Hide file tree
Showing 6 changed files with 103 additions and 24 deletions.
52 changes: 43 additions & 9 deletions erts/emulator/beam/beam_file.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,17 @@ static int beamreader_read_u8(BeamReader *reader, byte *val) {
return 1;
}

static int beamreader_read_u16(BeamReader *reader, Uint16 *val) {
LoadAssert(beamreader_test(reader, sizeof(Uint16)));

*val = (Uint16)reader->head[0] << 0x08 |
(Uint16)reader->head[1] << 0x00;

reader->head += sizeof(Uint16);

return 1;
}

static int beamreader_read_bytes(BeamReader *reader, size_t size, const byte **val) {
if (!beamreader_test(reader, size)) {
return 0;
Expand Down Expand Up @@ -241,13 +252,18 @@ static int parse_atom_chunk(BeamFile *beam,
BeamReader reader;
Sint32 count;
int i;
bool long_counts = false;

ASSERT(beam->atoms.entries == NULL);
atoms = &beam->atoms;

beamreader_init(chunk->data, chunk->size, &reader);

LoadAssert(beamreader_read_i32(&reader, &count));
if (count == -1) {
long_counts = true;
LoadAssert(beamreader_read_i32(&reader, &count));
}
LoadAssert(CHECK_ITEM_COUNT(count, 1, sizeof(atoms->entries[0])));

/* Reserve a slot for the empty list, which is encoded as atom 0 as we
Expand All @@ -262,18 +278,36 @@ static int parse_atom_chunk(BeamFile *beam,
atoms->entries[0] = THE_NON_VALUE;
atoms->count = count;

for (i = 1; i < count; i++) {
const byte *string;
byte length;
Eterm atom;
if (long_counts) {
/* Erlang/OTP 28 and later. */
for (i = 1; i < count; i++) {
const byte *string;
Uint16 length;
Eterm atom;

LoadAssert(beamreader_read_u8(&reader, &length));
LoadAssert(beamreader_read_bytes(&reader, length, &string));
LoadAssert(beamreader_read_u16(&reader, &length));
LoadAssert(beamreader_read_bytes(&reader, length, &string));

atom = erts_atom_put(string, length, ERTS_ATOM_ENC_UTF8, 1);
LoadAssert(atom != THE_NON_VALUE);
atom = erts_atom_put(string, length, ERTS_ATOM_ENC_UTF8, 1);
LoadAssert(atom != THE_NON_VALUE);

atoms->entries[i] = atom;
atoms->entries[i] = atom;
}
} else {
/* Before Erlang/OTP 28. */
for (i = 1; i < count; i++) {
const byte *string;
byte length;
Eterm atom;

LoadAssert(beamreader_read_u8(&reader, &length));
LoadAssert(beamreader_read_bytes(&reader, length, &string));

atom = erts_atom_put(string, length, ERTS_ATOM_ENC_UTF8, 1);
LoadAssert(atom != THE_NON_VALUE);

atoms->entries[i] = atom;
}
}

beam->module = atoms->entries[1];
Expand Down
16 changes: 14 additions & 2 deletions lib/compiler/src/beam_asm.erl
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,7 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, ExtraChunks0, CompileInfo, Com
Code),

%% Create the atom table chunk.
{NumAtoms, AtomTab} = beam_dict:atom_table(Dict),
AtomChunk = chunk(<<"AtU8">>, <<NumAtoms:32>>, AtomTab),
AtomChunk = build_atom_table(CompilerOpts, Dict),

%% Create the import table chunk.

Expand Down Expand Up @@ -287,6 +286,19 @@ build_attributes(Attr, Compile, MD5) ->
CompileBinary = term_to_binary([{version,?COMPILER_VSN}|Compile]),
{AttrBinary,CompileBinary}.

build_atom_table(Options, Dict) ->
{NumAtoms, AtomTab0} = beam_dict:atom_table(Dict),
case member(no_long_atoms, Options) of
false ->
%% Build an atom table for Erlang/OTP 28 and later.
chunk(<<"AtU8">>, <<-1:32,NumAtoms:32>>, AtomTab0);
true ->
%% Build an atom table compatible with Erlang/OTP 27
%% and earlier.
AtomTab = [[Len,Text] || [<<Len:16>>,Text] <- AtomTab0],
chunk(<<"AtU8">>, <<NumAtoms:32>>, AtomTab)
end.

build_line_table(Dict, Options) ->
{NumLineInstrs,NumFnames0,Fnames0,NumLines,Lines0,ExecLine} =
beam_dict:line_table(Dict),
Expand Down
2 changes: 1 addition & 1 deletion lib/compiler/src/beam_dict.erl
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ atom_table(#asm{atoms=Atoms}) ->
Sorted = lists:keysort(2, maps:to_list(Atoms)),
{NumAtoms,[begin
L = atom_to_binary(A, utf8),
[byte_size(L),L]
[<<(byte_size(L)):16>>,L]
end || {A,_} <- Sorted]}.

%% Returns the table of local functions.
Expand Down
4 changes: 3 additions & 1 deletion lib/compiler/src/compile.erl
Original file line number Diff line number Diff line change
Expand Up @@ -1040,7 +1040,9 @@ expand_opt(r25, Os) ->
[no_ssa_opt_update_tuple, no_bs_match, no_min_max_bifs |
expand_opt(r26, Os)];
expand_opt(r26, Os) ->
[no_bsm_opt | Os];
[no_bsm_opt | expand_opt(r27, Os)];
expand_opt(r27, Os) ->
[no_long_atoms | Os];
expand_opt({debug_info_key,_}=O, Os) ->
[encrypt_debug_info,O|Os];
expand_opt(no_type_opt=O, Os) ->
Expand Down
30 changes: 21 additions & 9 deletions lib/compiler/test/compile_SUITE.erl
Original file line number Diff line number Diff line change
Expand Up @@ -943,19 +943,31 @@ test_sloppy() ->
Turtle.

utf8_atoms(Config) when is_list(Config) ->
do_utf8_atom(binary_to_atom(<<"こんにちは"/utf8>>, utf8)),

LongAtom = binary_to_atom(binary:copy(<<240,159,159,166>>, 255)),
do_utf8_atom(LongAtom),

ok.

do_utf8_atom(Atom) ->
Mod = ?FUNCTION_NAME,
Anno = erl_anno:new(1),
Atom = binary_to_atom(<<"こんにちは"/utf8>>, utf8),
Forms = [{attribute,Anno,compile,[export_all]},
Forms = [{attribute,Anno,module,Mod},
{attribute,Anno,compile,[export_all]},
{function,Anno,atom,0,[{clause,Anno,[],[],[{atom,Anno,Atom}]}]}],

Utf8AtomForms = [{attribute,Anno,module,utf8_atom}|Forms],
{ok,utf8_atom,Utf8AtomBin} =
compile:forms(Utf8AtomForms, [binary]),
{ok,{utf8_atom,[{atoms,_}]}} =
beam_lib:chunks(Utf8AtomBin, [atoms]),
code:load_binary(utf8_atom, "compile_SUITE", Utf8AtomBin),
Atom = utf8_atom:atom(),
{ok,Mod,Utf8AtomBin} = compile:forms(Forms, [binary,report]),
{ok,{Mod,[{atoms,_}]}} = beam_lib:chunks(Utf8AtomBin, [atoms]),

code:load_binary(Mod, "compile_SUITE", Utf8AtomBin),

Atom = Mod:atom(),
true = is_atom(Atom),

true = code:delete(Mod),
false = code:purge(Mod),

ok.

utf8_functions(Config) when is_list(Config) ->
Expand Down
23 changes: 21 additions & 2 deletions lib/stdlib/src/beam_lib.erl
Original file line number Diff line number Diff line change
Expand Up @@ -977,8 +977,7 @@ scan_beam(FD, Pos, What, Mod, Data) ->
get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, Encoding) ->
NewCs = del_chunk(Id, Cs),
{NFD, Chunk} = get_chunk(Id, Pos, Size, FD),
<<_Num:32, Chunk2/binary>> = Chunk,
{Module, _} = extract_atom(Chunk2, Encoding),
Module = extract_module(Chunk, Encoding),
C = case Cs of
info ->
{Id, Pos, Size};
Expand Down Expand Up @@ -1202,6 +1201,15 @@ ensure_atoms({empty, AT}, Cs) ->
ensure_atoms(AT, _Cs) ->
AT.

extract_module(<<-1:32/signed-integer, _Num:32, B/binary>>, utf8) ->
{Module, _} = extract_long_atom(B),
Module;
extract_module(<<_Num:32, B/binary>>, Encoding) ->
{Module, _} = extract_atom(B, Encoding),
Module.

extract_atoms(<<-1:32/signed-integer, _Num:32, B/binary>>, AT, utf8) ->
extract_long_atoms(B, 1, AT);
extract_atoms(<<_Num:32, B/binary>>, AT, Encoding) ->
extract_atoms(B, 1, AT, Encoding).

Expand All @@ -1216,6 +1224,17 @@ extract_atom(<<Len, B/binary>>, Encoding) ->
<<SB:Len/binary, Tail/binary>> = B,
{binary_to_atom(SB, Encoding), Tail}.

extract_long_atoms(<<>>, _I, _AT) ->
true;
extract_long_atoms(B, I, AT) ->
{Atom, B1} = extract_long_atom(B),
true = ets:insert(AT, {I, Atom}),
extract_long_atoms(B1, I+1, AT).

extract_long_atom(<<Len:16, B/binary>>) ->
<<SB:Len/binary, Tail/binary>> = B,
{binary_to_atom(SB, utf8), Tail}.

%%% Utils.

-record(bb, {pos = 0 :: integer(),
Expand Down

0 comments on commit 0b18e23

Please sign in to comment.