diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index b23a58d7a..958f6188a 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -82,8 +82,8 @@ jobs: # - {std: 17, cxx: vs2019, bt: Release, os: windows-2019, bitlinks: static64 static32} - {std: 20, cxx: vs2019, bt: Release, os: windows-2019, bitlinks: static64 static32} - - {std: 17, cxx: vs2022, bt: Release, os: windows-2022, bitlinks: static64 static32} - - {std: 20, cxx: vs2022, bt: Release, os: windows-2022, bitlinks: static64 static32} + #- {std: 17, cxx: vs2022, bt: Release, os: windows-2022, bitlinks: static64 static32} + #- {std: 20, cxx: vs2022, bt: Release, os: windows-2022, bitlinks: static64 static32} # - {std: 17, cxx: xcode, xcver: 13, bt: Release, os: macos-11, bitlinks: static64} env: {BM: ON, STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"} diff --git a/README.md b/README.md index dff3d55d5..48a877f1f 100644 --- a/README.md +++ b/README.md @@ -432,37 +432,6 @@ CHECK(loc.col == 4u); ## Using ryml in your project -### Package managers - -ryml is available in most package managers (thanks to all the -contributors!) and linux distributions. But please be aware: those -packages are maintained downstream of this repository, so if you have -issues with the package, file a report with the respective maintainer. - -Here's a quick roundup (not maintained): -* Package managers: - * [conan](https://conan.io/center/recipes/rapidyaml) - * [vcpkg](https://vcpkg.io/en/packages.html): `vcpkg install ryml` - * [PyPI](https://pypi.org/project/rapidyaml/) -* Linux distributions: - * Arch Linux/Manjaro: - * [rapidyaml (aarch64)](https://archlinuxarm.org/packages/aarch64/rapidyaml) - * [rapidyaml-git (AUR)](https://aur.archlinux.org/packages/rapidyaml-git/) - * [python-rapidyaml-git (AUR)](https://aur.archlinux.org/packages/python-rapidyaml-git/) - * [Fedora Linux](https://getfedora.org/)/[EPEL](https://docs.fedoraproject.org/en-US/epel/): - * `dnf install rapidyaml-devel` - * `dnf install python3-rapidyaml` - * [Gentoo](https://packages.gentoo.org/packages/dev-cpp/rapidyaml) - * [OpenSuse](https://build.openbuildservice.org/package/show/Emulators/rapidyaml) - * [Slackbuilds](https://slackbuilds.org/repository/15.0/libraries/rapidyaml/) - * [AltLinux](https://packages.altlinux.org/en/sisyphus/srpms/rapidyaml/3006055151670528141) - -Although package managers are very useful for quickly getting up to -speed, the advised way is still to bring ryml as a submodule of your -project, building both together. This makes it easy to track any -upstream changes in ryml. Also, ryml is small and quick to build, so -there's not much of a cost for building it with your project. - ### Single header file ryml is provided chiefly as a cmake library project, but it can also be used as a single header file, and there is a [tool to @@ -531,6 +500,38 @@ If you omit `--recursive`, after cloning you will have to do `git submodule update --init --recursive` to ensure ryml's submodules are checked out. +### Package managers + +ryml is available in most package managers (thanks to all the +contributors!) and linux distributions. But please be aware: those +packages are maintained downstream of this repository, so if you have +issues with the package, file a report with the respective maintainer. + +Here's a quick roundup (not maintained): +* Package managers: + * [conan](https://conan.io/center/recipes/rapidyaml) + * [vcpkg](https://vcpkg.io/en/packages.html): `vcpkg install ryml` + * [PyPI](https://pypi.org/project/rapidyaml/) +* Linux distributions: + * Arch Linux/Manjaro: + * [rapidyaml (aarch64)](https://archlinuxarm.org/packages/aarch64/rapidyaml) + * [rapidyaml-git (AUR)](https://aur.archlinux.org/packages/rapidyaml-git/) + * [python-rapidyaml-git (AUR)](https://aur.archlinux.org/packages/python-rapidyaml-git/) + * [Fedora Linux](https://getfedora.org/)/[EPEL](https://docs.fedoraproject.org/en-US/epel/): + * `dnf install rapidyaml-devel` + * `dnf install python3-rapidyaml` + * [Gentoo](https://packages.gentoo.org/packages/dev-cpp/rapidyaml) + * [OpenSuse](https://build.openbuildservice.org/package/show/Emulators/rapidyaml) + * [Slackbuilds](https://slackbuilds.org/repository/15.0/libraries/rapidyaml/) + * [AltLinux](https://packages.altlinux.org/en/sisyphus/srpms/rapidyaml/3006055151670528141) + +Although package managers are very useful for quickly getting up to +speed, the advised way is still to bring ryml as a submodule of your +project, building both together. This makes it easy to track any +upstream changes in ryml. Also, ryml is small and quick to build, so +there's not much of a cost for building it with your project. + + ### Quickstart samples These samples show different ways of getting ryml into your application. All the @@ -555,6 +556,7 @@ more about each sample: | [`fetch_content`](./samples/fetch_content) | **yes** | [`CMakeLists.txt`](./samples/fetch_content/CMakeLists.txt) | [`run.sh`](./samples/fetch_content/run.sh) | | [`find_package`](./samples/find_package) | **no**
needs prior install or package | [`CMakeLists.txt`](./samples/find_package/CMakeLists.txt) | [`run.sh`](./samples/find_package/run.sh) | + ### CMake build settings for ryml The following cmake variables can be used to control the build behavior of ryml: @@ -726,20 +728,16 @@ See also [the roadmap](./ROADMAP.md) for a list of future work. ### Known limitations -ryml deliberately makes no effort to follow the standard in the +ryml deliberately makes no effort to follow the YAML standard in the following situations: -* ryml's tree does NOT accept containers are as mapping keys: keys - must be scalars. HOWEVER, this is a limitation only of the tree. The - event-based parser engine DOES parse container keys. The parser - engine is the result of a recent refactor and its usage is meant to - be used by other programming languages to create their native - data-structures. This engine is fully tested and fully conformant - (other than the general error permissiveness noted below). But - because it is recent, it is still undocumented, and it requires some - API cleanup before being ready for isolated use. Please get in touch - if you are interested in integrating the event-based parser engine - without the standalone `ryml::parse_*()` +* ryml's tree does NOT accept containers as map keys: keys stored in + the tree must always be scalars. HOWEVER, this is a limitation only + of the final tree. The event-based parse engine DOES parse container + keys, as it is is meant to be used by other programming languages to + create their native data-structures, and it is fully tested and + fully conformant (other than the general error permissiveness noted + below). * Tab characters after `:` and `-` are not accepted tokens, unless ryml is compiled with the macro `RYML_WITH_TAB_TOKENS`. This requirement exists because checking for tabs introduces branching @@ -774,11 +772,11 @@ following situations: If you do run into trouble and would like to investigate conformance of your YAML code, **beware** of existing online YAML linters, many of which are not fully conformant. Instead, try using -[https://play.yaml.io](https://play.yaml.io), an amazing tool which -lets you dynamically input your YAML and continuously see the results -from all the existing parsers (kudos to @ingydotnet and the people -from the YAML test suite). And of course, if you detect anything wrong -with ryml, please [open an +[https://play.yaml.io](https://play.yaml.io), an amazingly useful tool +which lets you dynamically input your YAML and continuously see the +results from all the existing parsers (kudos to @ingydotnet and the +people from the YAML test suite). And of course, if you detect +anything wrong with ryml, please [open an issue](https://github.com/biojppm/rapidyaml/issues) so that we can improve. @@ -786,44 +784,53 @@ improve. ### Test suite status As part of its CI testing, ryml uses the [YAML test -suite](https://github.com/yaml/yaml-test-suite). This is an extensive -set of reference cases covering the full YAML spec. Each of these -cases have several subparts: +suite](https://github.com/yaml/yaml-test-suite). (See also the test +suite results at +[https://matrix.yaml.info/](https://matrix.yaml.info/), but be aware +that the results there may be using an older version of ryml.) This is +an extensive and merciless set of reference cases covering the full +YAML spec. Each of these cases has several subparts: * `in-yaml`: mildly, plainly or extremely difficult-to-parse YAML * `in-json`: equivalent JSON (where possible/meaningful) * `out-yaml`: equivalent standard YAML * `emit-yaml`: equivalent standard YAML - * `events`: reference results (ie, expected tree) - -When testing, ryml parses each of the 4 yaml/json parts, then emits -the parsed tree, then parses the emitted result and verifies that -emission is idempotent, ie that the emitted result is semantically the -same as its input without any loss of information. To ensure -consistency, this happens over four levels of parse/emission -pairs. And to ensure correctness, each of the stages is compared -against the `events` spec from the test, which constitutes the -reference. The tests also check for equality between the reference + * `events`: reference events according to the YAML standard + +When testing, ryml parses each of the yaml/json parts, then emits the +parsed tree, then parses the emitted result and verifies that emission +is idempotent, ie that the round trip emitted result is semantically +the same as its input without any loss of information. + +To ensure consistency, this happens over four successive levels of +parse->emit round trips. And to ensure correctness, each of the stages +is compared against the `events` spec from the test, which constitutes +the reference. The tests also check for equality between the reference events in the test case and the events emitted by ryml from the data tree parsed from the test case input. All of this is then carried out -combining several variations: both unix `\n` vs windows `\r\n` line +with several variations: both unix `\n` vs windows `\r\n` line endings, emitting to string, file or streams, which results in ~250 -tests per case part. With multiple parts per case and ~400 reference +tests per case part. + +With multiple parts per case and ~400 reference cases in the test suite, this makes over several hundred thousand individual tests to which ryml is subjected, which are added to the -unit tests in ryml, which also employ the same extensive -combinatorial approach. +unit tests in ryml, which also employ the same extensive combinatorial +approach. -Also, note that in [their own words](http://matrix.yaml.io/), the +Also, note that in [their own words](http://matrix.yaml.info/), the tests from the YAML test suite *contain a lot of edge cases that don't play such an important role in real world examples*. And yet, despite the extreme focus of the test suite, currently ryml only fails a minor fraction of the test cases, mostly related with the deliberate -limitations noted above. Other than those limitations, by far the main -issue with ryml is that several standard-mandated parse errors fail to -materialize. For the up-to-date list of ryml failures in the -test-suite, refer to the [list of known +limitations noted above. + +Other than those limitations, by far the main issue with ryml is that +several standard-mandated parse errors fail to materialize (this will +be addressed in the coming releases). For the up-to-date list of ryml +failures in the test-suite, refer to the [list of known exceptions](test/test_suite/test_suite_parts.cpp) from ryml's test -suite runner, which is used as part of ryml's CI process. +suite runner, which is used as part of ryml's CI setup. + ------ diff --git a/ROADMAP.md b/ROADMAP.md index 8df21edc4..c78da795c 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,2 +1,2 @@ -Moved! See ryml's [Kanban board on github](https://github.com/biojppm/rapidyaml/projects/1). +Moved! See rapidyaml's [Kanban board on github](https://github.com/users/biojppm/projects/1/views/1). diff --git a/changelog/current.md b/changelog/current.md index 599197491..69a39c9df 100644 --- a/changelog/current.md +++ b/changelog/current.md @@ -1,11 +1,19 @@ -Most of the changes are from the giant Parser refactor described below. Before getting to that, a couple of other minor changes first. +Most of the changes are from the giant Parser refactor described below. Before getting to that, some other minor changes first. ### Fixes - [#PR431](https://github.com/biojppm/rapidyaml/pull/431) - Emitter: prevent stack overflows when emitting malicious trees by providing a max tree depth for the emit visitor. This was done by adding an `EmitOptions` structure as an argument both to the emitter and to the emit functions, which is then forwarded to the emitter. This `EmitOptions` structure has a max tree depth setting with a default value of 64. - [#PR431](https://github.com/biojppm/rapidyaml/pull/431) - Fix `_RYML_CB_ALLOC()` using `(T)` in parenthesis, making the macro unusable. -- [#434] - Ensure empty vals are not deserialized ([#PR436](https://github.com/biojppm/rapidyaml/pull/436)). +- [#434](https://github.com/biojppm/rapidyaml/issues/434) - Ensure empty vals are not deserialized ([#PR436](https://github.com/biojppm/rapidyaml/pull/436)). +- [#PR433](https://github.com/biojppm/rapidyaml/pull/433): + - Fix some corner cases causing read-after-free in the tree's arena when it is relocated while filtering scalar. + - Improve YAML error conformance - detect YAML-mandated parse errors when: + - directives are misplaced (eg [9MMA](https://matrix.yaml.info/details/9MMA.html), [9HCY](https://matrix.yaml.info/details/9HCY.html), [B63P](https://matrix.yaml.info/details/B63P.html), [EB22](https://matrix.yaml.info/details/EB22.html), [SF5V](https://matrix.yaml.info/details/SF5V.html)). + - comments are misplaced (eg [MUS6/00](https://matrix.yaml.info/details/MUS6:00.html), [9JBA](https://matrix.yaml.info/details/9JBA.html), [SU5Z](https://matrix.yaml.info/details/SU5Z.html)) + - a node has both an anchor and an alias (eg [SR86](https://matrix.yaml.info/details/SR86.html), [SU74](https://matrix.yaml.info/details/SU74.html)). + - tags contain [invalid characters](https://yaml.org/spec/1.2.2/#tag-shorthands) `,{}[]` (eg [LHL4](https://matrix.yaml.info/details/LHL4.html), [U99R](https://matrix.yaml.info/details/U99R.html), [WZ62](https://matrix.yaml.info/details/WZ62.html)). + ### New features diff --git a/doc/index.rst b/doc/index.rst index b26b178cd..67bd88711 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -35,10 +35,10 @@ ryml is written in C++11, and compiles cleanly with: * Visual Studio 2015 and later -* clang++ 3.9 and later - * g++ 4.8 and later +* clang++ 3.9 and later + * Intel Compiler .. note:: diff --git a/doc/sphinx_quicklinks.rst b/doc/sphinx_quicklinks.rst index b4c8c7c1e..133d4f80d 100644 --- a/doc/sphinx_quicklinks.rst +++ b/doc/sphinx_quicklinks.rst @@ -13,7 +13,7 @@ Quick links * `Pull Requests `_ - * `Kanban board `_ + * `Kanban board `_ * Latest release: `0.6.0 `_ diff --git a/doc/sphinx_yaml_standard.rst b/doc/sphinx_yaml_standard.rst index fdf5c13e8..390f833de 100644 --- a/doc/sphinx_yaml_standard.rst +++ b/doc/sphinx_yaml_standard.rst @@ -17,7 +17,7 @@ welcome. linters**, many of which are not fully conformant; instead, try using `https://play.yaml.io `__, - an amazing tool which lets you dynamically input your YAML and + an amazingly useful tool which lets you dynamically input your YAML and continuously see the results from all the existing parsers (kudos to @ingydotnet and the people from the YAML test suite). And of course, if you detect anything wrong with ryml, please `open an @@ -31,17 +31,13 @@ Deliberate deviations ryml deliberately makes no effort to follow the standard in the following situations: -- ryml's tree does NOT accept containers are as mapping keys: keys - must be scalars. HOWEVER, this is a limitation only of the tree. The - event-based parser engine DOES parse container keys. The parser - engine is the result of a recent refactor and its usage is meant to - be used by other programming languages to create their native - data-structures. This engine is fully tested and fully conformant - (other than the general error permissiveness noted below). But - because it is recent, it is still undocumented, and it requires some - API cleanup before being ready for isolated use. Please get in touch - if you are interested in integrating the event-based parser engine - without the standalone `ryml::parse_*()` +- ryml's tree does NOT accept containers as map keys: keys stored in + the tree must always be scalars. HOWEVER, this is a limitation only + of the final tree. The event-based parse engine DOES parse container + keys, as it is is meant to be used by other programming languages to + create their native data-structures, and it is fully tested and + fully conformant (other than the general error permissiveness noted + below). - Tab characters after ``:`` and ``-`` are not accepted tokens, unless ryml is compiled with the macro ``RYML_WITH_TAB_TOKENS``. This requirement exists because checking for tabs introduces branching @@ -83,42 +79,52 @@ generally a source of problems; this is good practice anyway. YAML test suite --------------- -As part of its CI testing, ryml uses the `YAML test -suite `__. This is an extensive -set of reference cases covering the full YAML spec. Each of these cases -have several subparts: - +As part of its CI testing, ryml uses the `YAML test suite `__. This is +an extensive and merciless set of reference cases covering the full +YAML spec. Each of these cases has several subparts: - ``in-yaml``: mildly, plainly or extremely difficult-to-parse YAML - ``in-json``: equivalent JSON (where possible/meaningful) - ``out-yaml``: equivalent standard YAML - ``emit-yaml``: equivalent standard YAML - - ``events``: reference results (ie, expected tree) + - ``events``: reference events according to the YAML standard + +.. note:: -When testing, ryml parses each of the 4 yaml/json parts, then emits the + See the test suite results at + `https://matrix.yaml.info/ `__, + but be aware that the results there may be using an older + version of ryml. + +When testing, ryml parses each of the yaml/json parts, then emits the parsed tree, then parses the emitted result and verifies that emission -is idempotent, ie that the emitted result is semantically the same as -its input without any loss of information. To ensure consistency, this -happens over four levels of parse/emission pairs. And to ensure -correctness, each of the stages is compared against the ``events`` spec -from the test, which constitutes the reference. The tests also check for -equality between the reference events in the test case and the events -emitted by ryml from the data tree parsed from the test case input. All -of this is then carried out combining several variations: both unix -``\n`` vs windows ``\r\n`` line endings, emitting to string, file or -streams, which results in ~250 tests per case part. With multiple parts -per case and ~400 reference cases in the test suite, this makes over -several hundred thousand individual tests to which ryml is subjected, -which are added to the unit tests in ryml, which also employ the same -extensive combinatorial approach. +is idempotent, ie that the round trip emitted result is semantically +the same as its input without any loss of information. + +To ensure consistency, this happens over four successive levels of +parse->emit round trips. And to ensure correctness, each of the stages +is compared against the ``events`` spec from the test, which constitutes +the reference. The tests also check for equality between the reference +events in the test case and the events emitted by ryml from the data +tree parsed from the test case input. All of this is then carried out +with several variations: both unix ``\n`` vs windows ``\r\n`` line +endings, emitting to string, file or streams, which results in ~250 +tests per case part. + +With multiple parts per case and ~400 reference +cases in the test suite, this makes over several hundred thousand +individual tests to which ryml is subjected, which are added to the +unit tests in ryml, which also employ the same extensive combinatorial +approach. Also, note that in `their own words `__, the -tests from the YAML test suite *contain a lot of edge cases that don’t +tests from the YAML test suite *contain a lot of edge cases that don't play such an important role in real world examples*. And yet, despite the extreme focus of the test suite, currently ryml only fails a minor fraction of the test cases, mostly related with the deliberate -limitations noted above. Other than those limitations, by far the main -issue with ryml is that several standard-mandated parse errors fail to -materialize. For the up-to-date list of ryml failures in the test-suite, -refer to the `list of known -exceptions `__ from ryml’s test -suite runner, which is used as part of ryml’s CI process. +limitations noted above. + +Other than those limitations, by far the main issue with ryml is that +several standard-mandated parse errors fail to materialize (this will +be addressed in the coming releases). For the up-to-date list of ryml +failures in the test-suite, refer to the `list of known exceptions `__ from ryml's test +suite runner, which is used as part of ryml's CI setup. diff --git a/samples/quickstart.cpp b/samples/quickstart.cpp index b306871c3..d27b738bb 100644 --- a/samples/quickstart.cpp +++ b/samples/quickstart.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #ifdef C4_EXCEPTIONS #include @@ -543,12 +542,14 @@ john: doe)"; // The downside, of course, is that when you are not sure, release // builds may be doing something crazy. // - // So you can override this behavior and enable/disable - // assertions, by defining the macro RYML_USE_ASSERT to a proper - // value (see c4/yml/common.hpp). + // So in that case, you can either use the appropriate ryml + // predicates to check your intent (as in the examples above), or + // you can override this behavior and enable/disable assertions, + // by defining the macro RYML_USE_ASSERT to a proper value (see + // c4/yml/common.hpp). // // Also, to be clear, this does not apply to parse errors - // happening when the YAML is parsed. Checking for these errors is + // occurring when the YAML is parsed. Checking for these errors is // always enabled and cannot be switched off. @@ -578,19 +579,19 @@ john: doe)"; // node. It can be used to read from the node, but not write to it // or modify the hierarchy of the node. If any modification is // desired then a NodeRef must be used instead: - ryml::NodeRef wroot = tree.rootref(); + ryml::NodeRef wroot = tree.rootref(); // writeable root // operator= assigns an existing string to the receiving node. - // The contents are NOT copied, and this pointer will be in effect - // until the tree goes out of scope! So BEWARE to only assign from - // strings outliving the tree. + // The contents are NOT copied, and the string pointer will be in + // effect until the tree goes out of scope! So BEWARE to only + // assign from strings outliving the tree. wroot["foo"] = "says you"; wroot["bar"][0] = "-2"; wroot["bar"][1] = "-3"; wroot["john"] = "ron"; // Now the tree is _pointing_ at the memory of the strings above. - // In this case it is OK because those are static strings and will - // outlive the tree. + // In this case it is OK because those are static strings, located + // in the executable's static section, and will outlive the tree. CHECK(root["foo"].val() == "says you"); CHECK(root["bar"][0].val() == "-2"); CHECK(root["bar"][1].val() == "-3"); @@ -602,9 +603,12 @@ john: doe)"; // } // CHECK(root["john"] == "dangling"); // CRASH! the string was deallocated - // operator<< first serializes the input to the tree's arena, then - // assigns the serialized string to the receiving node. This avoids - // constraints with the lifetime, since the arena lives with the tree. + // operator<<: for cases where the lifetime of the string is + // problematic WRT the tree, you can create and save a string in + // the tree using operator<<. It first serializes values to a + // string arena owned by the tree, then assigns the serialized + // string to the receiving node. This avoids constraints with the + // lifetime, since the arena lives with the tree. CHECK(tree.arena().empty()); wroot["foo"] << "says who"; // requires to_chars(). see serialization samples below. wroot["bar"][0] << 20; diff --git a/src/c4/yml/detail/stack.hpp b/src/c4/yml/detail/stack.hpp index df3e27d37..8a5d3783c 100644 --- a/src/c4/yml/detail/stack.hpp +++ b/src/c4/yml/detail/stack.hpp @@ -34,11 +34,11 @@ class stack public: - T m_buf[size_t(N)]; - T * m_stack; - id_type m_size; - id_type m_capacity; - Callbacks m_callbacks; + T m_buf[size_t(N)]; + T *C4_RESTRICT m_stack; + id_type m_size; + id_type m_capacity; + Callbacks m_callbacks; public: diff --git a/src/c4/yml/emit.def.hpp b/src/c4/yml/emit.def.hpp index 5ea901f89..3b8896e20 100644 --- a/src/c4/yml/emit.def.hpp +++ b/src/c4/yml/emit.def.hpp @@ -643,19 +643,31 @@ void Emitter::_write(NodeScalar const& C4_RESTRICT sc, NodeType flags, i template void Emitter::_write_json(NodeScalar const& C4_RESTRICT sc, NodeType flags) { - if(C4_UNLIKELY( ! sc.tag.empty())) - _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have tags"); + if(flags & (KEYTAG|VALTAG)) + if(m_opts.json_error_flags() & EmitOptions::JSON_ERR_ON_TAG) + _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have tags"); if(C4_UNLIKELY(flags.has_anchor())) - _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have anchors"); - // use double quoted style... - // if it is a key (mandatory in JSON) - // if the style is marked quoted - bool dquoted = ((flags & (KEY|VALQUO)) - || (scalar_style_json_choose(sc.scalar) & SCALAR_DQUO)); // choose the style - if(dquoted) - _write_scalar_json_dquo(sc.scalar); + if(m_opts.json_error_flags() & EmitOptions::JSON_ERR_ON_ANCHOR) + _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have anchors"); + if(sc.scalar.len) + { + // use double quoted style... + // if it is a key (mandatory in JSON) + // if the style is marked quoted + bool dquoted = ((flags & (KEY|VALQUO)) + || (scalar_style_json_choose(sc.scalar) & SCALAR_DQUO)); // choose the style + if(dquoted) + _write_scalar_json_dquo(sc.scalar); + else + this->Writer::_do_write(sc.scalar); + } else - this->Writer::_do_write(sc.scalar); + { + if(sc.scalar.str || (flags & (KEY|VALQUO|KEYTAG|VALTAG))) + this->Writer::_do_write("\"\""); + else + this->Writer::_do_write("null"); + } } template diff --git a/src/c4/yml/emit.hpp b/src/c4/yml/emit.hpp index 8c788b614..879f5f5cf 100644 --- a/src/c4/yml/emit.hpp +++ b/src/c4/yml/emit.hpp @@ -74,7 +74,24 @@ typedef enum { /** A lightweight object containing options to be used when emitting. */ struct EmitOptions { + typedef enum : uint32_t { + DEFAULT_FLAGS = 0, + JSON_ERR_ON_TAG = 1 << 0, + JSON_ERR_ON_ANCHOR = 1 << 1, + _JSON_ERR_MASK = JSON_ERR_ON_TAG|JSON_ERR_ON_ANCHOR, + } EmitOptionFlags_e; + public: + + /** @name option flags + * + * @{ */ + C4_ALWAYS_INLINE EmitOptionFlags_e json_error_flags() const noexcept { return m_option_flags; } + EmitOptions& json_error_flags(EmitOptionFlags_e d) noexcept { m_option_flags = (EmitOptionFlags_e)(d & _JSON_ERR_MASK); return *this; } + /** @} */ + +public: + /** @name max depth for the emitted tree * * This makes the emitter fail when emitting trees exceeding the @@ -85,8 +102,21 @@ struct EmitOptions EmitOptions& max_depth(id_type d) noexcept { m_max_depth = d; return *this; } static constexpr const id_type max_depth_default = 64; /** @} */ + +public: + + bool operator== (const EmitOptions& that) const noexcept + { + return m_max_depth == that.m_max_depth && + m_option_flags == that.m_option_flags; + } + private: + + /** @cond dev */ id_type m_max_depth{max_depth_default}; + EmitOptionFlags_e m_option_flags{DEFAULT_FLAGS}; + /** @endcond */ }; @@ -442,6 +472,7 @@ inline OStream& operator<< (OStream& s, as_yaml const& y) * @param id the node where to start emitting. * @param opts emit options. * @param buf the output buffer. + * @param opts emit options. * @param error_on_excess Raise an error if the space in the buffer is insufficient. * @return a substr trimmed to the result in the output buffer. If the buffer is * insufficient (when error_on_excess is false), the string pointer of the @@ -462,6 +493,7 @@ inline substr emit_yaml(Tree const& t, id_type id, substr buf, bool error_on_exc * @param id the node where to start emitting. * @param opts emit options. * @param buf the output buffer. + * @param opts emit options. * @param error_on_excess Raise an error if the space in the buffer is insufficient. * @return a substr trimmed to the result in the output buffer. If the buffer is * insufficient (when error_on_excess is false), the string pointer of the diff --git a/src/c4/yml/event_handler_stack.hpp b/src/c4/yml/event_handler_stack.hpp index 4bb780772..30e969530 100644 --- a/src/c4/yml/event_handler_stack.hpp +++ b/src/c4/yml/event_handler_stack.hpp @@ -25,6 +25,10 @@ namespace yml { /** @addtogroup doc_event_handlers * @{ */ +namespace detail { +using pfn_relocate_arena = void (*)(void*, csubstr prev_arena, substr next_arena); +} // detail + /** Use this class a base of implementations of event handler to * simplify the stack logic. */ template @@ -34,17 +38,36 @@ struct EventHandlerStack "ParserState must be a base of HandlerState"); using state = HandlerState; + using pfn_relocate_arena = detail::pfn_relocate_arena; public: detail::stack m_stack; state *C4_RESTRICT m_curr; ///< current stack level: top of the stack. cached here for easier access. state *C4_RESTRICT m_parent; ///< parent of the current stack level. + pfn_relocate_arena m_relocate_arena; ///< callback when the arena gets relocated + void * m_relocate_arena_data; protected: - EventHandlerStack() : m_stack(), m_curr(), m_parent() {} - EventHandlerStack(Callbacks const& cb) : m_stack(cb), m_curr(), m_parent() {} + EventHandlerStack() : m_stack(), m_curr(), m_parent(), m_relocate_arena(), m_relocate_arena_data() {} + EventHandlerStack(Callbacks const& cb) : m_stack(cb), m_curr(), m_parent(), m_relocate_arena(), m_relocate_arena_data() {} + +protected: + + void _stack_start_parse(const char *filename, pfn_relocate_arena relocate_arena, void *relocate_arena_data) + { + _RYML_CB_ASSERT(m_stack.m_callbacks, m_curr != nullptr); + _RYML_CB_ASSERT(m_stack.m_callbacks, relocate_arena != nullptr); + _RYML_CB_ASSERT(m_stack.m_callbacks, relocate_arena_data != nullptr); + m_curr->start_parse(filename, m_curr->node_id); + m_relocate_arena = relocate_arena; + m_relocate_arena_data = relocate_arena_data; + } + + void _stack_finish_parse() + { + } protected: @@ -89,15 +112,25 @@ struct EventHandlerStack #endif } - substr _stack_relocate_to_new_arena(csubstr s, csubstr prev, substr curr) +protected: + + // undefined at the end + #define _has_any_(bits) (static_cast(this)->template _has_any__()) + + bool _stack_should_push_on_begin_doc() const { - _RYML_CB_ASSERT(m_stack.m_callbacks, prev.is_super(s)); - auto pos = s.str - prev.str; - substr out = {curr.str + pos, s.len}; - _RYML_CB_ASSERT(m_stack.m_callbacks, curr.is_super(out)); - return out; + const bool is_root = (m_stack.size() == 1u); + return is_root && (_has_any_(DOC|VAL|MAP|SEQ) || m_curr->has_children); } + bool _stack_should_pop_on_end_doc() const + { + const bool is_root = (m_stack.size() == 1u); + return !is_root && _has_any_(DOC); + } + +protected: + void _stack_relocate_to_new_arena(csubstr prev, substr curr) { for(state &st : m_stack) @@ -109,23 +142,18 @@ struct EventHandlerStack if(st.line_contents.stripped.is_sub(prev)) st.line_contents.stripped = _stack_relocate_to_new_arena(st.line_contents.stripped, prev, curr); } + _RYML_CB_ASSERT(m_stack.m_callbacks, m_relocate_arena != nullptr); + _RYML_CB_ASSERT(m_stack.m_callbacks, m_relocate_arena_data != nullptr); + m_relocate_arena(m_relocate_arena_data, prev, curr); } -protected: - - // undefined at the end - #define _has_any_(bits) (static_cast(this)->template _has_any__()) - - bool _stack_should_push_on_begin_doc() const - { - const bool is_root = (m_stack.size() == 1u); - return is_root && (_has_any_(DOC|VAL|MAP|SEQ) || m_curr->has_children); - } - - bool _stack_should_pop_on_end_doc() const + substr _stack_relocate_to_new_arena(csubstr s, csubstr prev, substr curr) { - const bool is_root = (m_stack.size() == 1u); - return !is_root && _has_any_(DOC); + _RYML_CB_ASSERT(m_stack.m_callbacks, prev.is_super(s)); + auto pos = s.str - prev.str; + substr out = {curr.str + pos, s.len}; + _RYML_CB_ASSERT(m_stack.m_callbacks, curr.is_super(out)); + return out; } public: diff --git a/src/c4/yml/event_handler_tree.hpp b/src/c4/yml/event_handler_tree.hpp index b46238f4d..6c08ecadc 100644 --- a/src/c4/yml/event_handler_tree.hpp +++ b/src/c4/yml/event_handler_tree.hpp @@ -43,6 +43,8 @@ struct EventHandlerTree : public EventHandlerStack(); _c4dbgpf("node[{}]: enable {}", m_curr->node_id, #bits) @@ -59,21 +61,23 @@ struct EventHandlerTree : public EventHandlerStackcallbacks()), m_tree(tree), m_id(id) + EventHandlerTree() : EventHandlerStack(), m_tree(), m_id(NONE), m_num_directives(), m_yaml_directive() {} + EventHandlerTree(Callbacks const& cb) : EventHandlerStack(cb), m_tree(), m_id(NONE), m_num_directives(), m_yaml_directive() {} + EventHandlerTree(Tree *tree, id_type id) : EventHandlerStack(tree->callbacks()), m_tree(tree), m_id(id), m_num_directives(), m_yaml_directive() { reset(tree, id); } void reset(Tree *tree, id_type id) { - RYML_CHECK(tree); - RYML_CHECK(id < tree->capacity()); - if(!tree->is_root(id)) - if(tree->is_map(tree->parent(id))) - if(!tree->has_key(id)) - c4::yml::error("destination node belongs to a map and has no key"); + if(C4_UNLIKELY(!tree)) + _RYML_CB_ERR(m_stack.m_callbacks, "null tree"); + if(C4_UNLIKELY(id >= tree->capacity())) + _RYML_CB_ERR(tree->callbacks(), "invalid node"); + if(C4_UNLIKELY(!tree->is_root(id))) + if(C4_UNLIKELY(tree->is_map(tree->parent(id)))) + if(C4_UNLIKELY(!tree->has_key(id))) + _RYML_CB_ERR(tree->callbacks(), "destination node belongs to a map and has no key"); m_tree = tree; m_id = id; if(m_tree->is_root(id)) @@ -87,6 +91,8 @@ struct EventHandlerTree : public EventHandlerStackparent(id)); _reset_parser_state(m_curr, id, id); } + m_num_directives = 0; + m_yaml_directive = false; } /** @} */ @@ -96,13 +102,16 @@ struct EventHandlerTree : public EventHandlerStackstart_parse(filename, m_curr->node_id); + this->_stack_start_parse(filename, relocate_arena, relocate_arena_data); } void finish_parse() { + if(m_num_directives && !m_tree->is_stream(m_tree->root_id())) + _RYML_CB_ERR_(m_stack.m_callbacks, "directives cannot be used without a document", {}); + this->_stack_finish_parse(); /* This pointer is temporary. Remember that: * * - this handler object may be held by the user @@ -114,7 +123,7 @@ struct EventHandlerTree : public EventHandlerStacknode_id, anchor.len, anchor); - _RYML_CB_ASSERT(m_stack.m_callbacks, !anchor.begins_with('&')); + if(C4_UNLIKELY(_has_any_(KEYREF))) + _RYML_CB_ERR_(m_tree->callbacks(), "key cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_tree->callbacks(), !anchor.begins_with('&')); _enable_(KEYANCH); m_curr->tr_data->m_key.anchor = anchor; } - void set_val_anchor(csubstr anchor) RYML_NOEXCEPT + void set_val_anchor(csubstr anchor) { _c4dbgpf("node[{}]: set val anchor: [{}]~~~{}~~~", m_curr->node_id, anchor.len, anchor); - _RYML_CB_ASSERT(m_stack.m_callbacks, !anchor.begins_with('&')); + if(C4_UNLIKELY(_has_any_(VALREF))) + _RYML_CB_ERR_(m_tree->callbacks(), "val cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_tree->callbacks(), !anchor.begins_with('&')); _enable_(VALANCH); m_curr->tr_data->m_val.anchor = anchor; } - void set_key_ref(csubstr ref) RYML_NOEXCEPT + void set_key_ref(csubstr ref) { _c4dbgpf("node[{}]: set key ref: [{}]~~~{}~~~", m_curr->node_id, ref.len, ref); - _RYML_CB_ASSERT(m_stack.m_callbacks, ref.begins_with('*')); + if(C4_UNLIKELY(_has_any_(KEYANCH))) + _RYML_CB_ERR_(m_tree->callbacks(), "key cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_tree->callbacks(), ref.begins_with('*')); _enable_(KEY|KEYREF); m_curr->tr_data->m_key.anchor = ref.sub(1); m_curr->tr_data->m_key.scalar = ref; } - void set_val_ref(csubstr ref) RYML_NOEXCEPT + void set_val_ref(csubstr ref) { _c4dbgpf("node[{}]: set val ref: [{}]~~~{}~~~", m_curr->node_id, ref.len, ref); - _RYML_CB_ASSERT(m_stack.m_callbacks, ref.begins_with('*')); + if(C4_UNLIKELY(_has_any_(VALANCH))) + _RYML_CB_ERR_(m_tree->callbacks(), "val cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_tree->callbacks(), ref.begins_with('*')); _enable_(VAL|VALREF); m_curr->tr_data->m_val.anchor = ref.sub(1); m_curr->tr_data->m_val.scalar = ref; @@ -491,23 +509,27 @@ struct EventHandlerTree : public EventHandlerStackcallbacks(), directive.begins_with('%')); if(directive.begins_with("%TAG")) { - // TODO do not use directives in the tree - _RYML_CB_CHECK(m_stack.m_callbacks, m_tree->add_tag_directive(directive)); + if(C4_UNLIKELY(!m_tree->add_tag_directive(directive))) + _RYML_CB_ERR_(m_stack.m_callbacks, "failed to add directive", m_curr->pos); } else if(directive.begins_with("%YAML")) { _c4dbgpf("%YAML directive! ignoring...: {}", directive); + if(C4_UNLIKELY(m_yaml_directive)) + _RYML_CB_ERR_(m_stack.m_callbacks, "multiple yaml directives", m_curr->pos); + m_yaml_directive = true; } else { - _c4dbgpf("% directive unknown! ignoring...: {}", directive); + _c4dbgpf("unknown directive! ignoring... {}", directive); } + ++m_num_directives; } /** @} */ @@ -583,7 +605,7 @@ struct EventHandlerTree : public EventHandlerStackcallbacks(), "cannot append to node"); } if(type.is_doc()) { @@ -644,15 +666,15 @@ struct EventHandlerTree : public EventHandlerStackroot_id() == 0u); - _RYML_CB_ASSERT(m_stack.m_callbacks, m_curr->node_id == 0u); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->root_id() == 0u); + _RYML_CB_ASSERT(m_tree->callbacks(), m_curr->node_id == 0u); const bool hack = !m_tree->has_children(m_curr->node_id) && !m_tree->is_val(m_curr->node_id); if(hack) m_tree->_p(m_tree->root_id())->m_type.add(VAL); m_tree->set_root_as_stream(); - _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_stream(m_tree->root_id())); - _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_children(m_tree->root_id())); - _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_doc(m_tree->first_child(m_tree->root_id()))); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_stream(m_tree->root_id())); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->has_children(m_tree->root_id())); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_doc(m_tree->first_child(m_tree->root_id()))); if(hack) m_tree->_p(m_tree->first_child(m_tree->root_id()))->m_type.rem(VAL); _set_state_(m_curr, m_tree->root_id()); @@ -675,7 +697,7 @@ struct EventHandlerTree : public EventHandlerStacksize() > 0); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->size() > 0); const id_type last_added = m_tree->size() - 1; if(m_tree->has_parent(last_added)) if(m_tree->_p(last_added)->m_type == NOTYPE) @@ -684,9 +706,9 @@ struct EventHandlerTree : public EventHandlerStacksize() > 0); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->size() > 0); const id_type last_added = m_tree->size() - 1; - _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_parent(last_added)); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->has_parent(last_added)); if(m_tree->_p(last_added)->m_type == NOTYPE) { _c4dbgpf("remove speculative node with parent. parent={} node={} parent(node)={}", m_parent->node_id, last_added, m_tree->parent(last_added)); @@ -696,7 +718,7 @@ struct EventHandlerTree : public EventHandlerStack_p(m_curr->node_id)->m_val.scalar.len == 0); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->_p(m_curr->node_id)->m_val.scalar.len == 0); m_tree->_p(m_curr->node_id)->m_val.scalar.str = m_curr->line_contents.rem.str; } diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index 4faf68fce..972c742d3 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -364,6 +364,33 @@ void ParseEngine::_reset() m_was_inside_qmrk = false; } + +//----------------------------------------------------------------------------- + +template +void ParseEngine::_relocate_arena(csubstr prev_arena, substr next_arena) +{ + #define _ryml_relocate(s) \ + if(s.is_sub(prev_arena)) \ + { \ + s.str = next_arena.str + (s.str - prev_arena.str); \ + } + _ryml_relocate(m_buf); + _ryml_relocate(m_newline_offsets_buf); + for(size_t i = 0; i < m_pending_tags.num_entries; ++i) + _ryml_relocate(m_pending_tags.annotations[i].str); + for(size_t i = 0; i < m_pending_anchors.num_entries; ++i) + _ryml_relocate(m_pending_anchors.annotations[i].str); + #undef _ryml_relocate +} + +template +void ParseEngine::_s_relocate_arena(void* data, csubstr prev_arena, substr next_arena) +{ + ((ParseEngine*)data)->_relocate_arena(prev_arena, next_arena); +} + + //----------------------------------------------------------------------------- template @@ -525,14 +552,37 @@ void ParseEngine::_skipchars(const char (&chars)[N]) _line_progressed(pos); } +template +void ParseEngine::_skip_comment() +{ + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, m_state->line_contents.rem.begins_with('#')); + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, m_state->line_contents.rem.is_sub(m_state->line_contents.full)); + csubstr rem = m_state->line_contents.rem; + csubstr full = m_state->line_contents.full; + // raise an error if the comment is not preceded by whitespace + if(!full.begins_with('#')) + { + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, rem.str > full.str); + const char c = full[(size_t)(rem.str - full.str - 1)]; + if(C4_UNLIKELY(c != ' ' && c != '\t')) + _RYML_CB_ERR(m_evt_handler->m_stack.m_callbacks, "comment not preceded by whitespace"); + } + else + { + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, rem.str == full.str); + } + _c4dbgpf("comment was '{}'", rem); + _line_progressed(rem.len); +} + template void ParseEngine::_maybe_skip_comment() { csubstr s = m_state->line_contents.rem.triml(' '); if(s.begins_with('#')) { - _c4dbgpf("comment was '{}'", s); - _line_progressed(m_state->line_contents.rem.len); + _line_progressed((size_t)(s.str - m_state->line_contents.rem.str)); + _skip_comment(); } } @@ -559,6 +609,29 @@ bool ParseEngine::_maybe_scan_following_colon() noexcept return false; } +template +bool ParseEngine::_maybe_scan_following_comma() noexcept +{ + if(m_state->line_contents.rem.len) + { + if(m_state->line_contents.rem.str[0] == ' ' || m_state->line_contents.rem.str[0] == '\t') + { + size_t pos = m_state->line_contents.rem.first_not_of(" \t"); + if(pos == npos) + pos = m_state->line_contents.rem.len; // maybe the line has only spaces + _c4dbgpf("skip {}x'{}'", pos, ' '); + _line_progressed(pos); + } + if(m_state->line_contents.rem.len && (m_state->line_contents.rem.str[0] == ',')) + { + _c4dbgp("found ',' comma next"); + _line_progressed(1); + return true; + } + } + return false; +} + //----------------------------------------------------------------------------- @@ -602,7 +675,10 @@ csubstr ParseEngine::_scan_tag() if(rem.begins_with("!!")) { _c4dbgp("begins with '!!'"); - t = rem.left_of(rem.first_of(" ,")); + if(has_any(FLOW)) + t = rem.left_of(rem.first_of(" ,")); + else + t = rem.left_of(rem.first_of(' ')); } else if(rem.begins_with("!<")) { @@ -620,7 +696,10 @@ csubstr ParseEngine::_scan_tag() { _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, rem.begins_with('!')); _c4dbgp("begins with '!'"); - t = rem.left_of(rem.first_of(' ')); + if(has_any(FLOW)) + t = rem.left_of(rem.first_of(" ,")); + else + t = rem.left_of(rem.first_of(' ')); } _line_progressed(t.len); _maybe_skip_whitespace_tokens(); @@ -752,9 +831,14 @@ bool ParseEngine::_scan_scalar_plain_seq_flow(ScannedScalar *C4_RE _c4dbgpf("found terminating character at {}: '{}'", i, c); _line_progressed(i); if(m_state->pos.offset + i > start_offset) + { goto ended_scalar; + } else - _c4err("parse error"); + { + _c4dbgp("at the beginning. no scalar here."); + return false; + } break; case ']': _c4dbgpf("found terminating character at {}: '{}'", i, c); @@ -2525,8 +2609,10 @@ void ParseEngine::_filter_dquoted_backslash(FilterProcessor &C4_RE uint32_t codepoint_val = {}; if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val))) _c4err("failed to parse \\u codepoint. scalar pos={}", proc.rpos); - size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); - C4_ASSERT(numbytes <= 4); + const size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); + if(C4_UNLIKELY(numbytes == 0)) + _c4err("failed to decode code point={}", proc.rpos); + _RYML_CB_ASSERT(callbacks(), numbytes <= 4); proc.translate_esc_bulk(readbuf, numbytes, /*nread*/5u); } else if(next == 'U') // UTF32 @@ -2538,8 +2624,10 @@ void ParseEngine::_filter_dquoted_backslash(FilterProcessor &C4_RE uint32_t codepoint_val = {}; if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val))) _c4err("failed to parse \\U codepoint. scalar pos={}", proc.rpos); - size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); - C4_ASSERT(numbytes <= 4); + const size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); + if(C4_UNLIKELY(numbytes == 0)) + _c4err("failed to decode code point={}", proc.rpos); + _RYML_CB_ASSERT(callbacks(), numbytes <= 4); proc.translate_esc_bulk(readbuf, numbytes, /*nread*/9u); } // https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char @@ -2715,7 +2803,6 @@ void ParseEngine::_filter_chomp(FilterProcessor &C4_RESTRICT proc, { const char curr = proc.curr(); _c4dbgchomp("curr='{}'", _c4prc(curr)); - _RYML_CB_ASSERT(this->callbacks(), curr == '\n' || curr == '\r'); switch(curr) { case '\n': @@ -2754,6 +2841,9 @@ void ParseEngine::_filter_chomp(FilterProcessor &C4_RESTRICT proc, case '\r': proc.skip(); break; + default: + _c4err("parse error"); + break; } } } @@ -4017,13 +4107,31 @@ bool ParseEngine::_annotations_require_key_container() const return m_pending_tags.num_entries > 1 || m_pending_anchors.num_entries > 1; } +template +void ParseEngine::_check_tag(csubstr tag) +{ + if(!tag.begins_with("!<")) + { + if(C4_UNLIKELY(tag.first_of("[]{},") != npos)) + _RYML_CB_ERR_(m_evt_handler->m_stack.m_callbacks, "tags must not contain any of '[]{},'", m_state->pos); + } + else + { + if(C4_UNLIKELY(!tag.ends_with('>'))) + _RYML_CB_ERR_(m_evt_handler->m_stack.m_callbacks, "malformed tag", m_state->pos); + } +} + template void ParseEngine::_handle_annotations_before_blck_key_scalar() { + _c4dbgpf("annotations_before_blck_key_scalar, node={}", m_state->node_id); if(m_pending_tags.num_entries) { + _c4dbgpf("annotations_before_blck_key_scalar, #tags={}", m_pending_tags.num_entries); if(C4_LIKELY(m_pending_tags.num_entries == 1)) { + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_key_tag(m_pending_tags.annotations[0].str); _clear_annotations(&m_pending_tags); } @@ -4034,6 +4142,7 @@ void ParseEngine::_handle_annotations_before_blck_key_scalar() } if(m_pending_anchors.num_entries) { + _c4dbgpf("annotations_before_blck_key_scalar, #anchors={}", m_pending_anchors.num_entries); if(C4_LIKELY(m_pending_anchors.num_entries == 1)) { m_evt_handler->set_key_anchor(m_pending_anchors.annotations[0].str); @@ -4049,10 +4158,13 @@ void ParseEngine::_handle_annotations_before_blck_key_scalar() template void ParseEngine::_handle_annotations_before_blck_val_scalar() { + _c4dbgpf("annotations_before_blck_val_scalar, node={}", m_state->node_id); if(m_pending_tags.num_entries) { + _c4dbgpf("annotations_before_blck_val_scalar, #tags={}", m_pending_tags.num_entries); if(C4_LIKELY(m_pending_tags.num_entries == 1)) { + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_val_tag(m_pending_tags.annotations[0].str); _clear_annotations(&m_pending_tags); } @@ -4063,6 +4175,7 @@ void ParseEngine::_handle_annotations_before_blck_val_scalar() } if(m_pending_anchors.num_entries) { + _c4dbgpf("annotations_before_blck_val_scalar, #anchors={}", m_pending_anchors.num_entries); if(C4_LIKELY(m_pending_anchors.num_entries == 1)) { m_evt_handler->set_val_anchor(m_pending_anchors.annotations[0].str); @@ -4082,6 +4195,7 @@ void ParseEngine::_handle_annotations_before_start_mapblck(size_t if(m_pending_tags.num_entries == 2) { _c4dbgp("2 tags, setting entry 0"); + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_val_tag(m_pending_tags.annotations[0].str); } else if(m_pending_tags.num_entries == 1) @@ -4090,6 +4204,7 @@ void ParseEngine::_handle_annotations_before_start_mapblck(size_t if(m_pending_tags.annotations[0].line < current_line) { _c4dbgp("...tag is for the map. setting it."); + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_val_tag(m_pending_tags.annotations[0].str); _clear_annotations(&m_pending_tags); } @@ -4118,6 +4233,7 @@ void ParseEngine::_handle_annotations_before_start_mapblck_as_key( _c4dbgp("annotations_before_start_mapblck_as_key"); if(m_pending_tags.num_entries == 2) { + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_key_tag(m_pending_tags.annotations[0].str); } if(m_pending_anchors.num_entries == 2) @@ -4138,10 +4254,12 @@ void ParseEngine::_handle_annotations_and_indentation_after_start_ switch(m_pending_tags.num_entries) { case 1u: + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_key_tag(m_pending_tags.annotations[0].str); _clear_annotations(&m_pending_tags); break; case 2u: + _check_tag(m_pending_tags.annotations[1].str); m_evt_handler->set_key_tag(m_pending_tags.annotations[1].str); _clear_annotations(&m_pending_tags); break; @@ -4186,6 +4304,27 @@ size_t ParseEngine::_select_indentation_from_annotations(size_t va return curr->line < val_line ? val_indentation : curr->indentation; } +template +void ParseEngine::_handle_directive(csubstr rem) +{ + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, rem.is_sub(m_state->line_contents.rem)); + const size_t pos = rem.find('#'); + _c4dbgpf("handle_directive: pos={} rem={}", pos, rem); + if(pos == npos) // no comments + { + m_evt_handler->add_directive(rem); + _line_progressed(rem.len); + } + else + { + csubstr to_comment = rem.first(pos); + csubstr trimmed = to_comment.trimr(" \t"); + m_evt_handler->add_directive(trimmed); + _line_progressed(pos); + _skip_comment(); + } +} + //----------------------------------------------------------------------------- @@ -4352,7 +4491,7 @@ void ParseEngine::_handle_map_json() ScannedScalar sc = _scan_scalar_dquot(); csubstr maybe_filtered = _maybe_filter_key_scalar_dquot(sc); m_evt_handler->set_key_scalar_dquoted(maybe_filtered); - addrem_flags(RKCL, RKEY|QMRK); + addrem_flags(RKCL, RKEY); break; } case '}': // this happens on a trailing comma like ", }" @@ -4816,11 +4955,9 @@ void ParseEngine::_handle_seq_flow() else if(first == ']') // this happens on a trailing comma like ", ]" { _c4dbgp("seqflow[RVAL]: end!"); - rem_flags(RSEQ); - m_evt_handler->end_seq(); _line_progressed(1); - if(!has_all(RSEQ|FLOW)) - goto seqflow_finish; + m_evt_handler->end_seq(); + goto seqflow_finish; } else if(first == '*') { @@ -4834,12 +4971,25 @@ void ParseEngine::_handle_seq_flow() csubstr anchor = _scan_anchor(); _c4dbgpf("seqflow[RVAL]: anchor! [{}]~~~{}~~~", anchor.len, anchor); m_evt_handler->set_val_anchor(anchor); + if(_maybe_scan_following_comma()) + { + _c4dbgp("seqflow[RVAL]: empty scalar!"); + m_evt_handler->set_val_scalar_plain({}); + m_evt_handler->add_sibling(); + } } else if(first == '!') { csubstr tag = _scan_tag(); _c4dbgpf("seqflow[RVAL]: tag! [{}]~~~{}~~~", tag.len, tag); + _check_tag(tag); m_evt_handler->set_val_tag(tag); + if(_maybe_scan_following_comma()) + { + _c4dbgp("seqflow[RVAL]: empty scalar!"); + m_evt_handler->set_val_scalar_plain({}); + m_evt_handler->add_sibling(); + } } else if(first == ':') { @@ -4863,7 +5013,6 @@ void ParseEngine::_handle_seq_flow() _line_progressed(1); _maybe_skip_whitespace_tokens(); goto seqflow_finish; - } else { @@ -5042,6 +5191,7 @@ void ParseEngine::_handle_map_flow() { csubstr tag = _scan_tag(); _c4dbgpf("mapflow[RKEY]: tag! [{}]~~~{}~~~", tag.len, tag); + _check_tag(tag); m_evt_handler->set_key_tag(tag); } else @@ -5163,6 +5313,7 @@ void ParseEngine::_handle_map_flow() { csubstr tag = _scan_tag(); _c4dbgpf("mapflow[RVAL]: tag! [{}]~~~{}~~~", tag.len, tag); + _check_tag(tag); m_evt_handler->set_val_tag(tag); } else @@ -5289,6 +5440,7 @@ void ParseEngine::_handle_map_flow() { csubstr tag = _scan_tag(); _c4dbgpf("mapflow[QMRK]: tag! [{}]~~~{}~~~", tag.len, tag); + _check_tag(tag); m_evt_handler->set_key_tag(tag); } else @@ -5636,6 +5788,8 @@ void ParseEngine::_handle_seq_block() // handle indentation // _c4dbgpf("seqblck[RNXT]: indref={} indentation={}", m_state->indref, m_state->line_contents.indentation); + if(C4_UNLIKELY(!_at_line_begin())) + _c4err("parse error"); if(m_state->indentation_ge()) { _c4dbgpf("seqblck[RNXT]: skip {} from indref", m_state->indref); @@ -5918,6 +6072,7 @@ void ParseEngine::_handle_map_block() { csubstr ref = _scan_ref_map(); _c4dbgpf("mapblck[RKEY]: key ref! [{}]~~~{}~~~", ref.len, ref); + _handle_annotations_before_blck_key_scalar(); m_evt_handler->set_key_ref(ref); addrem_flags(RVAL, RKEY); if(!_maybe_scan_following_colon()) @@ -6043,7 +6198,7 @@ void ParseEngine::_handle_map_block() else if(first == '?') { _c4dbgp("mapblck[RKCL]: got '?'. val was empty"); - _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, m_was_inside_qmrk); + _RYML_CB_CHECK(m_evt_handler->m_stack.m_callbacks, m_was_inside_qmrk); m_evt_handler->set_val_scalar_plain({}); m_evt_handler->add_sibling(); addrem_flags(QMRK, RKCL); @@ -6055,7 +6210,7 @@ void ParseEngine::_handle_map_block() if(m_state->indref == 0 || m_state->line_contents.indentation == 0 || _is_doc_begin_token(rem)) { _c4dbgp("mapblck[RKCL]: end+start doc"); - _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, _is_doc_begin_token(rem)); + _RYML_CB_CHECK(m_evt_handler->m_stack.m_callbacks, _is_doc_begin_token(rem)); _start_doc_suddenly(); _line_progressed(3); _maybe_skip_whitespace_tokens(); @@ -6411,6 +6566,7 @@ void ParseEngine::_handle_map_block() else { _c4dbgp("mapblck[RVAL]: was val ref"); + _handle_annotations_before_blck_val_scalar(); m_evt_handler->set_val_ref(ref); addrem_flags(RNXT, RVAL); } @@ -7080,8 +7236,9 @@ void ParseEngine::_handle_unk() else if(first == '%') { _c4dbgpf("directive: {}", rem); - m_evt_handler->add_directive(rem); - _line_progressed(rem.len); + if(C4_UNLIKELY(!m_doc_empty && has_none(NDOC))) + _RYML_CB_ERR(m_evt_handler->m_stack.m_callbacks, "need document footer before directives"); + _handle_directive(rem); return; } } @@ -7848,7 +8005,7 @@ void ParseEngine::parse_json_in_place_ev(csubstr filename, substr m_file = filename; m_buf = src; _reset(); - m_evt_handler->start_parse(filename.str); + m_evt_handler->start_parse(filename.str, &_s_relocate_arena, this); m_evt_handler->begin_stream(); while( ! _finished_file()) { @@ -7892,7 +8049,7 @@ void ParseEngine::parse_in_place_ev(csubstr filename, substr src) m_file = filename; m_buf = src; _reset(); - m_evt_handler->start_parse(filename.str); + m_evt_handler->start_parse(filename.str, &_s_relocate_arena, this); m_evt_handler->begin_stream(); while( ! _finished_file()) { diff --git a/src/c4/yml/parse_engine.hpp b/src/c4/yml/parse_engine.hpp index 06f660ab9..f63653ebc 100644 --- a/src/c4/yml/parse_engine.hpp +++ b/src/c4/yml/parse_engine.hpp @@ -571,6 +571,7 @@ class ParseEngine void _handle_indentation_pop(ParserState const* dst); void _maybe_skip_comment(); + void _skip_comment(); void _maybe_skip_whitespace_tokens(); void _maybe_skipchars(char c); #ifdef RYML_NO_COVERAGE__TO_BE_DELETED @@ -579,6 +580,7 @@ class ParseEngine template void _skipchars(const char (&chars)[N]); bool _maybe_scan_following_colon() noexcept; + bool _maybe_scan_following_comma() noexcept; public: @@ -631,6 +633,9 @@ class ParseEngine return m_evt_handler->m_curr->line_contents.rem.begin() == m_evt_handler->m_curr->line_contents.full.begin(); } + void _relocate_arena(csubstr prev_arena, substr next_arena); + static void _s_relocate_arena(void*, csubstr prev_arena, substr next_arena); + private: C4_ALWAYS_INLINE bool has_all(ParserFlag_t f) const noexcept { return (m_evt_handler->m_curr->flags & f) == f; } @@ -696,6 +701,7 @@ class ParseEngine void _add_annotation(Annotation *C4_RESTRICT dst, csubstr str, size_t indentation, size_t line); void _clear_annotations(Annotation *C4_RESTRICT dst); + bool _has_pending_annotations() const { return m_pending_tags.num_entries || m_pending_anchors.num_entries; } #ifdef RYML_NO_COVERAGE__TO_BE_DELETED bool _handle_indentation_from_annotations(); #endif @@ -706,6 +712,9 @@ class ParseEngine void _handle_annotations_before_start_mapblck_as_key(); void _handle_annotations_and_indentation_after_start_mapblck(size_t key_indentation, size_t key_line); size_t _select_indentation_from_annotations(size_t val_indentation, size_t val_line); + void _handle_directive(csubstr rem); + + void _check_tag(csubstr tag); private: diff --git a/src/c4/yml/tag.cpp b/src/c4/yml/tag.cpp index a74537d9a..674b8c285 100644 --- a/src/c4/yml/tag.cpp +++ b/src/c4/yml/tag.cpp @@ -248,8 +248,10 @@ size_t TagDirective::transform(csubstr tag, substr output, Callbacks const& call _c4dbgpf("%TAG: rest={}", rest); if(rest.begins_with('<')) { - rest = rest.offs(1, 1); _c4dbgpf("%TAG: begins with <. rest={}", rest); + if(C4_UNLIKELY(!rest.ends_with('>'))) + _RYML_CB_ERR(callbacks, "malformed tag"); + rest = rest.offs(1, 1); if(rest.begins_with(prefix)) { _c4dbgpf("%TAG: already transformed! actual={}", rest.sub(prefix.len)); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 905292cef..f49083261 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -265,3 +265,62 @@ if(RYML_TEST_SUITE) ryml_add_test_from_suite(${case}) endforeach() endif(RYML_TEST_SUITE) + + +#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------ + +# run every known fuzz crash +set(fuzzdefault ON) +if(EMSCRIPTEN) + set(fuzzdefault OFF) +endif() +option(RYML_TEST_FUZZ "Enable tests for known problematic fuzz cases." ${fuzzdefault}) + +if(RYML_TEST_FUZZ) + c4_download_remote_proj(rapidyaml-data rapidyaml_data_dir + GIT_REPOSITORY https://github.com/biojppm/rapidyaml-data + GIT_TAG master) + if(NOT EXISTS ${rapidyaml_data_dir}/fuzz/yaml.dict) + c4_err("cannot find rapidyaml-data at ${rapidyaml_data_dir} -- was there an error downloading the project?") + endif() + + set(corpus_suite_dir ${rapidyaml_data_dir}/fuzz/yaml_test_suite) + set(corpus_generated_dir ${rapidyaml_data_dir}/fuzz/yaml_generated) + set(corpus_artifacts_dir ${rapidyaml_data_dir}/fuzz/yaml_artifacts) + set(corpus_merged_dir ${rapidyaml_data_dir}/fuzz/yaml_merged) + set(yaml_dict ${rapidyaml_data_dir}/fuzz/yaml.dict) + file(GLOB_RECURSE fuzz_files RELATIVE "${corpus_artifacts_dir}" "${corpus_artifacts_dir}/*") + file(GLOB_RECURSE suite_files RELATIVE "${corpus_suite_dir}" "${corpus_suite_dir}/*") + + # add individual tests for problematic fuzz files + function(ryml_add_fuzz_test name) + c4_add_executable(ryml-test-fuzz-${name} + SOURCES + test_fuzz/test_fuzz_common.hpp + test_fuzz/test_fuzz_${name}.cpp + test_fuzz/test_fuzz_main.cpp + ${ARGN} + INC_DIRS ${CMAKE_CURRENT_LIST_DIR} + LIBS ryml c4fs + FOLDER test/fuzz) + if(RYML_DBG) + target_compile_definitions(ryml-test-fuzz-${name} PUBLIC RYML_DBG) + endif() + add_dependencies(ryml-test-build ryml-test-fuzz-${name}) + ryml_get_target_exe(ryml-test-fuzz-${name} tgtexe) + function(ryml_add_fuzz_test_file name_ dir file) + string(REPLACE "/" "_" fuzz_name "${file}") + add_test(NAME ryml-test-fuzz-${name_}-${fuzz_name} + COMMAND ${tgtexe} ${dir}/${file}) + endfunction() + foreach(fuzz_file ${fuzz_files}) + ryml_add_fuzz_test_file(${name} ${corpus_artifacts_dir} ${fuzz_file}) + endforeach() + endfunction() + ryml_add_fuzz_test(parse_emit) + ryml_add_fuzz_test(events + ../test/test_suite/test_suite_event_handler.hpp + ../test/test_suite/test_suite_event_handler.cpp) +endif() diff --git a/test/test_emit.cpp b/test/test_emit.cpp index 055afc960..f7cea26e7 100644 --- a/test/test_emit.cpp +++ b/test/test_emit.cpp @@ -72,11 +72,6 @@ std::string emitrs_append(csubstr first_part, Emit &&fn) //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- -bool operator== (const EmitOptions lhs, const EmitOptions rhs) -{ - return memcmp(&lhs, &rhs, sizeof(lhs)) == 0; -} - TEST(as_yaml, basic) { Tree et; diff --git a/test/test_fuzz/test_fuzz_common.hpp b/test/test_fuzz/test_fuzz_common.hpp new file mode 100644 index 000000000..65b0d762b --- /dev/null +++ b/test/test_fuzz/test_fuzz_common.hpp @@ -0,0 +1,135 @@ +#pragma once +#ifndef TEST_FUZZ_COMMON_H +#define TEST_FUZZ_COMMON_H + +#ifdef RYML_SINGLE_HEADER +#include +#else +#include +#include +#include +#include +#include +#endif +#include +#include +#include + +#ifdef C4_EXCEPTIONS +#include +#else +#include +std::jmp_buf jmp_env = {}; +c4::csubstr jmp_msg = {}; +#endif + + +#ifdef RYML_DBG +#define _if_dbg(...) __VA_ARGS__ +bool report_errors = true; +#else +#define _if_dbg(...) +bool report_errors = false; +#endif + +inline void report_error(const char* msg, size_t length, c4::yml::Location loc, FILE *f) +{ + if(!report_errors) + return; + if(!loc.name.empty()) + { + fwrite(loc.name.str, 1, loc.name.len, f); + fputc(':', f); + } + fprintf(f, "%zu:", loc.line); + if(loc.col) + fprintf(f, "%zu:", loc.col); + if(loc.offset) + fprintf(f, " (%zuB):", loc.offset); + fputc(' ', f); + fprintf(f, "%.*s\n", static_cast(length), msg); + fflush(f); +} + +// watchout: VS2022 requires C4_NORETURN to come before inline +C4_NORETURN inline void errcallback(const char *msg, size_t msg_len, c4::yml::Location location, void *) +{ + report_error(msg, msg_len, location, stderr); + C4_IF_EXCEPTIONS( + throw std::runtime_error({msg, msg_len}); + , + jmp_msg.assign(msg, msg_len); + std::longjmp(jmp_env, 1); + ); +} + +inline c4::yml::Callbacks create_custom_callbacks() +{ + c4::set_error_flags(c4::ON_ERROR_CALLBACK); + c4::set_error_callback([](const char *msg, size_t msg_len){ + errcallback(msg, msg_len, {}, nullptr); + }); + c4::yml::Callbacks callbacks = {}; + callbacks.m_error = errcallback; + return callbacks; +} + +namespace c4 { +namespace yml { + +inline int fuzztest_parse_emit(uint32_t case_number, csubstr src) +{ + C4_UNUSED(case_number); + set_callbacks(create_custom_callbacks()); + Tree tree(create_custom_callbacks()); + bool parse_success = false; + C4_IF_EXCEPTIONS_(try, if(setjmp(jmp_env) == 0)) + { + RYML_ASSERT(tree.empty()); + _if_dbg(_dbg_printf("in[{}]: [{}]~~~\n{}\n~~~\n", case_number, src.len, src); fflush(NULL)); + parse_in_arena(src, &tree); + parse_success = true; + _if_dbg(print_tree("parsed tree", tree)); + _if_dbg(_dbg_printf("in[{}]: [{}]~~~\n{}\n~~~\n", case_number, src.len, src); fflush(NULL)); + std::string dst = emitrs_yaml(tree); + _if_dbg(_dbg_printf("emitted[{}]: [{}]~~~\n{}\n~~~\n", case_number, dst.size(), to_csubstr(dst)); fflush(NULL)); + C4_DONT_OPTIMIZE(dst); + C4_DONT_OPTIMIZE(parse_success); + } + C4_IF_EXCEPTIONS_(catch(std::exception const&), else) + { + // if an exception leaks from here, it is likely because of a greedy noexcept + _if_dbg(if(parse_success) print_tree("parsed tree", tree)); + return 1; + } + return 0; +} + +inline int fuzztest_yaml_events(uint32_t case_number, csubstr src) +{ + C4_UNUSED(case_number); + set_callbacks(create_custom_callbacks()); + EventHandlerYamlStd::EventSink sink = {}; + EventHandlerYamlStd handler(&sink, create_custom_callbacks()); + ParseEngine parser(&handler); + std::string str(src.begin(), src.end()); + C4_IF_EXCEPTIONS_(try, if(setjmp(jmp_env) == 0)) + { + _if_dbg(_dbg_printf("in[{}]: [{}]~~~\n{}\n~~~\n", case_number, src.len, src); fflush(NULL)); + parser.parse_in_place_ev("input", c4::to_substr(str)); + _if_dbg(_dbg_printf("evts[{}]: ~~~\n{}\n~~~\n", case_number, sink); fflush(NULL)); + C4_DONT_OPTIMIZE(sink); + } + C4_IF_EXCEPTIONS_(catch(std::exception const&), else) + { + // if an exception leaks from here, it is likely because of a greedy noexcept + _if_dbg(fprintf(stdout, "err\n"); fflush(NULL)); + return 1; + } + return 0; +} + +} // namespace yml +} // namespace c4 + +#endif /* TEST_FUZZ_COMMON_H */ diff --git a/test/test_fuzz/test_fuzz_events.cpp b/test/test_fuzz/test_fuzz_events.cpp new file mode 100644 index 000000000..a857a2fe6 --- /dev/null +++ b/test/test_fuzz/test_fuzz_events.cpp @@ -0,0 +1,9 @@ +#include "./test_fuzz_common.hpp" +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *str, size_t len) +{ + static std::atomic case_number{0}; + c4::csubstr src = {reinterpret_cast(str), len}; + return c4::yml::fuzztest_yaml_events(case_number++, src); +} diff --git a/test/test_fuzz/test_fuzz_main.cpp b/test/test_fuzz/test_fuzz_main.cpp new file mode 100644 index 000000000..45f21b28e --- /dev/null +++ b/test/test_fuzz/test_fuzz_main.cpp @@ -0,0 +1,16 @@ +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *str, size_t len); + +int main(int argc, const char *argv[]) +{ + if(argc < 2) + return 1; + const char *filename = argv[1]; + if(!c4::fs::file_exists(filename)) + return 1; + std::string file = c4::fs::file_get_contents(filename); + (void)LLVMFuzzerTestOneInput(reinterpret_cast(&file[0]), file.size()); + return 0; +} diff --git a/test/test_fuzz/test_fuzz_parse_emit.cpp b/test/test_fuzz/test_fuzz_parse_emit.cpp new file mode 100644 index 000000000..dc4c08057 --- /dev/null +++ b/test/test_fuzz/test_fuzz_parse_emit.cpp @@ -0,0 +1,9 @@ +#include "./test_fuzz_common.hpp" +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *str, size_t len) +{ + static std::atomic case_number{0}; + c4::csubstr src = {reinterpret_cast(str), len}; + return c4::yml::fuzztest_parse_emit(case_number++, src); +} diff --git a/test/test_lib/test_engine.cpp b/test/test_lib/test_engine.cpp index 43f18e6f2..f2870ee21 100644 --- a/test/test_lib/test_engine.cpp +++ b/test/test_lib/test_engine.cpp @@ -72,8 +72,9 @@ void test_new_parser_events_from_yaml(ReferenceYaml const& yaml, std::string con ParseEngine parser(&handler); std::string copy = yaml.parsed; parser.parse_in_place_ev("(testyaml)", to_substr(copy)); - _c4dbgpf("~~~\n{}~~~\n", sink.result); - EXPECT_EQ(sink.result, expected_events); + csubstr result = sink; + _c4dbgpf("~~~\n{}~~~\n", result); + EXPECT_EQ(std::string(result.str, result.len), expected_events); } void test_new_parser_tree_from_yaml(ReferenceYaml const& yaml) diff --git a/test/test_lib/test_engine.hpp b/test/test_lib/test_engine.hpp index 5bf24bccd..d749263d2 100644 --- a/test/test_lib/test_engine.hpp +++ b/test/test_lib/test_engine.hpp @@ -42,8 +42,9 @@ C4_NO_INLINE void test_new_parser_str_from_events(std::string const& expected_ev handler.reset(); EventProducerFn event_producer; event_producer(handler); - _c4dbgpf("~~~\n{}~~~\n", sink.result); - EXPECT_EQ(sink.result, expected_events); + csubstr result = sink; + _c4dbgpf("~~~\n{}~~~\n", result); + EXPECT_EQ(std::string(result.str, result.len), expected_events); } template class EventProducerFn> @@ -77,8 +78,8 @@ void test_new_parser_events_from_yaml(ReferenceYaml const& yaml, std::string con void test_new_parser_tree_from_yaml(ReferenceYaml const& yaml); void test_new_parser_events_from_yaml_with_comments(ReferenceYaml const& yaml, std::string const& expected_events); void test_new_parser_tree_from_yaml_with_comments(ReferenceYaml const& yaml); -void test_expected_error_events_from_yaml(std::string const& parsed_yaml, Location const& expected_error_location); -void test_expected_error_tree_from_yaml(std::string const& parsed_yaml, Location const& expected_error_location); +void test_expected_error_events_from_yaml(std::string const& parsed_yaml, Location const& expected_error_location={}); +void test_expected_error_tree_from_yaml(std::string const& parsed_yaml, Location const& expected_error_location={}); @@ -91,6 +92,7 @@ void test_expected_error_tree_from_yaml(std::string const& parsed_yaml, Location #define _RYML_SHOWFILELINE(name) #endif + //----------------------------------------------------------------------------- #define ENGINE_TEST_ERRLOC(name, location, refyaml) \ @@ -114,6 +116,29 @@ TEST(EngineTest, name##_err_tree_from_yaml) \ } +//----------------------------------------------------------------------------- + +#define ENGINE_TEST_ERR(name, refyaml) \ + \ + \ +TEST(EngineTest, name##_err_events_from_yaml) \ +{ \ + _RYML_SHOWFILELINE(name); \ + SCOPED_TRACE(#name "_err_events_from_yaml"); \ + test_expected_error_events_from_yaml(refyaml); \ + _RYML_SHOWFILELINE(name); \ +} \ + \ + \ +TEST(EngineTest, name##_err_tree_from_yaml) \ +{ \ + _RYML_SHOWFILELINE(name); \ + SCOPED_TRACE(#name "_err_tree_from_yaml"); \ + test_expected_error_tree_from_yaml(refyaml); \ + _RYML_SHOWFILELINE(name); \ +} + + //----------------------------------------------------------------------------- @@ -184,7 +209,7 @@ TEST(EngineTest, name##_events_from_yaml_with_comments) \ TEST(EngineTest, name##_tree_from_yaml_with_comments) \ { \ _RYML_SHOWFILELINE(name); \ - SCOPED_TRACE(#name "_wtree_from_yaml"); \ + SCOPED_TRACE(#name "_tree_from_yaml"); \ ReferenceYaml yaml refyaml; \ test_new_parser_tree_from_yaml_with_comments(yaml); \ _RYML_SHOWFILELINE(name); \ @@ -203,9 +228,9 @@ void name##_impl(Ps &ps) #if !defined(RYML_DBG) #define ___(stmt) stmt #else -inline void _print_handler_info(EventHandlerYamlStd const& ps, csubstr stmt) +inline void _print_handler_info(EventHandlerYamlStd const& ps, csubstr stmt, const char *file, int line) { - _c4dbgpf("{}", stmt); + _dbg_printf("{}:{}: {}", file, line, stmt); auto indent = [](id_type n){ for(id_type level = 0; level < n; ++level) { @@ -214,31 +239,31 @@ inline void _print_handler_info(EventHandlerYamlStd const& ps, csubstr stmt) }; for(id_type i = 0; i < ps.m_stack.size(); ++i) { - auto const& str = ps._buf_(i).get(); + csubstr const& str = ps._buf_(i); indent(i); _dbg_printf("[{}]\n", i); - for(csubstr line : str.split('\n')) + for(csubstr ln : str.split('\n')) { indent(i); - _dbg_printf("{}\n", line); + _dbg_printf("{}\n", ln); } } } -inline void _print_handler_info(EventHandlerTree const& ps, csubstr stmt) +inline void _print_handler_info(EventHandlerTree const& ps, csubstr stmt, const char *file, int line) { if(ps.m_parent) - _c4dbgpf("parent.id={} curr.id={} {}\n", - ps.m_parent->node_id, ps.m_curr->node_id, stmt); + _dbg_printf("{}:{}: parent.id={} curr.id={} {}\n", + file, line, ps.m_parent->node_id, ps.m_curr->node_id, stmt); else - _c4dbgpf("parent.id=-- curr.id={} {}\n", - ps.m_curr->node_id, stmt); + _dbg_printf("{}:{}: parent.id=-- curr.id={} {}\n", + file, line, ps.m_curr->node_id, stmt); print_tree(*ps.m_tree); } #define ___(stmt) \ do \ { \ stmt; \ - _print_handler_info(ps, #stmt); \ + _print_handler_info(ps, #stmt, __FILE__, __LINE__); \ } while(0) #endif diff --git a/test/test_parse_engine_4_anchor.cpp b/test/test_parse_engine_4_anchor.cpp index c21b6c50f..4174080f8 100644 --- a/test/test_parse_engine_4_anchor.cpp +++ b/test/test_parse_engine_4_anchor.cpp @@ -1111,6 +1111,30 @@ ENGINE_TEST(DoubleAnchorKeyMap, ___(ps.end_stream()); } +ENGINE_TEST_ERRLOC(AnchorAndAliasAsBlockMapKey_SU74, Location(2,10), "" + "key1: &alias value1\n" + "&b *alias : value2\n") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsFlowMapKey_SU74, Location(2,10), "" + "{key1: &alias value1,\n" + "&b *alias : value2}\n") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsFlowMapVal_SU74, Location(2,18), "" + "key1: &alias value1\n" + "value2: &b *alias\n") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsBlockMapVal_SU74, Location(2,18), "" + "{key1: &alias value1,\n" + "value2: &b *alias\n}") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsFlowSeqVal_SU74, Location(2,12), "" + "- &alias value1\n" + "- &b *alias\n") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsBlockSeqVal_SU74, Location(2,10), "" + "[&alias value1,\n" + "&b *alias]\n") + } // namespace yml } // namespace c4 diff --git a/test/test_parse_engine_5_tag.cpp b/test/test_parse_engine_5_tag.cpp index c1279c2ac..540deaf27 100644 --- a/test/test_parse_engine_5_tag.cpp +++ b/test/test_parse_engine_5_tag.cpp @@ -93,6 +93,261 @@ ENGINE_TEST(DirectiveAndTag, //----------------------------------------------------------------------------- +ENGINE_TEST_ERRLOC(TagTestSuiteU99R_0, Location(2,1), + "- !!str, xxx\n") + +ENGINE_TEST_ERRLOC(TagTestSuiteU99R_1, Location(2,1), + "- !str, xxx\n") + +ENGINE_TEST(TagTestSuiteU99R_2, + ("[!!str, xxx]", "[!!str ,xxx]"), + "+STR\n" + "+DOC\n" + "+SEQ []\n" + "=VAL :\n" + "=VAL :xxx\n" + "-SEQ\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_seq_val_flow()); + ___(ps.set_val_tag("!!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_val_scalar_plain("xxx")); + ___(ps.end_seq()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteU99R_2_1, + ("[!str, xxx]","[!str ,xxx]"), + "+STR\n" + "+DOC\n" + "+SEQ []\n" + "=VAL :\n" + "=VAL :xxx\n" + "-SEQ\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_seq_val_flow()); + ___(ps.set_val_tag("!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_val_scalar_plain("xxx")); + ___(ps.end_seq()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteU99R_3, + ("{!!str, xxx}", "{!!str : ,xxx: }"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :\n" + "=VAL :\n" + "=VAL :xxx\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_tag("!!str")); + ___(ps.set_key_scalar_plain({})); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_scalar_plain("xxx")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteU99R_3_1, + ("{!str, xxx}", "{!str : ,xxx: }"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :\n" + "=VAL :\n" + "=VAL :xxx\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_tag("!str")); + ___(ps.set_key_scalar_plain({})); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_scalar_plain("xxx")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteWZ62_0_0, + ("{foo: !!str , !!str : bar}", "{foo: !!str ,!!str : bar}"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :foo\n" + "=VAL :\n" + "=VAL :\n" + "=VAL :bar\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("foo")); + ___(ps.set_val_tag("!!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_tag("!!str")); + ___(ps.set_key_scalar_plain({})); + ___(ps.set_val_scalar_plain("bar")); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteWZ62_0_1, + ("{foo: !str , !str : bar}", "{foo: !str ,!str : bar}"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :foo\n" + "=VAL :\n" + "=VAL :\n" + "=VAL :bar\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("foo")); + ___(ps.set_val_tag("!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_tag("!str")); + ___(ps.set_key_scalar_plain({})); + ___(ps.set_val_scalar_plain("bar")); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteWZ62_1_0, + ("{foo: !!str, !!str: bar}", "{foo: !!str ,!!str: bar: }"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :foo\n" + "=VAL :\n" + "=VAL :bar\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("foo")); + ___(ps.set_val_tag("!!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_tag("!!str:")); + ___(ps.set_key_scalar_plain("bar")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteWZ62_1_1, + ("{foo: !str, !str: bar}", "{foo: !str ,!str: bar: }"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :foo\n" + "=VAL :\n" + "=VAL :bar\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("foo")); + ___(ps.set_val_tag("!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_tag("!str:")); + ___(ps.set_key_scalar_plain("bar")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST_ERRLOC(TagTestSuiteLHL4_0, Location(3,1), + "---\n" + "!invalid{}tag scalar\n") + +ENGINE_TEST_ERRLOC(TagTestSuiteLHL4_1, Location(3,1), + "---\n" + "!!invalid{}tag scalar\n") + +ENGINE_TEST(TagTestSuiteUGM3, + ("--- !\n" + "invoice: 34843\n" + "date: 2001-01-23\n"), + "+STR\n" + "+DOC ---\n" + "+MAP \n" + "=VAL :invoice\n" + "=VAL :34843\n" + "=VAL :date\n" + "=VAL :2001-01-23\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc_expl()); + ___(ps.set_val_tag("!")); + ___(ps.begin_map_val_block()); + ___(ps.set_key_scalar_plain("invoice")); + ___(ps.set_val_scalar_plain("34843")); + ___(ps.add_sibling()); + ___(ps.set_key_scalar_plain("date")); + ___(ps.set_val_scalar_plain("2001-01-23")); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + ENGINE_TEST(TagTestSuiteUKK6_02_0, ("!", "! \n"), "+STR\n" @@ -163,6 +418,105 @@ ENGINE_TEST(TagTestSuite6WLZ_1, ___(ps.end_stream()); } + +//----------------------------------------------------------------------------- + +ENGINE_TEST_ERR(DirectiveTestSuite9MMA_YAML, + "%YAML 1.2") + +ENGINE_TEST_ERR(DirectiveTestSuite9MMA_TAG, + "%TAG ! tag:example.com,2000:app/\n") + +ENGINE_TEST_ERR(DirectiveTestSuiteMUS6_00, + "%YAML 1.1#...\n" + "---\n") + +ENGINE_TEST_ERR(DirectiveTestSuite9HCY, + "!foo \"bar\"\n" + "%TAG ! tag:example.com,2000:app/\n" + "---\n" + "!foo \"bar\"\n") + +ENGINE_TEST(DirectiveTestSuiteMUS6, + ("%YAM 1.1\n" + "---\n", + "---\n"), + "+STR\n" + "+DOC ---\n" + "=VAL :\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.add_directive("%YAM 1.1")); + ___(ps.begin_doc_expl()); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(DirectiveMultipleYAML_W4TN, + ("" + "%YAML 1.2\n" + "---\n" + "foo\n" + "...\n" + "%YAML 1.2\n" + "---\n" + "# Empty\n" + "...\n" + "", + "--- foo\n" + "---\n" + ""), + "+STR\n" + "+DOC ---\n" + "=VAL :foo\n" + "-DOC ...\n" + "+DOC ---\n" + "=VAL :\n" + "-DOC ...\n" + "-STR\n" + "") +{ + ___(ps.begin_stream()); + ___(ps.add_directive("%YAML 1.2")); + ___(ps.begin_doc_expl()); + ___(ps.set_val_scalar_plain("foo")); + ___(ps.end_doc_expl()); + ___(ps.add_directive("%YAML 1.2")); + ___(ps.begin_doc_expl()); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_doc_expl()); + ___(ps.end_stream()); +} + +ENGINE_TEST_ERR(DirectiveMultipleYAML_0, + "%YAML 1.2\n" + "...\n" + "%YAML 1.2\n" + "---\n" + "bar") + +ENGINE_TEST_ERR(DirectiveMultipleYAML_1, + "%YAML 1.1\n" + "...\n" + "%YAML 1.2\n" + "---\n" + "bar") + +ENGINE_TEST_ERR(DirectiveMultipleYAML_2, + "%YAML 1.2\n" + "%YAML 1.2\n" + "---\n" + "bar") + +ENGINE_TEST_ERR(DirectiveMultipleYAML_3, + "%YAML 1.1\n" + "%YAML 1.2\n" + "---\n" + "bar") + } // namespace yml } // namespace c4 diff --git a/test/test_parse_engine_6_qmrk.cpp b/test/test_parse_engine_6_qmrk.cpp index fe182ef29..d495e41df 100644 --- a/test/test_parse_engine_6_qmrk.cpp +++ b/test/test_parse_engine_6_qmrk.cpp @@ -292,6 +292,32 @@ ENGINE_TEST(Qmrk3, ___(ps.end_stream()); } +ENGINE_TEST(Qmrk4_0, + ("[?baz:,]", + "[{?baz: }]"), + "+STR\n" + "+DOC\n" + "+SEQ []\n" + "+MAP {}\n" + "=VAL :?baz\n" + "=VAL :\n" + "-MAP\n" + "-SEQ\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_seq_val_flow()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("?baz")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_seq()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + ENGINE_TEST(Qmrk4, ("[ ? an explicit key, ? foo,? bar,?baz:,?bat]", "[{an explicit key: },{foo: },{bar: },{?baz: },?bat]"), @@ -959,6 +985,70 @@ ENGINE_TEST(QmrkFlow1MapTag, ___(ps.end_stream()); } +#ifdef TODO_FIXME // this is the only failing suite test +ENGINE_TEST(QmrkTestSuiteM2N8_01_0, + (HAS_CONTAINER_KEYS, + "? []: x"), + "+STR\n" + "+DOC\n" + "+MAP\n" + "+MAP\n" + "+SEQ []\n" + "-SEQ\n" + "=VAL :x\n" + "-MAP\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n" +) +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_block()); + ___(ps.begin_map_key_block()); + ___(ps.begin_seq_key_flow()); + ___(ps.end_seq()); + ___(ps.set_val_scalar_plain("x")); + ___(ps.end_map()); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(QmrkTestSuiteM2N8_01_1, + (HAS_CONTAINER_KEYS, + "? {}: x"), + "+STR\n" + "+DOC\n" + "+MAP\n" + "+MAP\n" + "+MAP {}\n" + "-MAP\n" + "=VAL :x\n" + "-MAP\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n" +) +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_block()); + ___(ps.begin_map_key_block()); + ___(ps.begin_map_key_flow()); + ___(ps.end_map()); + ___(ps.set_val_scalar_plain("x")); + ___(ps.end_map()); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} +#endif + } // namespace yml } // namespace c4 diff --git a/test/test_parser.cpp b/test/test_parser.cpp index c1f6969f9..cb5159c8d 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -402,7 +402,12 @@ TEST(Parser, alloc_arena) { Tree tree; Parser::handler_type evt_handler = {}; + int data = 0; + auto relocate = [](void*, csubstr prev, substr next_arena){ + EXPECT_FALSE(prev.overlaps(next_arena)); + }; evt_handler.reset(&tree, tree.root_id()); + evt_handler.start_parse("filename", relocate, &data); substr bufa = evt_handler.alloc_arena(64); bufa.fill('a'); csubstr prev = bufa; @@ -2001,7 +2006,7 @@ TEST_F(ParseToMapFlowTest, map_flow__to__map_flow__new_child) TEST_F(ParseToMapFlowTest, map_flow__to__map_flow__new_child_no_key) { NodeRef dst = dst_map_flow.rootref().append_child(); - ExpectError::do_check([&]{ + ExpectError::do_check(dst.tree(), [&]{ parse_in_arena(to_csubstr(map_flow), dst); }); } @@ -2663,7 +2668,7 @@ TEST_F(ParseToMapBlockTest, map_flow__to__map_flow__new_child) TEST_F(ParseToMapBlockTest, map_flow__to__map_flow__new_child_no_key) { NodeRef dst = dst_map_blck.rootref().append_child(); - ExpectError::do_check([&]{ + ExpectError::do_check(dst.tree(), [&]{ parse_in_arena(to_csubstr(map_flow), dst); }); } diff --git a/test/test_suite.cpp b/test/test_suite.cpp index 2ba6b0292..194998e8c 100644 --- a/test/test_suite.cpp +++ b/test/test_suite.cpp @@ -299,11 +299,11 @@ struct TestSequenceLevel if(prev) receive_src(*prev); _nfo_logf("level[{}]: parsing source:\n{}", level, src_evts); - evt_str_sink.reset(); + evt_str_sink.clear(); evt_handler_str_sink.reset(); evt_handler_str_sink.m_stack.m_callbacks = get_callbacks(); parser_str_sink.parse_in_place_ev(filename, to_substr(src_evts)); - EXPECT_NE(evt_str_sink.result, ""); + EXPECT_NE(evt_str_sink.size(), 0); events_were_generated = true; } @@ -655,7 +655,8 @@ struct TestSequenceData for(size_t i = 0; i < num; ++i) { levels[i].parse_yaml_to_events(); - events->compare_emitted_events_to_reference_events(levels[i].evt_str_sink.result, + csubstr result = levels[i].evt_str_sink; + events->compare_emitted_events_to_reference_events(std::string(result.str, result.len), /*ignore_container_style*/false, /*ignore_scalar_style*/(num>0)); } diff --git a/test/test_suite/string.hpp b/test/test_suite/string.hpp new file mode 100644 index 000000000..a61a8ea5b --- /dev/null +++ b/test/test_suite/string.hpp @@ -0,0 +1,424 @@ +#ifndef _C4_YML_EXTRA_STRING_HPP_ +#define _C4_YML_EXTRA_STRING_HPP_ + +#ifdef RYML_SINGLE_HEADER +#ifndef _RYML_SINGLE_HEADER_AMALGAMATED_HPP_ +#include +#endif +#else +#ifndef _C4_YML_COMMON_HPP_ +#include "c4/yml/common.hpp" +#endif +#endif + +#include + +C4_SUPPRESS_WARNING_GCC_CLANG_PUSH +C4_SUPPRESS_WARNING_GCC_CLANG("-Wold-style-cast") +C4_SUPPRESS_WARNING_GCC("-Wuseless-cast") + +#ifndef RYML_STRING_SSO_SIZE +#define RYML_STRING_SSO_SIZE 128 +#endif + +#ifndef RYML_STRING_LIST_SSO_SIZE +#define RYML_STRING_LIST_SSO_SIZE 66 +#endif + +namespace c4 { +namespace yml { +namespace extra { + +/** an owning string class used by the yaml std event handler (and the + * YamlScript handler). we use this instead of std::string because: + * 1) this spares the dependency on the standard library + * 2) enables possibility of adding borrowing semantics (in the future) */ +struct string +{ + enum : id_type { sso_size = RYML_STRING_SSO_SIZE }; + char m_buf[sso_size]; + char *C4_RESTRICT m_str; + id_type m_size; + id_type m_capacity; + +public: + + string() + : m_buf() + , m_str(m_buf) + , m_size(0) + , m_capacity(sso_size) + {} + ~string() noexcept + { + _free(); + } + + string(string const& that) RYML_NOEXCEPT : string() + { + resize(that.m_size); + _cp(&that); + } + + string(string &&that) noexcept : string() + { + _mv(&that); + } + + string& operator= (string const& that) RYML_NOEXCEPT + { + resize(that.m_size); + _cp(&that); + return *this; + } + + string& operator= (string &&that) noexcept + { + _mv(&that); + return *this; + } + +public: + + C4_ALWAYS_INLINE C4_HOT operator csubstr() const noexcept { return {m_str, m_size}; } + C4_ALWAYS_INLINE C4_HOT operator substr() noexcept { return {m_str, m_size}; } + +public: + + id_type size() const noexcept { return m_size; } + id_type capacity() const noexcept { return m_capacity; } + + void clear() + { + m_size = 0; + } + + void resize(id_type sz) + { + reserve(sz); + m_size = sz; + } + + void reserve(id_type sz) + { + if(sz <= m_capacity) + return; + id_type cap = m_capacity < string::sso_size ? string::sso_size : 2 * m_capacity; + cap = cap > sz ? cap : sz; + if(cap <= sso_size) + return; + Callbacks cb = get_callbacks(); + char *buf = (char*) _RYML_CB_ALLOC(cb, char, cap); + if(m_size) + memcpy(buf, m_str, (size_t)m_size); + if(m_str != m_buf) + { + _RYML_CB_FREE(cb, m_str, char, m_size); + } + m_str = buf; + m_capacity = cap; + } + +public: + + C4_ALWAYS_INLINE C4_HOT void append(char c) + { + if(C4_UNLIKELY(m_size == m_capacity)) + reserve(m_size + 1); + m_str[m_size++] = c; + } + C4_ALWAYS_INLINE C4_HOT void append(csubstr cs) + { + if(cs.len) + { + const id_type ilen = (id_type)cs.len; + if(C4_UNLIKELY(m_size + ilen > m_capacity)) + reserve(m_size + ilen); + memcpy(m_str + m_size, cs.str, cs.len); + m_size += ilen; + } + } + C4_ALWAYS_INLINE void insert(char c, id_type pos) + { + RYML_ASSERT(pos <= m_size); + if(pos < m_size) + { + if(C4_UNLIKELY(m_size == m_capacity)) + reserve(m_size + 1); + char *C4_RESTRICT src = m_str + pos; + memmove(src + 1, src, m_size - pos); + *src = c; + ++m_size; + } + else + { + append(c); + } + } + C4_NO_INLINE void insert(csubstr cs, id_type pos) + { + RYML_ASSERT(pos <= m_size); + if(cs.len) + { + if(pos < m_size) + { + const id_type ilen = (id_type)cs.len; + if(C4_UNLIKELY(m_size + ilen > m_capacity)) + reserve(m_size + ilen); + char *C4_RESTRICT src = m_str + pos; + memmove(src + cs.len, src, m_size - pos); + memcpy(src, cs.str, cs.len); + m_size += ilen; + } + else + { + append(cs); + } + } + } + C4_NO_INLINE size_t find_last(csubstr pattern) RYML_NOEXCEPT + { + RYML_ASSERT(pattern.len); + if(m_size >= pattern.len) + { + for(size_t i = m_size - pattern.len; i != (size_t)-1; --i) + { + if(m_str[i] == pattern[0]) + { + bool gotit = true; + for(size_t j = 1; j < pattern.len; ++j) + { + if(m_str[i + j] != pattern[j]) + { + gotit = false; + break; + } + } + if(gotit) + return i; + } + } + } + return npos; + } + +public: + + void _free() + { + RYML_ASSERT(m_str != nullptr); // this structure cannot be memset() to zero + if(m_str != m_buf) + { + _RYML_CB_FREE(get_callbacks(), m_str, char, (size_t)m_capacity); + m_str = m_buf; + m_capacity = sso_size; + } + RYML_ASSERT(m_capacity == sso_size); + m_size = 0; + } + + void _cp(string const* C4_RESTRICT that) + { + #if RYML_USE_ASSERT + if(that->m_str != that->m_buf) + { + RYML_ASSERT(that->m_capacity > sso_size); + RYML_ASSERT(that->m_size <= that->m_capacity); + } + else + { + RYML_ASSERT(that->m_capacity <= sso_size); + RYML_ASSERT(that->m_size <= that->m_capacity); + } + #endif + memcpy(m_str, that->m_str, that->m_size); + m_size = that->m_size; + m_capacity = that->m_size < sso_size ? sso_size : that->m_size; + } + + void _mv(string *C4_RESTRICT that) + { + if(that->m_str != that->m_buf) + { + RYML_ASSERT(that->m_capacity > sso_size); + RYML_ASSERT(that->m_size <= that->m_capacity); + m_str = that->m_str; + } + else + { + RYML_ASSERT(that->m_capacity <= sso_size); + RYML_ASSERT(that->m_size <= that->m_capacity); + memcpy(m_buf, that->m_buf, that->m_size); + m_str = m_buf; + } + m_size = that->m_size; + m_capacity = that->m_capacity; + // make sure no deallocation happens on destruction + RYML_ASSERT(that->m_str != this->m_buf); + that->m_str = that->m_buf; + that->m_capacity = sso_size; + that->m_size = 0; + } +}; + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +/** a string collection used by the event handler. using this instead of + * std::vector spares the dependency on the standard library. */ +struct string_vector +{ + enum : id_type { sso_size = RYML_STRING_LIST_SSO_SIZE }; + union { + alignas(string) string m_buf[sso_size]; + alignas(string) char m_buf_bytes[sso_size * sizeof(string)]; + }; + string *C4_RESTRICT m_arr; + id_type m_size; + id_type m_capacity; + +public: + + string_vector() + : m_arr(m_buf) + , m_size(0) + , m_capacity(sso_size) + {} + ~string_vector() noexcept + { + _free(); + } + + string_vector(string_vector const& that) RYML_NOEXCEPT : string_vector() + { + reserve(that.m_size); + m_size = that.m_size; + for(id_type i = 0; i < that.m_size; ++i) + new ((void*)(m_arr+i)) string(that.m_arr[i]); + } + + string_vector(string_vector &&that) noexcept : string_vector() + { + reserve(that.m_size); + m_size = that.m_size; + for(id_type i = 0; i < that.m_size; ++i) + new ((void*)(m_arr+i)) string(std::move(that.m_arr[i])); + that.~string_vector(); + } + + string_vector& operator= (string_vector const& that) RYML_NOEXCEPT + { + _free(); + reserve(that.m_size); + for(id_type i = 0; i < that.m_size; ++i) + m_arr[i].operator=(that.m_arr[i]); + m_size = that.m_size; + return *this; + } + + string_vector& operator= (string_vector &&that) noexcept + { + _free(); + reserve(that.m_size); + for(id_type i = 0; i < that.m_size; ++i) + m_arr[i].operator=(std::move(that.m_arr[i])); + m_size = that.m_size; + that.~string_vector(); + return *this; + } + + void _free() + { + RYML_ASSERT(m_arr != nullptr); // this structure cannot be memset() to zero + for(id_type i = 0; i < m_size; ++i) + m_arr[i].~string(); + if(m_arr != m_buf) + { + _RYML_CB_FREE(get_callbacks(), m_arr, string, (size_t)m_capacity); + m_arr = m_buf; + m_capacity = sso_size; + } + RYML_ASSERT(m_capacity == sso_size); + m_size = 0; + } + +public: + + id_type size() const noexcept { return m_size; } + id_type capacity() const noexcept { return m_capacity; } + + void clear() + { + resize(0); + } + + void resize(id_type sz) + { + reserve(sz); + if(sz >= m_size) + { + for(id_type i = m_size; i < sz; ++i) + new ((void*)(m_arr + i)) string(); + } + else + { + for(id_type i = sz; i < m_size; ++i) + m_arr[i].~string(); + } + m_size = sz; + } + + void reserve(id_type sz) + { + if(sz <= m_capacity) + return; + id_type cap = m_capacity < string::sso_size ? string::sso_size : 2 * m_capacity; + cap = cap > sz ? cap : sz; + if(cap <= sso_size) + return; + Callbacks cb = get_callbacks(); + string *buf = (string*) _RYML_CB_ALLOC(cb, string, cap); + for(id_type i = 0; i < m_size; ++i) + new ((void*)(buf + i)) string(std::move(m_arr[i])); + if(m_arr != m_buf) + { + _RYML_CB_FREE(cb, m_arr, string, m_size); + } + m_arr = buf; + m_capacity = cap; + } + +public: + + string& emplace_back() + { + RYML_ASSERT(m_size < m_capacity); + if(m_size == m_capacity) + reserve(m_size + 1); + string& ret = m_arr[m_size++]; + new ((void*)&ret) string(); + return ret; + } + string& operator[] (id_type i) + { + RYML_ASSERT(m_size <= m_capacity); + RYML_ASSERT(i < m_size); + return m_arr[i]; + } + string const& operator[] (id_type i) const + { + RYML_ASSERT(m_size <= m_capacity); + RYML_ASSERT(i < m_size); + return m_arr[i]; + } +}; + +} // namespace extra +} // namespace yml +} // namespace c4 + +C4_SUPPRESS_WARNING_GCC_POP + +#endif /* _C4_YML_EXTRA_STRING_HPP_ */ diff --git a/test/test_suite/test_suite_event_handler.cpp b/test/test_suite/test_suite_event_handler.cpp index 7642d3544..ec1fd1806 100644 --- a/test/test_suite/test_suite_event_handler.cpp +++ b/test/test_suite/test_suite_event_handler.cpp @@ -12,15 +12,14 @@ namespace yml { // instantiate the template template class ParseEngine; - -void EventHandlerYamlStd::EventSink::append_escaped(csubstr val) +void append_escaped(extra::string *es, csubstr val) { - #define _c4flush_use_instead(repl, skip) \ - do { \ - this->append(val.range(prev, i)); \ - this->append(repl); \ - prev = i + skip; \ - } \ + #define _c4flush_use_instead(i, repl, skip) \ + do { \ + es->append(val.range(prev, i)); \ + es->append(repl); \ + prev = i + skip; \ + } \ while(0) uint8_t const* C4_RESTRICT s = reinterpret_cast(val.str); size_t prev = 0; @@ -29,33 +28,33 @@ void EventHandlerYamlStd::EventSink::append_escaped(csubstr val) switch(s[i]) { case UINT8_C(0x0a): // \n - _c4flush_use_instead("\\n", 1); break; + _c4flush_use_instead(i, "\\n", 1); break; case UINT8_C(0x5c): // '\\' - _c4flush_use_instead("\\\\", 1); break; + _c4flush_use_instead(i, "\\\\", 1); break; case UINT8_C(0x09): // \t - _c4flush_use_instead("\\t", 1); break; + _c4flush_use_instead(i, "\\t", 1); break; case UINT8_C(0x0d): // \r - _c4flush_use_instead("\\r", 1); break; + _c4flush_use_instead(i, "\\r", 1); break; case UINT8_C(0x00): // \0 - _c4flush_use_instead("\\0", 1); break; + _c4flush_use_instead(i, "\\0", 1); break; case UINT8_C(0x0c): // \f (form feed) - _c4flush_use_instead("\\f", 1); break; + _c4flush_use_instead(i, "\\f", 1); break; case UINT8_C(0x08): // \b (backspace) - _c4flush_use_instead("\\b", 1); break; + _c4flush_use_instead(i, "\\b", 1); break; case UINT8_C(0x07): // \a (bell) - _c4flush_use_instead("\\a", 1); break; + _c4flush_use_instead(i, "\\a", 1); break; case UINT8_C(0x0b): // \v (vertical tab) - _c4flush_use_instead("\\v", 1); break; + _c4flush_use_instead(i, "\\v", 1); break; case UINT8_C(0x1b): // \e (escape) - _c4flush_use_instead("\\e", 1); break; + _c4flush_use_instead(i, "\\e", 1); break; case UINT8_C(0xc2): if(i+1 < val.len) { const uint8_t np1 = s[i+1]; if(np1 == UINT8_C(0xa0)) - _c4flush_use_instead("\\_", 2); + _c4flush_use_instead(i, "\\_", 2); else if(np1 == UINT8_C(0x85)) - _c4flush_use_instead("\\N", 2); + _c4flush_use_instead(i, "\\N", 2); } break; case UINT8_C(0xe2): @@ -64,16 +63,16 @@ void EventHandlerYamlStd::EventSink::append_escaped(csubstr val) if(s[i+1] == UINT8_C(0x80)) { if(s[i+2] == UINT8_C(0xa8)) - _c4flush_use_instead("\\L", 3); + _c4flush_use_instead(i, "\\L", 3); else if(s[i+2] == UINT8_C(0xa9)) - _c4flush_use_instead("\\P", 3); + _c4flush_use_instead(i, "\\P", 3); } } break; } } // flush the rest - this->append(val.sub(prev)); + es->append(val.sub(prev)); #undef _c4flush_use_instead } diff --git a/test/test_suite/test_suite_event_handler.hpp b/test/test_suite/test_suite_event_handler.hpp index c830e1d6b..ad5f7e9ec 100644 --- a/test/test_suite/test_suite_event_handler.hpp +++ b/test/test_suite/test_suite_event_handler.hpp @@ -7,15 +7,14 @@ #ifndef _C4_YML_EVENT_HANDLER_STACK_HPP_ #include "c4/yml/event_handler_stack.hpp" #endif -#ifndef _C4_YML_STD_STRING_HPP_ -#include "c4/yml/std/string.hpp" -#endif #ifndef _C4_YML_DETAIL_PRINT_HPP_ #include "c4/yml/detail/print.hpp" #endif #endif -#include +#ifndef _C4_YML_EXTRA_STRING_HPP_ +#include "./string.hpp" +#endif C4_SUPPRESS_WARNING_GCC_CLANG_PUSH C4_SUPPRESS_WARNING_GCC_CLANG("-Wold-style-cast") @@ -28,6 +27,9 @@ namespace yml { /** @addtogroup doc_event_handlers * @{ */ +void append_escaped(extra::string *s, csubstr val); + + /** The stack state needed specifically by @ref EventHandlerYamlStd */ struct EventHandlerYamlStdState : public ParserState { @@ -53,19 +55,7 @@ struct EventHandlerYamlStd : public EventHandlerStack m_val_buffers; - char m_key_tag_buf[256]; - char m_val_tag_buf[256]; + extra::string_vector m_val_buffers; + extra::string m_key_tag_buf; + extra::string m_val_tag_buf; TagDirective m_tag_directives[RYML_MAX_TAG_DIRECTIVES]; - std::string m_arena; + bool m_has_yaml_directive; + extra::string m_arena; + bool m_has_docs; // undefined at the end #define _enable_(bits) _enable__() @@ -90,9 +82,9 @@ struct EventHandlerYamlStd : public EventHandlerStackflags |= RUNK|RTOP; - for(auto &td : m_tag_directives) + m_has_yaml_directive = false; + for(TagDirective &td : m_tag_directives) td = {}; + m_val_buffers.clear(); m_val_buffers.resize((size_t)m_stack.size()); m_arena.clear(); m_arena.reserve(1024); + m_key_tag_buf.resize(256); + m_val_tag_buf.resize(256); + m_has_docs = false; } /** @} */ @@ -116,13 +113,16 @@ struct EventHandlerYamlStd : public EventHandlerStackstart_parse(filename, m_curr->node_id); + this->_stack_start_parse(filename, relocate_arena, relocate_arena_data); } void finish_parse() { + if((_num_tag_directives() || m_has_yaml_directive) && !m_has_docs) + _RYML_CB_ERR_(m_stack.m_callbacks, "directives cannot be used without a document", {}); + this->_stack_finish_parse(); } void cancel_parse() @@ -168,6 +168,7 @@ struct EventHandlerYamlStd : public EventHandlerStacknode_id, anchor.len, anchor); - RYML_ASSERT(!anchor.begins_with('&')); + if(C4_UNLIKELY(_has_any_(KEYANCH))) + _RYML_CB_ERR_(m_stack.m_callbacks, "key cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_stack.m_callbacks, !anchor.begins_with('&')); _enable_(KEYANCH); m_curr->ev_data.m_key.anchor = anchor; } void set_val_anchor(csubstr anchor) { _c4dbgpf("node[{}]: set val anchor: [{}]~~~{}~~~", m_curr->node_id, anchor.len, anchor); - RYML_ASSERT(!anchor.begins_with('&')); + if(C4_UNLIKELY(_has_any_(VALREF))) + _RYML_CB_ERR_(m_stack.m_callbacks, "val cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_stack.m_callbacks, !anchor.begins_with('&')); _enable_(VALANCH); m_curr->ev_data.m_val.anchor = anchor; } - C4_ALWAYS_INLINE bool has_key_anchor() const - { - return _has_any_(KEYANCH); - } - - C4_ALWAYS_INLINE bool has_val_anchor() const - { - return _has_any_(VALANCH); - } - void set_key_ref(csubstr ref) { _c4dbgpf("node[{}]: set key ref: [{}]~~~{}~~~", m_curr->node_id, ref.len, ref); - RYML_ASSERT(ref.begins_with('*')); + if(C4_UNLIKELY(_has_any_(KEYANCH))) + _RYML_CB_ERR_(m_stack.m_callbacks, "key cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_stack.m_callbacks, ref.begins_with('*')); _enable_(KEY|KEYREF); _send_("=ALI "); _send_(ref); @@ -513,7 +512,9 @@ struct EventHandlerYamlStd : public EventHandlerStacknode_id, ref.len, ref); - RYML_ASSERT(ref.begins_with('*')); + if(C4_UNLIKELY(_has_any_(VALANCH))) + _RYML_CB_ERR_(m_stack.m_callbacks, "val cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_stack.m_callbacks, ref.begins_with('*')); _enable_(VAL|VALREF); _send_("=ALI "); _send_(ref); @@ -531,13 +532,13 @@ struct EventHandlerYamlStd : public EventHandlerStacknode_id, tag.len, tag); _enable_(KEYTAG); - m_curr->ev_data.m_key.tag = _transform_directive(tag, m_key_tag_buf); + m_curr->ev_data.m_key.tag = _transform_directive(tag, &m_key_tag_buf); } void set_val_tag(csubstr tag) { _c4dbgpf("node[{}]: set val tag: [{}]~~~{}~~~", m_curr->node_id, tag.len, tag); _enable_(VALTAG); - m_curr->ev_data.m_val.tag = _transform_directive(tag, m_val_tag_buf); + m_curr->ev_data.m_val.tag = _transform_directive(tag, &m_val_tag_buf); } /** @} */ @@ -553,8 +554,21 @@ struct EventHandlerYamlStd : public EventHandlerStack= RYML_MAX_TAG_DIRECTIVES)) + _RYML_CB_ERR_(m_stack.m_callbacks, "too many directives", m_curr->pos); + if(C4_UNLIKELY(!m_tag_directives[pos].create_from_str(directive))) + _RYML_CB_ERR_(m_stack.m_callbacks, "failed to add directive", m_curr->pos); + } + else if(directive.begins_with("%YAML")) + { + _c4dbgpf("%YAML directive! ignoring...: {}", directive); + if(C4_UNLIKELY(m_has_yaml_directive)) + _RYML_CB_ERR_(m_stack.m_callbacks, "multiple yaml directives", m_curr->pos); + m_has_yaml_directive = true; + } + else + { + _c4dbgpf("unknown directive! ignoring... {}", directive); } } @@ -568,7 +582,7 @@ struct EventHandlerYamlStd : public EventHandlerStackev_data = {}; _c4dbgpf("pushed! level={}", m_curr->level); } @@ -634,26 +648,26 @@ struct EventHandlerYamlStd : public EventHandlerStacklevel < m_val_buffers.size()); - return m_val_buffers[(size_t)m_curr->level]; + _RYML_CB_ASSERT(m_stack.m_callbacks, m_curr->level < m_val_buffers.size()); + return m_val_buffers[m_curr->level]; } EventSink& _buf_(id_type level) noexcept { - _RYML_CB_ASSERT(m_stack.m_callbacks, (size_t)level < m_val_buffers.size()); - return m_val_buffers[(size_t)level]; + _RYML_CB_ASSERT(m_stack.m_callbacks, level < m_val_buffers.size()); + return m_val_buffers[level]; } EventSink const& _buf_(id_type level) const noexcept { - _RYML_CB_ASSERT(m_stack.m_callbacks, (size_t)level < m_val_buffers.size()); - return m_val_buffers[(size_t)level]; + _RYML_CB_ASSERT(m_stack.m_callbacks, level < m_val_buffers.size()); + return m_val_buffers[level]; } static void _buf_flush_to_(EventSink &C4_RESTRICT src, EventSink &C4_RESTRICT dst) noexcept { - dst.append(src.get()); - src.reset(); + dst.append(src); + src.clear(); } void _buf_flush_to_(id_type level_src, id_type level_dst) noexcept @@ -670,8 +684,8 @@ struct EventHandlerYamlStd : public EventHandlerStack m_val_buffers.size()) - m_val_buffers.resize((size_t)size_needed); + if(size_needed > m_val_buffers.size()) + m_val_buffers.resize(size_needed); } C4_ALWAYS_INLINE void _send_(csubstr s) noexcept { _buf_().append(s); } @@ -683,7 +697,7 @@ struct EventHandlerYamlStd : public EventHandlerStack output->size()) + { + _RYML_CB_CHECK(m_stack.m_callbacks, !retry); + retry = true; + output->resize(len); + output->resize(output->capacity()); + goto again1; + } + return csubstr(*output).first(len); } } if(tag.begins_with('!')) @@ -783,15 +806,23 @@ struct EventHandlerYamlStd : public EventHandlerStackpos); } } - csubstr result = normalize_tag_long(tag, output); - _RYML_CB_CHECK(m_stack.m_callbacks, result.len > 0); - _RYML_CB_CHECK(m_stack.m_callbacks, result.str); + bool retry = false; + again2: + csubstr result = normalize_tag_long(tag, *output); + if(!result.str) + { + _RYML_CB_CHECK(m_stack.m_callbacks, !retry); + retry = true; + output->resize(result.len); + output->resize(output->capacity()); + goto again2; + } return result; } -#undef _enable_ -#undef _disable_ -#undef _has_any_ + #undef _enable_ + #undef _disable_ + #undef _has_any_ /** @endcond */ }; diff --git a/test/test_suite/test_suite_events.cpp b/test/test_suite/test_suite_events.cpp index d4b33f338..881fab342 100644 --- a/test/test_suite/test_suite_events.cpp +++ b/test/test_suite/test_suite_events.cpp @@ -15,7 +15,8 @@ std::string emit_events_from_source(substr src) EventHandlerYamlStd handler(&sink); ParseEngine parser(&handler); parser.parse_in_place_ev("(testyaml)", src); - return sink.result; + csubstr result = sink; + return std::string(result.str, result.len); } diff --git a/test/test_suite/test_suite_parts.cpp b/test/test_suite/test_suite_parts.cpp index 9c22090fb..54e70176f 100644 --- a/test/test_suite/test_suite_parts.cpp +++ b/test/test_suite/test_suite_parts.cpp @@ -21,16 +21,12 @@ constexpr const AllowedFailure allowed_failures[] = { // errors that fail to materialize _("3HFZ-error" , "should not accept scalar after ..."), - _("4EJS-error" , "should not accept double anchor for scalar"), + _("4EJS-error" , "should not accept tabs as indendation in a mapping"), _("5TRB-error" , "should not accept document-end marker in double quoted string"), _("5U3A-error" , "should not accept opening a sequence on same line as map key"), _("7LBH-error" , "should not accept multiline double quoted implicit keys"), _("9C9N-error" , "should not accept non-indented flow sequence"), - _("9HCY-error" , "should not accept tag directive in non-doc scope"), - _("9JBA-error" , "should not accept comment after flow seq terminating ]"), - _("9MMA-error" , "should not accept empty doc after %YAML directive"), _("9MQT_01-error" , "should not accept scalars after ..."), - _("B63P-error" , "should not accept directive without doc"), _("C2SP-error" , "should not accept flow sequence with terminating ] on the next line"), _("CVW2-error" , "should not accept flow sequence with comment after ,"), _("CXX2-error" , "should not accept mapping with anchor on document start line"), @@ -38,26 +34,16 @@ constexpr const AllowedFailure allowed_failures[] = { _("DK4H-error" , "should not accept implicit key followed by newline"), _("DK95_01-error" , "should not accept leading tabs in double quoted multiline scalar"), _("DK95_06-error" , "should not accept tab indentation"), - _("EB22-error" , "should not accept missing document-end marker before directive"), _("G5U8-error" , "should not accept [-, -]"), _("H7TQ-error" , "should not accept extra words after directive"), _("JKF3-error" , "should not accept multiline unindented double quoted scalar"), - _("LHL4-error" , "should not accept tag"), - _("MUS6_00-error" , "should not accept #... at the end of %YAML directive"), - _("MUS6_01-error" , "should not accept #... at the end of %YAML directive"), _("N782-error" , "TBD"), - _("P2EQ-error" , "should not accept sequence item on same line as previous item"), _("QB6E-error" , "should not accept indented multiline quoted scalar"), _("QLJ7-error" , "tag directives should apply only to the next doc (?)"), _("RXY3-error" , "should not accept document-end marker in single quoted string"), _("S4GJ-error" , "should not accept text after block scalar indicator"), _("S98Z-error" , "should not accept block scalar with more spaces than first content line"), - _("SF5V-error" , "should not accept duplicate YAML directive"), - _("SR86-error" , "TBD"), - _("SU5Z-error" , "should not accept comment without whitespace after double quoted scalar"), - _("SU74-error" , "should not accept anchor and alias as mapping key"), _("SY6V-error" , "TBD"), - _("U99R-error" , "should not accept comma in a tag"), _("VJP3_00-error" , "should not accept flow collections over many lines"), _("X4QW-error" , "should not accept comment without whitespace after block scalar indicator"), _("Y79Y_003-error" , "should not accept leading tabs in seq elmt"), diff --git a/test/test_tag_property.cpp b/test/test_tag_property.cpp index 27469d6fa..27aab0551 100644 --- a/test/test_tag_property.cpp +++ b/test/test_tag_property.cpp @@ -1041,6 +1041,19 @@ TEST(tags, EHF6) } } +TEST(tags, fuzzcrash0) +{ + Tree tree; + ExpectError::do_check(&tree, [&]{ + parse_in_arena("%TAG !! " "\n" + "})" "\n" + "" "\n" + "!!<" "\n" + , + &tree); + }); +} + //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- diff --git a/test/test_yaml_events.cpp b/test/test_yaml_events.cpp index 7af519930..92f4cc8e5 100644 --- a/test/test_yaml_events.cpp +++ b/test/test_yaml_events.cpp @@ -58,9 +58,12 @@ TEST_P(EventsTest, from_parser) ParseEngine parser(&handler); std::string src_copy(ec.src.str, ec.src.len); parser.parse_in_place_ev("(testyaml)", to_substr(src_copy)); - _c4dbgpf("~~~\n{}~~~\n", sink.result); + csubstr result = sink; + _c4dbgpf("~~~\n{}~~~\n", result); + // use the diff from std::string which is nice std::string exp_copy(ec.expected_events_from_parser.str, ec.expected_events_from_parser.len); - EXPECT_EQ(sink.result, exp_copy); // use the diff from std::string which is nice + std::string result_copy(result.str, result.len); + EXPECT_EQ(result_copy, exp_copy); } TEST_P(EventsTest, from_tree) diff --git a/tools/parse_emit.cpp b/tools/parse_emit.cpp index dda9729f9..c9c3f29fe 100644 --- a/tools/parse_emit.cpp +++ b/tools/parse_emit.cpp @@ -4,12 +4,21 @@ #include #include #include +#include #endif #include #include #include +#ifdef C4_EXCEPTIONS +#include +#else +#include +std::jmp_buf jmp_env = {}; +c4::csubstr jmp_msg = {}; +#endif + using namespace c4; @@ -18,6 +27,12 @@ C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH("-Wold-style-cast") C4_SUPPRESS_WARNING_GCC("-Wuseless-cast") +bool quiet = false; +bool emit_as_json = false; +bool timed_sections = false; +bool emit_to_string = false; + + //----------------------------------------------------------------------------- struct timed_section @@ -32,90 +47,214 @@ struct timed_section timed_section(csubstr n) : name(n), start(myclock::now()) {} ~timed_section() { - fprintf(stderr, "%.6fms: %.*s\n", since().count(), (int)name.len, name.str); - fflush(stderr); + if(timed_sections) + { + fprintf(stderr, "%.6fms: %.*s\n", since().count(), (int)name.len, name.str); + fflush(stderr); + } } }; -#define TS(name) timed_section name##__##__LINE__(#name) +#define TS(name) timed_section C4_XCAT(__, C4_XCAT(name, __LINE__))(#name) +// LCOV_EXCL_START -//----------------------------------------------------------------------------- -//----------------------------------------------------------------------------- -//----------------------------------------------------------------------------- +constexpr const char *usage = R"(usage: -int main(int argc, const char *argv[]) + %s + +Parse yaml from file (or stdin when file is `-`) and emit to stdout. + +Options: + + -q,--quiet do not emit (default: emit yaml) + -d,--debug print rapidyaml tree + -j,--json emit json instead of yaml (default: emit yaml) + -s,--string emit to string before dumping to stdout. + otherwise, emit directly to stdout + -t,--timed time sections (print timings to stderr) + +)"; + +csubstr parse_args(int argc, const char *argv[]) { - bool print_emitted_to_stdout = true; - csubstr file; - // LCOV_EXCL_START - auto show_usage = [argv]{ - printf("usage: %s [-s] \n", argv[0]); - }; - if(argc == 2) + if(argc < 2) { - file = to_csubstr(argv[1]); + printf(usage, argv[0]); + yml::error("unknown argument"); } - else if(argc > 2) + csubstr file = to_csubstr(argv[argc - 1]); + for(int i = 1; i+1 < argc; ++i) { - file = to_csubstr(argv[2]); - csubstr arg = to_csubstr(argv[1]); - if(arg == "-s") + csubstr arg = to_csubstr(argv[i]); + if(arg == "-q" || arg == "--quiet") + quiet = true; + else if(arg == "-t" || arg == "--timed") + timed_sections = true; + else if(arg == "-s" || arg == "--string") + emit_to_string = true; + else if(arg == "-j" || arg == "--json") + emit_as_json = true; + else { - print_emitted_to_stdout = false; + printf(usage, argv[0]); + yml::error("unknown argument"); } - else + } + return file; +} + +void load_file(csubstr filename, std::string *buf) +{ + buf->clear(); + if(filename == "-") // read from stdin + { + for(int c = std::getchar(); c != EOF; c = std::getchar()) { - show_usage(); - return 1; + buf->push_back((char)c); + if(buf->size() == buf->capacity()) + buf->reserve(2u * (buf->capacity() >= 128u ? buf->capacity() : 128u)); } } else { - show_usage(); - return 1; + if(!fs::path_exists(filename.str)) + { + std::fprintf(stderr, "cannot find file: %s (cwd=%s)\n", filename.str, fs::cwd().c_str()); + yml::error("file not found"); + } + fs::file_get_contents(filename.str, buf); } - // LCOV_EXCL_STOP - - TS(TOTAL); - - C4_CHECK_MSG(fs::path_exists(file.str), "cannot find file: %s (cwd=%s)", file.str, fs::cwd().c_str()); +} - { - TS(objects); - std::string contents, output; - yml::Tree tree; +void emit_json_docs(yml::Tree const& tree, std::string *dst=nullptr) +{ + auto emitnode = [&](yml::ConstNodeRef node){ + if(dst) { - TS(read_file); - fs::file_get_contents(file.str, &contents); + emitrs_json(node, dst, /*append*/true); + *dst += '\n'; } + else { - TS(tree_reserve); - yml::id_type cap; - { - TS(estimate_capacity); - cap = yml::estimate_tree_capacity(to_csubstr(contents)); - } - fprintf(stderr, "reserving capacity=%zu\n", (size_t)cap); - tree.reserve(cap); + emit_json(node, stdout); } + }; + yml::ConstNodeRef root = tree.rootref(); + if(!root.is_stream()) + emitnode(root); + else + for(yml::ConstNodeRef doc : root.children()) + emitnode(doc); +} + +void report_error(const char* msg, size_t length, yml::Location loc, FILE *f) +{ + if(!loc.name.empty()) + { + fwrite(loc.name.str, 1, loc.name.len, f); + fputc(':', f); + } + fprintf(f, "%zu:", loc.line); + if(loc.col) + fprintf(f, "%zu:", loc.col); + if(loc.offset) + fprintf(f, " (%zuB):", loc.offset); + fputc(' ', f); + fprintf(f, "%.*s\n", static_cast(length), msg); + fflush(f); +} + +yml::Callbacks create_custom_callbacks() +{ + yml::Callbacks callbacks = {}; + callbacks.m_error = [](const char *msg, size_t msg_len, yml::Location location, void *) + { + report_error(msg, msg_len, location, stderr); + C4_IF_EXCEPTIONS( + throw std::runtime_error({msg, msg_len}); + , + jmp_msg.assign(msg, msg_len); + std::longjmp(jmp_env, 1); + ); + }; + return callbacks; +} + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +void process_file(csubstr file) +{ + TS(objects); + std::string contents; + yml::Tree tree(yml::get_callbacks()); + { + TS(read_file); + load_file(file, &contents); + } + { + TS(tree_reserve); + yml::id_type cap; { - TS(parse_yml); - yml::parse_in_place(file, to_substr(contents), &tree); + TS(estimate_capacity); + cap = yml::estimate_tree_capacity(to_csubstr(contents)); } + fprintf(stderr, "reserving capacity=%zu\n", (size_t)cap); + tree.reserve(cap); + } + { + TS(parse_yml); + yml::parse_in_place(file, to_substr(contents), &tree); + } + if(emit_as_json) + { + TS(resolve_refs); + tree.resolve(); + } + if(emit_to_string) + { + std::string output; { TS(emit_to_buffer); output.resize(contents.size()); // resize, not just reserve - yml::emitrs_yaml(tree, &output); + if(!emit_as_json) + yml::emitrs_yaml(tree, &output); + else + emit_json_docs(tree, &output); } - if(print_emitted_to_stdout) + if(!quiet) { TS(print_stdout); fwrite(output.data(), 1, output.size(), stdout); - putchar('\n'); } } + else if(!quiet) + { + TS(emit_to_stdout); + if(!emit_as_json) + yml::emit_yaml(tree); + else + emit_json_docs(tree); + } +} +// LCOV_EXCL_STOP +int main(int argc, const char *argv[]) +{ + TS(TOTAL); + set_callbacks(create_custom_callbacks()); + C4_IF_EXCEPTIONS_(try, if(setjmp(jmp_env) == 0)) + { + csubstr file = parse_args(argc, argv); + process_file(file); + } + C4_IF_EXCEPTIONS_(catch(std::exception const&), else) + { + return 1; + } return 0; } diff --git a/tools/yaml_events.cpp b/tools/yaml_events.cpp index edbfae927..b0d8009b3 100644 --- a/tools/yaml_events.cpp +++ b/tools/yaml_events.cpp @@ -118,7 +118,8 @@ std::string emit_events_direct(csubstr filename, substr filecontents) EventHandlerYamlStd handler(&sink, create_custom_callbacks()); ParseEngine parser(&handler); parser.parse_in_place_ev(filename, filecontents); - return sink.result; + csubstr result = sink; + return std::string(result.str, result.len); } @@ -150,8 +151,6 @@ bool Args::parse(Args *args, int argc, const char *argv[]) return true; } -C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH("-Wold-style-cast") - std::string load_file(csubstr filename) { if(filename == "-") // read from stdin @@ -159,7 +158,7 @@ std::string load_file(csubstr filename) std::string buf; for(int c = std::getchar(); c != EOF; c = std::getchar()) { - buf.push_back((char)c); + buf.push_back(static_cast(c)); if(buf.size() == buf.capacity()) buf.reserve(2u * (buf.capacity() >= 128u ? buf.capacity() : 128u)); } @@ -186,12 +185,10 @@ void report_error(const char* msg, size_t length, Location loc, FILE *f) if(loc.offset) fprintf(f, " (%zuB):", loc.offset); fputc(' ', f); - fprintf(f, "%.*s\n", (int)length, msg); + fprintf(f, "%.*s\n", static_cast(length), msg); fflush(f); } -C4_SUPPRESS_WARNING_GCC_CLANG_POP - Callbacks create_custom_callbacks() { Callbacks callbacks = {};