From dbfa4ed14fe4f813c922ef6ba09388df28674f29 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Wed, 22 May 2024 18:51:14 +0100 Subject: [PATCH 01/18] tag: ensure no overflow on malformed tags --- src/c4/yml/tag.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/c4/yml/tag.cpp b/src/c4/yml/tag.cpp index a74537d9..674b8c28 100644 --- a/src/c4/yml/tag.cpp +++ b/src/c4/yml/tag.cpp @@ -248,8 +248,10 @@ size_t TagDirective::transform(csubstr tag, substr output, Callbacks const& call _c4dbgpf("%TAG: rest={}", rest); if(rest.begins_with('<')) { - rest = rest.offs(1, 1); _c4dbgpf("%TAG: begins with <. rest={}", rest); + if(C4_UNLIKELY(!rest.ends_with('>'))) + _RYML_CB_ERR(callbacks, "malformed tag"); + rest = rest.offs(1, 1); if(rest.begins_with(prefix)) { _c4dbgpf("%TAG: already transformed! actual={}", rest.sub(prefix.len)); From 3b294ea6a319bcce4278dc628756ba10fdef0fd4 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Wed, 22 May 2024 18:52:01 +0100 Subject: [PATCH 02/18] arena: ensure the source buffer is relocated when the arena grows --- src/c4/yml/emit.hpp | 2 + src/c4/yml/event_handler_stack.hpp | 72 ++++++++++++++------ src/c4/yml/event_handler_tree.hpp | 5 +- src/c4/yml/parse_engine.def.hpp | 31 ++++++++- src/c4/yml/parse_engine.hpp | 3 + test/test_parser.cpp | 5 ++ test/test_suite/test_suite_event_handler.hpp | 4 +- 7 files changed, 94 insertions(+), 28 deletions(-) diff --git a/src/c4/yml/emit.hpp b/src/c4/yml/emit.hpp index 8c788b61..31811da4 100644 --- a/src/c4/yml/emit.hpp +++ b/src/c4/yml/emit.hpp @@ -442,6 +442,7 @@ inline OStream& operator<< (OStream& s, as_yaml const& y) * @param id the node where to start emitting. * @param opts emit options. * @param buf the output buffer. + * @param opts emit options. * @param error_on_excess Raise an error if the space in the buffer is insufficient. * @return a substr trimmed to the result in the output buffer. If the buffer is * insufficient (when error_on_excess is false), the string pointer of the @@ -462,6 +463,7 @@ inline substr emit_yaml(Tree const& t, id_type id, substr buf, bool error_on_exc * @param id the node where to start emitting. * @param opts emit options. * @param buf the output buffer. + * @param opts emit options. * @param error_on_excess Raise an error if the space in the buffer is insufficient. * @return a substr trimmed to the result in the output buffer. If the buffer is * insufficient (when error_on_excess is false), the string pointer of the diff --git a/src/c4/yml/event_handler_stack.hpp b/src/c4/yml/event_handler_stack.hpp index 4bb78077..4e6d94f5 100644 --- a/src/c4/yml/event_handler_stack.hpp +++ b/src/c4/yml/event_handler_stack.hpp @@ -25,6 +25,10 @@ namespace yml { /** @addtogroup doc_event_handlers * @{ */ +namespace detail { +using pfn_relocate_arena = void (*)(void*, csubstr prev_arena, substr next_arena); +} // detail + /** Use this class a base of implementations of event handler to * simplify the stack logic. */ template @@ -34,17 +38,36 @@ struct EventHandlerStack "ParserState must be a base of HandlerState"); using state = HandlerState; + using pfn_relocate_arena = detail::pfn_relocate_arena; public: detail::stack m_stack; state *C4_RESTRICT m_curr; ///< current stack level: top of the stack. cached here for easier access. state *C4_RESTRICT m_parent; ///< parent of the current stack level. + pfn_relocate_arena m_relocate_arena; ///< callback when the arena gets relocated + void *C4_RESTRICT m_relocate_arena_data; protected: - EventHandlerStack() : m_stack(), m_curr(), m_parent() {} - EventHandlerStack(Callbacks const& cb) : m_stack(cb), m_curr(), m_parent() {} + EventHandlerStack() : m_stack(), m_curr(), m_parent(), m_relocate_arena(), m_relocate_arena_data() {} + EventHandlerStack(Callbacks const& cb) : m_stack(cb), m_curr(), m_parent(), m_relocate_arena(), m_relocate_arena_data() {} + +protected: + + void _stack_start_parse(const char *filename, pfn_relocate_arena relocate_arena, void *relocate_arena_data) + { + _RYML_CB_ASSERT(m_stack.m_callbacks, m_curr != nullptr); + _RYML_CB_ASSERT(m_stack.m_callbacks, relocate_arena != nullptr); + _RYML_CB_ASSERT(m_stack.m_callbacks, relocate_arena_data != nullptr); + m_curr->start_parse(filename, m_curr->node_id); + m_relocate_arena = relocate_arena; + m_relocate_arena_data = relocate_arena_data; + } + + void _stack_finish_parse() + { + } protected: @@ -89,15 +112,25 @@ struct EventHandlerStack #endif } - substr _stack_relocate_to_new_arena(csubstr s, csubstr prev, substr curr) +protected: + + // undefined at the end + #define _has_any_(bits) (static_cast(this)->template _has_any__()) + + bool _stack_should_push_on_begin_doc() const { - _RYML_CB_ASSERT(m_stack.m_callbacks, prev.is_super(s)); - auto pos = s.str - prev.str; - substr out = {curr.str + pos, s.len}; - _RYML_CB_ASSERT(m_stack.m_callbacks, curr.is_super(out)); - return out; + const bool is_root = (m_stack.size() == 1u); + return is_root && (_has_any_(DOC|VAL|MAP|SEQ) || m_curr->has_children); } + bool _stack_should_pop_on_end_doc() const + { + const bool is_root = (m_stack.size() == 1u); + return !is_root && _has_any_(DOC); + } + +protected: + void _stack_relocate_to_new_arena(csubstr prev, substr curr) { for(state &st : m_stack) @@ -109,23 +142,18 @@ struct EventHandlerStack if(st.line_contents.stripped.is_sub(prev)) st.line_contents.stripped = _stack_relocate_to_new_arena(st.line_contents.stripped, prev, curr); } + _RYML_CB_ASSERT(m_stack.m_callbacks, m_relocate_arena != nullptr); + _RYML_CB_ASSERT(m_stack.m_callbacks, m_relocate_arena_data != nullptr); + m_relocate_arena(m_relocate_arena_data, prev, curr); } -protected: - - // undefined at the end - #define _has_any_(bits) (static_cast(this)->template _has_any__()) - - bool _stack_should_push_on_begin_doc() const - { - const bool is_root = (m_stack.size() == 1u); - return is_root && (_has_any_(DOC|VAL|MAP|SEQ) || m_curr->has_children); - } - - bool _stack_should_pop_on_end_doc() const + substr _stack_relocate_to_new_arena(csubstr s, csubstr prev, substr curr) { - const bool is_root = (m_stack.size() == 1u); - return !is_root && _has_any_(DOC); + _RYML_CB_ASSERT(m_stack.m_callbacks, prev.is_super(s)); + auto pos = s.str - prev.str; + substr out = {curr.str + pos, s.len}; + _RYML_CB_ASSERT(m_stack.m_callbacks, curr.is_super(out)); + return out; } public: diff --git a/src/c4/yml/event_handler_tree.hpp b/src/c4/yml/event_handler_tree.hpp index b46238f4..d4fe0481 100644 --- a/src/c4/yml/event_handler_tree.hpp +++ b/src/c4/yml/event_handler_tree.hpp @@ -96,13 +96,14 @@ struct EventHandlerTree : public EventHandlerStackstart_parse(filename, m_curr->node_id); + this->_stack_start_parse(filename, relocate_arena, relocate_arena_data); } void finish_parse() { + this->_stack_finish_parse(); /* This pointer is temporary. Remember that: * * - this handler object may be held by the user diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index 4faf68fc..af5e32b3 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -364,6 +364,33 @@ void ParseEngine::_reset() m_was_inside_qmrk = false; } + +//----------------------------------------------------------------------------- + +template +void ParseEngine::_relocate_arena(csubstr prev_arena, substr next_arena) +{ + #define _ryml_relocate(s) \ + if(s.is_sub(prev_arena)) \ + { \ + s.str = next_arena.str + (s.str - prev_arena.str); \ + } + _ryml_relocate(m_buf); + _ryml_relocate(m_newline_offsets_buf); + for(size_t i = 0; i < m_pending_tags.num_entries; ++i) + _ryml_relocate(m_pending_tags.annotations[i].str); + for(size_t i = 0; i < m_pending_anchors.num_entries; ++i) + _ryml_relocate(m_pending_anchors.annotations[i].str); + #undef _ryml_relocate +} + +template +void ParseEngine::_s_relocate_arena(void* data, csubstr prev_arena, substr next_arena) +{ + ((ParseEngine*)data)->_relocate_arena(prev_arena, next_arena); +} + + //----------------------------------------------------------------------------- template @@ -7848,7 +7875,7 @@ void ParseEngine::parse_json_in_place_ev(csubstr filename, substr m_file = filename; m_buf = src; _reset(); - m_evt_handler->start_parse(filename.str); + m_evt_handler->start_parse(filename.str, &_s_relocate_arena, this); m_evt_handler->begin_stream(); while( ! _finished_file()) { @@ -7892,7 +7919,7 @@ void ParseEngine::parse_in_place_ev(csubstr filename, substr src) m_file = filename; m_buf = src; _reset(); - m_evt_handler->start_parse(filename.str); + m_evt_handler->start_parse(filename.str, &_s_relocate_arena, this); m_evt_handler->begin_stream(); while( ! _finished_file()) { diff --git a/src/c4/yml/parse_engine.hpp b/src/c4/yml/parse_engine.hpp index 06f660ab..759a842b 100644 --- a/src/c4/yml/parse_engine.hpp +++ b/src/c4/yml/parse_engine.hpp @@ -631,6 +631,9 @@ class ParseEngine return m_evt_handler->m_curr->line_contents.rem.begin() == m_evt_handler->m_curr->line_contents.full.begin(); } + void _relocate_arena(csubstr prev_arena, substr next_arena); + static void _s_relocate_arena(void*, csubstr prev_arena, substr next_arena); + private: C4_ALWAYS_INLINE bool has_all(ParserFlag_t f) const noexcept { return (m_evt_handler->m_curr->flags & f) == f; } diff --git a/test/test_parser.cpp b/test/test_parser.cpp index c1f6969f..ff8ef06f 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -402,7 +402,12 @@ TEST(Parser, alloc_arena) { Tree tree; Parser::handler_type evt_handler = {}; + int data = 0; + auto relocate = [](void*, csubstr prev, substr next_arena){ + EXPECT_FALSE(prev.overlaps(next_arena)); + }; evt_handler.reset(&tree, tree.root_id()); + evt_handler.start_parse("filename", relocate, &data); substr bufa = evt_handler.alloc_arena(64); bufa.fill('a'); csubstr prev = bufa; diff --git a/test/test_suite/test_suite_event_handler.hpp b/test/test_suite/test_suite_event_handler.hpp index c830e1d6..55eb650d 100644 --- a/test/test_suite/test_suite_event_handler.hpp +++ b/test/test_suite/test_suite_event_handler.hpp @@ -116,9 +116,9 @@ struct EventHandlerYamlStd : public EventHandlerStackstart_parse(filename, m_curr->node_id); + this->_stack_start_parse(filename, relocate_arena, relocate_arena_data); } void finish_parse() From c7734c88f7a27d2123156e5c6d5285c40c5d6130 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Thu, 23 May 2024 23:54:04 +0100 Subject: [PATCH 03/18] fix assertions that should be checks --- src/c4/yml/parse_engine.def.hpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index af5e32b3..c535c115 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -2742,7 +2742,6 @@ void ParseEngine::_filter_chomp(FilterProcessor &C4_RESTRICT proc, { const char curr = proc.curr(); _c4dbgchomp("curr='{}'", _c4prc(curr)); - _RYML_CB_ASSERT(this->callbacks(), curr == '\n' || curr == '\r'); switch(curr) { case '\n': @@ -2781,6 +2780,9 @@ void ParseEngine::_filter_chomp(FilterProcessor &C4_RESTRICT proc, case '\r': proc.skip(); break; + default: + _c4err("parse error"); + break; } } } @@ -5663,6 +5665,8 @@ void ParseEngine::_handle_seq_block() // handle indentation // _c4dbgpf("seqblck[RNXT]: indref={} indentation={}", m_state->indref, m_state->line_contents.indentation); + if(C4_UNLIKELY(!_at_line_begin())) + _c4err("parse error"); if(m_state->indentation_ge()) { _c4dbgpf("seqblck[RNXT]: skip {} from indref", m_state->indref); @@ -6070,7 +6074,7 @@ void ParseEngine::_handle_map_block() else if(first == '?') { _c4dbgp("mapblck[RKCL]: got '?'. val was empty"); - _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, m_was_inside_qmrk); + _RYML_CB_CHECK(m_evt_handler->m_stack.m_callbacks, m_was_inside_qmrk); m_evt_handler->set_val_scalar_plain({}); m_evt_handler->add_sibling(); addrem_flags(QMRK, RKCL); @@ -6082,7 +6086,7 @@ void ParseEngine::_handle_map_block() if(m_state->indref == 0 || m_state->line_contents.indentation == 0 || _is_doc_begin_token(rem)) { _c4dbgp("mapblck[RKCL]: end+start doc"); - _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, _is_doc_begin_token(rem)); + _RYML_CB_CHECK(m_evt_handler->m_stack.m_callbacks, _is_doc_begin_token(rem)); _start_doc_suddenly(); _line_progressed(3); _maybe_skip_whitespace_tokens(); From da6e90b333316d4ca9524e1f1678cb29020cb9b2 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Fri, 24 May 2024 01:34:22 +0100 Subject: [PATCH 04/18] fix problem with popping seqs into seqimaps --- src/c4/yml/parse_engine.def.hpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index c535c115..a91889b1 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -4845,11 +4845,9 @@ void ParseEngine::_handle_seq_flow() else if(first == ']') // this happens on a trailing comma like ", ]" { _c4dbgp("seqflow[RVAL]: end!"); - rem_flags(RSEQ); - m_evt_handler->end_seq(); _line_progressed(1); - if(!has_all(RSEQ|FLOW)) - goto seqflow_finish; + m_evt_handler->end_seq(); + goto seqflow_finish; } else if(first == '*') { From bd0306c72b2055649e0ee56180ff1772cbe068c2 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sat, 25 May 2024 13:30:55 +0100 Subject: [PATCH 05/18] ensure proper error handling for bad utf codepoints --- src/c4/yml/parse_engine.def.hpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index a91889b1..806ee019 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -2552,8 +2552,10 @@ void ParseEngine::_filter_dquoted_backslash(FilterProcessor &C4_RE uint32_t codepoint_val = {}; if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val))) _c4err("failed to parse \\u codepoint. scalar pos={}", proc.rpos); - size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); - C4_ASSERT(numbytes <= 4); + const size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); + if(C4_UNLIKELY(numbytes == 0)) + _c4err("failed to decode code point={}", proc.rpos); + _RYML_CB_ASSERT(callbacks(), numbytes <= 4); proc.translate_esc_bulk(readbuf, numbytes, /*nread*/5u); } else if(next == 'U') // UTF32 @@ -2565,8 +2567,10 @@ void ParseEngine::_filter_dquoted_backslash(FilterProcessor &C4_RE uint32_t codepoint_val = {}; if(C4_UNLIKELY(!read_hex(codepoint, &codepoint_val))) _c4err("failed to parse \\U codepoint. scalar pos={}", proc.rpos); - size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); - C4_ASSERT(numbytes <= 4); + const size_t numbytes = decode_code_point((uint8_t*)readbuf, sizeof(readbuf), codepoint_val); + if(C4_UNLIKELY(numbytes == 0)) + _c4err("failed to decode code point={}", proc.rpos); + _RYML_CB_ASSERT(callbacks(), numbytes <= 4); proc.translate_esc_bulk(readbuf, numbytes, /*nread*/9u); } // https://yaml.org/spec/1.2.2/#rule-c-ns-esc-char From e4eba399b4cec15f69543c8284d23c07d7563f31 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sun, 26 May 2024 16:22:45 +0100 Subject: [PATCH 06/18] do not use std::string in the event handlers --- test/test_lib/test_engine.cpp | 5 +- test/test_lib/test_engine.hpp | 7 +- test/test_suite.cpp | 7 +- test/test_suite/string.hpp | 424 +++++++++++++++++++ test/test_suite/test_suite_event_handler.cpp | 43 +- test/test_suite/test_suite_event_handler.hpp | 90 ++-- test/test_suite/test_suite_events.cpp | 3 +- test/test_yaml_events.cpp | 7 +- tools/yaml_events.cpp | 3 +- 9 files changed, 516 insertions(+), 73 deletions(-) create mode 100644 test/test_suite/string.hpp diff --git a/test/test_lib/test_engine.cpp b/test/test_lib/test_engine.cpp index 43f18e6f..f2870ee2 100644 --- a/test/test_lib/test_engine.cpp +++ b/test/test_lib/test_engine.cpp @@ -72,8 +72,9 @@ void test_new_parser_events_from_yaml(ReferenceYaml const& yaml, std::string con ParseEngine parser(&handler); std::string copy = yaml.parsed; parser.parse_in_place_ev("(testyaml)", to_substr(copy)); - _c4dbgpf("~~~\n{}~~~\n", sink.result); - EXPECT_EQ(sink.result, expected_events); + csubstr result = sink; + _c4dbgpf("~~~\n{}~~~\n", result); + EXPECT_EQ(std::string(result.str, result.len), expected_events); } void test_new_parser_tree_from_yaml(ReferenceYaml const& yaml) diff --git a/test/test_lib/test_engine.hpp b/test/test_lib/test_engine.hpp index 5bf24bcc..880bd67d 100644 --- a/test/test_lib/test_engine.hpp +++ b/test/test_lib/test_engine.hpp @@ -42,8 +42,9 @@ C4_NO_INLINE void test_new_parser_str_from_events(std::string const& expected_ev handler.reset(); EventProducerFn event_producer; event_producer(handler); - _c4dbgpf("~~~\n{}~~~\n", sink.result); - EXPECT_EQ(sink.result, expected_events); + csubstr result = sink; + _c4dbgpf("~~~\n{}~~~\n", result); + EXPECT_EQ(std::string(result.str, result.len), expected_events); } template class EventProducerFn> @@ -214,7 +215,7 @@ inline void _print_handler_info(EventHandlerYamlStd const& ps, csubstr stmt) }; for(id_type i = 0; i < ps.m_stack.size(); ++i) { - auto const& str = ps._buf_(i).get(); + csubstr const& str = ps._buf_(i); indent(i); _dbg_printf("[{}]\n", i); for(csubstr line : str.split('\n')) diff --git a/test/test_suite.cpp b/test/test_suite.cpp index 2ba6b029..194998e8 100644 --- a/test/test_suite.cpp +++ b/test/test_suite.cpp @@ -299,11 +299,11 @@ struct TestSequenceLevel if(prev) receive_src(*prev); _nfo_logf("level[{}]: parsing source:\n{}", level, src_evts); - evt_str_sink.reset(); + evt_str_sink.clear(); evt_handler_str_sink.reset(); evt_handler_str_sink.m_stack.m_callbacks = get_callbacks(); parser_str_sink.parse_in_place_ev(filename, to_substr(src_evts)); - EXPECT_NE(evt_str_sink.result, ""); + EXPECT_NE(evt_str_sink.size(), 0); events_were_generated = true; } @@ -655,7 +655,8 @@ struct TestSequenceData for(size_t i = 0; i < num; ++i) { levels[i].parse_yaml_to_events(); - events->compare_emitted_events_to_reference_events(levels[i].evt_str_sink.result, + csubstr result = levels[i].evt_str_sink; + events->compare_emitted_events_to_reference_events(std::string(result.str, result.len), /*ignore_container_style*/false, /*ignore_scalar_style*/(num>0)); } diff --git a/test/test_suite/string.hpp b/test/test_suite/string.hpp new file mode 100644 index 00000000..a61a8ea5 --- /dev/null +++ b/test/test_suite/string.hpp @@ -0,0 +1,424 @@ +#ifndef _C4_YML_EXTRA_STRING_HPP_ +#define _C4_YML_EXTRA_STRING_HPP_ + +#ifdef RYML_SINGLE_HEADER +#ifndef _RYML_SINGLE_HEADER_AMALGAMATED_HPP_ +#include +#endif +#else +#ifndef _C4_YML_COMMON_HPP_ +#include "c4/yml/common.hpp" +#endif +#endif + +#include + +C4_SUPPRESS_WARNING_GCC_CLANG_PUSH +C4_SUPPRESS_WARNING_GCC_CLANG("-Wold-style-cast") +C4_SUPPRESS_WARNING_GCC("-Wuseless-cast") + +#ifndef RYML_STRING_SSO_SIZE +#define RYML_STRING_SSO_SIZE 128 +#endif + +#ifndef RYML_STRING_LIST_SSO_SIZE +#define RYML_STRING_LIST_SSO_SIZE 66 +#endif + +namespace c4 { +namespace yml { +namespace extra { + +/** an owning string class used by the yaml std event handler (and the + * YamlScript handler). we use this instead of std::string because: + * 1) this spares the dependency on the standard library + * 2) enables possibility of adding borrowing semantics (in the future) */ +struct string +{ + enum : id_type { sso_size = RYML_STRING_SSO_SIZE }; + char m_buf[sso_size]; + char *C4_RESTRICT m_str; + id_type m_size; + id_type m_capacity; + +public: + + string() + : m_buf() + , m_str(m_buf) + , m_size(0) + , m_capacity(sso_size) + {} + ~string() noexcept + { + _free(); + } + + string(string const& that) RYML_NOEXCEPT : string() + { + resize(that.m_size); + _cp(&that); + } + + string(string &&that) noexcept : string() + { + _mv(&that); + } + + string& operator= (string const& that) RYML_NOEXCEPT + { + resize(that.m_size); + _cp(&that); + return *this; + } + + string& operator= (string &&that) noexcept + { + _mv(&that); + return *this; + } + +public: + + C4_ALWAYS_INLINE C4_HOT operator csubstr() const noexcept { return {m_str, m_size}; } + C4_ALWAYS_INLINE C4_HOT operator substr() noexcept { return {m_str, m_size}; } + +public: + + id_type size() const noexcept { return m_size; } + id_type capacity() const noexcept { return m_capacity; } + + void clear() + { + m_size = 0; + } + + void resize(id_type sz) + { + reserve(sz); + m_size = sz; + } + + void reserve(id_type sz) + { + if(sz <= m_capacity) + return; + id_type cap = m_capacity < string::sso_size ? string::sso_size : 2 * m_capacity; + cap = cap > sz ? cap : sz; + if(cap <= sso_size) + return; + Callbacks cb = get_callbacks(); + char *buf = (char*) _RYML_CB_ALLOC(cb, char, cap); + if(m_size) + memcpy(buf, m_str, (size_t)m_size); + if(m_str != m_buf) + { + _RYML_CB_FREE(cb, m_str, char, m_size); + } + m_str = buf; + m_capacity = cap; + } + +public: + + C4_ALWAYS_INLINE C4_HOT void append(char c) + { + if(C4_UNLIKELY(m_size == m_capacity)) + reserve(m_size + 1); + m_str[m_size++] = c; + } + C4_ALWAYS_INLINE C4_HOT void append(csubstr cs) + { + if(cs.len) + { + const id_type ilen = (id_type)cs.len; + if(C4_UNLIKELY(m_size + ilen > m_capacity)) + reserve(m_size + ilen); + memcpy(m_str + m_size, cs.str, cs.len); + m_size += ilen; + } + } + C4_ALWAYS_INLINE void insert(char c, id_type pos) + { + RYML_ASSERT(pos <= m_size); + if(pos < m_size) + { + if(C4_UNLIKELY(m_size == m_capacity)) + reserve(m_size + 1); + char *C4_RESTRICT src = m_str + pos; + memmove(src + 1, src, m_size - pos); + *src = c; + ++m_size; + } + else + { + append(c); + } + } + C4_NO_INLINE void insert(csubstr cs, id_type pos) + { + RYML_ASSERT(pos <= m_size); + if(cs.len) + { + if(pos < m_size) + { + const id_type ilen = (id_type)cs.len; + if(C4_UNLIKELY(m_size + ilen > m_capacity)) + reserve(m_size + ilen); + char *C4_RESTRICT src = m_str + pos; + memmove(src + cs.len, src, m_size - pos); + memcpy(src, cs.str, cs.len); + m_size += ilen; + } + else + { + append(cs); + } + } + } + C4_NO_INLINE size_t find_last(csubstr pattern) RYML_NOEXCEPT + { + RYML_ASSERT(pattern.len); + if(m_size >= pattern.len) + { + for(size_t i = m_size - pattern.len; i != (size_t)-1; --i) + { + if(m_str[i] == pattern[0]) + { + bool gotit = true; + for(size_t j = 1; j < pattern.len; ++j) + { + if(m_str[i + j] != pattern[j]) + { + gotit = false; + break; + } + } + if(gotit) + return i; + } + } + } + return npos; + } + +public: + + void _free() + { + RYML_ASSERT(m_str != nullptr); // this structure cannot be memset() to zero + if(m_str != m_buf) + { + _RYML_CB_FREE(get_callbacks(), m_str, char, (size_t)m_capacity); + m_str = m_buf; + m_capacity = sso_size; + } + RYML_ASSERT(m_capacity == sso_size); + m_size = 0; + } + + void _cp(string const* C4_RESTRICT that) + { + #if RYML_USE_ASSERT + if(that->m_str != that->m_buf) + { + RYML_ASSERT(that->m_capacity > sso_size); + RYML_ASSERT(that->m_size <= that->m_capacity); + } + else + { + RYML_ASSERT(that->m_capacity <= sso_size); + RYML_ASSERT(that->m_size <= that->m_capacity); + } + #endif + memcpy(m_str, that->m_str, that->m_size); + m_size = that->m_size; + m_capacity = that->m_size < sso_size ? sso_size : that->m_size; + } + + void _mv(string *C4_RESTRICT that) + { + if(that->m_str != that->m_buf) + { + RYML_ASSERT(that->m_capacity > sso_size); + RYML_ASSERT(that->m_size <= that->m_capacity); + m_str = that->m_str; + } + else + { + RYML_ASSERT(that->m_capacity <= sso_size); + RYML_ASSERT(that->m_size <= that->m_capacity); + memcpy(m_buf, that->m_buf, that->m_size); + m_str = m_buf; + } + m_size = that->m_size; + m_capacity = that->m_capacity; + // make sure no deallocation happens on destruction + RYML_ASSERT(that->m_str != this->m_buf); + that->m_str = that->m_buf; + that->m_capacity = sso_size; + that->m_size = 0; + } +}; + + +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- +//----------------------------------------------------------------------------- + +/** a string collection used by the event handler. using this instead of + * std::vector spares the dependency on the standard library. */ +struct string_vector +{ + enum : id_type { sso_size = RYML_STRING_LIST_SSO_SIZE }; + union { + alignas(string) string m_buf[sso_size]; + alignas(string) char m_buf_bytes[sso_size * sizeof(string)]; + }; + string *C4_RESTRICT m_arr; + id_type m_size; + id_type m_capacity; + +public: + + string_vector() + : m_arr(m_buf) + , m_size(0) + , m_capacity(sso_size) + {} + ~string_vector() noexcept + { + _free(); + } + + string_vector(string_vector const& that) RYML_NOEXCEPT : string_vector() + { + reserve(that.m_size); + m_size = that.m_size; + for(id_type i = 0; i < that.m_size; ++i) + new ((void*)(m_arr+i)) string(that.m_arr[i]); + } + + string_vector(string_vector &&that) noexcept : string_vector() + { + reserve(that.m_size); + m_size = that.m_size; + for(id_type i = 0; i < that.m_size; ++i) + new ((void*)(m_arr+i)) string(std::move(that.m_arr[i])); + that.~string_vector(); + } + + string_vector& operator= (string_vector const& that) RYML_NOEXCEPT + { + _free(); + reserve(that.m_size); + for(id_type i = 0; i < that.m_size; ++i) + m_arr[i].operator=(that.m_arr[i]); + m_size = that.m_size; + return *this; + } + + string_vector& operator= (string_vector &&that) noexcept + { + _free(); + reserve(that.m_size); + for(id_type i = 0; i < that.m_size; ++i) + m_arr[i].operator=(std::move(that.m_arr[i])); + m_size = that.m_size; + that.~string_vector(); + return *this; + } + + void _free() + { + RYML_ASSERT(m_arr != nullptr); // this structure cannot be memset() to zero + for(id_type i = 0; i < m_size; ++i) + m_arr[i].~string(); + if(m_arr != m_buf) + { + _RYML_CB_FREE(get_callbacks(), m_arr, string, (size_t)m_capacity); + m_arr = m_buf; + m_capacity = sso_size; + } + RYML_ASSERT(m_capacity == sso_size); + m_size = 0; + } + +public: + + id_type size() const noexcept { return m_size; } + id_type capacity() const noexcept { return m_capacity; } + + void clear() + { + resize(0); + } + + void resize(id_type sz) + { + reserve(sz); + if(sz >= m_size) + { + for(id_type i = m_size; i < sz; ++i) + new ((void*)(m_arr + i)) string(); + } + else + { + for(id_type i = sz; i < m_size; ++i) + m_arr[i].~string(); + } + m_size = sz; + } + + void reserve(id_type sz) + { + if(sz <= m_capacity) + return; + id_type cap = m_capacity < string::sso_size ? string::sso_size : 2 * m_capacity; + cap = cap > sz ? cap : sz; + if(cap <= sso_size) + return; + Callbacks cb = get_callbacks(); + string *buf = (string*) _RYML_CB_ALLOC(cb, string, cap); + for(id_type i = 0; i < m_size; ++i) + new ((void*)(buf + i)) string(std::move(m_arr[i])); + if(m_arr != m_buf) + { + _RYML_CB_FREE(cb, m_arr, string, m_size); + } + m_arr = buf; + m_capacity = cap; + } + +public: + + string& emplace_back() + { + RYML_ASSERT(m_size < m_capacity); + if(m_size == m_capacity) + reserve(m_size + 1); + string& ret = m_arr[m_size++]; + new ((void*)&ret) string(); + return ret; + } + string& operator[] (id_type i) + { + RYML_ASSERT(m_size <= m_capacity); + RYML_ASSERT(i < m_size); + return m_arr[i]; + } + string const& operator[] (id_type i) const + { + RYML_ASSERT(m_size <= m_capacity); + RYML_ASSERT(i < m_size); + return m_arr[i]; + } +}; + +} // namespace extra +} // namespace yml +} // namespace c4 + +C4_SUPPRESS_WARNING_GCC_POP + +#endif /* _C4_YML_EXTRA_STRING_HPP_ */ diff --git a/test/test_suite/test_suite_event_handler.cpp b/test/test_suite/test_suite_event_handler.cpp index 7642d354..55d939f1 100644 --- a/test/test_suite/test_suite_event_handler.cpp +++ b/test/test_suite/test_suite_event_handler.cpp @@ -12,15 +12,14 @@ namespace yml { // instantiate the template template class ParseEngine; - -void EventHandlerYamlStd::EventSink::append_escaped(csubstr val) +void append_escaped(extra::string *es, csubstr val) { - #define _c4flush_use_instead(repl, skip) \ - do { \ - this->append(val.range(prev, i)); \ - this->append(repl); \ - prev = i + skip; \ - } \ + #define _c4flush_use_instead(i, repl, skip) \ + do { \ + es->append(val.range(prev, i)); \ + es->append(repl); \ + prev = i + skip; \ + } \ while(0) uint8_t const* C4_RESTRICT s = reinterpret_cast(val.str); size_t prev = 0; @@ -29,33 +28,33 @@ void EventHandlerYamlStd::EventSink::append_escaped(csubstr val) switch(s[i]) { case UINT8_C(0x0a): // \n - _c4flush_use_instead("\\n", 1); break; + _c4flush_use_instead(i, "\\n", 1); break; case UINT8_C(0x5c): // '\\' - _c4flush_use_instead("\\\\", 1); break; + _c4flush_use_instead(i, "\\\\", 1); break; case UINT8_C(0x09): // \t - _c4flush_use_instead("\\t", 1); break; + _c4flush_use_instead(i, "\\t", 1); break; case UINT8_C(0x0d): // \r - _c4flush_use_instead("\\r", 1); break; + _c4flush_use_instead(i, "\\r", 1); break; case UINT8_C(0x00): // \0 - _c4flush_use_instead("\\0", 1); break; + _c4flush_use_instead(i, "\\0", 1); break; case UINT8_C(0x0c): // \f (form feed) - _c4flush_use_instead("\\f", 1); break; + _c4flush_use_instead(i, "\\f", 1); break; case UINT8_C(0x08): // \b (backspace) - _c4flush_use_instead("\\b", 1); break; + _c4flush_use_instead(i, "\\b", 1); break; case UINT8_C(0x07): // \a (bell) - _c4flush_use_instead("\\a", 1); break; + _c4flush_use_instead(i, "\\a", 1); break; case UINT8_C(0x0b): // \v (vertical tab) - _c4flush_use_instead("\\v", 1); break; + _c4flush_use_instead(i, "\\v", 1); break; case UINT8_C(0x1b): // \e (escape) - _c4flush_use_instead("\\e", 1); break; + _c4flush_use_instead(i, "\\e", 1); break; case UINT8_C(0xc2): if(i+1 < val.len) { const uint8_t np1 = s[i+1]; if(np1 == UINT8_C(0xa0)) - _c4flush_use_instead("\\_", 2); + _c4flush_use_instead(i, "\\_", 2); else if(np1 == UINT8_C(0x85)) - _c4flush_use_instead("\\N", 2); + _c4flush_use_instead(i, "\\N", 2); } break; case UINT8_C(0xe2): @@ -64,9 +63,9 @@ void EventHandlerYamlStd::EventSink::append_escaped(csubstr val) if(s[i+1] == UINT8_C(0x80)) { if(s[i+2] == UINT8_C(0xa8)) - _c4flush_use_instead("\\L", 3); + _c4flush_use_instead(i, "\\L", 3); else if(s[i+2] == UINT8_C(0xa9)) - _c4flush_use_instead("\\P", 3); + _c4flush_use_instead(i, "\\P", 3); } } break; diff --git a/test/test_suite/test_suite_event_handler.hpp b/test/test_suite/test_suite_event_handler.hpp index 55eb650d..5730374d 100644 --- a/test/test_suite/test_suite_event_handler.hpp +++ b/test/test_suite/test_suite_event_handler.hpp @@ -15,7 +15,9 @@ #endif #endif -#include +#ifndef _C4_YML_EXTRA_STRING_HPP_ +#include "./string.hpp" +#endif C4_SUPPRESS_WARNING_GCC_CLANG_PUSH C4_SUPPRESS_WARNING_GCC_CLANG("-Wold-style-cast") @@ -28,6 +30,9 @@ namespace yml { /** @addtogroup doc_event_handlers * @{ */ +void append_escaped(extra::string *s, csubstr val); + + /** The stack state needed specifically by @ref EventHandlerYamlStd */ struct EventHandlerYamlStdState : public ParserState { @@ -53,19 +58,7 @@ struct EventHandlerYamlStd : public EventHandlerStack m_val_buffers; - char m_key_tag_buf[256]; - char m_val_tag_buf[256]; + extra::string_vector m_val_buffers; + extra::string m_key_tag_buf; + extra::string m_val_tag_buf; TagDirective m_tag_directives[RYML_MAX_TAG_DIRECTIVES]; - std::string m_arena; + extra::string m_arena; // undefined at the end #define _enable_(bits) _enable__() @@ -90,9 +83,9 @@ struct EventHandlerYamlStd : public EventHandlerStacknode_id, tag.len, tag); _enable_(KEYTAG); - m_curr->ev_data.m_key.tag = _transform_directive(tag, m_key_tag_buf); + m_curr->ev_data.m_key.tag = _transform_directive(tag, &m_key_tag_buf); } void set_val_tag(csubstr tag) { _c4dbgpf("node[{}]: set val tag: [{}]~~~{}~~~", m_curr->node_id, tag.len, tag); _enable_(VALTAG); - m_curr->ev_data.m_val.tag = _transform_directive(tag, m_val_tag_buf); + m_curr->ev_data.m_val.tag = _transform_directive(tag, &m_val_tag_buf); } /** @} */ @@ -568,7 +563,7 @@ struct EventHandlerYamlStd : public EventHandlerStackev_data = {}; _c4dbgpf("pushed! level={}", m_curr->level); } @@ -652,8 +647,8 @@ struct EventHandlerYamlStd : public EventHandlerStack output->size()) + { + _RYML_CB_CHECK(m_stack.m_callbacks, !retry); + retry = true; + output->resize(len); + output->resize(output->capacity()); + goto again1; + } + return csubstr(*output).first(len); } } if(tag.begins_with('!')) @@ -783,15 +787,23 @@ struct EventHandlerYamlStd : public EventHandlerStackpos); } } - csubstr result = normalize_tag_long(tag, output); - _RYML_CB_CHECK(m_stack.m_callbacks, result.len > 0); - _RYML_CB_CHECK(m_stack.m_callbacks, result.str); + bool retry = false; + again2: + csubstr result = normalize_tag_long(tag, *output); + if(!result.str) + { + _RYML_CB_CHECK(m_stack.m_callbacks, !retry); + retry = true; + output->resize(result.len); + output->resize(output->capacity()); + goto again2; + } return result; } -#undef _enable_ -#undef _disable_ -#undef _has_any_ + #undef _enable_ + #undef _disable_ + #undef _has_any_ /** @endcond */ }; diff --git a/test/test_suite/test_suite_events.cpp b/test/test_suite/test_suite_events.cpp index d4b33f33..881fab34 100644 --- a/test/test_suite/test_suite_events.cpp +++ b/test/test_suite/test_suite_events.cpp @@ -15,7 +15,8 @@ std::string emit_events_from_source(substr src) EventHandlerYamlStd handler(&sink); ParseEngine parser(&handler); parser.parse_in_place_ev("(testyaml)", src); - return sink.result; + csubstr result = sink; + return std::string(result.str, result.len); } diff --git a/test/test_yaml_events.cpp b/test/test_yaml_events.cpp index 7af51993..92f4cc8e 100644 --- a/test/test_yaml_events.cpp +++ b/test/test_yaml_events.cpp @@ -58,9 +58,12 @@ TEST_P(EventsTest, from_parser) ParseEngine parser(&handler); std::string src_copy(ec.src.str, ec.src.len); parser.parse_in_place_ev("(testyaml)", to_substr(src_copy)); - _c4dbgpf("~~~\n{}~~~\n", sink.result); + csubstr result = sink; + _c4dbgpf("~~~\n{}~~~\n", result); + // use the diff from std::string which is nice std::string exp_copy(ec.expected_events_from_parser.str, ec.expected_events_from_parser.len); - EXPECT_EQ(sink.result, exp_copy); // use the diff from std::string which is nice + std::string result_copy(result.str, result.len); + EXPECT_EQ(result_copy, exp_copy); } TEST_P(EventsTest, from_tree) diff --git a/tools/yaml_events.cpp b/tools/yaml_events.cpp index edbfae92..b8c849a5 100644 --- a/tools/yaml_events.cpp +++ b/tools/yaml_events.cpp @@ -118,7 +118,8 @@ std::string emit_events_direct(csubstr filename, substr filecontents) EventHandlerYamlStd handler(&sink, create_custom_callbacks()); ParseEngine parser(&handler); parser.parse_in_place_ev(filename, filecontents); - return sink.result; + csubstr result = sink; + return std::string(result.str, result.len); } From 702803a1cd37f248107e4c15f460505888163163 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Wed, 29 May 2024 11:36:46 +0100 Subject: [PATCH 07/18] fix: scalar emit in json --- src/c4/yml/emit.def.hpp | 34 ++++++++++------ src/c4/yml/parse_engine.def.hpp | 2 +- test/test_parse_engine_6_qmrk.cpp | 64 +++++++++++++++++++++++++++++++ 3 files changed, 88 insertions(+), 12 deletions(-) diff --git a/src/c4/yml/emit.def.hpp b/src/c4/yml/emit.def.hpp index 5ea901f8..3b8896e2 100644 --- a/src/c4/yml/emit.def.hpp +++ b/src/c4/yml/emit.def.hpp @@ -643,19 +643,31 @@ void Emitter::_write(NodeScalar const& C4_RESTRICT sc, NodeType flags, i template void Emitter::_write_json(NodeScalar const& C4_RESTRICT sc, NodeType flags) { - if(C4_UNLIKELY( ! sc.tag.empty())) - _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have tags"); + if(flags & (KEYTAG|VALTAG)) + if(m_opts.json_error_flags() & EmitOptions::JSON_ERR_ON_TAG) + _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have tags"); if(C4_UNLIKELY(flags.has_anchor())) - _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have anchors"); - // use double quoted style... - // if it is a key (mandatory in JSON) - // if the style is marked quoted - bool dquoted = ((flags & (KEY|VALQUO)) - || (scalar_style_json_choose(sc.scalar) & SCALAR_DQUO)); // choose the style - if(dquoted) - _write_scalar_json_dquo(sc.scalar); + if(m_opts.json_error_flags() & EmitOptions::JSON_ERR_ON_ANCHOR) + _RYML_CB_ERR(m_tree->callbacks(), "JSON does not have anchors"); + if(sc.scalar.len) + { + // use double quoted style... + // if it is a key (mandatory in JSON) + // if the style is marked quoted + bool dquoted = ((flags & (KEY|VALQUO)) + || (scalar_style_json_choose(sc.scalar) & SCALAR_DQUO)); // choose the style + if(dquoted) + _write_scalar_json_dquo(sc.scalar); + else + this->Writer::_do_write(sc.scalar); + } else - this->Writer::_do_write(sc.scalar); + { + if(sc.scalar.str || (flags & (KEY|VALQUO|KEYTAG|VALTAG))) + this->Writer::_do_write("\"\""); + else + this->Writer::_do_write("null"); + } } template diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index 806ee019..5831bb84 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -4385,7 +4385,7 @@ void ParseEngine::_handle_map_json() ScannedScalar sc = _scan_scalar_dquot(); csubstr maybe_filtered = _maybe_filter_key_scalar_dquot(sc); m_evt_handler->set_key_scalar_dquoted(maybe_filtered); - addrem_flags(RKCL, RKEY|QMRK); + addrem_flags(RKCL, RKEY); break; } case '}': // this happens on a trailing comma like ", }" diff --git a/test/test_parse_engine_6_qmrk.cpp b/test/test_parse_engine_6_qmrk.cpp index fe182ef2..118f6375 100644 --- a/test/test_parse_engine_6_qmrk.cpp +++ b/test/test_parse_engine_6_qmrk.cpp @@ -959,6 +959,70 @@ ENGINE_TEST(QmrkFlow1MapTag, ___(ps.end_stream()); } +#ifdef TODO_FIXME // this is the only failing suite test +ENGINE_TEST(QmrkTestSuiteM2N8_01_0, + (HAS_CONTAINER_KEYS, + "? []: x"), + "+STR\n" + "+DOC\n" + "+MAP\n" + "+MAP\n" + "+SEQ []\n" + "-SEQ\n" + "=VAL :x\n" + "-MAP\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n" +) +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_block()); + ___(ps.begin_map_key_block()); + ___(ps.begin_seq_key_flow()); + ___(ps.end_seq()); + ___(ps.set_val_scalar_plain("x")); + ___(ps.end_map()); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(QmrkTestSuiteM2N8_01_1, + (HAS_CONTAINER_KEYS, + "? {}: x"), + "+STR\n" + "+DOC\n" + "+MAP\n" + "+MAP\n" + "+MAP {}\n" + "-MAP\n" + "=VAL :x\n" + "-MAP\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n" +) +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_block()); + ___(ps.begin_map_key_block()); + ___(ps.begin_map_key_flow()); + ___(ps.end_map()); + ___(ps.set_val_scalar_plain("x")); + ___(ps.end_map()); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} +#endif + } // namespace yml } // namespace c4 From d9fd048cf0d80febbc52fe144e438362e3a1707e Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Fri, 31 May 2024 18:11:12 +0100 Subject: [PATCH 08/18] fix: should not accept both anchor and ref --- src/c4/yml/event_handler_tree.hpp | 48 ++++++++++++-------- src/c4/yml/parse_engine.def.hpp | 8 ++++ test/test_parse_engine_4_anchor.cpp | 24 ++++++++++ test/test_suite/test_suite_event_handler.hpp | 26 +++++------ 4 files changed, 72 insertions(+), 34 deletions(-) diff --git a/src/c4/yml/event_handler_tree.hpp b/src/c4/yml/event_handler_tree.hpp index d4fe0481..e84440c0 100644 --- a/src/c4/yml/event_handler_tree.hpp +++ b/src/c4/yml/event_handler_tree.hpp @@ -433,33 +433,41 @@ struct EventHandlerTree : public EventHandlerStacknode_id, anchor.len, anchor); - _RYML_CB_ASSERT(m_stack.m_callbacks, !anchor.begins_with('&')); + if(C4_UNLIKELY(_has_any_(KEYREF))) + _RYML_CB_ERR_(m_tree->callbacks(), "key cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_tree->callbacks(), !anchor.begins_with('&')); _enable_(KEYANCH); m_curr->tr_data->m_key.anchor = anchor; } - void set_val_anchor(csubstr anchor) RYML_NOEXCEPT + void set_val_anchor(csubstr anchor) { _c4dbgpf("node[{}]: set val anchor: [{}]~~~{}~~~", m_curr->node_id, anchor.len, anchor); - _RYML_CB_ASSERT(m_stack.m_callbacks, !anchor.begins_with('&')); + if(C4_UNLIKELY(_has_any_(VALREF))) + _RYML_CB_ERR_(m_tree->callbacks(), "val cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_tree->callbacks(), !anchor.begins_with('&')); _enable_(VALANCH); m_curr->tr_data->m_val.anchor = anchor; } - void set_key_ref(csubstr ref) RYML_NOEXCEPT + void set_key_ref(csubstr ref) { _c4dbgpf("node[{}]: set key ref: [{}]~~~{}~~~", m_curr->node_id, ref.len, ref); - _RYML_CB_ASSERT(m_stack.m_callbacks, ref.begins_with('*')); + if(C4_UNLIKELY(_has_any_(KEYANCH))) + _RYML_CB_ERR_(m_tree->callbacks(), "key cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_tree->callbacks(), ref.begins_with('*')); _enable_(KEY|KEYREF); m_curr->tr_data->m_key.anchor = ref.sub(1); m_curr->tr_data->m_key.scalar = ref; } - void set_val_ref(csubstr ref) RYML_NOEXCEPT + void set_val_ref(csubstr ref) { _c4dbgpf("node[{}]: set val ref: [{}]~~~{}~~~", m_curr->node_id, ref.len, ref); - _RYML_CB_ASSERT(m_stack.m_callbacks, ref.begins_with('*')); + if(C4_UNLIKELY(_has_any_(VALANCH))) + _RYML_CB_ERR_(m_tree->callbacks(), "val cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_tree->callbacks(), ref.begins_with('*')); _enable_(VAL|VALREF); m_curr->tr_data->m_val.anchor = ref.sub(1); m_curr->tr_data->m_val.scalar = ref; @@ -495,11 +503,11 @@ struct EventHandlerTree : public EventHandlerStackcallbacks(), directive.begins_with('%')); if(directive.begins_with("%TAG")) { // TODO do not use directives in the tree - _RYML_CB_CHECK(m_stack.m_callbacks, m_tree->add_tag_directive(directive)); + _RYML_CB_CHECK(m_tree->callbacks(), m_tree->add_tag_directive(directive)); } else if(directive.begins_with("%YAML")) { @@ -584,7 +592,7 @@ struct EventHandlerTree : public EventHandlerStackcallbacks(), "cannot append to node"); } if(type.is_doc()) { @@ -645,15 +653,15 @@ struct EventHandlerTree : public EventHandlerStackroot_id() == 0u); - _RYML_CB_ASSERT(m_stack.m_callbacks, m_curr->node_id == 0u); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->root_id() == 0u); + _RYML_CB_ASSERT(m_tree->callbacks(), m_curr->node_id == 0u); const bool hack = !m_tree->has_children(m_curr->node_id) && !m_tree->is_val(m_curr->node_id); if(hack) m_tree->_p(m_tree->root_id())->m_type.add(VAL); m_tree->set_root_as_stream(); - _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_stream(m_tree->root_id())); - _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_children(m_tree->root_id())); - _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->is_doc(m_tree->first_child(m_tree->root_id()))); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_stream(m_tree->root_id())); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->has_children(m_tree->root_id())); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->is_doc(m_tree->first_child(m_tree->root_id()))); if(hack) m_tree->_p(m_tree->first_child(m_tree->root_id()))->m_type.rem(VAL); _set_state_(m_curr, m_tree->root_id()); @@ -676,7 +684,7 @@ struct EventHandlerTree : public EventHandlerStacksize() > 0); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->size() > 0); const id_type last_added = m_tree->size() - 1; if(m_tree->has_parent(last_added)) if(m_tree->_p(last_added)->m_type == NOTYPE) @@ -685,9 +693,9 @@ struct EventHandlerTree : public EventHandlerStacksize() > 0); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->size() > 0); const id_type last_added = m_tree->size() - 1; - _RYML_CB_ASSERT(m_stack.m_callbacks, m_tree->has_parent(last_added)); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->has_parent(last_added)); if(m_tree->_p(last_added)->m_type == NOTYPE) { _c4dbgpf("remove speculative node with parent. parent={} node={} parent(node)={}", m_parent->node_id, last_added, m_tree->parent(last_added)); @@ -697,7 +705,7 @@ struct EventHandlerTree : public EventHandlerStack_p(m_curr->node_id)->m_val.scalar.len == 0); + _RYML_CB_ASSERT(m_tree->callbacks(), m_tree->_p(m_curr->node_id)->m_val.scalar.len == 0); m_tree->_p(m_curr->node_id)->m_val.scalar.str = m_curr->line_contents.rem.str; } diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index 5831bb84..b40cbdff 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -4053,8 +4053,10 @@ bool ParseEngine::_annotations_require_key_container() const template void ParseEngine::_handle_annotations_before_blck_key_scalar() { + _c4dbgpf("annotations_before_blck_key_scalar, node={}", m_state->node_id); if(m_pending_tags.num_entries) { + _c4dbgpf("annotations_before_blck_key_scalar, #tags={}", m_pending_tags.num_entries); if(C4_LIKELY(m_pending_tags.num_entries == 1)) { m_evt_handler->set_key_tag(m_pending_tags.annotations[0].str); @@ -4067,6 +4069,7 @@ void ParseEngine::_handle_annotations_before_blck_key_scalar() } if(m_pending_anchors.num_entries) { + _c4dbgpf("annotations_before_blck_key_scalar, #anchors={}", m_pending_anchors.num_entries); if(C4_LIKELY(m_pending_anchors.num_entries == 1)) { m_evt_handler->set_key_anchor(m_pending_anchors.annotations[0].str); @@ -4082,8 +4085,10 @@ void ParseEngine::_handle_annotations_before_blck_key_scalar() template void ParseEngine::_handle_annotations_before_blck_val_scalar() { + _c4dbgpf("annotations_before_blck_val_scalar, node={}", m_state->node_id); if(m_pending_tags.num_entries) { + _c4dbgpf("annotations_before_blck_val_scalar, #tags={}", m_pending_tags.num_entries); if(C4_LIKELY(m_pending_tags.num_entries == 1)) { m_evt_handler->set_val_tag(m_pending_tags.annotations[0].str); @@ -4096,6 +4101,7 @@ void ParseEngine::_handle_annotations_before_blck_val_scalar() } if(m_pending_anchors.num_entries) { + _c4dbgpf("annotations_before_blck_val_scalar, #anchors={}", m_pending_anchors.num_entries); if(C4_LIKELY(m_pending_anchors.num_entries == 1)) { m_evt_handler->set_val_anchor(m_pending_anchors.annotations[0].str); @@ -5951,6 +5957,7 @@ void ParseEngine::_handle_map_block() { csubstr ref = _scan_ref_map(); _c4dbgpf("mapblck[RKEY]: key ref! [{}]~~~{}~~~", ref.len, ref); + _handle_annotations_before_blck_key_scalar(); m_evt_handler->set_key_ref(ref); addrem_flags(RVAL, RKEY); if(!_maybe_scan_following_colon()) @@ -6444,6 +6451,7 @@ void ParseEngine::_handle_map_block() else { _c4dbgp("mapblck[RVAL]: was val ref"); + _handle_annotations_before_blck_val_scalar(); m_evt_handler->set_val_ref(ref); addrem_flags(RNXT, RVAL); } diff --git a/test/test_parse_engine_4_anchor.cpp b/test/test_parse_engine_4_anchor.cpp index c21b6c50..4174080f 100644 --- a/test/test_parse_engine_4_anchor.cpp +++ b/test/test_parse_engine_4_anchor.cpp @@ -1111,6 +1111,30 @@ ENGINE_TEST(DoubleAnchorKeyMap, ___(ps.end_stream()); } +ENGINE_TEST_ERRLOC(AnchorAndAliasAsBlockMapKey_SU74, Location(2,10), "" + "key1: &alias value1\n" + "&b *alias : value2\n") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsFlowMapKey_SU74, Location(2,10), "" + "{key1: &alias value1,\n" + "&b *alias : value2}\n") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsFlowMapVal_SU74, Location(2,18), "" + "key1: &alias value1\n" + "value2: &b *alias\n") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsBlockMapVal_SU74, Location(2,18), "" + "{key1: &alias value1,\n" + "value2: &b *alias\n}") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsFlowSeqVal_SU74, Location(2,12), "" + "- &alias value1\n" + "- &b *alias\n") + +ENGINE_TEST_ERRLOC(AnchorAndAliasAsBlockSeqVal_SU74, Location(2,10), "" + "[&alias value1,\n" + "&b *alias]\n") + } // namespace yml } // namespace c4 diff --git a/test/test_suite/test_suite_event_handler.hpp b/test/test_suite/test_suite_event_handler.hpp index 5730374d..562bc0d2 100644 --- a/test/test_suite/test_suite_event_handler.hpp +++ b/test/test_suite/test_suite_event_handler.hpp @@ -474,32 +474,28 @@ struct EventHandlerYamlStd : public EventHandlerStacknode_id, anchor.len, anchor); - RYML_ASSERT(!anchor.begins_with('&')); + if(C4_UNLIKELY(_has_any_(KEYANCH))) + _RYML_CB_ERR_(m_stack.m_callbacks, "key cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_stack.m_callbacks, !anchor.begins_with('&')); _enable_(KEYANCH); m_curr->ev_data.m_key.anchor = anchor; } void set_val_anchor(csubstr anchor) { _c4dbgpf("node[{}]: set val anchor: [{}]~~~{}~~~", m_curr->node_id, anchor.len, anchor); - RYML_ASSERT(!anchor.begins_with('&')); + if(C4_UNLIKELY(_has_any_(VALREF))) + _RYML_CB_ERR_(m_stack.m_callbacks, "val cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_stack.m_callbacks, !anchor.begins_with('&')); _enable_(VALANCH); m_curr->ev_data.m_val.anchor = anchor; } - C4_ALWAYS_INLINE bool has_key_anchor() const - { - return _has_any_(KEYANCH); - } - - C4_ALWAYS_INLINE bool has_val_anchor() const - { - return _has_any_(VALANCH); - } - void set_key_ref(csubstr ref) { _c4dbgpf("node[{}]: set key ref: [{}]~~~{}~~~", m_curr->node_id, ref.len, ref); - RYML_ASSERT(ref.begins_with('*')); + if(C4_UNLIKELY(_has_any_(KEYANCH))) + _RYML_CB_ERR_(m_stack.m_callbacks, "key cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_stack.m_callbacks, ref.begins_with('*')); _enable_(KEY|KEYREF); _send_("=ALI "); _send_(ref); @@ -508,7 +504,9 @@ struct EventHandlerYamlStd : public EventHandlerStacknode_id, ref.len, ref); - RYML_ASSERT(ref.begins_with('*')); + if(C4_UNLIKELY(_has_any_(VALANCH))) + _RYML_CB_ERR_(m_stack.m_callbacks, "val cannot have both anchor and ref", m_curr->pos); + _RYML_CB_ASSERT(m_stack.m_callbacks, ref.begins_with('*')); _enable_(VAL|VALREF); _send_("=ALI "); _send_(ref); From f7f03f434ec985a18cd3a8a7cc5e3183478b69e4 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Fri, 31 May 2024 20:03:35 +0100 Subject: [PATCH 09/18] fix: directive conformance --- changelog/current.md | 12 ++- src/c4/yml/event_handler_tree.hpp | 29 ++++-- src/c4/yml/parse_engine.def.hpp | 53 ++++++++++- src/c4/yml/parse_engine.hpp | 2 + test/test_lib/test_engine.hpp | 50 +++++++--- test/test_parse_engine_5_tag.cpp | 99 ++++++++++++++++++++ test/test_suite/test_suite_event_handler.hpp | 35 +++++-- test/test_suite/test_suite_parts.cpp | 7 +- tools/parse_emit.cpp | 68 ++++++++++++++ 9 files changed, 315 insertions(+), 40 deletions(-) diff --git a/changelog/current.md b/changelog/current.md index 59919749..f3cd77fd 100644 --- a/changelog/current.md +++ b/changelog/current.md @@ -1,11 +1,19 @@ -Most of the changes are from the giant Parser refactor described below. Before getting to that, a couple of other minor changes first. +Most of the changes are from the giant Parser refactor described below. Before getting to that, some other minor changes first. ### Fixes - [#PR431](https://github.com/biojppm/rapidyaml/pull/431) - Emitter: prevent stack overflows when emitting malicious trees by providing a max tree depth for the emit visitor. This was done by adding an `EmitOptions` structure as an argument both to the emitter and to the emit functions, which is then forwarded to the emitter. This `EmitOptions` structure has a max tree depth setting with a default value of 64. - [#PR431](https://github.com/biojppm/rapidyaml/pull/431) - Fix `_RYML_CB_ALLOC()` using `(T)` in parenthesis, making the macro unusable. -- [#434] - Ensure empty vals are not deserialized ([#PR436](https://github.com/biojppm/rapidyaml/pull/436)). +- [#434](https://github.com/biojppm/rapidyaml/issues/434) - Ensure empty vals are not deserialized ([#PR436](https://github.com/biojppm/rapidyaml/pull/436)). +- [#PR433](https://github.com/biojppm/rapidyaml/pull/433): + - Fix some corner cases causing read-after-free in the tree's arena when it is relocated while filtering scalar. + - Improve YAML error conformance and - detect YAML-mandated parse errors when: + - directives are misplaced (eg [9MMA](https://matrix.yaml.info/details/9MMA.html), [9HCY](https://matrix.yaml.info/details/9HCY.html), [B63P](https://matrix.yaml.info/details/B63P.html), [EB22](https://matrix.yaml.info/details/EB22.html), [SF5V](https://matrix.yaml.info/details/SF5V.html)). + - comments are misplaced (eg [MUS6/00](https://matrix.yaml.info/details/MUS6:00.html), [9JBA](https://matrix.yaml.info/details/9JBA.html), [SU5Z](https://matrix.yaml.info/details/SU5Z.html)) + - a node has both an anchor and an alias (eg [SR86](https://matrix.yaml.info/details/SR86.html), [SU74](https://matrix.yaml.info/details/SU74.html)). + - tags contain [invalid characters](https://yaml.org/spec/1.2.2/#tag-shorthands) `,{}[]` (eg [LHL4](https://matrix.yaml.info/details/LHL4.html), [U99R](https://matrix.yaml.info/details/U99R.html), [WZ62](https://matrix.yaml.info/details/WZ62.html)). + ### New features diff --git a/src/c4/yml/event_handler_tree.hpp b/src/c4/yml/event_handler_tree.hpp index e84440c0..a349894b 100644 --- a/src/c4/yml/event_handler_tree.hpp +++ b/src/c4/yml/event_handler_tree.hpp @@ -43,6 +43,8 @@ struct EventHandlerTree : public EventHandlerStack(); _c4dbgpf("node[{}]: enable {}", m_curr->node_id, #bits) @@ -59,9 +61,9 @@ struct EventHandlerTree : public EventHandlerStackcallbacks()), m_tree(tree), m_id(id) + EventHandlerTree() : EventHandlerStack(), m_tree(), m_id(NONE), m_num_directives(), m_yaml_directive() {} + EventHandlerTree(Callbacks const& cb) : EventHandlerStack(cb), m_tree(), m_id(NONE), m_num_directives(), m_yaml_directive() {} + EventHandlerTree(Tree *tree, id_type id) : EventHandlerStack(tree->callbacks()), m_tree(tree), m_id(id), m_num_directives(), m_yaml_directive() { reset(tree, id); } @@ -73,7 +75,7 @@ struct EventHandlerTree : public EventHandlerStackis_root(id)) if(tree->is_map(tree->parent(id))) if(!tree->has_key(id)) - c4::yml::error("destination node belongs to a map and has no key"); + _RYML_CB_ERR(m_stack.m_callbacks, "destination node belongs to a map and has no key"); m_tree = tree; m_id = id; if(m_tree->is_root(id)) @@ -87,6 +89,8 @@ struct EventHandlerTree : public EventHandlerStackparent(id)); _reset_parser_state(m_curr, id, id); } + m_num_directives = 0; + m_yaml_directive = false; } /** @} */ @@ -103,6 +107,8 @@ struct EventHandlerTree : public EventHandlerStackis_stream(m_tree->root_id())) + _RYML_CB_ERR_(m_stack.m_callbacks, "directives cannot be used without a document", {}); this->_stack_finish_parse(); /* This pointer is temporary. Remember that: * @@ -115,7 +121,7 @@ struct EventHandlerTree : public EventHandlerStackcallbacks(), directive.begins_with('%')); if(directive.begins_with("%TAG")) { - // TODO do not use directives in the tree - _RYML_CB_CHECK(m_tree->callbacks(), m_tree->add_tag_directive(directive)); + if(C4_UNLIKELY(!m_tree->add_tag_directive(directive))) + _RYML_CB_ERR_(m_stack.m_callbacks, "failed to add directive", m_curr->pos); } else if(directive.begins_with("%YAML")) { _c4dbgpf("%YAML directive! ignoring...: {}", directive); + if(C4_UNLIKELY(m_yaml_directive)) + _RYML_CB_ERR_(m_stack.m_callbacks, "multiple yaml directives", m_curr->pos); + m_yaml_directive = true; } else { - _c4dbgpf("% directive unknown! ignoring...: {}", directive); + _c4dbgpf("unknown directive! ignoring... {}", directive); } + ++m_num_directives; } /** @} */ diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index b40cbdff..aae1f6fd 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -552,14 +552,37 @@ void ParseEngine::_skipchars(const char (&chars)[N]) _line_progressed(pos); } +template +void ParseEngine::_skip_comment() +{ + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, m_state->line_contents.rem.begins_with('#')); + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, m_state->line_contents.rem.is_sub(m_state->line_contents.full)); + csubstr rem = m_state->line_contents.rem; + csubstr full = m_state->line_contents.full; + // raise an error if the comment is not preceded by whitespace + if(!full.begins_with('#')) + { + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, rem.str > full.str); + const char c = full[(size_t)(rem.str - full.str - 1)]; + if(C4_UNLIKELY(c != ' ' && c != '\t')) + _RYML_CB_ERR(m_evt_handler->m_stack.m_callbacks, "comment not preceded by whitespace"); + } + else + { + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, rem.str == full.str); + } + _c4dbgpf("comment was '{}'", rem); + _line_progressed(rem.len); +} + template void ParseEngine::_maybe_skip_comment() { csubstr s = m_state->line_contents.rem.triml(' '); if(s.begins_with('#')) { - _c4dbgpf("comment was '{}'", s); - _line_progressed(m_state->line_contents.rem.len); + _line_progressed((size_t)(s.str - m_state->line_contents.rem.str)); + _skip_comment(); } } @@ -4225,6 +4248,27 @@ size_t ParseEngine::_select_indentation_from_annotations(size_t va return curr->line < val_line ? val_indentation : curr->indentation; } +template +void ParseEngine::_handle_directive(csubstr rem) +{ + _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, rem.is_sub(m_state->line_contents.rem)); + const size_t pos = rem.find('#'); + _c4dbgpf("handle_directive: pos={} rem={}", pos, rem); + if(pos == npos) // no comments + { + m_evt_handler->add_directive(rem); + _line_progressed(rem.len); + } + else + { + csubstr to_comment = rem.first(pos); + csubstr trimmed = to_comment.trimr(" \t"); + m_evt_handler->add_directive(trimmed); + _line_progressed(pos); + _skip_comment(); + } +} + //----------------------------------------------------------------------------- @@ -7121,8 +7165,9 @@ void ParseEngine::_handle_unk() else if(first == '%') { _c4dbgpf("directive: {}", rem); - m_evt_handler->add_directive(rem); - _line_progressed(rem.len); + if(C4_UNLIKELY(!m_doc_empty && has_none(NDOC))) + _RYML_CB_ERR(m_evt_handler->m_stack.m_callbacks, "need document footer before directives"); + _handle_directive(rem); return; } } diff --git a/src/c4/yml/parse_engine.hpp b/src/c4/yml/parse_engine.hpp index 759a842b..8435841d 100644 --- a/src/c4/yml/parse_engine.hpp +++ b/src/c4/yml/parse_engine.hpp @@ -571,6 +571,7 @@ class ParseEngine void _handle_indentation_pop(ParserState const* dst); void _maybe_skip_comment(); + void _skip_comment(); void _maybe_skip_whitespace_tokens(); void _maybe_skipchars(char c); #ifdef RYML_NO_COVERAGE__TO_BE_DELETED @@ -709,6 +710,7 @@ class ParseEngine void _handle_annotations_before_start_mapblck_as_key(); void _handle_annotations_and_indentation_after_start_mapblck(size_t key_indentation, size_t key_line); size_t _select_indentation_from_annotations(size_t val_indentation, size_t val_line); + void _handle_directive(csubstr rem); private: diff --git a/test/test_lib/test_engine.hpp b/test/test_lib/test_engine.hpp index 880bd67d..d749263d 100644 --- a/test/test_lib/test_engine.hpp +++ b/test/test_lib/test_engine.hpp @@ -78,8 +78,8 @@ void test_new_parser_events_from_yaml(ReferenceYaml const& yaml, std::string con void test_new_parser_tree_from_yaml(ReferenceYaml const& yaml); void test_new_parser_events_from_yaml_with_comments(ReferenceYaml const& yaml, std::string const& expected_events); void test_new_parser_tree_from_yaml_with_comments(ReferenceYaml const& yaml); -void test_expected_error_events_from_yaml(std::string const& parsed_yaml, Location const& expected_error_location); -void test_expected_error_tree_from_yaml(std::string const& parsed_yaml, Location const& expected_error_location); +void test_expected_error_events_from_yaml(std::string const& parsed_yaml, Location const& expected_error_location={}); +void test_expected_error_tree_from_yaml(std::string const& parsed_yaml, Location const& expected_error_location={}); @@ -92,6 +92,7 @@ void test_expected_error_tree_from_yaml(std::string const& parsed_yaml, Location #define _RYML_SHOWFILELINE(name) #endif + //----------------------------------------------------------------------------- #define ENGINE_TEST_ERRLOC(name, location, refyaml) \ @@ -115,6 +116,29 @@ TEST(EngineTest, name##_err_tree_from_yaml) \ } +//----------------------------------------------------------------------------- + +#define ENGINE_TEST_ERR(name, refyaml) \ + \ + \ +TEST(EngineTest, name##_err_events_from_yaml) \ +{ \ + _RYML_SHOWFILELINE(name); \ + SCOPED_TRACE(#name "_err_events_from_yaml"); \ + test_expected_error_events_from_yaml(refyaml); \ + _RYML_SHOWFILELINE(name); \ +} \ + \ + \ +TEST(EngineTest, name##_err_tree_from_yaml) \ +{ \ + _RYML_SHOWFILELINE(name); \ + SCOPED_TRACE(#name "_err_tree_from_yaml"); \ + test_expected_error_tree_from_yaml(refyaml); \ + _RYML_SHOWFILELINE(name); \ +} + + //----------------------------------------------------------------------------- @@ -185,7 +209,7 @@ TEST(EngineTest, name##_events_from_yaml_with_comments) \ TEST(EngineTest, name##_tree_from_yaml_with_comments) \ { \ _RYML_SHOWFILELINE(name); \ - SCOPED_TRACE(#name "_wtree_from_yaml"); \ + SCOPED_TRACE(#name "_tree_from_yaml"); \ ReferenceYaml yaml refyaml; \ test_new_parser_tree_from_yaml_with_comments(yaml); \ _RYML_SHOWFILELINE(name); \ @@ -204,9 +228,9 @@ void name##_impl(Ps &ps) #if !defined(RYML_DBG) #define ___(stmt) stmt #else -inline void _print_handler_info(EventHandlerYamlStd const& ps, csubstr stmt) +inline void _print_handler_info(EventHandlerYamlStd const& ps, csubstr stmt, const char *file, int line) { - _c4dbgpf("{}", stmt); + _dbg_printf("{}:{}: {}", file, line, stmt); auto indent = [](id_type n){ for(id_type level = 0; level < n; ++level) { @@ -218,28 +242,28 @@ inline void _print_handler_info(EventHandlerYamlStd const& ps, csubstr stmt) csubstr const& str = ps._buf_(i); indent(i); _dbg_printf("[{}]\n", i); - for(csubstr line : str.split('\n')) + for(csubstr ln : str.split('\n')) { indent(i); - _dbg_printf("{}\n", line); + _dbg_printf("{}\n", ln); } } } -inline void _print_handler_info(EventHandlerTree const& ps, csubstr stmt) +inline void _print_handler_info(EventHandlerTree const& ps, csubstr stmt, const char *file, int line) { if(ps.m_parent) - _c4dbgpf("parent.id={} curr.id={} {}\n", - ps.m_parent->node_id, ps.m_curr->node_id, stmt); + _dbg_printf("{}:{}: parent.id={} curr.id={} {}\n", + file, line, ps.m_parent->node_id, ps.m_curr->node_id, stmt); else - _c4dbgpf("parent.id=-- curr.id={} {}\n", - ps.m_curr->node_id, stmt); + _dbg_printf("{}:{}: parent.id=-- curr.id={} {}\n", + file, line, ps.m_curr->node_id, stmt); print_tree(*ps.m_tree); } #define ___(stmt) \ do \ { \ stmt; \ - _print_handler_info(ps, #stmt); \ + _print_handler_info(ps, #stmt, __FILE__, __LINE__); \ } while(0) #endif diff --git a/test/test_parse_engine_5_tag.cpp b/test/test_parse_engine_5_tag.cpp index c1279c2a..d58083fb 100644 --- a/test/test_parse_engine_5_tag.cpp +++ b/test/test_parse_engine_5_tag.cpp @@ -163,6 +163,105 @@ ENGINE_TEST(TagTestSuite6WLZ_1, ___(ps.end_stream()); } + +//----------------------------------------------------------------------------- + +ENGINE_TEST_ERR(DirectiveTestSuite9MMA_YAML, + "%YAML 1.2") + +ENGINE_TEST_ERR(DirectiveTestSuite9MMA_TAG, + "%TAG ! tag:example.com,2000:app/\n") + +ENGINE_TEST_ERR(DirectiveTestSuiteMUS6_00, + "%YAML 1.1#...\n" + "---\n") + +ENGINE_TEST_ERR(DirectiveTestSuite9HCY, + "!foo \"bar\"\n" + "%TAG ! tag:example.com,2000:app/\n" + "---\n" + "!foo \"bar\"\n") + +ENGINE_TEST(DirectiveTestSuiteMUS6, + ("%YAM 1.1\n" + "---\n", + "---\n"), + "+STR\n" + "+DOC ---\n" + "=VAL :\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.add_directive("%YAM 1.1")); + ___(ps.begin_doc_expl()); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(DirectiveMultipleYAML_W4TN, + ("" + "%YAML 1.2\n" + "---\n" + "foo\n" + "...\n" + "%YAML 1.2\n" + "---\n" + "# Empty\n" + "...\n" + "", + "--- foo\n" + "---\n" + ""), + "+STR\n" + "+DOC ---\n" + "=VAL :foo\n" + "-DOC ...\n" + "+DOC ---\n" + "=VAL :\n" + "-DOC ...\n" + "-STR\n" + "") +{ + ___(ps.begin_stream()); + ___(ps.add_directive("%YAML 1.2")); + ___(ps.begin_doc_expl()); + ___(ps.set_val_scalar_plain("foo")); + ___(ps.end_doc_expl()); + ___(ps.add_directive("%YAML 1.2")); + ___(ps.begin_doc_expl()); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_doc_expl()); + ___(ps.end_stream()); +} + +ENGINE_TEST_ERR(DirectiveMultipleYAML_0, + "%YAML 1.2\n" + "...\n" + "%YAML 1.2\n" + "---\n" + "bar") + +ENGINE_TEST_ERR(DirectiveMultipleYAML_1, + "%YAML 1.1\n" + "...\n" + "%YAML 1.2\n" + "---\n" + "bar") + +ENGINE_TEST_ERR(DirectiveMultipleYAML_2, + "%YAML 1.2\n" + "%YAML 1.2\n" + "---\n" + "bar") + +ENGINE_TEST_ERR(DirectiveMultipleYAML_3, + "%YAML 1.1\n" + "%YAML 1.2\n" + "---\n" + "bar") + } // namespace yml } // namespace c4 diff --git a/test/test_suite/test_suite_event_handler.hpp b/test/test_suite/test_suite_event_handler.hpp index 562bc0d2..6d6e087c 100644 --- a/test/test_suite/test_suite_event_handler.hpp +++ b/test/test_suite/test_suite_event_handler.hpp @@ -70,7 +70,9 @@ struct EventHandlerYamlStd : public EventHandlerStack() @@ -83,9 +85,9 @@ struct EventHandlerYamlStd : public EventHandlerStackflags |= RUNK|RTOP; - for(auto &td : m_tag_directives) + m_has_yaml_directive = false; + for(TagDirective &td : m_tag_directives) td = {}; m_val_buffers.resize((size_t)m_stack.size()); m_arena.clear(); m_arena.reserve(1024); m_key_tag_buf.resize(256); m_val_tag_buf.resize(256); + m_has_docs = false; } /** @} */ @@ -118,6 +122,9 @@ struct EventHandlerYamlStd : public EventHandlerStack_stack_finish_parse(); } void cancel_parse() @@ -163,6 +170,7 @@ struct EventHandlerYamlStd : public EventHandlerStack= RYML_MAX_TAG_DIRECTIVES)) + _RYML_CB_ERR_(m_stack.m_callbacks, "too many directives", m_curr->pos); + if(C4_UNLIKELY(!m_tag_directives[pos].create_from_str(directive))) + _RYML_CB_ERR_(m_stack.m_callbacks, "failed to add directive", m_curr->pos); + } + else if(directive.begins_with("%YAML")) + { + _c4dbgpf("%YAML directive! ignoring...: {}", directive); + if(C4_UNLIKELY(m_has_yaml_directive)) + _RYML_CB_ERR_(m_stack.m_callbacks, "multiple yaml directives", m_curr->pos); + m_has_yaml_directive = true; + } + else + { + _c4dbgpf("unknown directive! ignoring... {}", directive); } } diff --git a/test/test_suite/test_suite_parts.cpp b/test/test_suite/test_suite_parts.cpp index 9c22090f..6bf8e45a 100644 --- a/test/test_suite/test_suite_parts.cpp +++ b/test/test_suite/test_suite_parts.cpp @@ -21,14 +21,12 @@ constexpr const AllowedFailure allowed_failures[] = { // errors that fail to materialize _("3HFZ-error" , "should not accept scalar after ..."), - _("4EJS-error" , "should not accept double anchor for scalar"), + _("4EJS-error" , "should not accept tabs as indendation in a mapping"), _("5TRB-error" , "should not accept document-end marker in double quoted string"), _("5U3A-error" , "should not accept opening a sequence on same line as map key"), _("7LBH-error" , "should not accept multiline double quoted implicit keys"), _("9C9N-error" , "should not accept non-indented flow sequence"), - _("9HCY-error" , "should not accept tag directive in non-doc scope"), _("9JBA-error" , "should not accept comment after flow seq terminating ]"), - _("9MMA-error" , "should not accept empty doc after %YAML directive"), _("9MQT_01-error" , "should not accept scalars after ..."), _("B63P-error" , "should not accept directive without doc"), _("C2SP-error" , "should not accept flow sequence with terminating ] on the next line"), @@ -43,8 +41,6 @@ constexpr const AllowedFailure allowed_failures[] = { _("H7TQ-error" , "should not accept extra words after directive"), _("JKF3-error" , "should not accept multiline unindented double quoted scalar"), _("LHL4-error" , "should not accept tag"), - _("MUS6_00-error" , "should not accept #... at the end of %YAML directive"), - _("MUS6_01-error" , "should not accept #... at the end of %YAML directive"), _("N782-error" , "TBD"), _("P2EQ-error" , "should not accept sequence item on same line as previous item"), _("QB6E-error" , "should not accept indented multiline quoted scalar"), @@ -55,7 +51,6 @@ constexpr const AllowedFailure allowed_failures[] = { _("SF5V-error" , "should not accept duplicate YAML directive"), _("SR86-error" , "TBD"), _("SU5Z-error" , "should not accept comment without whitespace after double quoted scalar"), - _("SU74-error" , "should not accept anchor and alias as mapping key"), _("SY6V-error" , "TBD"), _("U99R-error" , "should not accept comma in a tag"), _("VJP3_00-error" , "should not accept flow collections over many lines"), diff --git a/tools/parse_emit.cpp b/tools/parse_emit.cpp index dda9729f..23bfec35 100644 --- a/tools/parse_emit.cpp +++ b/tools/parse_emit.cpp @@ -44,6 +44,74 @@ struct timed_section //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- +void process_file(csubstr file) +{ + TS(objects); + std::string contents; + yml::Tree tree(yml::get_callbacks()); + { + TS(read_file); + load_file(file, &contents); + } + { + TS(tree_reserve); + yml::id_type cap; + { + TS(estimate_capacity); + cap = yml::estimate_tree_capacity(to_csubstr(contents)); + } + if(debug_tree) + fprintf(stderr, "reserving capacity=%zu\n", (size_t)cap); + tree.reserve(cap); + } + { + TS(parse_yml); + yml::parse_in_place(file, to_substr(contents), &tree); + } + if(debug_tree) + { + TS(debug_tree); + yml::print_tree(tree); + } + if(emit_as_json) + { + { + TS(resolve_refs); + tree.resolve(); + } + if(debug_tree) + { + TS(debug_resolved_tree); + yml::print_tree(tree); + } + } + if(emit_to_string) + { + std::string output; + { + TS(emit_to_buffer); + output.resize(contents.size()); // resize, not just reserve + if(!emit_as_json) + yml::emitrs_yaml(tree, &output); + else + emit_json_docs(tree, &output); + } + if(!quiet) + { + TS(print_stdout); + fwrite(output.data(), 1, output.size(), stdout); + } + } + else if(!quiet) + { + TS(emit_to_stdout); + if(!emit_as_json) + yml::emit_yaml(tree); + else + emit_json_docs(tree); + } +} + int main(int argc, const char *argv[]) { bool print_emitted_to_stdout = true; From 024cfeae71f5a97ca3a104be9700882821532e58 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sat, 1 Jun 2024 01:27:07 +0100 Subject: [PATCH 10/18] fix: error when tags have invalid characters --- src/c4/yml/parse_engine.def.hpp | 79 ++++++++- src/c4/yml/parse_engine.hpp | 4 + test/test_parse_engine_5_tag.cpp | 255 +++++++++++++++++++++++++++ test/test_suite/test_suite_parts.cpp | 9 - 4 files changed, 334 insertions(+), 13 deletions(-) diff --git a/src/c4/yml/parse_engine.def.hpp b/src/c4/yml/parse_engine.def.hpp index aae1f6fd..972c742d 100644 --- a/src/c4/yml/parse_engine.def.hpp +++ b/src/c4/yml/parse_engine.def.hpp @@ -609,6 +609,29 @@ bool ParseEngine::_maybe_scan_following_colon() noexcept return false; } +template +bool ParseEngine::_maybe_scan_following_comma() noexcept +{ + if(m_state->line_contents.rem.len) + { + if(m_state->line_contents.rem.str[0] == ' ' || m_state->line_contents.rem.str[0] == '\t') + { + size_t pos = m_state->line_contents.rem.first_not_of(" \t"); + if(pos == npos) + pos = m_state->line_contents.rem.len; // maybe the line has only spaces + _c4dbgpf("skip {}x'{}'", pos, ' '); + _line_progressed(pos); + } + if(m_state->line_contents.rem.len && (m_state->line_contents.rem.str[0] == ',')) + { + _c4dbgp("found ',' comma next"); + _line_progressed(1); + return true; + } + } + return false; +} + //----------------------------------------------------------------------------- @@ -652,7 +675,10 @@ csubstr ParseEngine::_scan_tag() if(rem.begins_with("!!")) { _c4dbgp("begins with '!!'"); - t = rem.left_of(rem.first_of(" ,")); + if(has_any(FLOW)) + t = rem.left_of(rem.first_of(" ,")); + else + t = rem.left_of(rem.first_of(' ')); } else if(rem.begins_with("!<")) { @@ -670,7 +696,10 @@ csubstr ParseEngine::_scan_tag() { _RYML_CB_ASSERT(m_evt_handler->m_stack.m_callbacks, rem.begins_with('!')); _c4dbgp("begins with '!'"); - t = rem.left_of(rem.first_of(' ')); + if(has_any(FLOW)) + t = rem.left_of(rem.first_of(" ,")); + else + t = rem.left_of(rem.first_of(' ')); } _line_progressed(t.len); _maybe_skip_whitespace_tokens(); @@ -802,9 +831,14 @@ bool ParseEngine::_scan_scalar_plain_seq_flow(ScannedScalar *C4_RE _c4dbgpf("found terminating character at {}: '{}'", i, c); _line_progressed(i); if(m_state->pos.offset + i > start_offset) + { goto ended_scalar; + } else - _c4err("parse error"); + { + _c4dbgp("at the beginning. no scalar here."); + return false; + } break; case ']': _c4dbgpf("found terminating character at {}: '{}'", i, c); @@ -4073,6 +4107,21 @@ bool ParseEngine::_annotations_require_key_container() const return m_pending_tags.num_entries > 1 || m_pending_anchors.num_entries > 1; } +template +void ParseEngine::_check_tag(csubstr tag) +{ + if(!tag.begins_with("!<")) + { + if(C4_UNLIKELY(tag.first_of("[]{},") != npos)) + _RYML_CB_ERR_(m_evt_handler->m_stack.m_callbacks, "tags must not contain any of '[]{},'", m_state->pos); + } + else + { + if(C4_UNLIKELY(!tag.ends_with('>'))) + _RYML_CB_ERR_(m_evt_handler->m_stack.m_callbacks, "malformed tag", m_state->pos); + } +} + template void ParseEngine::_handle_annotations_before_blck_key_scalar() { @@ -4082,6 +4131,7 @@ void ParseEngine::_handle_annotations_before_blck_key_scalar() _c4dbgpf("annotations_before_blck_key_scalar, #tags={}", m_pending_tags.num_entries); if(C4_LIKELY(m_pending_tags.num_entries == 1)) { + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_key_tag(m_pending_tags.annotations[0].str); _clear_annotations(&m_pending_tags); } @@ -4114,6 +4164,7 @@ void ParseEngine::_handle_annotations_before_blck_val_scalar() _c4dbgpf("annotations_before_blck_val_scalar, #tags={}", m_pending_tags.num_entries); if(C4_LIKELY(m_pending_tags.num_entries == 1)) { + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_val_tag(m_pending_tags.annotations[0].str); _clear_annotations(&m_pending_tags); } @@ -4144,6 +4195,7 @@ void ParseEngine::_handle_annotations_before_start_mapblck(size_t if(m_pending_tags.num_entries == 2) { _c4dbgp("2 tags, setting entry 0"); + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_val_tag(m_pending_tags.annotations[0].str); } else if(m_pending_tags.num_entries == 1) @@ -4152,6 +4204,7 @@ void ParseEngine::_handle_annotations_before_start_mapblck(size_t if(m_pending_tags.annotations[0].line < current_line) { _c4dbgp("...tag is for the map. setting it."); + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_val_tag(m_pending_tags.annotations[0].str); _clear_annotations(&m_pending_tags); } @@ -4180,6 +4233,7 @@ void ParseEngine::_handle_annotations_before_start_mapblck_as_key( _c4dbgp("annotations_before_start_mapblck_as_key"); if(m_pending_tags.num_entries == 2) { + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_key_tag(m_pending_tags.annotations[0].str); } if(m_pending_anchors.num_entries == 2) @@ -4200,10 +4254,12 @@ void ParseEngine::_handle_annotations_and_indentation_after_start_ switch(m_pending_tags.num_entries) { case 1u: + _check_tag(m_pending_tags.annotations[0].str); m_evt_handler->set_key_tag(m_pending_tags.annotations[0].str); _clear_annotations(&m_pending_tags); break; case 2u: + _check_tag(m_pending_tags.annotations[1].str); m_evt_handler->set_key_tag(m_pending_tags.annotations[1].str); _clear_annotations(&m_pending_tags); break; @@ -4915,12 +4971,25 @@ void ParseEngine::_handle_seq_flow() csubstr anchor = _scan_anchor(); _c4dbgpf("seqflow[RVAL]: anchor! [{}]~~~{}~~~", anchor.len, anchor); m_evt_handler->set_val_anchor(anchor); + if(_maybe_scan_following_comma()) + { + _c4dbgp("seqflow[RVAL]: empty scalar!"); + m_evt_handler->set_val_scalar_plain({}); + m_evt_handler->add_sibling(); + } } else if(first == '!') { csubstr tag = _scan_tag(); _c4dbgpf("seqflow[RVAL]: tag! [{}]~~~{}~~~", tag.len, tag); + _check_tag(tag); m_evt_handler->set_val_tag(tag); + if(_maybe_scan_following_comma()) + { + _c4dbgp("seqflow[RVAL]: empty scalar!"); + m_evt_handler->set_val_scalar_plain({}); + m_evt_handler->add_sibling(); + } } else if(first == ':') { @@ -4944,7 +5013,6 @@ void ParseEngine::_handle_seq_flow() _line_progressed(1); _maybe_skip_whitespace_tokens(); goto seqflow_finish; - } else { @@ -5123,6 +5191,7 @@ void ParseEngine::_handle_map_flow() { csubstr tag = _scan_tag(); _c4dbgpf("mapflow[RKEY]: tag! [{}]~~~{}~~~", tag.len, tag); + _check_tag(tag); m_evt_handler->set_key_tag(tag); } else @@ -5244,6 +5313,7 @@ void ParseEngine::_handle_map_flow() { csubstr tag = _scan_tag(); _c4dbgpf("mapflow[RVAL]: tag! [{}]~~~{}~~~", tag.len, tag); + _check_tag(tag); m_evt_handler->set_val_tag(tag); } else @@ -5370,6 +5440,7 @@ void ParseEngine::_handle_map_flow() { csubstr tag = _scan_tag(); _c4dbgpf("mapflow[QMRK]: tag! [{}]~~~{}~~~", tag.len, tag); + _check_tag(tag); m_evt_handler->set_key_tag(tag); } else diff --git a/src/c4/yml/parse_engine.hpp b/src/c4/yml/parse_engine.hpp index 8435841d..f63653eb 100644 --- a/src/c4/yml/parse_engine.hpp +++ b/src/c4/yml/parse_engine.hpp @@ -580,6 +580,7 @@ class ParseEngine template void _skipchars(const char (&chars)[N]); bool _maybe_scan_following_colon() noexcept; + bool _maybe_scan_following_comma() noexcept; public: @@ -700,6 +701,7 @@ class ParseEngine void _add_annotation(Annotation *C4_RESTRICT dst, csubstr str, size_t indentation, size_t line); void _clear_annotations(Annotation *C4_RESTRICT dst); + bool _has_pending_annotations() const { return m_pending_tags.num_entries || m_pending_anchors.num_entries; } #ifdef RYML_NO_COVERAGE__TO_BE_DELETED bool _handle_indentation_from_annotations(); #endif @@ -712,6 +714,8 @@ class ParseEngine size_t _select_indentation_from_annotations(size_t val_indentation, size_t val_line); void _handle_directive(csubstr rem); + void _check_tag(csubstr tag); + private: ParserOptions m_options; diff --git a/test/test_parse_engine_5_tag.cpp b/test/test_parse_engine_5_tag.cpp index d58083fb..540deaf2 100644 --- a/test/test_parse_engine_5_tag.cpp +++ b/test/test_parse_engine_5_tag.cpp @@ -93,6 +93,261 @@ ENGINE_TEST(DirectiveAndTag, //----------------------------------------------------------------------------- +ENGINE_TEST_ERRLOC(TagTestSuiteU99R_0, Location(2,1), + "- !!str, xxx\n") + +ENGINE_TEST_ERRLOC(TagTestSuiteU99R_1, Location(2,1), + "- !str, xxx\n") + +ENGINE_TEST(TagTestSuiteU99R_2, + ("[!!str, xxx]", "[!!str ,xxx]"), + "+STR\n" + "+DOC\n" + "+SEQ []\n" + "=VAL :\n" + "=VAL :xxx\n" + "-SEQ\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_seq_val_flow()); + ___(ps.set_val_tag("!!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_val_scalar_plain("xxx")); + ___(ps.end_seq()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteU99R_2_1, + ("[!str, xxx]","[!str ,xxx]"), + "+STR\n" + "+DOC\n" + "+SEQ []\n" + "=VAL :\n" + "=VAL :xxx\n" + "-SEQ\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_seq_val_flow()); + ___(ps.set_val_tag("!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_val_scalar_plain("xxx")); + ___(ps.end_seq()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteU99R_3, + ("{!!str, xxx}", "{!!str : ,xxx: }"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :\n" + "=VAL :\n" + "=VAL :xxx\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_tag("!!str")); + ___(ps.set_key_scalar_plain({})); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_scalar_plain("xxx")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteU99R_3_1, + ("{!str, xxx}", "{!str : ,xxx: }"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :\n" + "=VAL :\n" + "=VAL :xxx\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_tag("!str")); + ___(ps.set_key_scalar_plain({})); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_scalar_plain("xxx")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteWZ62_0_0, + ("{foo: !!str , !!str : bar}", "{foo: !!str ,!!str : bar}"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :foo\n" + "=VAL :\n" + "=VAL :\n" + "=VAL :bar\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("foo")); + ___(ps.set_val_tag("!!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_tag("!!str")); + ___(ps.set_key_scalar_plain({})); + ___(ps.set_val_scalar_plain("bar")); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteWZ62_0_1, + ("{foo: !str , !str : bar}", "{foo: !str ,!str : bar}"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :foo\n" + "=VAL :\n" + "=VAL :\n" + "=VAL :bar\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("foo")); + ___(ps.set_val_tag("!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_tag("!str")); + ___(ps.set_key_scalar_plain({})); + ___(ps.set_val_scalar_plain("bar")); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteWZ62_1_0, + ("{foo: !!str, !!str: bar}", "{foo: !!str ,!!str: bar: }"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :foo\n" + "=VAL :\n" + "=VAL :bar\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("foo")); + ___(ps.set_val_tag("!!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_tag("!!str:")); + ___(ps.set_key_scalar_plain("bar")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST(TagTestSuiteWZ62_1_1, + ("{foo: !str, !str: bar}", "{foo: !str ,!str: bar: }"), + "+STR\n" + "+DOC\n" + "+MAP {}\n" + "=VAL :foo\n" + "=VAL :\n" + "=VAL :bar\n" + "=VAL :\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("foo")); + ___(ps.set_val_tag("!str")); + ___(ps.set_val_scalar_plain({})); + ___(ps.add_sibling()); + ___(ps.set_key_tag("!str:")); + ___(ps.set_key_scalar_plain("bar")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + +ENGINE_TEST_ERRLOC(TagTestSuiteLHL4_0, Location(3,1), + "---\n" + "!invalid{}tag scalar\n") + +ENGINE_TEST_ERRLOC(TagTestSuiteLHL4_1, Location(3,1), + "---\n" + "!!invalid{}tag scalar\n") + +ENGINE_TEST(TagTestSuiteUGM3, + ("--- !\n" + "invoice: 34843\n" + "date: 2001-01-23\n"), + "+STR\n" + "+DOC ---\n" + "+MAP \n" + "=VAL :invoice\n" + "=VAL :34843\n" + "=VAL :date\n" + "=VAL :2001-01-23\n" + "-MAP\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc_expl()); + ___(ps.set_val_tag("!")); + ___(ps.begin_map_val_block()); + ___(ps.set_key_scalar_plain("invoice")); + ___(ps.set_val_scalar_plain("34843")); + ___(ps.add_sibling()); + ___(ps.set_key_scalar_plain("date")); + ___(ps.set_val_scalar_plain("2001-01-23")); + ___(ps.end_map()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + ENGINE_TEST(TagTestSuiteUKK6_02_0, ("!", "! \n"), "+STR\n" diff --git a/test/test_suite/test_suite_parts.cpp b/test/test_suite/test_suite_parts.cpp index 6bf8e45a..54e70176 100644 --- a/test/test_suite/test_suite_parts.cpp +++ b/test/test_suite/test_suite_parts.cpp @@ -26,9 +26,7 @@ constexpr const AllowedFailure allowed_failures[] = { _("5U3A-error" , "should not accept opening a sequence on same line as map key"), _("7LBH-error" , "should not accept multiline double quoted implicit keys"), _("9C9N-error" , "should not accept non-indented flow sequence"), - _("9JBA-error" , "should not accept comment after flow seq terminating ]"), _("9MQT_01-error" , "should not accept scalars after ..."), - _("B63P-error" , "should not accept directive without doc"), _("C2SP-error" , "should not accept flow sequence with terminating ] on the next line"), _("CVW2-error" , "should not accept flow sequence with comment after ,"), _("CXX2-error" , "should not accept mapping with anchor on document start line"), @@ -36,23 +34,16 @@ constexpr const AllowedFailure allowed_failures[] = { _("DK4H-error" , "should not accept implicit key followed by newline"), _("DK95_01-error" , "should not accept leading tabs in double quoted multiline scalar"), _("DK95_06-error" , "should not accept tab indentation"), - _("EB22-error" , "should not accept missing document-end marker before directive"), _("G5U8-error" , "should not accept [-, -]"), _("H7TQ-error" , "should not accept extra words after directive"), _("JKF3-error" , "should not accept multiline unindented double quoted scalar"), - _("LHL4-error" , "should not accept tag"), _("N782-error" , "TBD"), - _("P2EQ-error" , "should not accept sequence item on same line as previous item"), _("QB6E-error" , "should not accept indented multiline quoted scalar"), _("QLJ7-error" , "tag directives should apply only to the next doc (?)"), _("RXY3-error" , "should not accept document-end marker in single quoted string"), _("S4GJ-error" , "should not accept text after block scalar indicator"), _("S98Z-error" , "should not accept block scalar with more spaces than first content line"), - _("SF5V-error" , "should not accept duplicate YAML directive"), - _("SR86-error" , "TBD"), - _("SU5Z-error" , "should not accept comment without whitespace after double quoted scalar"), _("SY6V-error" , "TBD"), - _("U99R-error" , "should not accept comma in a tag"), _("VJP3_00-error" , "should not accept flow collections over many lines"), _("X4QW-error" , "should not accept comment without whitespace after block scalar indicator"), _("Y79Y_003-error" , "should not accept leading tabs in seq elmt"), From 1103a9af80bdd8cfe2febd55dad7aed70cf25596 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sat, 1 Jun 2024 14:34:35 +0100 Subject: [PATCH 11/18] fix: error reporting when resetting parser --- src/c4/yml/event_handler_tree.hpp | 14 ++++++++------ test/test_parser.cpp | 4 ++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/c4/yml/event_handler_tree.hpp b/src/c4/yml/event_handler_tree.hpp index a349894b..6c08ecad 100644 --- a/src/c4/yml/event_handler_tree.hpp +++ b/src/c4/yml/event_handler_tree.hpp @@ -70,12 +70,14 @@ struct EventHandlerTree : public EventHandlerStackcapacity()); - if(!tree->is_root(id)) - if(tree->is_map(tree->parent(id))) - if(!tree->has_key(id)) - _RYML_CB_ERR(m_stack.m_callbacks, "destination node belongs to a map and has no key"); + if(C4_UNLIKELY(!tree)) + _RYML_CB_ERR(m_stack.m_callbacks, "null tree"); + if(C4_UNLIKELY(id >= tree->capacity())) + _RYML_CB_ERR(tree->callbacks(), "invalid node"); + if(C4_UNLIKELY(!tree->is_root(id))) + if(C4_UNLIKELY(tree->is_map(tree->parent(id)))) + if(C4_UNLIKELY(!tree->has_key(id))) + _RYML_CB_ERR(tree->callbacks(), "destination node belongs to a map and has no key"); m_tree = tree; m_id = id; if(m_tree->is_root(id)) diff --git a/test/test_parser.cpp b/test/test_parser.cpp index ff8ef06f..cb5159c8 100644 --- a/test/test_parser.cpp +++ b/test/test_parser.cpp @@ -2006,7 +2006,7 @@ TEST_F(ParseToMapFlowTest, map_flow__to__map_flow__new_child) TEST_F(ParseToMapFlowTest, map_flow__to__map_flow__new_child_no_key) { NodeRef dst = dst_map_flow.rootref().append_child(); - ExpectError::do_check([&]{ + ExpectError::do_check(dst.tree(), [&]{ parse_in_arena(to_csubstr(map_flow), dst); }); } @@ -2668,7 +2668,7 @@ TEST_F(ParseToMapBlockTest, map_flow__to__map_flow__new_child) TEST_F(ParseToMapBlockTest, map_flow__to__map_flow__new_child_no_key) { NodeRef dst = dst_map_blck.rootref().append_child(); - ExpectError::do_check([&]{ + ExpectError::do_check(dst.tree(), [&]{ parse_in_arena(to_csubstr(map_flow), dst); }); } From 0e28299f7e40033f737cf7e8409724137da21385 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sat, 1 Jun 2024 14:35:00 +0100 Subject: [PATCH 12/18] fix: EmitOptions comparison --- src/c4/yml/emit.hpp | 30 ++++++++++++++++++++++++++++++ test/test_emit.cpp | 5 ----- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/src/c4/yml/emit.hpp b/src/c4/yml/emit.hpp index 31811da4..879f5f5c 100644 --- a/src/c4/yml/emit.hpp +++ b/src/c4/yml/emit.hpp @@ -74,7 +74,24 @@ typedef enum { /** A lightweight object containing options to be used when emitting. */ struct EmitOptions { + typedef enum : uint32_t { + DEFAULT_FLAGS = 0, + JSON_ERR_ON_TAG = 1 << 0, + JSON_ERR_ON_ANCHOR = 1 << 1, + _JSON_ERR_MASK = JSON_ERR_ON_TAG|JSON_ERR_ON_ANCHOR, + } EmitOptionFlags_e; + +public: + + /** @name option flags + * + * @{ */ + C4_ALWAYS_INLINE EmitOptionFlags_e json_error_flags() const noexcept { return m_option_flags; } + EmitOptions& json_error_flags(EmitOptionFlags_e d) noexcept { m_option_flags = (EmitOptionFlags_e)(d & _JSON_ERR_MASK); return *this; } + /** @} */ + public: + /** @name max depth for the emitted tree * * This makes the emitter fail when emitting trees exceeding the @@ -85,8 +102,21 @@ struct EmitOptions EmitOptions& max_depth(id_type d) noexcept { m_max_depth = d; return *this; } static constexpr const id_type max_depth_default = 64; /** @} */ + +public: + + bool operator== (const EmitOptions& that) const noexcept + { + return m_max_depth == that.m_max_depth && + m_option_flags == that.m_option_flags; + } + private: + + /** @cond dev */ id_type m_max_depth{max_depth_default}; + EmitOptionFlags_e m_option_flags{DEFAULT_FLAGS}; + /** @endcond */ }; diff --git a/test/test_emit.cpp b/test/test_emit.cpp index 055afc96..f7cea26e 100644 --- a/test/test_emit.cpp +++ b/test/test_emit.cpp @@ -72,11 +72,6 @@ std::string emitrs_append(csubstr first_part, Emit &&fn) //----------------------------------------------------------------------------- //----------------------------------------------------------------------------- -bool operator== (const EmitOptions lhs, const EmitOptions rhs) -{ - return memcmp(&lhs, &rhs, sizeof(lhs)) == 0; -} - TEST(as_yaml, basic) { Tree et; From 924d83c84aaf80536a54a44f5dd23e716b244233 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sat, 1 Jun 2024 14:35:25 +0100 Subject: [PATCH 13/18] do not clear the arena relocator callback --- changelog/current.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/changelog/current.md b/changelog/current.md index f3cd77fd..69a39c9d 100644 --- a/changelog/current.md +++ b/changelog/current.md @@ -8,7 +8,7 @@ Most of the changes are from the giant Parser refactor described below. Before g - [#434](https://github.com/biojppm/rapidyaml/issues/434) - Ensure empty vals are not deserialized ([#PR436](https://github.com/biojppm/rapidyaml/pull/436)). - [#PR433](https://github.com/biojppm/rapidyaml/pull/433): - Fix some corner cases causing read-after-free in the tree's arena when it is relocated while filtering scalar. - - Improve YAML error conformance and - detect YAML-mandated parse errors when: + - Improve YAML error conformance - detect YAML-mandated parse errors when: - directives are misplaced (eg [9MMA](https://matrix.yaml.info/details/9MMA.html), [9HCY](https://matrix.yaml.info/details/9HCY.html), [B63P](https://matrix.yaml.info/details/B63P.html), [EB22](https://matrix.yaml.info/details/EB22.html), [SF5V](https://matrix.yaml.info/details/SF5V.html)). - comments are misplaced (eg [MUS6/00](https://matrix.yaml.info/details/MUS6:00.html), [9JBA](https://matrix.yaml.info/details/9JBA.html), [SU5Z](https://matrix.yaml.info/details/SU5Z.html)) - a node has both an anchor and an alias (eg [SR86](https://matrix.yaml.info/details/SR86.html), [SU74](https://matrix.yaml.info/details/SU74.html)). From dab7f994600ac611076637f5d72503182d418551 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sun, 2 Jun 2024 13:18:24 +0100 Subject: [PATCH 14/18] minor changes in documentation --- README.md | 55 +++++++++++++++++------------- doc/sphinx_yaml_standard.rst | 66 +++++++++++++++++++++--------------- samples/quickstart.cpp | 32 +++++++++-------- 3 files changed, 88 insertions(+), 65 deletions(-) diff --git a/README.md b/README.md index dff3d55d..2add4977 100644 --- a/README.md +++ b/README.md @@ -786,44 +786,53 @@ improve. ### Test suite status As part of its CI testing, ryml uses the [YAML test -suite](https://github.com/yaml/yaml-test-suite). This is an extensive -set of reference cases covering the full YAML spec. Each of these -cases have several subparts: +suite](https://github.com/yaml/yaml-test-suite). (See also the test +suite results at +[https://matrix.yaml.info/](https://matrix.yaml.info/), but be aware +that the results there may be using an older version of ryml.) This is +an extensive and merciless set of reference cases covering the full +YAML spec. Each of these cases has several subparts: * `in-yaml`: mildly, plainly or extremely difficult-to-parse YAML * `in-json`: equivalent JSON (where possible/meaningful) * `out-yaml`: equivalent standard YAML * `emit-yaml`: equivalent standard YAML - * `events`: reference results (ie, expected tree) - -When testing, ryml parses each of the 4 yaml/json parts, then emits -the parsed tree, then parses the emitted result and verifies that -emission is idempotent, ie that the emitted result is semantically the -same as its input without any loss of information. To ensure -consistency, this happens over four levels of parse/emission -pairs. And to ensure correctness, each of the stages is compared -against the `events` spec from the test, which constitutes the -reference. The tests also check for equality between the reference + * `events`: reference events according to the YAML standard + +When testing, ryml parses each of the yaml/json parts, then emits the +parsed tree, then parses the emitted result and verifies that emission +is idempotent, ie that the round trip emitted result is semantically +the same as its input without any loss of information. + +To ensure consistency, this happens over four successive levels of +parse->emit round trips. And to ensure correctness, each of the stages +is compared against the `events` spec from the test, which constitutes +the reference. The tests also check for equality between the reference events in the test case and the events emitted by ryml from the data tree parsed from the test case input. All of this is then carried out -combining several variations: both unix `\n` vs windows `\r\n` line +with several variations: both unix `\n` vs windows `\r\n` line endings, emitting to string, file or streams, which results in ~250 -tests per case part. With multiple parts per case and ~400 reference +tests per case part. + +With multiple parts per case and ~400 reference cases in the test suite, this makes over several hundred thousand individual tests to which ryml is subjected, which are added to the -unit tests in ryml, which also employ the same extensive -combinatorial approach. +unit tests in ryml, which also employ the same extensive combinatorial +approach. -Also, note that in [their own words](http://matrix.yaml.io/), the +Also, note that in [their own words](http://matrix.yaml.info/), the tests from the YAML test suite *contain a lot of edge cases that don't play such an important role in real world examples*. And yet, despite the extreme focus of the test suite, currently ryml only fails a minor fraction of the test cases, mostly related with the deliberate -limitations noted above. Other than those limitations, by far the main -issue with ryml is that several standard-mandated parse errors fail to -materialize. For the up-to-date list of ryml failures in the -test-suite, refer to the [list of known +limitations noted above. + +Other than those limitations, by far the main issue with ryml is that +several standard-mandated parse errors fail to materialize (this will +be addressed in the coming releases). For the up-to-date list of ryml +failures in the test-suite, refer to the [list of known exceptions](test/test_suite/test_suite_parts.cpp) from ryml's test -suite runner, which is used as part of ryml's CI process. +suite runner, which is used as part of ryml's CI setup. + ------ diff --git a/doc/sphinx_yaml_standard.rst b/doc/sphinx_yaml_standard.rst index fdf5c13e..00618a67 100644 --- a/doc/sphinx_yaml_standard.rst +++ b/doc/sphinx_yaml_standard.rst @@ -83,42 +83,52 @@ generally a source of problems; this is good practice anyway. YAML test suite --------------- -As part of its CI testing, ryml uses the `YAML test -suite `__. This is an extensive -set of reference cases covering the full YAML spec. Each of these cases -have several subparts: - +As part of its CI testing, ryml uses the `YAML test suite `__. This is +an extensive and merciless set of reference cases covering the full +YAML spec. Each of these cases has several subparts: - ``in-yaml``: mildly, plainly or extremely difficult-to-parse YAML - ``in-json``: equivalent JSON (where possible/meaningful) - ``out-yaml``: equivalent standard YAML - ``emit-yaml``: equivalent standard YAML - - ``events``: reference results (ie, expected tree) + - ``events``: reference events according to the YAML standard + +.. note:: -When testing, ryml parses each of the 4 yaml/json parts, then emits the + See the test suite results at + `https://matrix.yaml.info/ `__, + but be aware that the results there may be using an older + version of ryml. + +When testing, ryml parses each of the yaml/json parts, then emits the parsed tree, then parses the emitted result and verifies that emission -is idempotent, ie that the emitted result is semantically the same as -its input without any loss of information. To ensure consistency, this -happens over four levels of parse/emission pairs. And to ensure -correctness, each of the stages is compared against the ``events`` spec -from the test, which constitutes the reference. The tests also check for -equality between the reference events in the test case and the events -emitted by ryml from the data tree parsed from the test case input. All -of this is then carried out combining several variations: both unix -``\n`` vs windows ``\r\n`` line endings, emitting to string, file or -streams, which results in ~250 tests per case part. With multiple parts -per case and ~400 reference cases in the test suite, this makes over -several hundred thousand individual tests to which ryml is subjected, -which are added to the unit tests in ryml, which also employ the same -extensive combinatorial approach. +is idempotent, ie that the round trip emitted result is semantically +the same as its input without any loss of information. + +To ensure consistency, this happens over four successive levels of +parse->emit round trips. And to ensure correctness, each of the stages +is compared against the ``events`` spec from the test, which constitutes +the reference. The tests also check for equality between the reference +events in the test case and the events emitted by ryml from the data +tree parsed from the test case input. All of this is then carried out +with several variations: both unix ``\n`` vs windows ``\r\n`` line +endings, emitting to string, file or streams, which results in ~250 +tests per case part. + +With multiple parts per case and ~400 reference +cases in the test suite, this makes over several hundred thousand +individual tests to which ryml is subjected, which are added to the +unit tests in ryml, which also employ the same extensive combinatorial +approach. Also, note that in `their own words `__, the -tests from the YAML test suite *contain a lot of edge cases that don’t +tests from the YAML test suite *contain a lot of edge cases that don't play such an important role in real world examples*. And yet, despite the extreme focus of the test suite, currently ryml only fails a minor fraction of the test cases, mostly related with the deliberate -limitations noted above. Other than those limitations, by far the main -issue with ryml is that several standard-mandated parse errors fail to -materialize. For the up-to-date list of ryml failures in the test-suite, -refer to the `list of known -exceptions `__ from ryml’s test -suite runner, which is used as part of ryml’s CI process. +limitations noted above. + +Other than those limitations, by far the main issue with ryml is that +several standard-mandated parse errors fail to materialize (this will +be addressed in the coming releases). For the up-to-date list of ryml +failures in the test-suite, refer to the `list of known exceptions `__ from ryml's test +suite runner, which is used as part of ryml's CI setup. diff --git a/samples/quickstart.cpp b/samples/quickstart.cpp index b306871c..d27b738b 100644 --- a/samples/quickstart.cpp +++ b/samples/quickstart.cpp @@ -22,7 +22,6 @@ #include #include #include -#include #include #ifdef C4_EXCEPTIONS #include @@ -543,12 +542,14 @@ john: doe)"; // The downside, of course, is that when you are not sure, release // builds may be doing something crazy. // - // So you can override this behavior and enable/disable - // assertions, by defining the macro RYML_USE_ASSERT to a proper - // value (see c4/yml/common.hpp). + // So in that case, you can either use the appropriate ryml + // predicates to check your intent (as in the examples above), or + // you can override this behavior and enable/disable assertions, + // by defining the macro RYML_USE_ASSERT to a proper value (see + // c4/yml/common.hpp). // // Also, to be clear, this does not apply to parse errors - // happening when the YAML is parsed. Checking for these errors is + // occurring when the YAML is parsed. Checking for these errors is // always enabled and cannot be switched off. @@ -578,19 +579,19 @@ john: doe)"; // node. It can be used to read from the node, but not write to it // or modify the hierarchy of the node. If any modification is // desired then a NodeRef must be used instead: - ryml::NodeRef wroot = tree.rootref(); + ryml::NodeRef wroot = tree.rootref(); // writeable root // operator= assigns an existing string to the receiving node. - // The contents are NOT copied, and this pointer will be in effect - // until the tree goes out of scope! So BEWARE to only assign from - // strings outliving the tree. + // The contents are NOT copied, and the string pointer will be in + // effect until the tree goes out of scope! So BEWARE to only + // assign from strings outliving the tree. wroot["foo"] = "says you"; wroot["bar"][0] = "-2"; wroot["bar"][1] = "-3"; wroot["john"] = "ron"; // Now the tree is _pointing_ at the memory of the strings above. - // In this case it is OK because those are static strings and will - // outlive the tree. + // In this case it is OK because those are static strings, located + // in the executable's static section, and will outlive the tree. CHECK(root["foo"].val() == "says you"); CHECK(root["bar"][0].val() == "-2"); CHECK(root["bar"][1].val() == "-3"); @@ -602,9 +603,12 @@ john: doe)"; // } // CHECK(root["john"] == "dangling"); // CRASH! the string was deallocated - // operator<< first serializes the input to the tree's arena, then - // assigns the serialized string to the receiving node. This avoids - // constraints with the lifetime, since the arena lives with the tree. + // operator<<: for cases where the lifetime of the string is + // problematic WRT the tree, you can create and save a string in + // the tree using operator<<. It first serializes values to a + // string arena owned by the tree, then assigns the serialized + // string to the receiving node. This avoids constraints with the + // lifetime, since the arena lives with the tree. CHECK(tree.arena().empty()); wroot["foo"] << "says who"; // requires to_chars(). see serialization samples below. wroot["bar"][0] << 20; From 4a9ebe5386e67fca1767317e414bdb492ec354a7 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sun, 2 Jun 2024 13:32:49 +0100 Subject: [PATCH 15/18] refactor parse_emit --- tools/parse_emit.cpp | 227 ++++++++++++++++++++++++++++++------------- 1 file changed, 157 insertions(+), 70 deletions(-) diff --git a/tools/parse_emit.cpp b/tools/parse_emit.cpp index 23bfec35..26aeeae9 100644 --- a/tools/parse_emit.cpp +++ b/tools/parse_emit.cpp @@ -4,12 +4,22 @@ #include #include #include +#include #endif #include +#include #include #include +#ifdef C4_EXCEPTIONS +#include +#else +#include +std::jmp_buf jmp_env = {}; +c4::csubstr jmp_msg = {}; +#endif + using namespace c4; @@ -18,6 +28,13 @@ C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH("-Wold-style-cast") C4_SUPPRESS_WARNING_GCC("-Wuseless-cast") +bool quiet = false; +bool debug_tree = false; +bool emit_as_json = false; +bool timed_sections = false; +bool emit_to_string = false; + + //----------------------------------------------------------------------------- struct timed_section @@ -32,12 +49,142 @@ struct timed_section timed_section(csubstr n) : name(n), start(myclock::now()) {} ~timed_section() { - fprintf(stderr, "%.6fms: %.*s\n", since().count(), (int)name.len, name.str); - fflush(stderr); + if(timed_sections) + { + fprintf(stderr, "%.6fms: %.*s\n", since().count(), (int)name.len, name.str); + fflush(stderr); + } } }; -#define TS(name) timed_section name##__##__LINE__(#name) +#define TS(name) timed_section C4_XCAT(__, C4_XCAT(name, __LINE__))(#name) + +// LCOV_EXCL_START + +constexpr const char *usage = R"(usage: + + %s + +Parse yaml from file (or stdin when file is `-`) and emit to stdout. + +Options: + + -q,--quiet do not emit (default: emit yaml) + -d,--debug print rapidyaml tree + -j,--json emit json instead of yaml (default: emit yaml) + -s,--string emit to string before dumping to stdout. + otherwise, emit directly to stdout + -t,--timed time sections (print timings to stderr) + +)"; + +csubstr parse_args(int argc, const char *argv[]) +{ + if(argc < 2) + { + printf(usage, argv[0]); + yml::error("unknown argument"); + } + csubstr file = to_csubstr(argv[argc - 1]); + for(int i = 1; i+1 < argc; ++i) + { + csubstr arg = to_csubstr(argv[i]); + if(arg == "-q" || arg == "--quiet") + quiet = true; + else if(arg == "-t" || arg == "--timed") + timed_sections = true; + else if(arg == "-s" || arg == "--string") + emit_to_string = true; + else if(arg == "-d" || arg == "--debug") + debug_tree = true; + else if(arg == "-j" || arg == "--json") + emit_as_json = true; + else + { + printf(usage, argv[0]); + yml::error("unknown argument"); + } + } + return file; +} + +void load_file(csubstr filename, std::string *buf) +{ + buf->clear(); + if(filename == "-") // read from stdin + { + for(int c = std::getchar(); c != EOF; c = std::getchar()) + { + buf->push_back((char)c); + if(buf->size() == buf->capacity()) + buf->reserve(2u * (buf->capacity() >= 128u ? buf->capacity() : 128u)); + } + } + else + { + if(!fs::path_exists(filename.str)) + { + std::fprintf(stderr, "cannot find file: %s (cwd=%s)\n", filename.str, fs::cwd().c_str()); + yml::error("file not found"); + } + fs::file_get_contents(filename.str, buf); + } +} + +void emit_json_docs(yml::Tree const& tree, std::string *dst=nullptr) +{ + auto emitnode = [&](yml::ConstNodeRef node){ + if(dst) + { + emitrs_json(node, dst, /*append*/true); + *dst += '\n'; + } + else + { + emit_json(node, stdout); + } + }; + yml::ConstNodeRef root = tree.rootref(); + if(!root.is_stream()) + emitnode(root); + else + for(yml::ConstNodeRef doc : root.children()) + emitnode(doc); +} + +void report_error(const char* msg, size_t length, yml::Location loc, FILE *f) +{ + if(!loc.name.empty()) + { + fwrite(loc.name.str, 1, loc.name.len, f); + fputc(':', f); + } + fprintf(f, "%zu:", loc.line); + if(loc.col) + fprintf(f, "%zu:", loc.col); + if(loc.offset) + fprintf(f, " (%zuB):", loc.offset); + fputc(' ', f); + fprintf(f, "%.*s\n", static_cast(length), msg); + fflush(f); +} + +yml::Callbacks create_custom_callbacks() +{ + yml::Callbacks callbacks = {}; + callbacks.m_error = [](const char *msg, size_t msg_len, yml::Location location, void *) + { + report_error(msg, msg_len, location, stderr); + C4_IF_EXCEPTIONS( + throw std::runtime_error({msg, msg_len}); + , + jmp_msg.assign(msg, msg_len); + std::longjmp(jmp_env, 1); + ); + }; + return callbacks; +} +// LCOV_EXCL_STOP //----------------------------------------------------------------------------- @@ -60,8 +207,7 @@ void process_file(csubstr file) TS(estimate_capacity); cap = yml::estimate_tree_capacity(to_csubstr(contents)); } - if(debug_tree) - fprintf(stderr, "reserving capacity=%zu\n", (size_t)cap); + fprintf(stderr, "reserving capacity=%zu\n", (size_t)cap); tree.reserve(cap); } { @@ -114,76 +260,17 @@ void process_file(csubstr file) int main(int argc, const char *argv[]) { - bool print_emitted_to_stdout = true; - csubstr file; - // LCOV_EXCL_START - auto show_usage = [argv]{ - printf("usage: %s [-s] \n", argv[0]); - }; - if(argc == 2) - { - file = to_csubstr(argv[1]); - } - else if(argc > 2) + TS(TOTAL); + set_callbacks(create_custom_callbacks()); + C4_IF_EXCEPTIONS_(try, if(setjmp(jmp_env) == 0)) { - file = to_csubstr(argv[2]); - csubstr arg = to_csubstr(argv[1]); - if(arg == "-s") - { - print_emitted_to_stdout = false; - } - else - { - show_usage(); - return 1; - } + csubstr file = parse_args(argc, argv); + process_file(file); } - else + C4_IF_EXCEPTIONS_(catch(std::exception const&), else) { - show_usage(); return 1; } - // LCOV_EXCL_STOP - - TS(TOTAL); - - C4_CHECK_MSG(fs::path_exists(file.str), "cannot find file: %s (cwd=%s)", file.str, fs::cwd().c_str()); - - { - TS(objects); - std::string contents, output; - yml::Tree tree; - { - TS(read_file); - fs::file_get_contents(file.str, &contents); - } - { - TS(tree_reserve); - yml::id_type cap; - { - TS(estimate_capacity); - cap = yml::estimate_tree_capacity(to_csubstr(contents)); - } - fprintf(stderr, "reserving capacity=%zu\n", (size_t)cap); - tree.reserve(cap); - } - { - TS(parse_yml); - yml::parse_in_place(file, to_substr(contents), &tree); - } - { - TS(emit_to_buffer); - output.resize(contents.size()); // resize, not just reserve - yml::emitrs_yaml(tree, &output); - } - if(print_emitted_to_stdout) - { - TS(print_stdout); - fwrite(output.data(), 1, output.size(), stdout); - putchar('\n'); - } - } - return 0; } From aad6ef3f7167a3c322020ec73afc98e3ba5c4bb6 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sun, 2 Jun 2024 13:32:40 +0100 Subject: [PATCH 16/18] minor fixes --- README.md | 94 ++++++++++---------- ROADMAP.md | 2 +- doc/index.rst | 4 +- doc/sphinx_quicklinks.rst | 2 +- doc/sphinx_yaml_standard.rst | 20 ++--- src/c4/yml/detail/stack.hpp | 10 +-- src/c4/yml/event_handler_stack.hpp | 2 +- test/test_parse_engine_6_qmrk.cpp | 26 ++++++ test/test_suite/test_suite_event_handler.cpp | 2 +- test/test_suite/test_suite_event_handler.hpp | 24 +++-- test/test_tag_property.cpp | 13 +++ tools/parse_emit.cpp | 20 +---- tools/yaml_events.cpp | 8 +- 13 files changed, 119 insertions(+), 108 deletions(-) diff --git a/README.md b/README.md index 2add4977..48a877f1 100644 --- a/README.md +++ b/README.md @@ -432,37 +432,6 @@ CHECK(loc.col == 4u); ## Using ryml in your project -### Package managers - -ryml is available in most package managers (thanks to all the -contributors!) and linux distributions. But please be aware: those -packages are maintained downstream of this repository, so if you have -issues with the package, file a report with the respective maintainer. - -Here's a quick roundup (not maintained): -* Package managers: - * [conan](https://conan.io/center/recipes/rapidyaml) - * [vcpkg](https://vcpkg.io/en/packages.html): `vcpkg install ryml` - * [PyPI](https://pypi.org/project/rapidyaml/) -* Linux distributions: - * Arch Linux/Manjaro: - * [rapidyaml (aarch64)](https://archlinuxarm.org/packages/aarch64/rapidyaml) - * [rapidyaml-git (AUR)](https://aur.archlinux.org/packages/rapidyaml-git/) - * [python-rapidyaml-git (AUR)](https://aur.archlinux.org/packages/python-rapidyaml-git/) - * [Fedora Linux](https://getfedora.org/)/[EPEL](https://docs.fedoraproject.org/en-US/epel/): - * `dnf install rapidyaml-devel` - * `dnf install python3-rapidyaml` - * [Gentoo](https://packages.gentoo.org/packages/dev-cpp/rapidyaml) - * [OpenSuse](https://build.openbuildservice.org/package/show/Emulators/rapidyaml) - * [Slackbuilds](https://slackbuilds.org/repository/15.0/libraries/rapidyaml/) - * [AltLinux](https://packages.altlinux.org/en/sisyphus/srpms/rapidyaml/3006055151670528141) - -Although package managers are very useful for quickly getting up to -speed, the advised way is still to bring ryml as a submodule of your -project, building both together. This makes it easy to track any -upstream changes in ryml. Also, ryml is small and quick to build, so -there's not much of a cost for building it with your project. - ### Single header file ryml is provided chiefly as a cmake library project, but it can also be used as a single header file, and there is a [tool to @@ -531,6 +500,38 @@ If you omit `--recursive`, after cloning you will have to do `git submodule update --init --recursive` to ensure ryml's submodules are checked out. +### Package managers + +ryml is available in most package managers (thanks to all the +contributors!) and linux distributions. But please be aware: those +packages are maintained downstream of this repository, so if you have +issues with the package, file a report with the respective maintainer. + +Here's a quick roundup (not maintained): +* Package managers: + * [conan](https://conan.io/center/recipes/rapidyaml) + * [vcpkg](https://vcpkg.io/en/packages.html): `vcpkg install ryml` + * [PyPI](https://pypi.org/project/rapidyaml/) +* Linux distributions: + * Arch Linux/Manjaro: + * [rapidyaml (aarch64)](https://archlinuxarm.org/packages/aarch64/rapidyaml) + * [rapidyaml-git (AUR)](https://aur.archlinux.org/packages/rapidyaml-git/) + * [python-rapidyaml-git (AUR)](https://aur.archlinux.org/packages/python-rapidyaml-git/) + * [Fedora Linux](https://getfedora.org/)/[EPEL](https://docs.fedoraproject.org/en-US/epel/): + * `dnf install rapidyaml-devel` + * `dnf install python3-rapidyaml` + * [Gentoo](https://packages.gentoo.org/packages/dev-cpp/rapidyaml) + * [OpenSuse](https://build.openbuildservice.org/package/show/Emulators/rapidyaml) + * [Slackbuilds](https://slackbuilds.org/repository/15.0/libraries/rapidyaml/) + * [AltLinux](https://packages.altlinux.org/en/sisyphus/srpms/rapidyaml/3006055151670528141) + +Although package managers are very useful for quickly getting up to +speed, the advised way is still to bring ryml as a submodule of your +project, building both together. This makes it easy to track any +upstream changes in ryml. Also, ryml is small and quick to build, so +there's not much of a cost for building it with your project. + + ### Quickstart samples These samples show different ways of getting ryml into your application. All the @@ -555,6 +556,7 @@ more about each sample: | [`fetch_content`](./samples/fetch_content) | **yes** | [`CMakeLists.txt`](./samples/fetch_content/CMakeLists.txt) | [`run.sh`](./samples/fetch_content/run.sh) | | [`find_package`](./samples/find_package) | **no**
needs prior install or package | [`CMakeLists.txt`](./samples/find_package/CMakeLists.txt) | [`run.sh`](./samples/find_package/run.sh) | + ### CMake build settings for ryml The following cmake variables can be used to control the build behavior of ryml: @@ -726,20 +728,16 @@ See also [the roadmap](./ROADMAP.md) for a list of future work. ### Known limitations -ryml deliberately makes no effort to follow the standard in the +ryml deliberately makes no effort to follow the YAML standard in the following situations: -* ryml's tree does NOT accept containers are as mapping keys: keys - must be scalars. HOWEVER, this is a limitation only of the tree. The - event-based parser engine DOES parse container keys. The parser - engine is the result of a recent refactor and its usage is meant to - be used by other programming languages to create their native - data-structures. This engine is fully tested and fully conformant - (other than the general error permissiveness noted below). But - because it is recent, it is still undocumented, and it requires some - API cleanup before being ready for isolated use. Please get in touch - if you are interested in integrating the event-based parser engine - without the standalone `ryml::parse_*()` +* ryml's tree does NOT accept containers as map keys: keys stored in + the tree must always be scalars. HOWEVER, this is a limitation only + of the final tree. The event-based parse engine DOES parse container + keys, as it is is meant to be used by other programming languages to + create their native data-structures, and it is fully tested and + fully conformant (other than the general error permissiveness noted + below). * Tab characters after `:` and `-` are not accepted tokens, unless ryml is compiled with the macro `RYML_WITH_TAB_TOKENS`. This requirement exists because checking for tabs introduces branching @@ -774,11 +772,11 @@ following situations: If you do run into trouble and would like to investigate conformance of your YAML code, **beware** of existing online YAML linters, many of which are not fully conformant. Instead, try using -[https://play.yaml.io](https://play.yaml.io), an amazing tool which -lets you dynamically input your YAML and continuously see the results -from all the existing parsers (kudos to @ingydotnet and the people -from the YAML test suite). And of course, if you detect anything wrong -with ryml, please [open an +[https://play.yaml.io](https://play.yaml.io), an amazingly useful tool +which lets you dynamically input your YAML and continuously see the +results from all the existing parsers (kudos to @ingydotnet and the +people from the YAML test suite). And of course, if you detect +anything wrong with ryml, please [open an issue](https://github.com/biojppm/rapidyaml/issues) so that we can improve. diff --git a/ROADMAP.md b/ROADMAP.md index 8df21edc..c78da795 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -1,2 +1,2 @@ -Moved! See ryml's [Kanban board on github](https://github.com/biojppm/rapidyaml/projects/1). +Moved! See rapidyaml's [Kanban board on github](https://github.com/users/biojppm/projects/1/views/1). diff --git a/doc/index.rst b/doc/index.rst index b26b178c..67bd8871 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -35,10 +35,10 @@ ryml is written in C++11, and compiles cleanly with: * Visual Studio 2015 and later -* clang++ 3.9 and later - * g++ 4.8 and later +* clang++ 3.9 and later + * Intel Compiler .. note:: diff --git a/doc/sphinx_quicklinks.rst b/doc/sphinx_quicklinks.rst index b4c8c7c1..133d4f80 100644 --- a/doc/sphinx_quicklinks.rst +++ b/doc/sphinx_quicklinks.rst @@ -13,7 +13,7 @@ Quick links * `Pull Requests `_ - * `Kanban board `_ + * `Kanban board `_ * Latest release: `0.6.0 `_ diff --git a/doc/sphinx_yaml_standard.rst b/doc/sphinx_yaml_standard.rst index 00618a67..390f833d 100644 --- a/doc/sphinx_yaml_standard.rst +++ b/doc/sphinx_yaml_standard.rst @@ -17,7 +17,7 @@ welcome. linters**, many of which are not fully conformant; instead, try using `https://play.yaml.io `__, - an amazing tool which lets you dynamically input your YAML and + an amazingly useful tool which lets you dynamically input your YAML and continuously see the results from all the existing parsers (kudos to @ingydotnet and the people from the YAML test suite). And of course, if you detect anything wrong with ryml, please `open an @@ -31,17 +31,13 @@ Deliberate deviations ryml deliberately makes no effort to follow the standard in the following situations: -- ryml's tree does NOT accept containers are as mapping keys: keys - must be scalars. HOWEVER, this is a limitation only of the tree. The - event-based parser engine DOES parse container keys. The parser - engine is the result of a recent refactor and its usage is meant to - be used by other programming languages to create their native - data-structures. This engine is fully tested and fully conformant - (other than the general error permissiveness noted below). But - because it is recent, it is still undocumented, and it requires some - API cleanup before being ready for isolated use. Please get in touch - if you are interested in integrating the event-based parser engine - without the standalone `ryml::parse_*()` +- ryml's tree does NOT accept containers as map keys: keys stored in + the tree must always be scalars. HOWEVER, this is a limitation only + of the final tree. The event-based parse engine DOES parse container + keys, as it is is meant to be used by other programming languages to + create their native data-structures, and it is fully tested and + fully conformant (other than the general error permissiveness noted + below). - Tab characters after ``:`` and ``-`` are not accepted tokens, unless ryml is compiled with the macro ``RYML_WITH_TAB_TOKENS``. This requirement exists because checking for tabs introduces branching diff --git a/src/c4/yml/detail/stack.hpp b/src/c4/yml/detail/stack.hpp index df3e27d3..8a5d3783 100644 --- a/src/c4/yml/detail/stack.hpp +++ b/src/c4/yml/detail/stack.hpp @@ -34,11 +34,11 @@ class stack public: - T m_buf[size_t(N)]; - T * m_stack; - id_type m_size; - id_type m_capacity; - Callbacks m_callbacks; + T m_buf[size_t(N)]; + T *C4_RESTRICT m_stack; + id_type m_size; + id_type m_capacity; + Callbacks m_callbacks; public: diff --git a/src/c4/yml/event_handler_stack.hpp b/src/c4/yml/event_handler_stack.hpp index 4e6d94f5..30e96953 100644 --- a/src/c4/yml/event_handler_stack.hpp +++ b/src/c4/yml/event_handler_stack.hpp @@ -46,7 +46,7 @@ struct EventHandlerStack state *C4_RESTRICT m_curr; ///< current stack level: top of the stack. cached here for easier access. state *C4_RESTRICT m_parent; ///< parent of the current stack level. pfn_relocate_arena m_relocate_arena; ///< callback when the arena gets relocated - void *C4_RESTRICT m_relocate_arena_data; + void * m_relocate_arena_data; protected: diff --git a/test/test_parse_engine_6_qmrk.cpp b/test/test_parse_engine_6_qmrk.cpp index 118f6375..d495e41d 100644 --- a/test/test_parse_engine_6_qmrk.cpp +++ b/test/test_parse_engine_6_qmrk.cpp @@ -292,6 +292,32 @@ ENGINE_TEST(Qmrk3, ___(ps.end_stream()); } +ENGINE_TEST(Qmrk4_0, + ("[?baz:,]", + "[{?baz: }]"), + "+STR\n" + "+DOC\n" + "+SEQ []\n" + "+MAP {}\n" + "=VAL :?baz\n" + "=VAL :\n" + "-MAP\n" + "-SEQ\n" + "-DOC\n" + "-STR\n") +{ + ___(ps.begin_stream()); + ___(ps.begin_doc()); + ___(ps.begin_seq_val_flow()); + ___(ps.begin_map_val_flow()); + ___(ps.set_key_scalar_plain("?baz")); + ___(ps.set_val_scalar_plain({})); + ___(ps.end_map()); + ___(ps.end_seq()); + ___(ps.end_doc()); + ___(ps.end_stream()); +} + ENGINE_TEST(Qmrk4, ("[ ? an explicit key, ? foo,? bar,?baz:,?bat]", "[{an explicit key: },{foo: },{bar: },{?baz: },?bat]"), diff --git a/test/test_suite/test_suite_event_handler.cpp b/test/test_suite/test_suite_event_handler.cpp index 55d939f1..ec1fd180 100644 --- a/test/test_suite/test_suite_event_handler.cpp +++ b/test/test_suite/test_suite_event_handler.cpp @@ -72,7 +72,7 @@ void append_escaped(extra::string *es, csubstr val) } } // flush the rest - this->append(val.sub(prev)); + es->append(val.sub(prev)); #undef _c4flush_use_instead } diff --git a/test/test_suite/test_suite_event_handler.hpp b/test/test_suite/test_suite_event_handler.hpp index 6d6e087c..ad5f7e9e 100644 --- a/test/test_suite/test_suite_event_handler.hpp +++ b/test/test_suite/test_suite_event_handler.hpp @@ -7,9 +7,6 @@ #ifndef _C4_YML_EVENT_HANDLER_STACK_HPP_ #include "c4/yml/event_handler_stack.hpp" #endif -#ifndef _C4_YML_STD_STRING_HPP_ -#include "c4/yml/std/string.hpp" -#endif #ifndef _C4_YML_DETAIL_PRINT_HPP_ #include "c4/yml/detail/print.hpp" #endif @@ -100,6 +97,7 @@ struct EventHandlerYamlStd : public EventHandlerStacklevel < m_val_buffers.size()); - return m_val_buffers[(size_t)m_curr->level]; + _RYML_CB_ASSERT(m_stack.m_callbacks, m_curr->level < m_val_buffers.size()); + return m_val_buffers[m_curr->level]; } EventSink& _buf_(id_type level) noexcept { - _RYML_CB_ASSERT(m_stack.m_callbacks, (size_t)level < m_val_buffers.size()); - return m_val_buffers[(size_t)level]; + _RYML_CB_ASSERT(m_stack.m_callbacks, level < m_val_buffers.size()); + return m_val_buffers[level]; } EventSink const& _buf_(id_type level) const noexcept { - _RYML_CB_ASSERT(m_stack.m_callbacks, (size_t)level < m_val_buffers.size()); - return m_val_buffers[(size_t)level]; + _RYML_CB_ASSERT(m_stack.m_callbacks, level < m_val_buffers.size()); + return m_val_buffers[level]; } static void _buf_flush_to_(EventSink &C4_RESTRICT src, EventSink &C4_RESTRICT dst) noexcept @@ -686,8 +684,8 @@ struct EventHandlerYamlStd : public EventHandlerStack m_val_buffers.size()) - m_val_buffers.resize((size_t)size_needed); + if(size_needed > m_val_buffers.size()) + m_val_buffers.resize(size_needed); } C4_ALWAYS_INLINE void _send_(csubstr s) noexcept { _buf_().append(s); } @@ -699,7 +697,7 @@ struct EventHandlerYamlStd : public EventHandlerStack #endif #include -#include #include #include @@ -29,7 +28,6 @@ C4_SUPPRESS_WARNING_GCC("-Wuseless-cast") bool quiet = false; -bool debug_tree = false; bool emit_as_json = false; bool timed_sections = false; bool emit_to_string = false; @@ -95,8 +93,6 @@ csubstr parse_args(int argc, const char *argv[]) timed_sections = true; else if(arg == "-s" || arg == "--string") emit_to_string = true; - else if(arg == "-d" || arg == "--debug") - debug_tree = true; else if(arg == "-j" || arg == "--json") emit_as_json = true; else @@ -214,22 +210,10 @@ void process_file(csubstr file) TS(parse_yml); yml::parse_in_place(file, to_substr(contents), &tree); } - if(debug_tree) - { - TS(debug_tree); - yml::print_tree(tree); - } if(emit_as_json) { - { - TS(resolve_refs); - tree.resolve(); - } - if(debug_tree) - { - TS(debug_resolved_tree); - yml::print_tree(tree); - } + TS(resolve_refs); + tree.resolve(); } if(emit_to_string) { diff --git a/tools/yaml_events.cpp b/tools/yaml_events.cpp index b8c849a5..b0d8009b 100644 --- a/tools/yaml_events.cpp +++ b/tools/yaml_events.cpp @@ -151,8 +151,6 @@ bool Args::parse(Args *args, int argc, const char *argv[]) return true; } -C4_SUPPRESS_WARNING_GCC_CLANG_WITH_PUSH("-Wold-style-cast") - std::string load_file(csubstr filename) { if(filename == "-") // read from stdin @@ -160,7 +158,7 @@ std::string load_file(csubstr filename) std::string buf; for(int c = std::getchar(); c != EOF; c = std::getchar()) { - buf.push_back((char)c); + buf.push_back(static_cast(c)); if(buf.size() == buf.capacity()) buf.reserve(2u * (buf.capacity() >= 128u ? buf.capacity() : 128u)); } @@ -187,12 +185,10 @@ void report_error(const char* msg, size_t length, Location loc, FILE *f) if(loc.offset) fprintf(f, " (%zuB):", loc.offset); fputc(' ', f); - fprintf(f, "%.*s\n", (int)length, msg); + fprintf(f, "%.*s\n", static_cast(length), msg); fflush(f); } -C4_SUPPRESS_WARNING_GCC_CLANG_POP - Callbacks create_custom_callbacks() { Callbacks callbacks = {}; From 8c7c76ce8b5f56c0b607dcec5ee1d7b41646164c Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Sat, 8 Jun 2024 17:50:51 +0100 Subject: [PATCH 17/18] add unit tests for known-problematic fuzz inputs --- test/CMakeLists.txt | 59 +++++++++++ test/test_fuzz/test_fuzz_common.hpp | 135 ++++++++++++++++++++++++ test/test_fuzz/test_fuzz_events.cpp | 9 ++ test/test_fuzz/test_fuzz_main.cpp | 16 +++ test/test_fuzz/test_fuzz_parse_emit.cpp | 9 ++ tools/parse_emit.cpp | 2 +- 6 files changed, 229 insertions(+), 1 deletion(-) create mode 100644 test/test_fuzz/test_fuzz_common.hpp create mode 100644 test/test_fuzz/test_fuzz_events.cpp create mode 100644 test/test_fuzz/test_fuzz_main.cpp create mode 100644 test/test_fuzz/test_fuzz_parse_emit.cpp diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 905292ce..f4908326 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -265,3 +265,62 @@ if(RYML_TEST_SUITE) ryml_add_test_from_suite(${case}) endforeach() endif(RYML_TEST_SUITE) + + +#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------ +#------------------------------------------------------------------------------ + +# run every known fuzz crash +set(fuzzdefault ON) +if(EMSCRIPTEN) + set(fuzzdefault OFF) +endif() +option(RYML_TEST_FUZZ "Enable tests for known problematic fuzz cases." ${fuzzdefault}) + +if(RYML_TEST_FUZZ) + c4_download_remote_proj(rapidyaml-data rapidyaml_data_dir + GIT_REPOSITORY https://github.com/biojppm/rapidyaml-data + GIT_TAG master) + if(NOT EXISTS ${rapidyaml_data_dir}/fuzz/yaml.dict) + c4_err("cannot find rapidyaml-data at ${rapidyaml_data_dir} -- was there an error downloading the project?") + endif() + + set(corpus_suite_dir ${rapidyaml_data_dir}/fuzz/yaml_test_suite) + set(corpus_generated_dir ${rapidyaml_data_dir}/fuzz/yaml_generated) + set(corpus_artifacts_dir ${rapidyaml_data_dir}/fuzz/yaml_artifacts) + set(corpus_merged_dir ${rapidyaml_data_dir}/fuzz/yaml_merged) + set(yaml_dict ${rapidyaml_data_dir}/fuzz/yaml.dict) + file(GLOB_RECURSE fuzz_files RELATIVE "${corpus_artifacts_dir}" "${corpus_artifacts_dir}/*") + file(GLOB_RECURSE suite_files RELATIVE "${corpus_suite_dir}" "${corpus_suite_dir}/*") + + # add individual tests for problematic fuzz files + function(ryml_add_fuzz_test name) + c4_add_executable(ryml-test-fuzz-${name} + SOURCES + test_fuzz/test_fuzz_common.hpp + test_fuzz/test_fuzz_${name}.cpp + test_fuzz/test_fuzz_main.cpp + ${ARGN} + INC_DIRS ${CMAKE_CURRENT_LIST_DIR} + LIBS ryml c4fs + FOLDER test/fuzz) + if(RYML_DBG) + target_compile_definitions(ryml-test-fuzz-${name} PUBLIC RYML_DBG) + endif() + add_dependencies(ryml-test-build ryml-test-fuzz-${name}) + ryml_get_target_exe(ryml-test-fuzz-${name} tgtexe) + function(ryml_add_fuzz_test_file name_ dir file) + string(REPLACE "/" "_" fuzz_name "${file}") + add_test(NAME ryml-test-fuzz-${name_}-${fuzz_name} + COMMAND ${tgtexe} ${dir}/${file}) + endfunction() + foreach(fuzz_file ${fuzz_files}) + ryml_add_fuzz_test_file(${name} ${corpus_artifacts_dir} ${fuzz_file}) + endforeach() + endfunction() + ryml_add_fuzz_test(parse_emit) + ryml_add_fuzz_test(events + ../test/test_suite/test_suite_event_handler.hpp + ../test/test_suite/test_suite_event_handler.cpp) +endif() diff --git a/test/test_fuzz/test_fuzz_common.hpp b/test/test_fuzz/test_fuzz_common.hpp new file mode 100644 index 00000000..65b0d762 --- /dev/null +++ b/test/test_fuzz/test_fuzz_common.hpp @@ -0,0 +1,135 @@ +#pragma once +#ifndef TEST_FUZZ_COMMON_H +#define TEST_FUZZ_COMMON_H + +#ifdef RYML_SINGLE_HEADER +#include +#else +#include +#include +#include +#include +#include +#endif +#include +#include +#include + +#ifdef C4_EXCEPTIONS +#include +#else +#include +std::jmp_buf jmp_env = {}; +c4::csubstr jmp_msg = {}; +#endif + + +#ifdef RYML_DBG +#define _if_dbg(...) __VA_ARGS__ +bool report_errors = true; +#else +#define _if_dbg(...) +bool report_errors = false; +#endif + +inline void report_error(const char* msg, size_t length, c4::yml::Location loc, FILE *f) +{ + if(!report_errors) + return; + if(!loc.name.empty()) + { + fwrite(loc.name.str, 1, loc.name.len, f); + fputc(':', f); + } + fprintf(f, "%zu:", loc.line); + if(loc.col) + fprintf(f, "%zu:", loc.col); + if(loc.offset) + fprintf(f, " (%zuB):", loc.offset); + fputc(' ', f); + fprintf(f, "%.*s\n", static_cast(length), msg); + fflush(f); +} + +// watchout: VS2022 requires C4_NORETURN to come before inline +C4_NORETURN inline void errcallback(const char *msg, size_t msg_len, c4::yml::Location location, void *) +{ + report_error(msg, msg_len, location, stderr); + C4_IF_EXCEPTIONS( + throw std::runtime_error({msg, msg_len}); + , + jmp_msg.assign(msg, msg_len); + std::longjmp(jmp_env, 1); + ); +} + +inline c4::yml::Callbacks create_custom_callbacks() +{ + c4::set_error_flags(c4::ON_ERROR_CALLBACK); + c4::set_error_callback([](const char *msg, size_t msg_len){ + errcallback(msg, msg_len, {}, nullptr); + }); + c4::yml::Callbacks callbacks = {}; + callbacks.m_error = errcallback; + return callbacks; +} + +namespace c4 { +namespace yml { + +inline int fuzztest_parse_emit(uint32_t case_number, csubstr src) +{ + C4_UNUSED(case_number); + set_callbacks(create_custom_callbacks()); + Tree tree(create_custom_callbacks()); + bool parse_success = false; + C4_IF_EXCEPTIONS_(try, if(setjmp(jmp_env) == 0)) + { + RYML_ASSERT(tree.empty()); + _if_dbg(_dbg_printf("in[{}]: [{}]~~~\n{}\n~~~\n", case_number, src.len, src); fflush(NULL)); + parse_in_arena(src, &tree); + parse_success = true; + _if_dbg(print_tree("parsed tree", tree)); + _if_dbg(_dbg_printf("in[{}]: [{}]~~~\n{}\n~~~\n", case_number, src.len, src); fflush(NULL)); + std::string dst = emitrs_yaml(tree); + _if_dbg(_dbg_printf("emitted[{}]: [{}]~~~\n{}\n~~~\n", case_number, dst.size(), to_csubstr(dst)); fflush(NULL)); + C4_DONT_OPTIMIZE(dst); + C4_DONT_OPTIMIZE(parse_success); + } + C4_IF_EXCEPTIONS_(catch(std::exception const&), else) + { + // if an exception leaks from here, it is likely because of a greedy noexcept + _if_dbg(if(parse_success) print_tree("parsed tree", tree)); + return 1; + } + return 0; +} + +inline int fuzztest_yaml_events(uint32_t case_number, csubstr src) +{ + C4_UNUSED(case_number); + set_callbacks(create_custom_callbacks()); + EventHandlerYamlStd::EventSink sink = {}; + EventHandlerYamlStd handler(&sink, create_custom_callbacks()); + ParseEngine parser(&handler); + std::string str(src.begin(), src.end()); + C4_IF_EXCEPTIONS_(try, if(setjmp(jmp_env) == 0)) + { + _if_dbg(_dbg_printf("in[{}]: [{}]~~~\n{}\n~~~\n", case_number, src.len, src); fflush(NULL)); + parser.parse_in_place_ev("input", c4::to_substr(str)); + _if_dbg(_dbg_printf("evts[{}]: ~~~\n{}\n~~~\n", case_number, sink); fflush(NULL)); + C4_DONT_OPTIMIZE(sink); + } + C4_IF_EXCEPTIONS_(catch(std::exception const&), else) + { + // if an exception leaks from here, it is likely because of a greedy noexcept + _if_dbg(fprintf(stdout, "err\n"); fflush(NULL)); + return 1; + } + return 0; +} + +} // namespace yml +} // namespace c4 + +#endif /* TEST_FUZZ_COMMON_H */ diff --git a/test/test_fuzz/test_fuzz_events.cpp b/test/test_fuzz/test_fuzz_events.cpp new file mode 100644 index 00000000..a857a2fe --- /dev/null +++ b/test/test_fuzz/test_fuzz_events.cpp @@ -0,0 +1,9 @@ +#include "./test_fuzz_common.hpp" +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *str, size_t len) +{ + static std::atomic case_number{0}; + c4::csubstr src = {reinterpret_cast(str), len}; + return c4::yml::fuzztest_yaml_events(case_number++, src); +} diff --git a/test/test_fuzz/test_fuzz_main.cpp b/test/test_fuzz/test_fuzz_main.cpp new file mode 100644 index 00000000..45f21b28 --- /dev/null +++ b/test/test_fuzz/test_fuzz_main.cpp @@ -0,0 +1,16 @@ +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *str, size_t len); + +int main(int argc, const char *argv[]) +{ + if(argc < 2) + return 1; + const char *filename = argv[1]; + if(!c4::fs::file_exists(filename)) + return 1; + std::string file = c4::fs::file_get_contents(filename); + (void)LLVMFuzzerTestOneInput(reinterpret_cast(&file[0]), file.size()); + return 0; +} diff --git a/test/test_fuzz/test_fuzz_parse_emit.cpp b/test/test_fuzz/test_fuzz_parse_emit.cpp new file mode 100644 index 00000000..dc4c0805 --- /dev/null +++ b/test/test_fuzz/test_fuzz_parse_emit.cpp @@ -0,0 +1,9 @@ +#include "./test_fuzz_common.hpp" +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *str, size_t len) +{ + static std::atomic case_number{0}; + c4::csubstr src = {reinterpret_cast(str), len}; + return c4::yml::fuzztest_parse_emit(case_number++, src); +} diff --git a/tools/parse_emit.cpp b/tools/parse_emit.cpp index 5342cec3..c9c3f29f 100644 --- a/tools/parse_emit.cpp +++ b/tools/parse_emit.cpp @@ -180,7 +180,6 @@ yml::Callbacks create_custom_callbacks() }; return callbacks; } -// LCOV_EXCL_STOP //----------------------------------------------------------------------------- @@ -241,6 +240,7 @@ void process_file(csubstr file) emit_json_docs(tree); } } +// LCOV_EXCL_STOP int main(int argc, const char *argv[]) { From ae1a1b18c1327e2aece021c2501bd1f00a5adfa8 Mon Sep 17 00:00:00 2001 From: Joao Paulo Magalhaes Date: Mon, 10 Jun 2024 20:41:10 +0100 Subject: [PATCH 18/18] disable bm for vs2022 - unexplainable error in the ci, could not reproduce --- .github/workflows/benchmarks.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml index b23a58d7..958f6188 100644 --- a/.github/workflows/benchmarks.yml +++ b/.github/workflows/benchmarks.yml @@ -82,8 +82,8 @@ jobs: # - {std: 17, cxx: vs2019, bt: Release, os: windows-2019, bitlinks: static64 static32} - {std: 20, cxx: vs2019, bt: Release, os: windows-2019, bitlinks: static64 static32} - - {std: 17, cxx: vs2022, bt: Release, os: windows-2022, bitlinks: static64 static32} - - {std: 20, cxx: vs2022, bt: Release, os: windows-2022, bitlinks: static64 static32} + #- {std: 17, cxx: vs2022, bt: Release, os: windows-2022, bitlinks: static64 static32} + #- {std: 20, cxx: vs2022, bt: Release, os: windows-2022, bitlinks: static64 static32} # - {std: 17, cxx: xcode, xcver: 13, bt: Release, os: macos-11, bitlinks: static64} env: {BM: ON, STD: "${{matrix.std}}", CXX_: "${{matrix.cxx}}", BT: "${{matrix.bt}}", BITLINKS: "${{matrix.bitlinks}}", VG: "${{matrix.vg}}", SAN: "${{matrix.san}}", LINT: "${{matrix.lint}}", OS: "${{matrix.os}}"}