From 2488846e38fe7a06e7436dc442f3e63d410c9731 Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Tue, 21 Dec 2021 13:56:28 +1000 Subject: [PATCH] uri: improve hierarchical parsing reliability --- rfc3986.rl | 23 +++++----- test/uri.cpp | 1 + uri.cpp | 122 +++++++++++++++++++++++++++++++++++++++++++++++++-- uri.cpp.rl | 54 ++++++++++++----------- uri.hpp | 20 +++++---- 5 files changed, 173 insertions(+), 47 deletions(-) diff --git a/rfc3986.rl b/rfc3986.rl index 8aa14927..b0ae57d3 100644 --- a/rfc3986.rl +++ b/rfc3986.rl @@ -112,21 +112,24 @@ ## URI types hier_part = ( - '//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end - ) | ( - path_absolute >path_begin %path_end - | path_rootless >path_begin %path_end - | path_empty >path_begin %path_end - ) >hier_begin %hier_end; + ( + '//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end + ) | ( + path_absolute >path_begin %path_end + | path_rootless >path_begin %path_end + | path_empty >path_begin %path_end + ) >hier_begin %hier_end + ); uri = scheme ':' hier_part ('?' query)? ('#' fragment)?; - relative_part = - '//' authority path_abempty >path_begin %path_end - | path_absolute >path_begin %path_end + relative_part = ( + '//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end + ) | ( + path_absolute >path_begin %path_end | path_noscheme >path_begin %path_end | path_empty >path_begin %path_end - ; + ) >hier_begin %hier_end; relative_ref = relative_part ('?' query)? ('#' fragment)?; diff --git a/test/uri.cpp b/test/uri.cpp index 0ed43bab..945b7623 100644 --- a/test/uri.cpp +++ b/test/uri.cpp @@ -175,6 +175,7 @@ test_normalise (cruft::TAP::logger &tap) { struct { char const *init; + char const *expected; } TESTS[] = { // { diff --git a/uri.cpp b/uri.cpp index 9bb2bf8a..c52c5b13 100644 --- a/uri.cpp +++ b/uri.cpp @@ -9,7 +9,74 @@ using cruft::uri; + /////////////////////////////////////////////////////////////////////////////// +#include +#include +#include + +#include + + +//----------------------------------------------------------------------------- +using uric = cruft::uri::component; + + +struct { + cruft::uri::component self; + // prev sibling + std::optional prev; + // parent + std::optional parent; +} COMPONENT_ORDER[cruft::uri::NUM_COMPONENTS] { + { uric::SCHEME, { }, {} }, + { uric::HIERARCHICAL, {uric::SCHEME }, {} }, + { uric::QUERY, {uric::HIERARCHICAL}, {} }, + { uric::FRAGMENT, {uric::QUERY }, {} }, + + { uric::AUTHORITY, { }, {uric::HIERARCHICAL} }, + { uric::PATH, {uric::AUTHORITY}, {uric::HIERARCHICAL} }, + + { uric::USER, { }, {uric::AUTHORITY} }, + { uric::HOST, {uric::USER}, {uric::AUTHORITY} }, + { uric::PORT, {uric::HOST}, {uric::AUTHORITY} }, +}; + + +/////////////////////////////////////////////////////////////////////////////// +cruft::uri::uri (std::string &&_value): + m_views { + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr + }, + m_value (std::move (_value)) +{ + parse (); + + for (auto const &order: COMPONENT_ORDER) { + if (m_views[order.self].data ()) + continue; + + if (order.prev) + m_views[order.self] = { m_views[*order.prev].end (), 0 }; + else if (order.parent) + m_views[order.self] = { m_views[*order.parent].begin (), 0 }; + else + m_views[order.self] = { m_value.data (), 0 }; + } + + CHECK_SANITY (*this); +} + + +//----------------------------------------------------------------------------- cruft::uri::uri (const char *str): uri (std::string (str)) { ; } @@ -33,8 +100,11 @@ uri::uri (uri const &rhs) , m_value (rhs.m_value) { auto const offset = rhs.m_value.data () - m_value.data (); + for (auto &i: m_views) - i = { i.begin () - offset, i.end () - offset }; + i -= offset; + + CHECK_SANITY (*this); } @@ -47,7 +117,6 @@ uri& uri::operator= (uri &&rhs) noexcept } - //----------------------------------------------------------------------------- static std::string combine_components ( @@ -163,6 +232,11 @@ cruft::query_to_map (std::string_view val) std::string cruft::map_to_query (std::map const &val) { + // Test for empty up front so that we can simplify the string + // concatenation below. + if (val.empty ()) + return ""; + std::string res; for (auto const &[k, v]: val) { res += k; @@ -171,6 +245,9 @@ cruft::map_to_query (std::map const &val) res += '&'; } + // The string must be non-zero length because we've tested for the empty + // set initially. + CHECK (!res.empty ()); res.resize (res.size () - 1); return res; } @@ -400,4 +477,43 @@ cruft::normalise (cruft::uri const &src) remove_dot_segments (src.path ()) ); return res; -} \ No newline at end of file +} + + +/////////////////////////////////////////////////////////////////////////////// +template <> +bool +cruft::debug::validator::is_valid (cruft::uri const &val) noexcept +{ + auto const &value = val.value (); + auto const &components = val.components (); + + // Each component should fall within the value string + for (auto const &i: components) { + RETURN_FALSE_UNLESS (i.begin () >= value.data ()); + RETURN_FALSE_UNLESS (i.end () <= value.data () + value.size ()); + } + + // Each component reference memory after the previous component. + // Empty components can be coincident with their siblings. + for (auto const [selfidx, previdx, parentidx]: COMPONENT_ORDER) { + auto const &self = components[selfidx]; + + if (previdx) { + auto const &prev = components[*previdx]; + + RETURN_FALSE_UNLESS (prev.begin () <= self.begin ()); + RETURN_FALSE_UNLESS (prev.end () <= self.end ()); + RETURN_FALSE_UNLESS (prev.end () <= self.begin ()); + } + + if (parentidx) { + auto const &parent = components[*parentidx]; + + RETURN_FALSE_UNLESS (parent.begin () <= self.begin ()); + RETURN_FALSE_UNLESS (parent.end () >= self.end ()); + } + } + + return true; +} diff --git a/uri.cpp.rl b/uri.cpp.rl index 30199d39..98ed18dc 100644 --- a/uri.cpp.rl +++ b/uri.cpp.rl @@ -8,6 +8,8 @@ #include "uri.hpp" +#include + #include #include @@ -26,36 +28,35 @@ using cruft::uri; action success {__success = true; } action failure {__success = false; } - action scheme_begin { m_views[SCHEME] = { p, p }; } - action scheme_end { m_views[SCHEME] = { m_views[SCHEME].begin (), p }; } + action scheme_begin { starts[SCHEME] = p; } + action scheme_end { CHECK (starts[SCHEME]); m_views[SCHEME] = { starts[SCHEME], p }; } - action hier_begin { m_views[HIERARCHICAL] = { p, p }; } - action hier_end { m_views[HIERARCHICAL] = { m_views[HIERARCHICAL].begin (), p }; } + action hier_begin { starts[HIERARCHICAL] = p; } + action hier_end { CHECK (starts[HIERARCHICAL]); m_views[HIERARCHICAL] = { starts[HIERARCHICAL], p }; } - action user_begin { m_views[USER] = { p, p }; } - action user_end { m_views[USER] = { m_views[USER].begin (), p }; } + action user_begin { starts[USER] = p; } + action user_end { CHECK (starts[USER]); m_views[USER] = { starts[USER], p }; } - action host_begin { m_views[HOST] = { p, p }; } - action host_end { m_views[HOST] = { m_views[HOST].begin (), p }; } + action host_begin { starts[HOST] = p; } + action host_end { CHECK (starts[HOST]); m_views[HOST] = { starts[HOST], p }; } - action port_begin { m_views[PORT] = { p, p }; } - action port_end { m_views[PORT] = { m_views[PORT].begin (), p }; } + action port_begin { starts[PORT] = p; } + action port_end { CHECK (starts[PORT]); m_views[PORT] = { starts[PORT], p }; } - action authority_begin { m_views[AUTHORITY] = { p, p}; } - action authority_end { m_views[AUTHORITY] = { m_views[AUTHORITY].begin (), p }; } + action authority_begin { starts[AUTHORITY] = p; } + action authority_end { CHECK (starts[AUTHORITY]); m_views[AUTHORITY] = { starts[AUTHORITY], p }; } - action path_begin { m_views[PATH] = { p, p}; } - action path_end { m_views[PATH] = { m_views[PATH].begin (), p }; } + action path_begin { starts[PATH] = p; } + action path_end { CHECK (starts[PATH]); m_views[PATH] = { starts[PATH], p }; } - action query_begin { m_views[QUERY] = { p, p}; } - action query_end { m_views[QUERY] = { m_views[QUERY].begin (), p }; } + action query_begin { starts[QUERY] = p; } + action query_end { CHECK (starts[QUERY]); m_views[QUERY] = { starts[QUERY], p }; } - - action fragment_begin { m_views[FRAGMENT] = { p, p}; } - action fragment_end { m_views[FRAGMENT] = { m_views[FRAGMENT].begin (), p }; } + action fragment_begin { starts[FRAGMENT] = p; } + action fragment_end { CHECK (starts[FRAGMENT]); m_views[FRAGMENT] = { starts[FRAGMENT], p }; } action uri_begin {} - action uri_end {} + action uri_end {} include rfc3986 'rfc3986.rl'; @@ -69,8 +70,12 @@ using cruft::uri; /////////////////////////////////////////////////////////////////////////////// -cruft::uri::uri (std::string &&_value): - m_views { +void +cruft::uri::parse (void) +{ + char const *starts[NUM_COMPONENTS] = {}; + + m_views = { nullptr, nullptr, nullptr, @@ -80,9 +85,8 @@ cruft::uri::uri (std::string &&_value): nullptr, nullptr, nullptr - }, - m_value (std::move (_value)) -{ + }; + const char *p = m_value.data (); const char *pe = m_value.data () + m_value.size (); const char *eof = pe; diff --git a/uri.hpp b/uri.hpp index 84eebf68..83ee9769 100644 --- a/uri.hpp +++ b/uri.hpp @@ -69,15 +69,15 @@ namespace cruft { // QUERY: 'foo=bar' // FRAGMENT: 'fragment' enum component { - SCHEME, - HIERARCHICAL, - AUTHORITY, - USER, - HOST, - PORT, - PATH, - QUERY, - FRAGMENT, + /* 0 */ SCHEME, + /* 1 */ HIERARCHICAL, + /* 2 */ AUTHORITY, + /* 3 */ USER, + /* 4 */ HOST, + /* 5 */ PORT, + /* 6 */ PATH, + /* 7 */ QUERY, + /* 8 */ FRAGMENT, NUM_COMPONENTS }; @@ -108,6 +108,8 @@ namespace cruft { static std::string percent_decode (view); private: + void parse (void); + std::array, NUM_COMPONENTS> m_views; std::string m_value; };