From b4175e45930e849a057ed24d57a385ef391fd5e5 Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Fri, 15 Dec 2017 18:57:10 +1100 Subject: [PATCH] uri: add more data fields we now provide more fine grained field extraction from URIs, focusing primarily on the utility of URLs. --- test/uri.cpp | 155 +++++++++++++++++++++++++++++++++++++++++------ test/version.cpp | 17 +++--- uri.cpp.rl | 84 +++++++++++++++++++------ uri.hpp | 56 ++++++++++++++--- 4 files changed, 261 insertions(+), 51 deletions(-) diff --git a/test/uri.cpp b/test/uri.cpp index 0f6e2367..7055c203 100644 --- a/test/uri.cpp +++ b/test/uri.cpp @@ -10,32 +10,149 @@ main (void) static const struct { const char *src; + const char *scheme; + const char *hierarchical; const char *authority; + const char *user; + const char *host; + const char *port; const char *path; const char *query; const char *fragment; } GOOD[] = { - { "ftp://ftp.is.co.za/rfc/rfc1808.txt", "ftp", "ftp.is.co.za", "/rfc/rfc1808.txt", "", "" }, - { "http://www.ietf.org/rfc/rfc2396.txt", "http", "www.ietf.org", "/rfc/rfc2396.txt", "", "" }, - { "ldap://[2001:db8::7]/c=GB?objectClass?one", "ldap", "[2001:db8::7]", "/c=GB", "objectClass?one", "" }, - { "mailto:John.Doe@example.com", "mailto", "", "John.Doe@example.com", "", "" }, - { "news:comp.infosystems.www.servers.unix", "news", "", "comp.infosystems.www.servers.unix", "", "" }, - { "tel:+1-816-555-1212", "tel", "", "+1-816-555-1212", "", "" }, - { "telnet://192.0.2.16:80/", "telnet", "192.0.2.16:80", "/", "", "" }, - { "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", "urn", "", "oasis:names:specification:docbook:dtd:xml:4.1.2", "", "" }, + // examples from rfc3986 + { + .src = "ftp://ftp.is.co.za/rfc/rfc1808.txt", + + .scheme = "ftp", + .hierarchical = "ftp.is.co.za/rfc/rfc1808.txt", + .authority = "ftp.is.co.za", + .user = "", + .host = "ftp.is.co.za", + .port = "", + .path = "/rfc/rfc1808.txt", + .query = "", + .fragment = "" }, + { + .src = "http://www.ietf.org/rfc/rfc2396.txt", + + .scheme = "http", + .hierarchical = "www.ietf.org/rfc/rfc2396.txt", + .authority = "www.ietf.org", + .user = "", + .host = "www.ietf.org", + .port = "", + .path = "/rfc/rfc2396.txt", + .query = "", + .fragment = "" }, + + { + .src = "ldap://[2001:db8::7]/c=GB?objectClass?one", + + .scheme = "ldap", + .hierarchical = "[2001:db8::7]/c=GB", + .authority = "[2001:db8::7]", + .user = "", + .host = "[2001:db8::7]", + .port = "", + .path = "/c=GB", + .query = "objectClass?one", + .fragment = "" }, + + { + .src = "mailto:John.Doe@example.com", + + .scheme= "mailto", + .hierarchical = "John.Doe@example.com", + .authority= "", + .user = "", + .host = "", + .port = "", + .path= "John.Doe@example.com", + .query= "", + .fragment= "" }, + + { + .src = "news:comp.infosystems.www.servers.unix", + .scheme= "news", + .hierarchical = "comp.infosystems.www.servers.unix", + .authority= "", + .user = "", + .host = "", + .port = "", + .path= "comp.infosystems.www.servers.unix", + .query= "", + .fragment= "" }, + + { + .src = "tel:+1-816-555-1212", + + .scheme= "tel", + .hierarchical = "+1-816-555-1212", + .authority= "", + .user = "", + .host = "", + .port = "", + .path= "+1-816-555-1212", + .query= "", + .fragment= "" }, + + { + .src = "telnet://192.0.2.16:80/", + + .scheme= "telnet", + .hierarchical = "192.0.2.16:80/", + .authority= "192.0.2.16:80", + .user = "", + .host = "192.0.2.16", + .port = "80", + .path= "/", + .query= "", + .fragment= "" }, + + { + .src = "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", + + .scheme= "urn", + .hierarchical = "oasis:names:specification:docbook:dtd:xml:4.1.2", + .authority= "", + .user = "", + .host = "", + .port = "", + .path= "oasis:names:specification:docbook:dtd:xml:4.1.2", + .query= "", + .fragment= "" }, + + + // a case with all possible components + { + .src = "https://user:password@example.com:80/path/to?foo=bar#fragment", + + .scheme = "https", + .hierarchical = "user:password@example.com:80/path/to", + .authority = "user:password@example.com:80", + .user = "user:password", + .host = "example.com", + .port = "80", + .path = "/path/to", + .query = "foo=bar", + .fragment = "fragment" }, }; - for (auto i: GOOD) { - tap.expect_nothrow ([i] (void) { util::uri foo (i.src); }, "nothrow parsing '%s'", i.src); + for (auto t: GOOD) { + tap.expect_nothrow ([t] (void) { util::uri foo (t.src); }, "nothrow parsing '%s'", t.src); + util::uri u (t.src); - util::uri u (i.src); - - tap.expect (std::equal (u.get (util::uri::SCHEME).begin (), u.get (util::uri::SCHEME).end (), i.scheme), "extracting scheme for '%s'", i.src); - tap.expect (std::equal (u.get (util::uri::AUTHORITY).begin (), u.get (util::uri::AUTHORITY).end (), i.authority), "extracting authority '%s'", i.src); - tap.expect (std::equal (u.get (util::uri::PATH).begin (), u.get (util::uri::PATH).end (), i.path), "extracting path '%s'", i.src); - tap.expect (std::equal (u.get (util::uri::QUERY).begin (), u.get (util::uri::QUERY).end (), i.query), "extracting query '%s'", i.src); - tap.expect (std::equal (u.get (util::uri::FRAGMENT).begin (), u.get (util::uri::FRAGMENT).end (), i.fragment), "extracting fragment '%s'", i.src); + tap.expect (equal (u.scheme (), t.scheme), "scheme for '%s'", t.src); + tap.expect (equal (u.heirarchical (), t.hierarchical), "hierarchical for '%s'", t.src); + tap.expect (equal (u.authority (), t.authority), "authority for '%s'", t.src); + tap.expect (equal (u.host (), t.host), "host for '%s'", t.src); + tap.expect (equal (u.user (), t.user), "user for '%s'", t.src); + tap.expect (equal (u.port (), t.port), "port for '%s'", t.src); + tap.expect (equal (u.path (), t.path), "path for '%s'", t.src); + tap.expect (equal (u.query (), t.query), "query for '%s'", t.src); + tap.expect (equal (u.fragment (), t.fragment), "fragment for '%s'", t.src); } static const char* BAD[] = { @@ -43,7 +160,9 @@ main (void) }; for (auto i: BAD) - tap.expect_throw ([i] (void) { util::uri foo (i); }, "throw parsing '%s'", i); + tap.expect_throw ( + [i] (void) { util::uri foo (i); }, "throw parsing '%s'", i + ); return tap.status (); diff --git a/test/version.cpp b/test/version.cpp index f8abf25c..9c2b1d29 100644 --- a/test/version.cpp +++ b/test/version.cpp @@ -43,23 +43,26 @@ static const struct { }; -/////////////////////////////////////////////////////////////////////////////// +//----------------------------------------------------------------------------- int main () { util::TAP::logger tap; - for (const auto &i: PARSE_TESTS) { - util::version v (i.str); + for (const auto &t: PARSE_TESTS) { + util::version v (t.str); - tap.expect (std::equal (v.begin (), v.end (), i.parts) && v.release == i.release, "%s", i.msg); + bool parts = std::equal (v.begin (), v.end (), t.parts); + bool release = v.release == t.release; + + tap.expect (parts && release, "%s", t.msg); } for (const auto &t: CMP_TESTS) { - bool eq = t.a == t.b, - lt = t.a < t.b, - gt = t.a > t.b; + const bool eq = t.a == t.b; + const bool lt = t.a < t.b; + const bool gt = t.a > t.b; tap.expect (t.eq == eq, "%s: equality", t.msg); tap.expect (t.lt == lt, "%s: less-than", t.msg); diff --git a/uri.cpp.rl b/uri.cpp.rl index a5f02f23..a60f6f3c 100644 --- a/uri.cpp.rl +++ b/uri.cpp.rl @@ -20,6 +20,7 @@ #include "uri.hpp" #include "debug.hpp" +#include "iterator.hpp" #include #include @@ -32,19 +33,31 @@ action success {__success = true; } action failure {__success = false; } - action scheme_begin { m_views[SCHEME] = { p, nullptr }; } + action scheme_begin { m_views[SCHEME] = { p, p }; } action scheme_end { m_views[SCHEME] = { m_views[SCHEME].begin (), p }; } - action authority_begin { m_views[AUTHORITY] = { p, nullptr}; } + action hier_begin { m_views[HIERARCHICAL] = { p, p }; } + action hier_end { m_views[HIERARCHICAL] = { m_views[HIERARCHICAL].begin (), p }; } + + action user_begin { m_views[USER] = { p, p }; } + action user_end { m_views[USER] = { m_views[USER].begin (), p }; } + + action host_begin { m_views[HOST] = { p, p }; } + action host_end { m_views[HOST] = { m_views[HOST].begin (), p }; } + + action port_begin { m_views[PORT] = { p, p }; } + action port_end { m_views[PORT] = { m_views[PORT].begin (), p }; } + + action authority_begin { m_views[AUTHORITY] = { p, p}; } action authority_end { m_views[AUTHORITY] = { m_views[AUTHORITY].begin (), p }; } - action path_begin { m_views[PATH] = { p, nullptr}; } + action path_begin { m_views[PATH] = { p, p}; } action path_end { m_views[PATH] = { m_views[PATH].begin (), p }; } - action query_begin { m_views[QUERY] = { p, nullptr}; } + action query_begin { m_views[QUERY] = { p, p}; } action query_end { m_views[QUERY] = { m_views[QUERY].begin (), p }; } - action fragment_begin { m_views[FRAGMENT] = { p, nullptr}; } + action fragment_begin { m_views[FRAGMENT] = { p, p}; } action fragment_end { m_views[FRAGMENT] = { m_views[FRAGMENT].begin (), p }; } ## Characters @@ -100,9 +113,18 @@ reserved = gen_delim | sub_delim; ## Authority - port = digit*; - host = ip_literal | ipv4address | reg_name; - userinfo = (unreserved | pct_encoded | sub_delim | ':')*; + port = ( + digit* + ) >port_begin %port_end; + + host = ( + ip_literal | ipv4address | reg_name + ) >host_begin %host_end; + + userinfo = ( + (unreserved | pct_encoded | sub_delim | ':')* + ) >user_begin %user_end; + authority = ( (userinfo '@')? host (':' port)? ) >authority_begin %authority_end; @@ -122,12 +144,13 @@ ) >fragment_begin %fragment_end; ## URI types - hier_part = - '//' authority path_abempty >path_begin %path_end - | path_absolute >path_begin %path_end + hier_part = ( + '//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end + ) | ( + path_absolute >path_begin %path_end | path_rootless >path_begin %path_end | path_empty >path_begin %path_end - ; + ) >hier_begin %hier_end; uri = scheme ':' hier_part ('?' query)? ('#' fragment); @@ -175,7 +198,17 @@ static const util::view NULL_VIEW { nullptr, nullptr }; //----------------------------------------------------------------------------- util::uri::uri (std::string &&_value): - m_views {NULL_VIEW, NULL_VIEW, NULL_VIEW, NULL_VIEW, NULL_VIEW}, + m_views { + NULL_VIEW, + NULL_VIEW, + NULL_VIEW, + NULL_VIEW, + NULL_VIEW, + NULL_VIEW, + NULL_VIEW, + NULL_VIEW, + NULL_VIEW + }, m_value (std::move (_value)) { const char *p = m_value.data (); @@ -196,7 +229,7 @@ util::uri::uri (std::string &&_value): //----------------------------------------------------------------------------- util::view -util::uri::get (util::uri::component c) +util::uri::get (util::uri::component c) const { CHECK_NEQ (c, NUM_COMPONENTS); return m_views[c]; @@ -264,16 +297,21 @@ util::uri::percent_decode (view s) } + //----------------------------------------------------------------------------- std::ostream& util::operator<< (std::ostream &os, util::uri::component c) { switch (c) { - case util::uri::SCHEME: return os << "SCHEME"; - case util::uri::AUTHORITY: return os << "AUTHORITY"; - case util::uri::PATH: return os << "PATH"; - case util::uri::QUERY: return os << "QUERY"; - case util::uri::FRAGMENT: return os << "FRAGMENT"; + case util::uri::SCHEME: return os << "SCHEME"; + case util::uri::HIERARCHICAL: return os << "HIERARCHICAL"; + case util::uri::AUTHORITY: return os << "AUTHORITY"; + case util::uri::USER: return os << "USER"; + case util::uri::HOST: return os << "HOST"; + case util::uri::PORT: return os << "PORT"; + case util::uri::PATH: return os << "PATH"; + case util::uri::QUERY: return os << "QUERY"; + case util::uri::FRAGMENT: return os << "FRAGMENT"; case util::uri::NUM_COMPONENTS: unreachable (); @@ -281,3 +319,11 @@ util::operator<< (std::ostream &os, util::uri::component c) unreachable (); } + + +//----------------------------------------------------------------------------- +std::ostream& +util::operator<< (std::ostream &os, const util::uri &val) +{ + return os << '[' << util::make_infix (val.components ()) << ']'; +} diff --git a/uri.hpp b/uri.hpp index 92f4749a..f8ff25f1 100644 --- a/uri.hpp +++ b/uri.hpp @@ -11,20 +11,33 @@ * See the License for the specific language governing permissions and * limitations under the License. * - * Copyright 2015 Danny Robson + * Copyright 2015, 2017 Danny Robson */ -#ifndef __UTIL_URI_HPP -#define __UTIL_URI_HPP +#ifndef CRUFT_UTIL_URI_HPP +#define CRUFT_UTIL_URI_HPP #include "view.hpp" +#include #include #include namespace util { + // parsing of rfc3986 uniform resource identifiers + // + // does not currently perform normalisation (scheme or protocol), + // comparison, or other associated operations. though these should be + // added in the future. + // + // note that the parsed results may not always conform to expectations + // for some protocols. eg, mailto identifiers are complex to parse + // reliably and would require a specialised parser to be reliable. + // + // not all fields will be present for all protocols (or all instances of + // any given protocol). eg, the "tel" is unlikely to have port numbers. class uri { public: explicit uri (std::string &&); @@ -34,25 +47,54 @@ namespace util { class parse_error : public std::runtime_error { using runtime_error::runtime_error; }; + + // URI: 'https://user:password@example.com:80/path/to?foo=bar#fragment' + // + // SCHEME: 'https' + // HIERARCHICAL: 'user:password@example.com:80/path/to' + // AUTHORITY: 'user:password@example.com:80' + // USER: 'user:password' + // HOST: 'example.com' + // PORT: '80' + // PATH: '/path/to' + // QUERY: 'foo=bar' + // FRAGMENT: 'fragment' enum component { SCHEME, - AUTHORITY, - PATH, + HIERARCHICAL, + AUTHORITY, + USER, + HOST, + PORT, + PATH, QUERY, FRAGMENT, NUM_COMPONENTS }; - view get (component); + view get (component) const; + + view scheme (void) const { return get (SCHEME); } + view heirarchical (void) const { return get (HIERARCHICAL); } + view authority (void) const { return get (AUTHORITY); } + view user (void) const { return get (USER); } + view host (void) const { return get (HOST); } + view port (void) const { return get (PORT); } + view path (void) const { return get (PATH); } + view query (void) const { return get (QUERY); } + view fragment (void) const { return get (FRAGMENT); } + + auto components (void) const noexcept { return m_views; } static std::string percent_decode (view); private: - view m_views[NUM_COMPONENTS]; + std::array, NUM_COMPONENTS> m_views; std::string m_value; }; + std::ostream& operator<< (std::ostream&, const uri&); std::ostream& operator<< (std::ostream&, uri::component); }