uri: add more data fields

we now provide more fine grained field extraction from URIs, focusing
primarily on the utility of URLs.
This commit is contained in:
Danny Robson 2017-12-15 18:57:10 +11:00
parent c1036d8337
commit b4175e4593
4 changed files with 261 additions and 51 deletions

View File

@ -10,32 +10,149 @@ main (void)
static const struct { static const struct {
const char *src; const char *src;
const char *scheme; const char *scheme;
const char *hierarchical;
const char *authority; const char *authority;
const char *user;
const char *host;
const char *port;
const char *path; const char *path;
const char *query; const char *query;
const char *fragment; const char *fragment;
} GOOD[] = { } GOOD[] = {
{ "ftp://ftp.is.co.za/rfc/rfc1808.txt", "ftp", "ftp.is.co.za", "/rfc/rfc1808.txt", "", "" }, // examples from rfc3986
{ "http://www.ietf.org/rfc/rfc2396.txt", "http", "www.ietf.org", "/rfc/rfc2396.txt", "", "" }, {
{ "ldap://[2001:db8::7]/c=GB?objectClass?one", "ldap", "[2001:db8::7]", "/c=GB", "objectClass?one", "" }, .src = "ftp://ftp.is.co.za/rfc/rfc1808.txt",
{ "mailto:John.Doe@example.com", "mailto", "", "John.Doe@example.com", "", "" },
{ "news:comp.infosystems.www.servers.unix", "news", "", "comp.infosystems.www.servers.unix", "", "" }, .scheme = "ftp",
{ "tel:+1-816-555-1212", "tel", "", "+1-816-555-1212", "", "" }, .hierarchical = "ftp.is.co.za/rfc/rfc1808.txt",
{ "telnet://192.0.2.16:80/", "telnet", "192.0.2.16:80", "/", "", "" }, .authority = "ftp.is.co.za",
{ "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", "urn", "", "oasis:names:specification:docbook:dtd:xml:4.1.2", "", "" }, .user = "",
.host = "ftp.is.co.za",
.port = "",
.path = "/rfc/rfc1808.txt",
.query = "",
.fragment = "" },
{
.src = "http://www.ietf.org/rfc/rfc2396.txt",
.scheme = "http",
.hierarchical = "www.ietf.org/rfc/rfc2396.txt",
.authority = "www.ietf.org",
.user = "",
.host = "www.ietf.org",
.port = "",
.path = "/rfc/rfc2396.txt",
.query = "",
.fragment = "" },
{
.src = "ldap://[2001:db8::7]/c=GB?objectClass?one",
.scheme = "ldap",
.hierarchical = "[2001:db8::7]/c=GB",
.authority = "[2001:db8::7]",
.user = "",
.host = "[2001:db8::7]",
.port = "",
.path = "/c=GB",
.query = "objectClass?one",
.fragment = "" },
{
.src = "mailto:John.Doe@example.com",
.scheme= "mailto",
.hierarchical = "John.Doe@example.com",
.authority= "",
.user = "",
.host = "",
.port = "",
.path= "John.Doe@example.com",
.query= "",
.fragment= "" },
{
.src = "news:comp.infosystems.www.servers.unix",
.scheme= "news",
.hierarchical = "comp.infosystems.www.servers.unix",
.authority= "",
.user = "",
.host = "",
.port = "",
.path= "comp.infosystems.www.servers.unix",
.query= "",
.fragment= "" },
{
.src = "tel:+1-816-555-1212",
.scheme= "tel",
.hierarchical = "+1-816-555-1212",
.authority= "",
.user = "",
.host = "",
.port = "",
.path= "+1-816-555-1212",
.query= "",
.fragment= "" },
{
.src = "telnet://192.0.2.16:80/",
.scheme= "telnet",
.hierarchical = "192.0.2.16:80/",
.authority= "192.0.2.16:80",
.user = "",
.host = "192.0.2.16",
.port = "80",
.path= "/",
.query= "",
.fragment= "" },
{
.src = "urn:oasis:names:specification:docbook:dtd:xml:4.1.2",
.scheme= "urn",
.hierarchical = "oasis:names:specification:docbook:dtd:xml:4.1.2",
.authority= "",
.user = "",
.host = "",
.port = "",
.path= "oasis:names:specification:docbook:dtd:xml:4.1.2",
.query= "",
.fragment= "" },
// a case with all possible components
{
.src = "https://user:password@example.com:80/path/to?foo=bar#fragment",
.scheme = "https",
.hierarchical = "user:password@example.com:80/path/to",
.authority = "user:password@example.com:80",
.user = "user:password",
.host = "example.com",
.port = "80",
.path = "/path/to",
.query = "foo=bar",
.fragment = "fragment" },
}; };
for (auto i: GOOD) { for (auto t: GOOD) {
tap.expect_nothrow ([i] (void) { util::uri foo (i.src); }, "nothrow parsing '%s'", i.src); tap.expect_nothrow ([t] (void) { util::uri foo (t.src); }, "nothrow parsing '%s'", t.src);
util::uri u (t.src);
util::uri u (i.src); tap.expect (equal (u.scheme (), t.scheme), "scheme for '%s'", t.src);
tap.expect (equal (u.heirarchical (), t.hierarchical), "hierarchical for '%s'", t.src);
tap.expect (std::equal (u.get (util::uri::SCHEME).begin (), u.get (util::uri::SCHEME).end (), i.scheme), "extracting scheme for '%s'", i.src); tap.expect (equal (u.authority (), t.authority), "authority for '%s'", t.src);
tap.expect (std::equal (u.get (util::uri::AUTHORITY).begin (), u.get (util::uri::AUTHORITY).end (), i.authority), "extracting authority '%s'", i.src); tap.expect (equal (u.host (), t.host), "host for '%s'", t.src);
tap.expect (std::equal (u.get (util::uri::PATH).begin (), u.get (util::uri::PATH).end (), i.path), "extracting path '%s'", i.src); tap.expect (equal (u.user (), t.user), "user for '%s'", t.src);
tap.expect (std::equal (u.get (util::uri::QUERY).begin (), u.get (util::uri::QUERY).end (), i.query), "extracting query '%s'", i.src); tap.expect (equal (u.port (), t.port), "port for '%s'", t.src);
tap.expect (std::equal (u.get (util::uri::FRAGMENT).begin (), u.get (util::uri::FRAGMENT).end (), i.fragment), "extracting fragment '%s'", i.src); tap.expect (equal (u.path (), t.path), "path for '%s'", t.src);
tap.expect (equal (u.query (), t.query), "query for '%s'", t.src);
tap.expect (equal (u.fragment (), t.fragment), "fragment for '%s'", t.src);
} }
static const char* BAD[] = { static const char* BAD[] = {
@ -43,7 +160,9 @@ main (void)
}; };
for (auto i: BAD) for (auto i: BAD)
tap.expect_throw<util::uri::parse_error> ([i] (void) { util::uri foo (i); }, "throw parsing '%s'", i); tap.expect_throw<util::uri::parse_error> (
[i] (void) { util::uri foo (i); }, "throw parsing '%s'", i
);
return tap.status (); return tap.status ();

View File

@ -43,23 +43,26 @@ static const struct {
}; };
/////////////////////////////////////////////////////////////////////////////// //-----------------------------------------------------------------------------
int int
main () { main () {
util::TAP::logger tap; util::TAP::logger tap;
for (const auto &i: PARSE_TESTS) { for (const auto &t: PARSE_TESTS) {
util::version v (i.str); util::version v (t.str);
tap.expect (std::equal (v.begin (), v.end (), i.parts) && v.release == i.release, "%s", i.msg); bool parts = std::equal (v.begin (), v.end (), t.parts);
bool release = v.release == t.release;
tap.expect (parts && release, "%s", t.msg);
} }
for (const auto &t: CMP_TESTS) { for (const auto &t: CMP_TESTS) {
bool eq = t.a == t.b, const bool eq = t.a == t.b;
lt = t.a < t.b, const bool lt = t.a < t.b;
gt = t.a > t.b; const bool gt = t.a > t.b;
tap.expect (t.eq == eq, "%s: equality", t.msg); tap.expect (t.eq == eq, "%s: equality", t.msg);
tap.expect (t.lt == lt, "%s: less-than", t.msg); tap.expect (t.lt == lt, "%s: less-than", t.msg);

View File

@ -20,6 +20,7 @@
#include "uri.hpp" #include "uri.hpp"
#include "debug.hpp" #include "debug.hpp"
#include "iterator.hpp"
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
@ -32,19 +33,31 @@
action success {__success = true; } action success {__success = true; }
action failure {__success = false; } action failure {__success = false; }
action scheme_begin { m_views[SCHEME] = { p, nullptr }; } action scheme_begin { m_views[SCHEME] = { p, p }; }
action scheme_end { m_views[SCHEME] = { m_views[SCHEME].begin (), p }; } action scheme_end { m_views[SCHEME] = { m_views[SCHEME].begin (), p }; }
action authority_begin { m_views[AUTHORITY] = { p, nullptr}; } action hier_begin { m_views[HIERARCHICAL] = { p, p }; }
action hier_end { m_views[HIERARCHICAL] = { m_views[HIERARCHICAL].begin (), p }; }
action user_begin { m_views[USER] = { p, p }; }
action user_end { m_views[USER] = { m_views[USER].begin (), p }; }
action host_begin { m_views[HOST] = { p, p }; }
action host_end { m_views[HOST] = { m_views[HOST].begin (), p }; }
action port_begin { m_views[PORT] = { p, p }; }
action port_end { m_views[PORT] = { m_views[PORT].begin (), p }; }
action authority_begin { m_views[AUTHORITY] = { p, p}; }
action authority_end { m_views[AUTHORITY] = { m_views[AUTHORITY].begin (), p }; } action authority_end { m_views[AUTHORITY] = { m_views[AUTHORITY].begin (), p }; }
action path_begin { m_views[PATH] = { p, nullptr}; } action path_begin { m_views[PATH] = { p, p}; }
action path_end { m_views[PATH] = { m_views[PATH].begin (), p }; } action path_end { m_views[PATH] = { m_views[PATH].begin (), p }; }
action query_begin { m_views[QUERY] = { p, nullptr}; } action query_begin { m_views[QUERY] = { p, p}; }
action query_end { m_views[QUERY] = { m_views[QUERY].begin (), p }; } action query_end { m_views[QUERY] = { m_views[QUERY].begin (), p }; }
action fragment_begin { m_views[FRAGMENT] = { p, nullptr}; } action fragment_begin { m_views[FRAGMENT] = { p, p}; }
action fragment_end { m_views[FRAGMENT] = { m_views[FRAGMENT].begin (), p }; } action fragment_end { m_views[FRAGMENT] = { m_views[FRAGMENT].begin (), p }; }
## Characters ## Characters
@ -100,9 +113,18 @@
reserved = gen_delim | sub_delim; reserved = gen_delim | sub_delim;
## Authority ## Authority
port = digit*; port = (
host = ip_literal | ipv4address | reg_name; digit*
userinfo = (unreserved | pct_encoded | sub_delim | ':')*; ) >port_begin %port_end;
host = (
ip_literal | ipv4address | reg_name
) >host_begin %host_end;
userinfo = (
(unreserved | pct_encoded | sub_delim | ':')*
) >user_begin %user_end;
authority = ( authority = (
(userinfo '@')? host (':' port)? (userinfo '@')? host (':' port)?
) >authority_begin %authority_end; ) >authority_begin %authority_end;
@ -122,12 +144,13 @@
) >fragment_begin %fragment_end; ) >fragment_begin %fragment_end;
## URI types ## URI types
hier_part = hier_part = (
'//' authority path_abempty >path_begin %path_end '//' (authority path_abempty >path_begin %path_end) >hier_begin %hier_end
| path_absolute >path_begin %path_end ) | (
path_absolute >path_begin %path_end
| path_rootless >path_begin %path_end | path_rootless >path_begin %path_end
| path_empty >path_begin %path_end | path_empty >path_begin %path_end
; ) >hier_begin %hier_end;
uri = scheme ':' hier_part ('?' query)? ('#' fragment); uri = scheme ':' hier_part ('?' query)? ('#' fragment);
@ -175,7 +198,17 @@ static const util::view<const char*> NULL_VIEW { nullptr, nullptr };
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
util::uri::uri (std::string &&_value): util::uri::uri (std::string &&_value):
m_views {NULL_VIEW, NULL_VIEW, NULL_VIEW, NULL_VIEW, NULL_VIEW}, m_views {
NULL_VIEW,
NULL_VIEW,
NULL_VIEW,
NULL_VIEW,
NULL_VIEW,
NULL_VIEW,
NULL_VIEW,
NULL_VIEW,
NULL_VIEW
},
m_value (std::move (_value)) m_value (std::move (_value))
{ {
const char *p = m_value.data (); const char *p = m_value.data ();
@ -196,7 +229,7 @@ util::uri::uri (std::string &&_value):
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
util::view<const char*> util::view<const char*>
util::uri::get (util::uri::component c) util::uri::get (util::uri::component c) const
{ {
CHECK_NEQ (c, NUM_COMPONENTS); CHECK_NEQ (c, NUM_COMPONENTS);
return m_views[c]; return m_views[c];
@ -264,16 +297,21 @@ util::uri::percent_decode (view<const char*> s)
} }
//----------------------------------------------------------------------------- //-----------------------------------------------------------------------------
std::ostream& std::ostream&
util::operator<< (std::ostream &os, util::uri::component c) util::operator<< (std::ostream &os, util::uri::component c)
{ {
switch (c) { switch (c) {
case util::uri::SCHEME: return os << "SCHEME"; case util::uri::SCHEME: return os << "SCHEME";
case util::uri::AUTHORITY: return os << "AUTHORITY"; case util::uri::HIERARCHICAL: return os << "HIERARCHICAL";
case util::uri::PATH: return os << "PATH"; case util::uri::AUTHORITY: return os << "AUTHORITY";
case util::uri::QUERY: return os << "QUERY"; case util::uri::USER: return os << "USER";
case util::uri::FRAGMENT: return os << "FRAGMENT"; case util::uri::HOST: return os << "HOST";
case util::uri::PORT: return os << "PORT";
case util::uri::PATH: return os << "PATH";
case util::uri::QUERY: return os << "QUERY";
case util::uri::FRAGMENT: return os << "FRAGMENT";
case util::uri::NUM_COMPONENTS: case util::uri::NUM_COMPONENTS:
unreachable (); unreachable ();
@ -281,3 +319,11 @@ util::operator<< (std::ostream &os, util::uri::component c)
unreachable (); unreachable ();
} }
//-----------------------------------------------------------------------------
std::ostream&
util::operator<< (std::ostream &os, const util::uri &val)
{
return os << '[' << util::make_infix (val.components ()) << ']';
}

56
uri.hpp
View File

@ -11,20 +11,33 @@
* See the License for the specific language governing permissions and * See the License for the specific language governing permissions and
* limitations under the License. * limitations under the License.
* *
* Copyright 2015 Danny Robson <danny@nerdcruft.net> * Copyright 2015, 2017 Danny Robson <danny@nerdcruft.net>
*/ */
#ifndef __UTIL_URI_HPP #ifndef CRUFT_UTIL_URI_HPP
#define __UTIL_URI_HPP #define CRUFT_UTIL_URI_HPP
#include "view.hpp" #include "view.hpp"
#include <array>
#include <string> #include <string>
#include <stdexcept> #include <stdexcept>
namespace util { namespace util {
// parsing of rfc3986 uniform resource identifiers
//
// does not currently perform normalisation (scheme or protocol),
// comparison, or other associated operations. though these should be
// added in the future.
//
// note that the parsed results may not always conform to expectations
// for some protocols. eg, mailto identifiers are complex to parse
// reliably and would require a specialised parser to be reliable.
//
// not all fields will be present for all protocols (or all instances of
// any given protocol). eg, the "tel" is unlikely to have port numbers.
class uri { class uri {
public: public:
explicit uri (std::string &&); explicit uri (std::string &&);
@ -34,25 +47,54 @@ namespace util {
class parse_error : public std::runtime_error class parse_error : public std::runtime_error
{ using runtime_error::runtime_error; }; { using runtime_error::runtime_error; };
// URI: 'https://user:password@example.com:80/path/to?foo=bar#fragment'
//
// SCHEME: 'https'
// HIERARCHICAL: 'user:password@example.com:80/path/to'
// AUTHORITY: 'user:password@example.com:80'
// USER: 'user:password'
// HOST: 'example.com'
// PORT: '80'
// PATH: '/path/to'
// QUERY: 'foo=bar'
// FRAGMENT: 'fragment'
enum component { enum component {
SCHEME, SCHEME,
AUTHORITY, HIERARCHICAL,
PATH, AUTHORITY,
USER,
HOST,
PORT,
PATH,
QUERY, QUERY,
FRAGMENT, FRAGMENT,
NUM_COMPONENTS NUM_COMPONENTS
}; };
view<const char*> get (component); view<const char*> get (component) const;
view<const char*> scheme (void) const { return get (SCHEME); }
view<const char*> heirarchical (void) const { return get (HIERARCHICAL); }
view<const char*> authority (void) const { return get (AUTHORITY); }
view<const char*> user (void) const { return get (USER); }
view<const char*> host (void) const { return get (HOST); }
view<const char*> port (void) const { return get (PORT); }
view<const char*> path (void) const { return get (PATH); }
view<const char*> query (void) const { return get (QUERY); }
view<const char*> fragment (void) const { return get (FRAGMENT); }
auto components (void) const noexcept { return m_views; }
static std::string percent_decode (view<const char*>); static std::string percent_decode (view<const char*>);
private: private:
view<const char*> m_views[NUM_COMPONENTS]; std::array<view<const char*>, NUM_COMPONENTS> m_views;
std::string m_value; std::string m_value;
}; };
std::ostream& operator<< (std::ostream&, const uri&);
std::ostream& operator<< (std::ostream&, uri::component); std::ostream& operator<< (std::ostream&, uri::component);
} }