libcruft-util/uri.cpp

526 lines
14 KiB
C++
Raw Normal View History

#include "./uri.hpp"
#include "./cast.hpp"
#include "./string.hpp"
#include "./debug/panic.hpp"
#include "./debug/assert.hpp"
#include "./debug/warn.hpp"
#include <ostream>
using cruft::uri;
///////////////////////////////////////////////////////////////////////////////
static void
minimal_percent_encode (std::string &str)
{
for (ssize_t i = 0; i < std::ssize (str); ++i) {
if (str[i] == ' ')
str.replace (i, 1, "%20");
}
}
///////////////////////////////////////////////////////////////////////////////
cruft::uri::uri (std::string &&_value):
m_offsets {},
m_value (std::move (_value))
{
minimal_percent_encode (m_value);
parse ();
2022-07-19 17:03:39 +10:00
// Ensure the offsets are well-ordered.
//
// We want the QUERY offsets to come after the PATH offsets even if there
// is no QUERY data (or at least be placed coincident with the PATH
// offsets).
for (int i = 1; i < NUM_COMPONENTS; ++i) {
2022-07-19 17:03:39 +10:00
// We very specifically do not want to move any offset that has a
// non-zero (ie, found) offset.
//
// If we do, this prevents us from computing `pqf` and retaining a
// trailing "?" for an empty query.
if (m_offsets[i].second or m_offsets[i].first)
continue;
2022-07-19 17:03:39 +10:00
m_offsets[i] = {
m_offsets[i - 1].second,
m_offsets[i - 1].second
};
}
CHECK_SANITY (*this);
}
//-----------------------------------------------------------------------------
cruft::uri::uri (const char *str):
uri (std::string (str))
{ ; }
//-----------------------------------------------------------------------------
uri::uri (cruft::view<const char *> _value):
uri (std::string (_value.begin (), _value.end ()))
{ ; }
//-----------------------------------------------------------------------------
uri::uri (const std::string &_value):
uri (std::string (_value))
{ ; }
//-----------------------------------------------------------------------------
static std::string
combine_components (
std::string_view scheme,
std::string_view authority,
std::string_view path,
std::string_view query,
std::string_view fragment
) {
std::string res;
res.reserve (
scheme.size () +
strlen ("://") + authority.size () +
path.size () +
strlen ("?") + query.size () +
strlen ("#") + fragment.size ()
);
if (!scheme.empty ()) {
res += scheme;
res += ":";
}
if (!authority.empty ()) {
res += "//";
res += authority;
}
res += path;
if (!query.empty ()) {
res += "?";
res += query;
}
if (!fragment.empty ()) {
res += "#";
res += fragment;
}
return res;
}
//-----------------------------------------------------------------------------
uri::uri (
std::string_view scheme,
std::string_view authority,
std::string_view path,
std::string_view query,
std::string_view fragment
) : uri (combine_components (scheme, authority, path, query, fragment))
{ ; }
///////////////////////////////////////////////////////////////////////////////
std::string_view
uri::heirarchical (void) const&
{
for (int i = USER; i <= PATH; ++i)
if (m_offsets[i].first != m_offsets[i].second)
return {
m_value.data () + m_offsets[ i].first,
m_value.data () + m_offsets[PATH].second
};
return { m_value.data () + m_offsets[USER].first, 0 };
}
//-----------------------------------------------------------------------------
std::string_view
uri::authority (void) const&
{
for (int i = USER; i <= PORT; ++i)
if (m_offsets[i].first != m_offsets[i].second)
return {
m_value.data () + m_offsets[ i].first,
m_value.data () + m_offsets[PORT].second
};
return {
m_value.data () + m_offsets[USER].first, 0
};
}
///////////////////////////////////////////////////////////////////////////////
std::string_view
uri::get (component c) const&
{
CHECK_INDEX (c, NUM_COMPONENTS);
return {
m_value.data () + m_offsets[c].first,
m_value.data () + m_offsets[c].second,
};
}
//-----------------------------------------------------------------------------
void
uri::set (component c, std::string_view val)
{
std::ptrdiff_t const diff = val.size () - (m_offsets[c].second - m_offsets[c].first);
m_value.replace (
m_offsets[c].first,
m_offsets[c].second - m_offsets[c].first,
val
);
m_offsets[c].second = cruft::cast::lossless<int> (m_offsets[c].first + val.size ());
for (int i = c + 1; i != component::NUM_COMPONENTS; ++i) {
m_offsets[i].first += diff;
m_offsets[i].second += diff;
}
CHECK_SANITY (*this);
}
//-----------------------------------------------------------------------------
void uri::clear_fragment ()
{
if (m_offsets[FRAGMENT].first == m_offsets[FRAGMENT].second)
return;
m_value.erase (
m_offsets[FRAGMENT].first,
m_offsets[FRAGMENT].second - m_offsets[FRAGMENT].first
);
CHECK (m_value.back () == '#');
m_value.pop_back ();
2022-02-22 15:18:03 +11:00
// Don't set this to nullptr. Other code assumes these views aren't null
// (eg, for offsetting during copy construction).
m_offsets[FRAGMENT].first = m_offsets[FRAGMENT].second = m_offsets[FRAGMENT - 1].second;
}
///////////////////////////////////////////////////////////////////////////////
std::vector<std::pair<std::string, std::string>>
cruft::query_to_vector (std::string_view val)
{
std::vector<std::pair<std::string, std::string>> res;
for (auto const tok: cruft::tokeniser (val, '&')) {
auto const &[k, v] = cruft::split_on (tok, '=');
res.emplace_back (
std::string (k.begin (), k.size ()),
std::string (v.begin (), v.size ())
);
}
return res;
}
//-----------------------------------------------------------------------------
std::string
cruft::vector_to_query (std::vector<std::pair<std::string, std::string>> const &val)
{
// Test for empty up front so that we can simplify the string
// concatenation below.
if (val.empty ())
return "";
std::string res;
for (auto const &[k, v]: val) {
res += k;
res += '=';
res += v;
res += '&';
}
// The string must be non-zero length because we've tested for the empty
// set initially.
CHECK (!res.empty ());
res.resize (res.size () - 1);
return res;
}
///////////////////////////////////////////////////////////////////////////////
std::array<std::string_view, cruft::uri::NUM_COMPONENTS>
uri::components () const& noexcept
{
std::array<std::string_view, cruft::uri::NUM_COMPONENTS> res;
for (int i = 0; i != NUM_COMPONENTS; ++i)
res[i] = get (uri::component (i));
return res;
}
///////////////////////////////////////////////////////////////////////////////
bool
cruft::operator== (cruft::uri const &a, cruft::uri const &b) noexcept
{
return a.value () == b.value ();
}
///////////////////////////////////////////////////////////////////////////////
static uint8_t
hex_to_uint (char c)
{
if (c >= '0' && c <= '9')
return c - '0';
if (c >= 'A' && c <= 'F')
return c - 'A' + 10;
if (c >= 'a' && c <= 'f')
return c - 'a' + 10;
unreachable ();
}
//-----------------------------------------------------------------------------
std::string
cruft::uri::percent_decode (view<const char*> s)
{
if (s.size () == 0)
return std::string ();
// Early check for late percent-encoding so we can simplify the decode loop
{
auto tail = std::find (s.size () < 3 ? s.begin ()
: s.end () - 2,
s.end (),
'%');
if (tail != s.end ())
throw parse_error ("triple overlaps end");
}
// Allocate and size a potentially overlong output string. This allows us
// to copy directly into its buffer. We'll shorten it at the end.
std::string out;
out.resize (s.size ());
// Find the percent, copy until that, decode, advance, repeat.
auto out_cursor = out.begin ();
for (auto i = s.begin (); i < s.end (); ++i) {
auto cursor = std::find (i, s.end (), '%');
if (cursor == s.end ()) {
out_cursor = std::copy (i, s.end (), out_cursor);
break;
}
out_cursor = std::copy (i, cursor, out_cursor);
*out_cursor = hex_to_uint (cursor[1]) << 4 | hex_to_uint(cursor[2]);
i += 3;
}
out.resize (out.end () - out_cursor);
return out;
}
///////////////////////////////////////////////////////////////////////////////
std::ostream&
cruft::operator<< (std::ostream &os, cruft::uri::component c)
{
switch (c) {
case cruft::uri::SCHEME: return os << "SCHEME";
case cruft::uri::USER: return os << "USER";
case cruft::uri::HOST: return os << "HOST";
case cruft::uri::PORT: return os << "PORT";
case cruft::uri::PATH: return os << "PATH";
case cruft::uri::QUERY: return os << "QUERY";
case cruft::uri::FRAGMENT: return os << "FRAGMENT";
case cruft::uri::NUM_COMPONENTS:
unreachable ();
}
unreachable ();
}
//-----------------------------------------------------------------------------
std::ostream&
cruft::operator<< (std::ostream &os, cruft::uri const &val)
{
return os << val.value ();
}
///////////////////////////////////////////////////////////////////////////////
static std::string
merge (std::string_view base, std::string_view relative)
{
auto const slash = std::find (std::rbegin (base), std::rend (base), '/');
if (slash == std::rend (base))
return fmt::format ("/{}", relative);
return fmt::format (
"{}/{}",
std::string_view (base.begin (), std::distance (base.begin (), slash.base ()) - 1),
relative
);
}
//-----------------------------------------------------------------------------
static std::string
remove_dot_segments (std::string_view path)
{
std::vector<std::string_view> src;
for (auto const &i: cruft::tokeniser (path, '/'))
src.push_back (std::string_view (i.begin (), i.size ()));
bool const absolute = !path.empty () && path[0] == '/';
bool const trailing = !src.empty () && (src.back () == "" or src.back () == "." or src.back () == "..");
std::vector<std::string_view> dst;
for (auto const &i: src) {
if (i == "..") {
if (!dst.empty ()) {
if (dst.back () == "..")
dst.push_back (i);
else
dst.pop_back ();
}
} else if (i != "." and i != "") {
dst.push_back (i);
}
}
std::string res = absolute ? "/" : "";
for (auto const &i: dst) {
res.append (i);
res.append ("/");
}
if (!trailing)
if (!res.empty ())
res.pop_back ();
return res;
}
//-----------------------------------------------------------------------------
// Uniform Resource Identifier (URI): Generic Syntax
// https://www.ietf.org/rfc/rfc3986.txt
// 5.2. Relative Resolution
cruft::uri
cruft::resolve (cruft::uri const &base, cruft::uri const &relative)
{
using namespace std::string_literals;
if (!relative.scheme ().empty ())
return relative;
std::string_view scheme = base.scheme ();
std::string_view authority = base.authority ();
std::string path = std::string (base.path ());
std::string_view query = base.query ();
std::string_view fragment = base.fragment ();
if (!relative.scheme ().empty ()) {
scheme = relative.scheme ();
authority = relative.authority ();
path = remove_dot_segments (relative.path ());
query = relative.query ();
} else {
if (!relative.authority ().empty ()) {
authority = relative.authority ();
path = remove_dot_segments(relative.path ());
query = relative.query ();
} else {
if (relative.path ().empty ()) {
path = base.path ();
if (!relative.query ().empty ())
query = relative.query ();
else
query = base.query ();
} else {
if (relative.path ().starts_with ("/"))
path = remove_dot_segments(relative.path ());
else {
if (!base.authority ().empty () and base.path ().empty ())
path = fmt::format ("/{}", relative.path ());
else
path = merge (base.path (), relative.path ());
path = remove_dot_segments (path);
}
query = relative.query ();
}
authority = base.authority ();
}
scheme = base.scheme ();
}
fragment = relative.fragment ();
return { scheme, authority, path, query, fragment };
}
///////////////////////////////////////////////////////////////////////////////
cruft::uri
cruft::normalise (cruft::uri const &src)
{
auto res = src;
res.set (
uri::component::PATH,
remove_dot_segments (src.path ())
);
return res;
}
///////////////////////////////////////////////////////////////////////////////
template <>
bool
cruft::debug::validator<cruft::uri>::is_valid (cruft::uri const &val) noexcept
{
auto const &value = val.value ();
auto const &components = val.components ();
// Each component should fall within the value string
for (auto const &i: components) {
RETURN_FALSE_UNLESS (i.begin () >= value.data ());
RETURN_FALSE_UNLESS (i.end () <= value.data () + value.size ());
}
for (int i = 1; i < uri::NUM_COMPONENTS; ++i) {
auto const &a = components[i - 1];
auto const &b = components[i ];
// All of our data needs to occur before the next component
//
// It may not be coincident with a sibling if there is padding between
// (as is the case directly after the scheme).
RETURN_FALSE_UNLESS (a.begin () <= b.begin ());
RETURN_FALSE_UNLESS (a.end () <= b.end ());
RETURN_FALSE_UNLESS (a.end () <= b.begin ());
}
return true;
}