libcruft-util/uri.cpp.rl

/*
 * This file is part of libgim.
 *
 * libgim is free software: you can redistribute it and/or modify it under the
 * terms of the GNU General Public License as published by the Free Software
 * Foundation, either version 3 of the License, or (at your option) any later
 * version.
 *
 * libgim is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
 * details.
 *
 * You should have received a copy of the GNU General Public License
 * along with libgim.  If not, see <http://www.gnu.org/licenses/>.
 *
 * Copyright 2015 Danny Robson <danny@nerdcruft.net>
 */

#include "uri.hpp"

#include "debug.hpp"
#include "iterator.hpp"

#include <algorithm>
#include <iostream>


%%{
    machine uri;

    action trace { if (0) std::cerr << *p; }
    action success {__success = true; }
    action failure {__success = false; }

    action scheme_begin { m_views[SCHEME] = { p, p }; }
    action scheme_end   { m_views[SCHEME] = { m_views[SCHEME].begin (), p }; }

    action hier_begin { m_views[HIERARCHICAL] = { p, p }; }
    action hier_end   { m_views[HIERARCHICAL] = { m_views[HIERARCHICAL].begin (), p }; }

    action user_begin { m_views[USER] = { p, p }; }
    action user_end   { m_views[USER] = { m_views[USER].begin (), p }; }

    action host_begin { m_views[HOST] = { p, p }; }
    action host_end   { m_views[HOST] = { m_views[HOST].begin (), p }; }

    action port_begin { m_views[PORT] = { p, p }; }
    action port_end   { m_views[PORT] = { m_views[PORT].begin (), p }; }

    action authority_begin { m_views[AUTHORITY] = { p, p}; }
    action authority_end   { m_views[AUTHORITY] = { m_views[AUTHORITY].begin (), p }; }

    action path_begin { m_views[PATH] = { p, p}; }
    action path_end   { m_views[PATH] = { m_views[PATH].begin (), p }; }

    action query_begin { m_views[QUERY] = { p, p}; }
    action query_end   { m_views[QUERY] = { m_views[QUERY].begin (), p }; }

    action fragment_begin { m_views[FRAGMENT] = { p, p}; }
    action fragment_end   { m_views[FRAGMENT] = { m_views[FRAGMENT].begin (), p }; }

    ## Characters
    unreserved = alpha | digit | "-" | "." | "_" | "~";
    pct_encoded = '%' xdigit xdigit;
    gen_delim = ":" | "/" | "?" | "#" | "[" | "]" | "@";
    sub_delim = "!" | "$" | "&" | "'" | "(" | ")" | "*" | "+" | "," | ";" | "=";
    pchar = unreserved | pct_encoded | sub_delim | ':' | '@';

    ## Atoms
    reg_name = (unreserved | pct_encoded | sub_delim)*;

    ## IP-address
    ## Note: The address grammar is embedded in the RFC so we embed it too
    dec_octet = digit | [1-9] digit | '1' digit{2} | '2' [0-4] digit | '25' [0-5];

    ipv4address = dec_octet '.' dec_octet '.' dec_octet '.' dec_octet;

    h16  =  xdigit{1,4};
    ls32 = (h16 ":" h16) | ipv4address;

    ipv6address =
                                      (h16 ":"){6} ls32
        |                        "::" (h16 ":"){5} ls32
        | (                h16)? "::" (h16 ":"){4} ls32
        | ((h16 ":"){0,1}  h16)? "::" (h16 ":"){3} ls32
        | ((h16 ":"){0,2}  h16)? "::" (h16 ":"){2} ls32
        | ((h16 ":"){0,3}  h16)? "::" (h16 ":"){1} ls32
        | ((h16 ":"){0,4}  h16)? "::"              ls32
        | ((h16 ":"){0,5}  h16)? "::"              h16
        | ((h16 ":"){0,6}  h16)? "::"
    ;

    ipvfuture = 'v' xdigit{1,} '.' (unreserved | sub_delim | ':'){1,};
    ip_literal = '[' (ipv6address | ipvfuture) ']';

    ## Segments
    segment = pchar*;
    segment_nz = pchar{1,};
    segment_nz_nc = (unreserved | pct_encoded | sub_delim | '@'){1,};

    ## Paths
    path_abempty = ('/' segment)*;
    path_absolute = '/' (segment_nz ('/' segment)*)?;
    path_noscheme = segment_nz_nc ('/' segment)*;
    path_rootless = segment_nz ('/' segment)*;
    path_empty = '0' pchar;

    path = (
        path_abempty | path_absolute | path_noscheme | path_rootless | path_empty
    );

    reserved = gen_delim | sub_delim;

    ## Authority
    port = (
        digit*
    ) >port_begin %port_end;

    host = (
        ip_literal | ipv4address | reg_name
    ) >host_begin %host_end;

    userinfo = (
        (unreserved | pct_encoded | sub_delim | ':')*
    ) >user_begin %user_end;

    authority = (
        (userinfo '@')? host (':' port)?
    ) >authority_begin %authority_end;


    ## URI components
    scheme = (
        alpha (alpha | digit | '+' | '-' | '.')*
    ) >scheme_begin %scheme_end;

    query = (
        (pchar | '/' | '?')*
    ) >query_begin %query_end;

    fragment = (
        (pchar | '/' | '?')*
    ) >fragment_begin %fragment_end;

    ## URI types
    hier_part = (
          '//' (authority path_abempty  >path_begin %path_end) >hier_begin %hier_end
    ) | (
          path_absolute  >path_begin %path_end
        | path_rootless  >path_begin %path_end
        | path_empty     >path_begin %path_end
    ) >hier_begin %hier_end;

    uri = scheme ':' hier_part ('?' query)? ('#' fragment);

    relative_part =
          '//' authority path_abempty  >path_begin %path_end
        | path_absolute  >path_begin %path_end
        | path_noscheme  >path_begin %path_end
        | path_empty     >path_begin %path_end
    ;

    relative_ref = relative_part ('?' query)? ('#' fragment);

    uri_reference = uri | relative_ref;

    absolute_uri = scheme ':' hier_part ('?' query)?;

    URI := (
        absolute_uri | uri_reference
    )
    %success
    $!failure
    $trace;

    write data;
}%%


///////////////////////////////////////////////////////////////////////////////
// URI

util::uri::uri (const char *str):
    uri (std::string (str))
{ ; }


//-----------------------------------------------------------------------------
util::uri::uri (const char *first, const char *last):
    uri (std::string (first, last))
{ ; }


///////////////////////////////////////////////////////////////////////////////
static const util::view<const char*> NULL_VIEW { nullptr, nullptr };


//-----------------------------------------------------------------------------
util::uri::uri (std::string &&_value):
    m_views {
        NULL_VIEW,
        NULL_VIEW,
        NULL_VIEW,
        NULL_VIEW,
        NULL_VIEW,
        NULL_VIEW,
        NULL_VIEW,
        NULL_VIEW,
        NULL_VIEW
    },
    m_value (std::move (_value))
{
    const char *p   = m_value.data ();
    const char *pe  = m_value.data () + m_value.size ();
    const char *eof = pe;

    bool __success = false;

    int cs;

    %%write init;
    %%write exec;

    if (!__success)
        throw parse_error ("invalid uri");
}


//-----------------------------------------------------------------------------
util::view<const char*>
util::uri::get (util::uri::component c) const
{
    CHECK_NEQ (c, NUM_COMPONENTS);
    return m_views[c];
}


//-----------------------------------------------------------------------------
static uint8_t
hex_to_uint (char c)
{
    if (c >= '0' && c <= '9')
        return c - '0';

    if (c >= 'A' && c <= 'F')
        return c - 'A' + 10;

    if (c >= 'a' && c <= 'f')
        return c - 'a' + 10;

    unreachable ();
}


//-----------------------------------------------------------------------------
std::string
util::uri::percent_decode (view<const char*> s)
{
    if (s.size () == 0)
        return std::string ();

    // Early check for late percent-encoding so we can simplify the decode loop
    {
        auto tail = std::find (s.size () < 3 ? s.begin ()
                                             : s.end () - 2,
                               s.end (),
                               '%');
        if (tail != s.end ())
            throw parse_error ("triple overlaps end");
    }

    // Allocate and size a potentially overlong output string. This allows us
    // to copy directly into its buffer. We'll shorten it at the end.
    std::string out;
    out.resize (s.size ());

    // Find the percent, copy until that, decode, advance, repeat.
    auto out_cursor = out.begin ();

    for (auto i = s.begin (); i < s.end (); ++i) {
        auto cursor = std::find (i, s.end (), '%');

        if (cursor == s.end ()) {
            out_cursor = std::copy (i, s.end (), out_cursor);
            break;
        }

        out_cursor = std::copy (i, cursor, out_cursor);
        *out_cursor = hex_to_uint (cursor[1]) << 4 | hex_to_uint(cursor[2]);

        i += 3;
    }

    out.resize (out.end () - out_cursor);
    return out;
}


//-----------------------------------------------------------------------------
std::ostream&
util::operator<< (std::ostream &os, util::uri::component c)
{
    switch (c) {
        case util::uri::SCHEME:         return os << "SCHEME";
        case util::uri::HIERARCHICAL:   return os << "HIERARCHICAL";
        case util::uri::AUTHORITY:      return os << "AUTHORITY";
        case util::uri::USER:           return os << "USER";
        case util::uri::HOST:           return os << "HOST";
        case util::uri::PORT:           return os << "PORT";
        case util::uri::PATH:           return os << "PATH";
        case util::uri::QUERY:          return os << "QUERY";
        case util::uri::FRAGMENT:       return os << "FRAGMENT";

        case util::uri::NUM_COMPONENTS:
            unreachable ();
    }

    unreachable ();
}


//-----------------------------------------------------------------------------
std::ostream&
util::operator<< (std::ostream &os, const util::uri &val)
{
    return os << '[' << util::make_infix (val.components ()) << ']';
}