utf8: add a trivial utf8 decoder

2017-10-02 15:25:59 +11:00 · 2017-10-02 15:25:59 +11:00 · 7ecaaacd93
commit 7ecaaacd93
parent 4641d43742
4 changed files with 550 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -408,6 +408,8 @@ list (
    types/traits.hpp
    uri.cpp
    uri.hpp
    utf8.cpp
    utf8.hpp
    variadic.cpp
    variadic.hpp
    vector.cpp
@ -531,6 +533,7 @@ if (TESTS)
        traits
        typeidx
        uri
        utf8
        vector
        version
        view
--- a/test/utf8.cpp
+++ b/test/utf8.cpp
@ -0,0 +1,325 @@
 #include "tap.hpp"
 #include "utf8.hpp"
 ///////////////////////////////////////////////////////////////////////////////
 static void
 simple_valid (util::TAP::logger &tap)
 {
    static constexpr struct {
        const char *data;
        size_t len;
        const char *message;
    } VALID[] = {
        { "",        0, "empty string" },
        { "a",       1, "single ANSI character" },
        { "abc",     3, "multiple ANSI characters" },
        { u8"κόσμε", 5, "greek kosme" },
    };
    static constexpr char fmt[] = "valid length, %s";
    for (const auto &t: VALID) {
        try {
            tap.expect_eq (
                t.len,
                util::utf8::decode (util::make_view (t.data)).size (),
                fmt,
                t.message
            );
        } catch (...) {
            tap.fail (fmt, t.message);
        }
    }
 }
 ///////////////////////////////////////////////////////////////////////////////
 static void
 single_boundaries (util::TAP::logger &tap)
 {
    static const struct {
        std::vector<uint8_t> data;
        uint32_t value;
        const char *direction;
    } TESTS[] {
        { { 0x00                   }, 0x00000000, "low length boundary" },
        { { 0xC2, 0x80             }, 0x00000080, "low length boundary" },
        { { 0xE0, 0xA0, 0x80       }, 0x00000800, "low length boundary" },
        { { 0xF0, 0x90, 0x80, 0x80 }, 0x00010000, "low length boundary" },
        { { 0x7F                   }, 0x0000007F, "high length boundary" },
        { { 0xDF, 0xBF             }, 0x000007FF, "high length boundary" },
        // this is an invalid codepoint so we're going to fail to parse this
        // whatever the case. disable it for the time being.
        //{ { 0xEF, 0xBF, 0xBF,      }, 0x0000FFFF, "high length boundary" },
        { { 0xF7, 0xBF, 0xBF, 0xBF }, 0x001FFFFF, "high length boundary" },
        { { 0xED, 0x9F, 0xBF       }, 0x0000D7FF, "other" },
        { { 0xEE, 0x80, 0x80       }, 0x0000E000, "other" },
        { { 0xEF, 0xBF, 0xBD       }, 0x0000FFFD, "other" },
        { { 0xF4, 0x8F, 0xBF, 0xBF }, 0x0010FFFF, "other" },
        { { 0xF4, 0x90, 0x80, 0x80 }, 0x00110000, "other" },
    };
    static constexpr char fmt[] = "single character (%s), %!-byte sequence";
    for (const auto &t: TESTS) {
        auto data = util::make_view (
            reinterpret_cast<const std::byte*> (&*t.data.cbegin ()),
            reinterpret_cast<const std::byte*> (&*t.data.cbegin ()) + t.data.size ()
        );
        try {
            auto codepoints = util::utf8::decode (data);
            tap.expect (
                codepoints.size () == 1 && codepoints[0] == t.value,
                fmt,
                t.direction,
                t.data.size ()
            );
        } catch (const util::utf8::malformed_error&) {
            tap.fail (fmt, t.direction, t.data.size ());
        }
    }
 };
 ///////////////////////////////////////////////////////////////////////////////
 static void
 malformed (util::TAP::logger &tap)
 {
    static const struct {
        std::vector<uint8_t> data;
        const char *message;
    } TESTS[] = {
        { { 0x80                   }, "first continuation" },
        { { 0xBF                   }, "last continuation" },
        { { 0x80, 0xBF             }, "continuation sequence" },
        { { 0x80, 0xBF, 0x80       }, "continuation sequence" },
        { { 0x80, 0xBF, 0x80, 0xBF }, "continuation sequence" },
    };
    static constexpr char fmt[] = "malformed %! byte sequence, %s";
    for (const auto &t: TESTS) {
        auto data = util::make_view (
            reinterpret_cast<const std::byte*> (&*t.data.cbegin ()),
            reinterpret_cast<const std::byte*> (&*t.data.cbegin ()) + t.data.size ()
        );
        tap.expect_throw<util::utf8::malformed_error> (
            [&data] () {
                util::utf8::decode (data);
            },
            fmt,
            data.size (),
            t.message
        );
    }
    // test every continuation byte by itself. we use a boolean flag that
    // should never reach the line where we toggle it to false due to the
    // expected exception.
    {
        bool success = true;
        for (uint8_t c = 0x80; c <= 0xbf; ++c) {
            try {
                const auto v = c;
                util::utf8::decode (util::view { &v, &v+1 });
                success = false;
                break;
            } catch (...) { ; }
        }
        tap.expect (success, "individual continuation bytes");
    }
    // every combination of first-byte-then-space sequences
    static const struct {
        uint8_t first;
        uint8_t last;
        int length;
    } LONELY[] = {
        { 0xc0, 0xdf, 2 },
        { 0xe0, 0xef, 3 },
        { 0xf0, 0xf7, 4 },
    };
    for (const auto &t: LONELY) {
        union {
            uint8_t bytes[4];
            char    str  [4];
        };
        bool success = true;
        for (auto i = t.first; i <= t.last; ++i) {
            std::fill (std::begin (str), std::end (str), ' ');
            bytes[0] = i;
            try {
                util::utf8::decode (util::make_cview (str));
                success = false;
            }
            catch (const util::utf8::malformed_error&)
            { ; }
            catch (...)
            { success = false; }
        }
        tap.expect (success, "lonely start characters, %! bytes", t.length);
    }
    static const std::vector<uint8_t> MISSING[] = {
        { 0xC0 },
        { 0xE0, 0x80 },
        { 0xF0, 0x80, 0x80 },
    };
    for (const auto &t: MISSING) {
        util::view<const char*> data {
            reinterpret_cast<const char*> (&t[0]),
            reinterpret_cast<const char*> (&t[0]) + t.size ()
        };
        tap.expect_throw<util::utf8::malformed_error> (
            [&data] () { util::utf8::decode (data); },
            "%! byte sequence missing the lastbyte",
            t.size ()
        );
    }
    static const std::vector<uint8_t> IMPOSSIBLE[] = {
        { 0xfe },
        { 0xff },
        { 0xfe, 0xfe, 0xff, 0xff }
    };
    for (const auto &t: IMPOSSIBLE) {
        util::view<const char*> data {
            reinterpret_cast<const char*> (&t[0]),
            reinterpret_cast<const char*> (&t[0]) + t.size ()
        };
        tap.expect_throw<util::utf8::malformed_error> (
            [&data] () { util::utf8::decode (data); },
            "impossible %! byte sequence",
            t.size ()
        );
    }
 };
 ///////////////////////////////////////////////////////////////////////////////
 void
 overlong (util::TAP::logger &tap)
 {
    static const struct {
        std::vector<uint8_t> data;
        const char *message;
    } TESTS[] = {
        { { 0xc0, 0xaf             }, "simple ANSI" },
        { { 0xe0, 0x80, 0xaf       }, "simple ANSI" },
        { { 0xf0, 0x80, 0x80, 0xaf }, "simple ANSI" },
        { { 0xc1, 0xbf             }, "maximum" },
        { { 0xe0, 0x9f, 0xbf       }, "maximum" },
        { { 0xf0, 0x8f, 0xbf, 0xbf }, "maximum" },
        { { 0xc0, 0x80             }, "null" },
        { { 0xe0, 0x80, 0x80       }, "null" },
        { { 0xf0, 0x80, 0x80, 0x80 }, "null" },
    };
    for (const auto &t: TESTS) {
        auto data = util::make_view (
            reinterpret_cast<const char*> (&t.data[0]),
            reinterpret_cast<const char*> (&t.data[0]) + t.data.size ()
        );
        tap.expect_throw<util::utf8::overlong_error> (
            [&] () {
                util::utf8::decode (data);
            },
            "overlong %! byte sequence, %s",
            t.data.size (),
            t.message
        );
    }
 };
 ///////////////////////////////////////////////////////////////////////////////
 void
 illegal (util::TAP::logger &tap)
 {
    static const std::array<uint8_t,3> SINGLE[] = {
        { 0xed, 0xa0, 0x80 }, // U+D800
        { 0xed, 0xad, 0xbf }, // U+DB7F
        { 0xed, 0xae, 0x80 }, // U+DB80
        { 0xed, 0xaf, 0xbf }, // U+DBFF
        { 0xed, 0xb0, 0x80 }, // U+DC00
        { 0xed, 0xbe, 0x80 }, // U+DF80
        { 0xed, 0xbf, 0xbf }, // U+DFFF
    };
    for (const auto &t: SINGLE)
        tap.expect_throw<util::utf8::error> (
            [&t] () { util::utf8::decode (util::make_view (t)); },
            "reject utf16 single surrogate"
        );
    static const std::array<uint8_t,6> DOUBLE[] = {
        { 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80 }, // U+D800 U+DC00
        { 0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf }, // U+D800 U+DFFF
        { 0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80 }, // U+DB7F U+DC00
        { 0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf }, // U+DB7F U+DFFF
        { 0xed, 0xae, 0x80, 0xed, 0xb0, 0x80 }, // U+DB80 U+DC00
        { 0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf }, // U+DB80 U+DFFF
        { 0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80 }, // U+DBFF U+DC00
        { 0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf }, // U+DBFF U+DFFF
    };
    for (const auto &t: DOUBLE)
        tap.expect_throw<util::utf8::error> (
            [&t] () { util::utf8::decode (util::make_view (t)); },
            "reject utf16 paired surrogate"
        );
    static const std::array<uint8_t,3> OTHER[] = {
        { 0xef, 0xbf, 0xbe }, // FFFE
        { 0xef, 0xbf, 0xbf }, // FFFF
    };
    for (const auto &t: OTHER)
        tap.expect_throw<util::utf8::error> (
            [&t] () { util::utf8::decode (util::make_view (t)); },
            "reject BOM"
        );
 };
 ///////////////////////////////////////////////////////////////////////////////
 int
 main()
 {
    util::TAP::logger tap;
    simple_valid (tap);
    single_boundaries (tap);
    malformed (tap);
    overlong (tap);
    illegal (tap);
    return tap.status ();
 };
--- a/utf8.cpp
+++ b/utf8.cpp
@ -0,0 +1,148 @@
 /*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Copyright 2017 Danny Robson <danny@nerdcruft.net>
 */
 #include "./utf8.hpp"
 ///////////////////////////////////////////////////////////////////////////////
 template <typename T>
 struct test {
    constexpr
    test (T _mask, T _value, T _shift):
        mask (_mask),
        value (_value),
        shift (_shift)
    { ; }
    constexpr bool
    operator() (T t) const
    {
        return (t & mask) == value;
    }
    T mask;
    T value;
    T shift;
 };
 //-----------------------------------------------------------------------------
 static constexpr test<uint32_t>
 operator"" _test (const char *str, size_t len)
 {
    uint32_t mask = 0;
    uint32_t value = 0;
    uint32_t shift = 0;
    if (str[0] != '0' || str[1] != 'b')
        throw std::invalid_argument ("invalid bit test prefix");
    for (size_t i = 2; i < len; ++i) {
        auto c = str[i];
        mask  <<= 1;
        value <<= 1;
        switch (c) {
        case '0': mask |= 0x1; value |= 0x0;          break;
        case '1': mask |= 0x1; value |= 0x1;          break;
        case 'x': mask |= 0x0; value |= 0x0; ++shift; break;
        default:
            throw std::invalid_argument ("invalid bit test character");
        }
    }
    return { mask, value, shift };
 }
 ///////////////////////////////////////////////////////////////////////////////
 std::vector<util::utf8::codepoint_t>
 util::utf8::decode (view<const std::byte*> src)
 {
    std::vector<codepoint_t> dst;
    dst.reserve (src.size ());
    static constexpr
    test<codepoint_t> TESTS[] = {
        "0b0xxxxxxx"_test,
        "0b110xxxxx"_test,
        "0b1110xxxx"_test,
        "0b11110xxx"_test
    };
    for (auto cursor = src.cbegin (); cursor != src.cend (); ++cursor) {
        codepoint_t c = std::to_integer<codepoint_t> (*cursor);
        int len = TESTS[0] (c) ? 0 :
                  TESTS[1] (c) ? 1 :
                  TESTS[2] (c) ? 2 :
                  TESTS[3] (c) ? 3 :
                  throw malformed_error {};
        if (cursor + len >= src.cend ())
            throw malformed_error {};
        // get the simple ANSI case out of the way
        if (!len) {
            dst.push_back (c);
            continue;
        }
        codepoint_t head = codepoint_t { c & ~TESTS[len].mask } << (len * 6);
        codepoint_t accum = head;
        codepoint_t shift = 0;
        // check every following data byte has the appropriate prefix
        for (int i = 1; i <= len; ++i) {
            if ((std::to_integer<codepoint_t> (cursor[i]) & 0b11'000000u) != 0b10'000000u)
                throw malformed_error {};
        }
        switch (len) {
        case 3: accum |= (std::to_integer<codepoint_t> (cursor[3]) & 0b00111111u) << (shift++ * 6u);
        case 2: accum |= (std::to_integer<codepoint_t> (cursor[2]) & 0b00111111u) << (shift++ * 6u);
        case 1: accum |= (std::to_integer<codepoint_t> (cursor[1]) & 0b00111111u) << (shift++ * 6u);
        }
        // describes the bits required to be present for a valid minimally
        // sized codepoint of a given byte length.
        static constexpr
        codepoint_t LEVEL_MASK[] {
            0b00000000'00000000'01111111,
            0b00000000'00000111'10000000,
            0b00000000'11111000'00000000,
            0b00011111'00000000'00000000
        };
        if (!(accum & LEVEL_MASK[len]))
            throw overlong_error {};
        // utf16 surrogates should not be present in utf8
        if (accum >= 0xD800 && accum <= 0xDFFF)
            throw illegal_codepoint {};
        // reject the BOM
        if (accum == 0xfffe || accum == 0xffff)
            throw illegal_codepoint {};
        dst.push_back (accum);
        std::advance (cursor, len);
    }
    return dst;
 }
--- a/utf8.hpp
+++ b/utf8.hpp
@ -0,0 +1,74 @@
 /*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Copyright 2017 Danny Robson <danny@nerdcruft.net>
 */
 #ifndef CRUFT_UTIL_UTF8_HPP
 #define CRUFT_UTIL_UTF8_HPP
 #include "./view.hpp"
 #include <cstddef>
 #include <cstdint>
 #include <vector>
 namespace util::utf8 {
    ///////////////////////////////////////////////////////////////////////////
    using codepoint_t = uint32_t;
    constexpr codepoint_t MAX_CODEPOINT = 0x10FFFF;
    ///////////////////////////////////////////////////////////////////////////
    std::vector<codepoint_t>
    decode (util::view<const std::byte*>);
    //-------------------------------------------------------------------------
    inline auto
    decode (util::view<const char*> data)
    {
        return decode ({
            reinterpret_cast<const std::byte*> (data.cbegin ()),
            reinterpret_cast<const std::byte*> (data.cend   ())
        });
    }
    inline auto
    decode (util::view<const uint8_t*> data)
    {
        return decode ({
            reinterpret_cast<const char*> (data.cbegin ()),
            reinterpret_cast<const char*> (data.cend   ())
        });
    }
    ///////////////////////////////////////////////////////////////////////////
    std::vector<std::byte>
    encode (util::view<const char*>);
    ///////////////////////////////////////////////////////////////////////////
    struct error : public std::exception {};
    struct malformed_error : public error { };
    struct illegal_codepoint : public malformed_error {};
    struct overlong_error  : public error { };
 }
 #endif