From 7ecaaacd93eaccf8ad06ca52ff0544fc62883e15 Mon Sep 17 00:00:00 2001
From: Danny Robson <danny@nerdcruft.net>
Date: Mon, 2 Oct 2017 15:25:59 +1100
Subject: [PATCH] utf8: add a trivial utf8 decoder

---
 CMakeLists.txt |   3 +
 test/utf8.cpp  | 325 +++++++++++++++++++++++++++++++++++++++++++++++++
 utf8.cpp       | 148 ++++++++++++++++++++++
 utf8.hpp       |  74 +++++++++++
 4 files changed, 550 insertions(+)
 create mode 100644 test/utf8.cpp
 create mode 100644 utf8.cpp
 create mode 100644 utf8.hpp
diff --git a/CMakeLists.txt b/CMakeLists.txt
index eb847c7c..d3ae3122 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -408,6 +408,8 @@ list (
     types/traits.hpp
     uri.cpp
     uri.hpp
+    utf8.cpp
+    utf8.hpp
     variadic.cpp
     variadic.hpp
     vector.cpp
@@ -531,6 +533,7 @@ if (TESTS)
         traits
         typeidx
         uri
+        utf8
         vector
         version
         view
diff --git a/test/utf8.cpp b/test/utf8.cpp
new file mode 100644
index 00000000..09bc1297
--- /dev/null
+++ b/test/utf8.cpp
@@ -0,0 +1,325 @@
+#include "tap.hpp"
+#include "utf8.hpp"
+
+
+///////////////////////////////////////////////////////////////////////////////
+static void
+simple_valid (util::TAP::logger &tap)
+{
+    static constexpr struct {
+        const char *data;
+        size_t len;
+        const char *message;
+    } VALID[] = {
+        { "",        0, "empty string" },
+        { "a",       1, "single ANSI character" },
+        { "abc",     3, "multiple ANSI characters" },
+        { u8"κόσμε", 5, "greek kosme" },
+    };
+
+    static constexpr char fmt[] = "valid length, %s";
+
+    for (const auto &t: VALID) {
+        try {
+            tap.expect_eq (
+                t.len,
+                util::utf8::decode (util::make_view (t.data)).size (),
+                fmt,
+                t.message
+            );
+        } catch (...) {
+            tap.fail (fmt, t.message);
+        }
+    }
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+static void
+single_boundaries (util::TAP::logger &tap)
+{
+    static const struct {
+        std::vector<uint8_t> data;
+        uint32_t value;
+        const char *direction;
+    } TESTS[] {
+        { { 0x00                   }, 0x00000000, "low length boundary" },
+        { { 0xC2, 0x80             }, 0x00000080, "low length boundary" },
+        { { 0xE0, 0xA0, 0x80       }, 0x00000800, "low length boundary" },
+        { { 0xF0, 0x90, 0x80, 0x80 }, 0x00010000, "low length boundary" },
+
+        { { 0x7F                   }, 0x0000007F, "high length boundary" },
+        { { 0xDF, 0xBF             }, 0x000007FF, "high length boundary" },
+        // this is an invalid codepoint so we're going to fail to parse this
+        // whatever the case. disable it for the time being.
+        //{ { 0xEF, 0xBF, 0xBF,      }, 0x0000FFFF, "high length boundary" },
+        { { 0xF7, 0xBF, 0xBF, 0xBF }, 0x001FFFFF, "high length boundary" },
+
+        { { 0xED, 0x9F, 0xBF       }, 0x0000D7FF, "other" },
+        { { 0xEE, 0x80, 0x80       }, 0x0000E000, "other" },
+        { { 0xEF, 0xBF, 0xBD       }, 0x0000FFFD, "other" },
+        { { 0xF4, 0x8F, 0xBF, 0xBF }, 0x0010FFFF, "other" },
+        { { 0xF4, 0x90, 0x80, 0x80 }, 0x00110000, "other" },
+    };
+
+    static constexpr char fmt[] = "single character (%s), %!-byte sequence";
+
+    for (const auto &t: TESTS) {
+        auto data = util::make_view (
+            reinterpret_cast<const std::byte*> (&*t.data.cbegin ()),
+            reinterpret_cast<const std::byte*> (&*t.data.cbegin ()) + t.data.size ()
+        );
+
+        try {
+            auto codepoints = util::utf8::decode (data);
+
+            tap.expect (
+                codepoints.size () == 1 && codepoints[0] == t.value,
+                fmt,
+                t.direction,
+                t.data.size ()
+            );
+        } catch (const util::utf8::malformed_error&) {
+            tap.fail (fmt, t.direction, t.data.size ());
+        }
+    }
+};
+
+
+
+///////////////////////////////////////////////////////////////////////////////
+static void
+malformed (util::TAP::logger &tap)
+{
+    static const struct {
+        std::vector<uint8_t> data;
+        const char *message;
+    } TESTS[] = {
+        { { 0x80                   }, "first continuation" },
+        { { 0xBF                   }, "last continuation" },
+        { { 0x80, 0xBF             }, "continuation sequence" },
+        { { 0x80, 0xBF, 0x80       }, "continuation sequence" },
+        { { 0x80, 0xBF, 0x80, 0xBF }, "continuation sequence" },
+    };
+
+    static constexpr char fmt[] = "malformed %! byte sequence, %s";
+
+    for (const auto &t: TESTS) {
+        auto data = util::make_view (
+            reinterpret_cast<const std::byte*> (&*t.data.cbegin ()),
+            reinterpret_cast<const std::byte*> (&*t.data.cbegin ()) + t.data.size ()
+        );
+
+        tap.expect_throw<util::utf8::malformed_error> (
+            [&data] () {
+                util::utf8::decode (data);
+            },
+            fmt,
+            data.size (),
+            t.message
+        );
+    }
+
+    // test every continuation byte by itself. we use a boolean flag that
+    // should never reach the line where we toggle it to false due to the
+    // expected exception.
+    {
+        bool success = true;
+        for (uint8_t c = 0x80; c <= 0xbf; ++c) {
+            try {
+                const auto v = c;
+                util::utf8::decode (util::view { &v, &v+1 });
+                success = false;
+                break;
+            } catch (...) { ; }
+        }
+
+        tap.expect (success, "individual continuation bytes");
+    }
+
+    // every combination of first-byte-then-space sequences
+    static const struct {
+        uint8_t first;
+        uint8_t last;
+        int length;
+    } LONELY[] = {
+        { 0xc0, 0xdf, 2 },
+        { 0xe0, 0xef, 3 },
+        { 0xf0, 0xf7, 4 },
+    };
+
+    for (const auto &t: LONELY) {
+        union {
+            uint8_t bytes[4];
+            char    str  [4];
+        };
+
+        bool success = true;
+
+        for (auto i = t.first; i <= t.last; ++i) {
+            std::fill (std::begin (str), std::end (str), ' ');
+            bytes[0] = i;
+
+            try {
+                util::utf8::decode (util::make_cview (str));
+                success = false;
+            }
+            catch (const util::utf8::malformed_error&)
+            { ; }
+            catch (...)
+            { success = false; }
+        }
+
+        tap.expect (success, "lonely start characters, %! bytes", t.length);
+    }
+
+
+    static const std::vector<uint8_t> MISSING[] = {
+        { 0xC0 },
+        { 0xE0, 0x80 },
+        { 0xF0, 0x80, 0x80 },
+    };
+
+    for (const auto &t: MISSING) {
+        util::view<const char*> data {
+            reinterpret_cast<const char*> (&t[0]),
+            reinterpret_cast<const char*> (&t[0]) + t.size ()
+        };
+
+        tap.expect_throw<util::utf8::malformed_error> (
+            [&data] () { util::utf8::decode (data); },
+            "%! byte sequence missing the lastbyte",
+            t.size ()
+        );
+    }
+
+
+    static const std::vector<uint8_t> IMPOSSIBLE[] = {
+        { 0xfe },
+        { 0xff },
+        { 0xfe, 0xfe, 0xff, 0xff }
+    };
+
+    for (const auto &t: IMPOSSIBLE) {
+        util::view<const char*> data {
+            reinterpret_cast<const char*> (&t[0]),
+            reinterpret_cast<const char*> (&t[0]) + t.size ()
+        };
+
+        tap.expect_throw<util::utf8::malformed_error> (
+            [&data] () { util::utf8::decode (data); },
+            "impossible %! byte sequence",
+            t.size ()
+        );
+    }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+void
+overlong (util::TAP::logger &tap)
+{
+    static const struct {
+        std::vector<uint8_t> data;
+        const char *message;
+    } TESTS[] = {
+        { { 0xc0, 0xaf             }, "simple ANSI" },
+        { { 0xe0, 0x80, 0xaf       }, "simple ANSI" },
+        { { 0xf0, 0x80, 0x80, 0xaf }, "simple ANSI" },
+
+        { { 0xc1, 0xbf             }, "maximum" },
+        { { 0xe0, 0x9f, 0xbf       }, "maximum" },
+        { { 0xf0, 0x8f, 0xbf, 0xbf }, "maximum" },
+
+        { { 0xc0, 0x80             }, "null" },
+        { { 0xe0, 0x80, 0x80       }, "null" },
+        { { 0xf0, 0x80, 0x80, 0x80 }, "null" },
+    };
+
+    for (const auto &t: TESTS) {
+        auto data = util::make_view (
+            reinterpret_cast<const char*> (&t.data[0]),
+            reinterpret_cast<const char*> (&t.data[0]) + t.data.size ()
+        );
+
+        tap.expect_throw<util::utf8::overlong_error> (
+            [&] () {
+                util::utf8::decode (data);
+            },
+            "overlong %! byte sequence, %s",
+            t.data.size (),
+            t.message
+        );
+    }
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+void
+illegal (util::TAP::logger &tap)
+{
+    static const std::array<uint8_t,3> SINGLE[] = {
+        { 0xed, 0xa0, 0x80 }, // U+D800
+        { 0xed, 0xad, 0xbf }, // U+DB7F
+        { 0xed, 0xae, 0x80 }, // U+DB80
+        { 0xed, 0xaf, 0xbf }, // U+DBFF
+        { 0xed, 0xb0, 0x80 }, // U+DC00
+        { 0xed, 0xbe, 0x80 }, // U+DF80
+        { 0xed, 0xbf, 0xbf }, // U+DFFF
+    };
+
+    for (const auto &t: SINGLE)
+        tap.expect_throw<util::utf8::error> (
+            [&t] () { util::utf8::decode (util::make_view (t)); },
+            "reject utf16 single surrogate"
+        );
+
+
+    static const std::array<uint8_t,6> DOUBLE[] = {
+        { 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80 }, // U+D800 U+DC00
+        { 0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf }, // U+D800 U+DFFF
+        { 0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80 }, // U+DB7F U+DC00
+        { 0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf }, // U+DB7F U+DFFF
+        { 0xed, 0xae, 0x80, 0xed, 0xb0, 0x80 }, // U+DB80 U+DC00
+        { 0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf }, // U+DB80 U+DFFF
+        { 0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80 }, // U+DBFF U+DC00
+        { 0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf }, // U+DBFF U+DFFF
+    };
+
+    for (const auto &t: DOUBLE)
+        tap.expect_throw<util::utf8::error> (
+            [&t] () { util::utf8::decode (util::make_view (t)); },
+            "reject utf16 paired surrogate"
+        );
+
+
+    static const std::array<uint8_t,3> OTHER[] = {
+        { 0xef, 0xbf, 0xbe }, // FFFE
+        { 0xef, 0xbf, 0xbf }, // FFFF
+    };
+
+
+    for (const auto &t: OTHER)
+        tap.expect_throw<util::utf8::error> (
+            [&t] () { util::utf8::decode (util::make_view (t)); },
+            "reject BOM"
+        );
+
+
+};
+
+
+///////////////////////////////////////////////////////////////////////////////
+int
+main()
+{
+    util::TAP::logger tap;
+
+    simple_valid (tap);
+    single_boundaries (tap);
+    malformed (tap);
+    overlong (tap);
+    illegal (tap);
+
+    return tap.status ();
+};
\ No newline at end of file
diff --git a/utf8.cpp b/utf8.cpp
new file mode 100644
index 00000000..fa2dc14c
--- /dev/null
+++ b/utf8.cpp
@@ -0,0 +1,148 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Copyright 2017 Danny Robson <danny@nerdcruft.net>
+ */
+
+#include "./utf8.hpp"
+
+
+///////////////////////////////////////////////////////////////////////////////
+template <typename T>
+struct test {
+    constexpr
+    test (T _mask, T _value, T _shift):
+        mask (_mask),
+        value (_value),
+        shift (_shift)
+    { ; }
+
+    constexpr bool
+    operator() (T t) const
+    {
+        return (t & mask) == value;
+    }
+
+    T mask;
+    T value;
+    T shift;
+};
+
+
+//-----------------------------------------------------------------------------
+static constexpr test<uint32_t>
+operator"" _test (const char *str, size_t len)
+{
+    uint32_t mask = 0;
+    uint32_t value = 0;
+    uint32_t shift = 0;
+
+    if (str[0] != '0' || str[1] != 'b')
+        throw std::invalid_argument ("invalid bit test prefix");
+
+    for (size_t i = 2; i < len; ++i) {
+        auto c = str[i];
+
+        mask  <<= 1;
+        value <<= 1;
+
+        switch (c) {
+        case '0': mask |= 0x1; value |= 0x0;          break;
+        case '1': mask |= 0x1; value |= 0x1;          break;
+        case 'x': mask |= 0x0; value |= 0x0; ++shift; break;
+        default:
+            throw std::invalid_argument ("invalid bit test character");
+        }
+
+    }
+
+    return { mask, value, shift };
+}
+
+
+///////////////////////////////////////////////////////////////////////////////
+std::vector<util::utf8::codepoint_t>
+util::utf8::decode (view<const std::byte*> src)
+{
+    std::vector<codepoint_t> dst;
+    dst.reserve (src.size ());
+
+    static constexpr
+    test<codepoint_t> TESTS[] = {
+        "0b0xxxxxxx"_test,
+        "0b110xxxxx"_test,
+        "0b1110xxxx"_test,
+        "0b11110xxx"_test
+    };
+
+    for (auto cursor = src.cbegin (); cursor != src.cend (); ++cursor) {
+        codepoint_t c = std::to_integer<codepoint_t> (*cursor);
+
+        int len = TESTS[0] (c) ? 0 :
+                  TESTS[1] (c) ? 1 :
+                  TESTS[2] (c) ? 2 :
+                  TESTS[3] (c) ? 3 :
+                  throw malformed_error {};
+
+        if (cursor + len >= src.cend ())
+            throw malformed_error {};
+
+        // get the simple ANSI case out of the way
+        if (!len) {
+            dst.push_back (c);
+            continue;
+        }
+
+        codepoint_t head = codepoint_t { c & ~TESTS[len].mask } << (len * 6);
+        codepoint_t accum = head;
+        codepoint_t shift = 0;
+
+        // check every following data byte has the appropriate prefix
+        for (int i = 1; i <= len; ++i) {
+            if ((std::to_integer<codepoint_t> (cursor[i]) & 0b11'000000u) != 0b10'000000u)
+                throw malformed_error {};
+        }
+
+        switch (len) {
+        case 3: accum |= (std::to_integer<codepoint_t> (cursor[3]) & 0b00111111u) << (shift++ * 6u);
+        case 2: accum |= (std::to_integer<codepoint_t> (cursor[2]) & 0b00111111u) << (shift++ * 6u);
+        case 1: accum |= (std::to_integer<codepoint_t> (cursor[1]) & 0b00111111u) << (shift++ * 6u);
+        }
+
+        // describes the bits required to be present for a valid minimally
+        // sized codepoint of a given byte length.
+        static constexpr
+        codepoint_t LEVEL_MASK[] {
+            0b00000000'00000000'01111111,
+            0b00000000'00000111'10000000,
+            0b00000000'11111000'00000000,
+            0b00011111'00000000'00000000
+        };
+
+        if (!(accum & LEVEL_MASK[len]))
+            throw overlong_error {};
+
+        // utf16 surrogates should not be present in utf8
+        if (accum >= 0xD800 && accum <= 0xDFFF)
+            throw illegal_codepoint {};
+
+        // reject the BOM
+        if (accum == 0xfffe || accum == 0xffff)
+            throw illegal_codepoint {};
+
+        dst.push_back (accum);
+        std::advance (cursor, len);
+    }
+
+    return dst;
+}
\ No newline at end of file
diff --git a/utf8.hpp b/utf8.hpp
new file mode 100644
index 00000000..b89c1b2d
--- /dev/null
+++ b/utf8.hpp
@@ -0,0 +1,74 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Copyright 2017 Danny Robson <danny@nerdcruft.net>
+ */
+
+
+#ifndef CRUFT_UTIL_UTF8_HPP
+#define CRUFT_UTIL_UTF8_HPP
+
+#include "./view.hpp"
+
+#include <cstddef>
+#include <cstdint>
+#include <vector>
+
+
+namespace util::utf8 {
+    ///////////////////////////////////////////////////////////////////////////
+    using codepoint_t = uint32_t;
+
+    constexpr codepoint_t MAX_CODEPOINT = 0x10FFFF;
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    std::vector<codepoint_t>
+    decode (util::view<const std::byte*>);
+
+
+    //-------------------------------------------------------------------------
+    inline auto
+    decode (util::view<const char*> data)
+    {
+        return decode ({
+            reinterpret_cast<const std::byte*> (data.cbegin ()),
+            reinterpret_cast<const std::byte*> (data.cend   ())
+        });
+    }
+
+    inline auto
+    decode (util::view<const uint8_t*> data)
+    {
+        return decode ({
+            reinterpret_cast<const char*> (data.cbegin ()),
+            reinterpret_cast<const char*> (data.cend   ())
+        });
+    }
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    std::vector<std::byte>
+    encode (util::view<const char*>);
+
+
+    ///////////////////////////////////////////////////////////////////////////
+    struct error : public std::exception {};
+
+    struct malformed_error : public error { };
+    struct illegal_codepoint : public malformed_error {};
+
+    struct overlong_error  : public error { };
+}
+
+#endif