From 7ecaaacd93eaccf8ad06ca52ff0544fc62883e15 Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Mon, 2 Oct 2017 15:25:59 +1100 Subject: [PATCH] utf8: add a trivial utf8 decoder --- CMakeLists.txt | 3 + test/utf8.cpp | 325 +++++++++++++++++++++++++++++++++++++++++++++++++ utf8.cpp | 148 ++++++++++++++++++++++ utf8.hpp | 74 +++++++++++ 4 files changed, 550 insertions(+) create mode 100644 test/utf8.cpp create mode 100644 utf8.cpp create mode 100644 utf8.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index eb847c7c..d3ae3122 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -408,6 +408,8 @@ list ( types/traits.hpp uri.cpp uri.hpp + utf8.cpp + utf8.hpp variadic.cpp variadic.hpp vector.cpp @@ -531,6 +533,7 @@ if (TESTS) traits typeidx uri + utf8 vector version view diff --git a/test/utf8.cpp b/test/utf8.cpp new file mode 100644 index 00000000..09bc1297 --- /dev/null +++ b/test/utf8.cpp @@ -0,0 +1,325 @@ +#include "tap.hpp" +#include "utf8.hpp" + + +/////////////////////////////////////////////////////////////////////////////// +static void +simple_valid (util::TAP::logger &tap) +{ + static constexpr struct { + const char *data; + size_t len; + const char *message; + } VALID[] = { + { "", 0, "empty string" }, + { "a", 1, "single ANSI character" }, + { "abc", 3, "multiple ANSI characters" }, + { u8"κόσμε", 5, "greek kosme" }, + }; + + static constexpr char fmt[] = "valid length, %s"; + + for (const auto &t: VALID) { + try { + tap.expect_eq ( + t.len, + util::utf8::decode (util::make_view (t.data)).size (), + fmt, + t.message + ); + } catch (...) { + tap.fail (fmt, t.message); + } + } +} + + +/////////////////////////////////////////////////////////////////////////////// +static void +single_boundaries (util::TAP::logger &tap) +{ + static const struct { + std::vector data; + uint32_t value; + const char *direction; + } TESTS[] { + { { 0x00 }, 0x00000000, "low length boundary" }, + { { 0xC2, 0x80 }, 0x00000080, "low length boundary" }, + { { 0xE0, 0xA0, 0x80 }, 0x00000800, "low length boundary" }, + { { 0xF0, 0x90, 0x80, 0x80 }, 0x00010000, "low length boundary" }, + + { { 0x7F }, 0x0000007F, "high length boundary" }, + { { 0xDF, 0xBF }, 0x000007FF, "high length boundary" }, + // this is an invalid codepoint so we're going to fail to parse this + // whatever the case. disable it for the time being. + //{ { 0xEF, 0xBF, 0xBF, }, 0x0000FFFF, "high length boundary" }, + { { 0xF7, 0xBF, 0xBF, 0xBF }, 0x001FFFFF, "high length boundary" }, + + { { 0xED, 0x9F, 0xBF }, 0x0000D7FF, "other" }, + { { 0xEE, 0x80, 0x80 }, 0x0000E000, "other" }, + { { 0xEF, 0xBF, 0xBD }, 0x0000FFFD, "other" }, + { { 0xF4, 0x8F, 0xBF, 0xBF }, 0x0010FFFF, "other" }, + { { 0xF4, 0x90, 0x80, 0x80 }, 0x00110000, "other" }, + }; + + static constexpr char fmt[] = "single character (%s), %!-byte sequence"; + + for (const auto &t: TESTS) { + auto data = util::make_view ( + reinterpret_cast (&*t.data.cbegin ()), + reinterpret_cast (&*t.data.cbegin ()) + t.data.size () + ); + + try { + auto codepoints = util::utf8::decode (data); + + tap.expect ( + codepoints.size () == 1 && codepoints[0] == t.value, + fmt, + t.direction, + t.data.size () + ); + } catch (const util::utf8::malformed_error&) { + tap.fail (fmt, t.direction, t.data.size ()); + } + } +}; + + + +/////////////////////////////////////////////////////////////////////////////// +static void +malformed (util::TAP::logger &tap) +{ + static const struct { + std::vector data; + const char *message; + } TESTS[] = { + { { 0x80 }, "first continuation" }, + { { 0xBF }, "last continuation" }, + { { 0x80, 0xBF }, "continuation sequence" }, + { { 0x80, 0xBF, 0x80 }, "continuation sequence" }, + { { 0x80, 0xBF, 0x80, 0xBF }, "continuation sequence" }, + }; + + static constexpr char fmt[] = "malformed %! byte sequence, %s"; + + for (const auto &t: TESTS) { + auto data = util::make_view ( + reinterpret_cast (&*t.data.cbegin ()), + reinterpret_cast (&*t.data.cbegin ()) + t.data.size () + ); + + tap.expect_throw ( + [&data] () { + util::utf8::decode (data); + }, + fmt, + data.size (), + t.message + ); + } + + // test every continuation byte by itself. we use a boolean flag that + // should never reach the line where we toggle it to false due to the + // expected exception. + { + bool success = true; + for (uint8_t c = 0x80; c <= 0xbf; ++c) { + try { + const auto v = c; + util::utf8::decode (util::view { &v, &v+1 }); + success = false; + break; + } catch (...) { ; } + } + + tap.expect (success, "individual continuation bytes"); + } + + // every combination of first-byte-then-space sequences + static const struct { + uint8_t first; + uint8_t last; + int length; + } LONELY[] = { + { 0xc0, 0xdf, 2 }, + { 0xe0, 0xef, 3 }, + { 0xf0, 0xf7, 4 }, + }; + + for (const auto &t: LONELY) { + union { + uint8_t bytes[4]; + char str [4]; + }; + + bool success = true; + + for (auto i = t.first; i <= t.last; ++i) { + std::fill (std::begin (str), std::end (str), ' '); + bytes[0] = i; + + try { + util::utf8::decode (util::make_cview (str)); + success = false; + } + catch (const util::utf8::malformed_error&) + { ; } + catch (...) + { success = false; } + } + + tap.expect (success, "lonely start characters, %! bytes", t.length); + } + + + static const std::vector MISSING[] = { + { 0xC0 }, + { 0xE0, 0x80 }, + { 0xF0, 0x80, 0x80 }, + }; + + for (const auto &t: MISSING) { + util::view data { + reinterpret_cast (&t[0]), + reinterpret_cast (&t[0]) + t.size () + }; + + tap.expect_throw ( + [&data] () { util::utf8::decode (data); }, + "%! byte sequence missing the lastbyte", + t.size () + ); + } + + + static const std::vector IMPOSSIBLE[] = { + { 0xfe }, + { 0xff }, + { 0xfe, 0xfe, 0xff, 0xff } + }; + + for (const auto &t: IMPOSSIBLE) { + util::view data { + reinterpret_cast (&t[0]), + reinterpret_cast (&t[0]) + t.size () + }; + + tap.expect_throw ( + [&data] () { util::utf8::decode (data); }, + "impossible %! byte sequence", + t.size () + ); + } +}; + + +/////////////////////////////////////////////////////////////////////////////// +void +overlong (util::TAP::logger &tap) +{ + static const struct { + std::vector data; + const char *message; + } TESTS[] = { + { { 0xc0, 0xaf }, "simple ANSI" }, + { { 0xe0, 0x80, 0xaf }, "simple ANSI" }, + { { 0xf0, 0x80, 0x80, 0xaf }, "simple ANSI" }, + + { { 0xc1, 0xbf }, "maximum" }, + { { 0xe0, 0x9f, 0xbf }, "maximum" }, + { { 0xf0, 0x8f, 0xbf, 0xbf }, "maximum" }, + + { { 0xc0, 0x80 }, "null" }, + { { 0xe0, 0x80, 0x80 }, "null" }, + { { 0xf0, 0x80, 0x80, 0x80 }, "null" }, + }; + + for (const auto &t: TESTS) { + auto data = util::make_view ( + reinterpret_cast (&t.data[0]), + reinterpret_cast (&t.data[0]) + t.data.size () + ); + + tap.expect_throw ( + [&] () { + util::utf8::decode (data); + }, + "overlong %! byte sequence, %s", + t.data.size (), + t.message + ); + } +}; + + +/////////////////////////////////////////////////////////////////////////////// +void +illegal (util::TAP::logger &tap) +{ + static const std::array SINGLE[] = { + { 0xed, 0xa0, 0x80 }, // U+D800 + { 0xed, 0xad, 0xbf }, // U+DB7F + { 0xed, 0xae, 0x80 }, // U+DB80 + { 0xed, 0xaf, 0xbf }, // U+DBFF + { 0xed, 0xb0, 0x80 }, // U+DC00 + { 0xed, 0xbe, 0x80 }, // U+DF80 + { 0xed, 0xbf, 0xbf }, // U+DFFF + }; + + for (const auto &t: SINGLE) + tap.expect_throw ( + [&t] () { util::utf8::decode (util::make_view (t)); }, + "reject utf16 single surrogate" + ); + + + static const std::array DOUBLE[] = { + { 0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80 }, // U+D800 U+DC00 + { 0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf }, // U+D800 U+DFFF + { 0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80 }, // U+DB7F U+DC00 + { 0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf }, // U+DB7F U+DFFF + { 0xed, 0xae, 0x80, 0xed, 0xb0, 0x80 }, // U+DB80 U+DC00 + { 0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf }, // U+DB80 U+DFFF + { 0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80 }, // U+DBFF U+DC00 + { 0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf }, // U+DBFF U+DFFF + }; + + for (const auto &t: DOUBLE) + tap.expect_throw ( + [&t] () { util::utf8::decode (util::make_view (t)); }, + "reject utf16 paired surrogate" + ); + + + static const std::array OTHER[] = { + { 0xef, 0xbf, 0xbe }, // FFFE + { 0xef, 0xbf, 0xbf }, // FFFF + }; + + + for (const auto &t: OTHER) + tap.expect_throw ( + [&t] () { util::utf8::decode (util::make_view (t)); }, + "reject BOM" + ); + + +}; + + +/////////////////////////////////////////////////////////////////////////////// +int +main() +{ + util::TAP::logger tap; + + simple_valid (tap); + single_boundaries (tap); + malformed (tap); + overlong (tap); + illegal (tap); + + return tap.status (); +}; \ No newline at end of file diff --git a/utf8.cpp b/utf8.cpp new file mode 100644 index 00000000..fa2dc14c --- /dev/null +++ b/utf8.cpp @@ -0,0 +1,148 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright 2017 Danny Robson + */ + +#include "./utf8.hpp" + + +/////////////////////////////////////////////////////////////////////////////// +template +struct test { + constexpr + test (T _mask, T _value, T _shift): + mask (_mask), + value (_value), + shift (_shift) + { ; } + + constexpr bool + operator() (T t) const + { + return (t & mask) == value; + } + + T mask; + T value; + T shift; +}; + + +//----------------------------------------------------------------------------- +static constexpr test +operator"" _test (const char *str, size_t len) +{ + uint32_t mask = 0; + uint32_t value = 0; + uint32_t shift = 0; + + if (str[0] != '0' || str[1] != 'b') + throw std::invalid_argument ("invalid bit test prefix"); + + for (size_t i = 2; i < len; ++i) { + auto c = str[i]; + + mask <<= 1; + value <<= 1; + + switch (c) { + case '0': mask |= 0x1; value |= 0x0; break; + case '1': mask |= 0x1; value |= 0x1; break; + case 'x': mask |= 0x0; value |= 0x0; ++shift; break; + default: + throw std::invalid_argument ("invalid bit test character"); + } + + } + + return { mask, value, shift }; +} + + +/////////////////////////////////////////////////////////////////////////////// +std::vector +util::utf8::decode (view src) +{ + std::vector dst; + dst.reserve (src.size ()); + + static constexpr + test TESTS[] = { + "0b0xxxxxxx"_test, + "0b110xxxxx"_test, + "0b1110xxxx"_test, + "0b11110xxx"_test + }; + + for (auto cursor = src.cbegin (); cursor != src.cend (); ++cursor) { + codepoint_t c = std::to_integer (*cursor); + + int len = TESTS[0] (c) ? 0 : + TESTS[1] (c) ? 1 : + TESTS[2] (c) ? 2 : + TESTS[3] (c) ? 3 : + throw malformed_error {}; + + if (cursor + len >= src.cend ()) + throw malformed_error {}; + + // get the simple ANSI case out of the way + if (!len) { + dst.push_back (c); + continue; + } + + codepoint_t head = codepoint_t { c & ~TESTS[len].mask } << (len * 6); + codepoint_t accum = head; + codepoint_t shift = 0; + + // check every following data byte has the appropriate prefix + for (int i = 1; i <= len; ++i) { + if ((std::to_integer (cursor[i]) & 0b11'000000u) != 0b10'000000u) + throw malformed_error {}; + } + + switch (len) { + case 3: accum |= (std::to_integer (cursor[3]) & 0b00111111u) << (shift++ * 6u); + case 2: accum |= (std::to_integer (cursor[2]) & 0b00111111u) << (shift++ * 6u); + case 1: accum |= (std::to_integer (cursor[1]) & 0b00111111u) << (shift++ * 6u); + } + + // describes the bits required to be present for a valid minimally + // sized codepoint of a given byte length. + static constexpr + codepoint_t LEVEL_MASK[] { + 0b00000000'00000000'01111111, + 0b00000000'00000111'10000000, + 0b00000000'11111000'00000000, + 0b00011111'00000000'00000000 + }; + + if (!(accum & LEVEL_MASK[len])) + throw overlong_error {}; + + // utf16 surrogates should not be present in utf8 + if (accum >= 0xD800 && accum <= 0xDFFF) + throw illegal_codepoint {}; + + // reject the BOM + if (accum == 0xfffe || accum == 0xffff) + throw illegal_codepoint {}; + + dst.push_back (accum); + std::advance (cursor, len); + } + + return dst; +} \ No newline at end of file diff --git a/utf8.hpp b/utf8.hpp new file mode 100644 index 00000000..b89c1b2d --- /dev/null +++ b/utf8.hpp @@ -0,0 +1,74 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright 2017 Danny Robson + */ + + +#ifndef CRUFT_UTIL_UTF8_HPP +#define CRUFT_UTIL_UTF8_HPP + +#include "./view.hpp" + +#include +#include +#include + + +namespace util::utf8 { + /////////////////////////////////////////////////////////////////////////// + using codepoint_t = uint32_t; + + constexpr codepoint_t MAX_CODEPOINT = 0x10FFFF; + + + /////////////////////////////////////////////////////////////////////////// + std::vector + decode (util::view); + + + //------------------------------------------------------------------------- + inline auto + decode (util::view data) + { + return decode ({ + reinterpret_cast (data.cbegin ()), + reinterpret_cast (data.cend ()) + }); + } + + inline auto + decode (util::view data) + { + return decode ({ + reinterpret_cast (data.cbegin ()), + reinterpret_cast (data.cend ()) + }); + } + + + /////////////////////////////////////////////////////////////////////////// + std::vector + encode (util::view); + + + /////////////////////////////////////////////////////////////////////////// + struct error : public std::exception {}; + + struct malformed_error : public error { }; + struct illegal_codepoint : public malformed_error {}; + + struct overlong_error : public error { }; +} + +#endif