From c61fa163168fefb1f7b33629a32b6d82060b028e Mon Sep 17 00:00:00 2001 From: Danny Robson Date: Mon, 2 Oct 2017 16:15:19 +1100 Subject: [PATCH] utf8: use an iterator style interface internally this will be presented to the users shortly --- utf8.cpp | 69 ++++++++++++++++++++++++++++++++------------------------ utf8.hpp | 2 -- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/utf8.cpp b/utf8.cpp index dbdb6a21..9d18c334 100644 --- a/utf8.cpp +++ b/utf8.cpp @@ -74,53 +74,52 @@ operator"" _test (const char *str, size_t len) /////////////////////////////////////////////////////////////////////////////// -std::vector -util::utf8::decode (view src) +template < + typename InputT, + typename OutputT> +static OutputT +decode (util::view src, OutputT dst) { - std::vector dst; - dst.reserve (src.size ()); + using namespace util::utf8; static constexpr - test TESTS[] = { + test PREFIX[] = { "0b0xxxxxxx"_test, "0b110xxxxx"_test, "0b1110xxxx"_test, "0b11110xxx"_test }; - for (auto cursor = src.cbegin (); cursor != src.cend (); ++cursor) { - codepoint_t c = std::to_integer (*cursor); + for (auto cursor = src.cbegin (); cursor != src.cend (); ) { + codepoint_t c = std::to_integer (*cursor++); - int len = TESTS[0].valid (c) ? 0 : - TESTS[1].valid (c) ? 1 : - TESTS[2].valid (c) ? 2 : - TESTS[3].valid (c) ? 3 : + int len = PREFIX[0].valid (c) ? 0 : + PREFIX[1].valid (c) ? 1 : + PREFIX[2].valid (c) ? 2 : + PREFIX[3].valid (c) ? 3 : throw malformed_error {}; - if (cursor + len >= src.cend ()) - throw malformed_error {}; - // get the simple ANSI case out of the way if (!len) { - dst.push_back (c); + *dst++ = c; continue; } - codepoint_t head = codepoint_t { c & ~TESTS[len].mask } << (len * 6); - codepoint_t accum = head; - codepoint_t shift = 0; + codepoint_t accum { PREFIX[len].value (c) }; // check every following data byte has the appropriate prefix - constexpr auto CONTINUATION = "0b10xxxxxx"_test; - for (int i = 1; i <= len; ++i) { - if (!CONTINUATION.valid (std::to_integer (cursor[i]))) - throw malformed_error {}; - } + static constexpr auto CONTINUATION = "0b10xxxxxx"_test; - switch (len) { - case 3: accum |= CONTINUATION.value (std::to_integer (cursor[3])) << (shift++ * 6u); - case 2: accum |= CONTINUATION.value (std::to_integer (cursor[2])) << (shift++ * 6u); - case 1: accum |= CONTINUATION.value (std::to_integer (cursor[1])) << (shift++ * 6u); + for (int i = 1; i <= len; ++i) { + if (cursor == src.cend ()) + throw malformed_error {}; + + codepoint_t now = std::to_integer (*cursor++); + if (!CONTINUATION.valid (now)) + throw malformed_error {}; + + accum <<= 6; + accum |= CONTINUATION.value (now); } // describes the bits required to be present for a valid minimally @@ -144,9 +143,21 @@ util::utf8::decode (view src) if (accum == 0xfffe || accum == 0xffff) throw illegal_codepoint {}; - dst.push_back (accum); - std::advance (cursor, len); + *dst++ = accum; } + + return dst; +} + + +/////////////////////////////////////////////////////////////////////////////// +std::vector +util::utf8::decode (view src) +{ + std::vector dst; + dst.reserve (src.size ()); + + ::decode (src, std::back_inserter (dst)); return dst; } \ No newline at end of file diff --git a/utf8.hpp b/utf8.hpp index b89c1b2d..bf3ea287 100644 --- a/utf8.hpp +++ b/utf8.hpp @@ -29,8 +29,6 @@ namespace util::utf8 { /////////////////////////////////////////////////////////////////////////// using codepoint_t = uint32_t; - constexpr codepoint_t MAX_CODEPOINT = 0x10FFFF; - /////////////////////////////////////////////////////////////////////////// std::vector