utf8: use an iterator style interface internally

this will be presented to the users shortly
This commit is contained in:
Danny Robson 2017-10-02 16:15:19 +11:00
parent 35db5f0a7a
commit c61fa16316
2 changed files with 40 additions and 31 deletions

View File

@ -74,53 +74,52 @@ operator"" _test (const char *str, size_t len)
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
std::vector<util::utf8::codepoint_t> template <
util::utf8::decode (view<const std::byte*> src) typename InputT,
typename OutputT>
static OutputT
decode (util::view<InputT> src, OutputT dst)
{ {
std::vector<codepoint_t> dst; using namespace util::utf8;
dst.reserve (src.size ());
static constexpr static constexpr
test<codepoint_t> TESTS[] = { test<codepoint_t> PREFIX[] = {
"0b0xxxxxxx"_test, "0b0xxxxxxx"_test,
"0b110xxxxx"_test, "0b110xxxxx"_test,
"0b1110xxxx"_test, "0b1110xxxx"_test,
"0b11110xxx"_test "0b11110xxx"_test
}; };
for (auto cursor = src.cbegin (); cursor != src.cend (); ++cursor) { for (auto cursor = src.cbegin (); cursor != src.cend (); ) {
codepoint_t c = std::to_integer<codepoint_t> (*cursor); codepoint_t c = std::to_integer<codepoint_t> (*cursor++);
int len = TESTS[0].valid (c) ? 0 : int len = PREFIX[0].valid (c) ? 0 :
TESTS[1].valid (c) ? 1 : PREFIX[1].valid (c) ? 1 :
TESTS[2].valid (c) ? 2 : PREFIX[2].valid (c) ? 2 :
TESTS[3].valid (c) ? 3 : PREFIX[3].valid (c) ? 3 :
throw malformed_error {}; throw malformed_error {};
if (cursor + len >= src.cend ())
throw malformed_error {};
// get the simple ANSI case out of the way // get the simple ANSI case out of the way
if (!len) { if (!len) {
dst.push_back (c); *dst++ = c;
continue; continue;
} }
codepoint_t head = codepoint_t { c & ~TESTS[len].mask } << (len * 6); codepoint_t accum { PREFIX[len].value (c) };
codepoint_t accum = head;
codepoint_t shift = 0;
// check every following data byte has the appropriate prefix // check every following data byte has the appropriate prefix
constexpr auto CONTINUATION = "0b10xxxxxx"_test; static constexpr auto CONTINUATION = "0b10xxxxxx"_test;
for (int i = 1; i <= len; ++i) {
if (!CONTINUATION.valid (std::to_integer<codepoint_t> (cursor[i])))
throw malformed_error {};
}
switch (len) { for (int i = 1; i <= len; ++i) {
case 3: accum |= CONTINUATION.value (std::to_integer<codepoint_t> (cursor[3])) << (shift++ * 6u); if (cursor == src.cend ())
case 2: accum |= CONTINUATION.value (std::to_integer<codepoint_t> (cursor[2])) << (shift++ * 6u); throw malformed_error {};
case 1: accum |= CONTINUATION.value (std::to_integer<codepoint_t> (cursor[1])) << (shift++ * 6u);
codepoint_t now = std::to_integer<codepoint_t> (*cursor++);
if (!CONTINUATION.valid (now))
throw malformed_error {};
accum <<= 6;
accum |= CONTINUATION.value (now);
} }
// describes the bits required to be present for a valid minimally // describes the bits required to be present for a valid minimally
@ -144,9 +143,21 @@ util::utf8::decode (view<const std::byte*> src)
if (accum == 0xfffe || accum == 0xffff) if (accum == 0xfffe || accum == 0xffff)
throw illegal_codepoint {}; throw illegal_codepoint {};
dst.push_back (accum); *dst++ = accum;
std::advance (cursor, len);
} }
return dst;
}
///////////////////////////////////////////////////////////////////////////////
std::vector<util::utf8::codepoint_t>
util::utf8::decode (view<const std::byte*> src)
{
std::vector<codepoint_t> dst;
dst.reserve (src.size ());
::decode (src, std::back_inserter (dst));
return dst; return dst;
} }

View File

@ -29,8 +29,6 @@ namespace util::utf8 {
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
using codepoint_t = uint32_t; using codepoint_t = uint32_t;
constexpr codepoint_t MAX_CODEPOINT = 0x10FFFF;
/////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////
std::vector<codepoint_t> std::vector<codepoint_t>