libcruft-util/utf8.cpp

165 lines
4.4 KiB
C++

/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2017 Danny Robson <danny@nerdcruft.net>
*/
#include "./utf8.hpp"
///////////////////////////////////////////////////////////////////////////////
template <typename T>
struct test {
constexpr
test (T _mask, T _bits) noexcept:
mask (_mask),
bits (_bits)
{ ; }
constexpr bool
valid (T t) const noexcept
{
return (t & mask) == bits;
}
constexpr T
value (T t) const noexcept
{
return t & ~mask;
}
T mask;
T bits;
};
//-----------------------------------------------------------------------------
static constexpr test<uint32_t>
operator"" _test (const char *str, size_t len)
{
uint32_t mask = 0;
uint32_t bits = 0;
if (str[0] != '0' || str[1] != 'b')
throw std::invalid_argument ("invalid bit test prefix");
for (size_t i = 2; i < len; ++i) {
auto c = str[i];
mask <<= 1;
bits <<= 1;
switch (c) {
case '0': mask |= 0x1; bits |= 0x0; break;
case '1': mask |= 0x1; bits |= 0x1; break;
case 'x': mask |= 0x0; bits |= 0x0; break;
default:
throw std::invalid_argument ("invalid bit test character");
}
}
return { mask, bits };
}
///////////////////////////////////////////////////////////////////////////////
template <
typename InputT,
typename OutputT>
static OutputT
decode (util::view<InputT> src, OutputT dst)
{
using namespace util::utf8;
static constexpr
test<codepoint_t> PREFIX[] = {
"0b0xxxxxxx"_test,
"0b110xxxxx"_test,
"0b1110xxxx"_test,
"0b11110xxx"_test
};
for (auto cursor = src.cbegin (); cursor != src.cend (); ) {
codepoint_t c = std::to_integer<codepoint_t> (*cursor++);
int len = PREFIX[0].valid (c) ? 0 :
PREFIX[1].valid (c) ? 1 :
PREFIX[2].valid (c) ? 2 :
PREFIX[3].valid (c) ? 3 :
throw malformed_error {};
// get the simple ANSI case out of the way
if (!len) {
*dst++ = c;
continue;
}
codepoint_t accum { PREFIX[len].value (c) };
// prepend each of the remaining bytes data to an accumulator
for (int i = 1; i <= len; ++i) {
static constexpr auto CONTINUATION = "0b10xxxxxx"_test;
if (cursor == src.cend ())
throw malformed_error {};
codepoint_t now = std::to_integer<codepoint_t> (*cursor++);
if (!CONTINUATION.valid (now))
throw malformed_error {};
accum <<= 6;
accum |= CONTINUATION.value (now);
}
// check that the codepoint is the right size by seeing if the unique
// bits present in the decoded size codepoint are actually used.
//
// these could theoretically be provided to the user, but they may be
// misused so we will throw an error instead.
static constexpr
codepoint_t LEVEL_MASK[] {
0b00000000'00000000'01111111,
0b00000000'00000111'10000000,
0b00000000'11111000'00000000,
0b00011111'00000000'00000000
};
if (!(accum & LEVEL_MASK[len]))
throw overlong_error {};
// utf16 surrogates should not be present in utf8
if (accum >= 0xD800 && accum <= 0xDFFF)
throw illegal_codepoint {};
// reject the BOM
if (accum == 0xfffe || accum == 0xffff)
throw illegal_codepoint {};
*dst++ = accum;
}
return dst;
}
///////////////////////////////////////////////////////////////////////////////
std::vector<util::utf8::codepoint_t>
util::utf8::decode (view<const std::byte*> src)
{
std::vector<codepoint_t> dst;
dst.reserve (src.size ());
::decode (src, std::back_inserter (dst));
return dst;
}