/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * Copyright 2017 Danny Robson */ #include "./utf8.hpp" /////////////////////////////////////////////////////////////////////////////// template struct test { constexpr test (T _mask, T _bits) noexcept: mask (_mask), bits (_bits) { ; } constexpr bool valid (T t) const noexcept { return (t & mask) == bits; } constexpr T value (T t) const noexcept { return t & ~mask; } T mask; T bits; }; //----------------------------------------------------------------------------- static constexpr test operator"" _test (const char *str, size_t len) { uint32_t mask = 0; uint32_t bits = 0; if (str[0] != '0' || str[1] != 'b') throw std::invalid_argument ("invalid bit test prefix"); for (size_t i = 2; i < len; ++i) { auto c = str[i]; mask <<= 1; bits <<= 1; switch (c) { case '0': mask |= 0x1; bits |= 0x0; break; case '1': mask |= 0x1; bits |= 0x1; break; case 'x': mask |= 0x0; bits |= 0x0; break; default: throw std::invalid_argument ("invalid bit test character"); } } return { mask, bits }; } /////////////////////////////////////////////////////////////////////////////// template < typename InputT, typename OutputT> static OutputT decode (util::view src, OutputT dst) { using namespace util::utf8; static constexpr test PREFIX[] = { "0b0xxxxxxx"_test, "0b110xxxxx"_test, "0b1110xxxx"_test, "0b11110xxx"_test }; for (auto cursor = src.cbegin (); cursor != src.cend (); ) { codepoint_t c = std::to_integer (*cursor++); int len = PREFIX[0].valid (c) ? 0 : PREFIX[1].valid (c) ? 1 : PREFIX[2].valid (c) ? 2 : PREFIX[3].valid (c) ? 3 : throw malformed_error {}; // get the simple ANSI case out of the way if (!len) { *dst++ = c; continue; } codepoint_t accum { PREFIX[len].value (c) }; // prepend each of the remaining bytes data to an accumulator for (int i = 1; i <= len; ++i) { static constexpr auto CONTINUATION = "0b10xxxxxx"_test; if (cursor == src.cend ()) throw malformed_error {}; codepoint_t now = std::to_integer (*cursor++); if (!CONTINUATION.valid (now)) throw malformed_error {}; accum <<= 6; accum |= CONTINUATION.value (now); } // check that the codepoint is the right size by seeing if the unique // bits present in the decoded size codepoint are actually used. // // these could theoretically be provided to the user, but they may be // misused so we will throw an error instead. static constexpr codepoint_t LEVEL_MASK[] { 0b00000000'00000000'01111111, 0b00000000'00000111'10000000, 0b00000000'11111000'00000000, 0b00011111'00000000'00000000 }; if (!(accum & LEVEL_MASK[len])) throw overlong_error {}; // utf16 surrogates should not be present in utf8 if (accum >= 0xD800 && accum <= 0xDFFF) throw illegal_codepoint {}; // reject the BOM if (accum == 0xfffe || accum == 0xffff) throw illegal_codepoint {}; *dst++ = accum; } return dst; } /////////////////////////////////////////////////////////////////////////////// std::vector util::utf8::decode (view src) { std::vector dst; dst.reserve (src.size ()); ::decode (src, std::back_inserter (dst)); return dst; }