libcruft-util/utf8.cpp

/*
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * Copyright 2017 Danny Robson <danny@nerdcruft.net>
 */

#include "./utf8.hpp"


///////////////////////////////////////////////////////////////////////////////
template <typename T>
struct test {
    constexpr
    test (T _mask, T _value, T _shift):
        mask (_mask),
        value (_value),
        shift (_shift)
    { ; }

    constexpr bool
    operator() (T t) const
    {
        return (t & mask) == value;
    }

    T mask;
    T value;
    T shift;
};


//-----------------------------------------------------------------------------
static constexpr test<uint32_t>
operator"" _test (const char *str, size_t len)
{
    uint32_t mask = 0;
    uint32_t value = 0;
    uint32_t shift = 0;

    if (str[0] != '0' || str[1] != 'b')
        throw std::invalid_argument ("invalid bit test prefix");

    for (size_t i = 2; i < len; ++i) {
        auto c = str[i];

        mask  <<= 1;
        value <<= 1;

        switch (c) {
        case '0': mask |= 0x1; value |= 0x0;          break;
        case '1': mask |= 0x1; value |= 0x1;          break;
        case 'x': mask |= 0x0; value |= 0x0; ++shift; break;
        default:
            throw std::invalid_argument ("invalid bit test character");
        }

    }

    return { mask, value, shift };
}


///////////////////////////////////////////////////////////////////////////////
std::vector<util::utf8::codepoint_t>
util::utf8::decode (view<const std::byte*> src)
{
    std::vector<codepoint_t> dst;
    dst.reserve (src.size ());

    static constexpr
    test<codepoint_t> TESTS[] = {
        "0b0xxxxxxx"_test,
        "0b110xxxxx"_test,
        "0b1110xxxx"_test,
        "0b11110xxx"_test
    };

    for (auto cursor = src.cbegin (); cursor != src.cend (); ++cursor) {
        codepoint_t c = std::to_integer<codepoint_t> (*cursor);

        int len = TESTS[0] (c) ? 0 :
                  TESTS[1] (c) ? 1 :
                  TESTS[2] (c) ? 2 :
                  TESTS[3] (c) ? 3 :
                  throw malformed_error {};

        if (cursor + len >= src.cend ())
            throw malformed_error {};

        // get the simple ANSI case out of the way
        if (!len) {
            dst.push_back (c);
            continue;
        }

        codepoint_t head = codepoint_t { c & ~TESTS[len].mask } << (len * 6);
        codepoint_t accum = head;
        codepoint_t shift = 0;

        // check every following data byte has the appropriate prefix
        for (int i = 1; i <= len; ++i) {
            if ((std::to_integer<codepoint_t> (cursor[i]) & 0b11'000000u) != 0b10'000000u)
                throw malformed_error {};
        }

        switch (len) {
        case 3: accum |= (std::to_integer<codepoint_t> (cursor[3]) & 0b00111111u) << (shift++ * 6u);
        case 2: accum |= (std::to_integer<codepoint_t> (cursor[2]) & 0b00111111u) << (shift++ * 6u);
        case 1: accum |= (std::to_integer<codepoint_t> (cursor[1]) & 0b00111111u) << (shift++ * 6u);
        }

        // describes the bits required to be present for a valid minimally
        // sized codepoint of a given byte length.
        static constexpr
        codepoint_t LEVEL_MASK[] {
            0b00000000'00000000'01111111,
            0b00000000'00000111'10000000,
            0b00000000'11111000'00000000,
            0b00011111'00000000'00000000
        };

        if (!(accum & LEVEL_MASK[len]))
            throw overlong_error {};

        // utf16 surrogates should not be present in utf8
        if (accum >= 0xD800 && accum <= 0xDFFF)
            throw illegal_codepoint {};

        // reject the BOM
        if (accum == 0xfffe || accum == 0xffff)
            throw illegal_codepoint {};

        dst.push_back (accum);
        std::advance (cursor, len);
    }

    return dst;
}