libcruft-util/kmeans.hpp
Danny Robson e2b55c7ee5 kmeans: use norm as default metric
this avoids relying on 'point' being the coordinate type
2018-04-24 11:53:58 +10:00

83 lines
2.5 KiB
C++

/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Copyright 2018 Danny Robson <danny@nerdcruft.net>
*/
#pragma once
#include "debug.hpp"
#include "iterator.hpp"
#include "point.hpp"
#include <iterator>
namespace util {
// a simplistic implementation of Lloyd's algorithm
//
// returns index of the closest output for each input
template <typename OutputT, typename InputT, typename FunctionT>
std::vector<size_t>
kmeans (util::view<InputT> src, util::view<OutputT> dst, FunctionT const &&metric)
{
CHECK_GE (src.size (), dst.size ());
using coord_t = typename std::iterator_traits<InputT>::value_type;
const int iterations = 100;
std::vector<coord_t> means (src.begin (), src.begin () + dst.size ());
std::vector<coord_t> accum (dst.size ());
std::vector<size_t> count (dst.size ());
std::vector<size_t> closest (src.size ());
for (auto i = 0; i < iterations; ++i) {
std::fill (std::begin (accum), std::end (accum), 0);
std::fill (std::begin (count), std::end (count), 0);
for (auto const& [j,p]: util::izip (src)) {
size_t bucket = 0;
for (size_t k = 1; k < dst.size (); ++k) {
if (metric (p, means[k]) < metric (p, means[bucket]))
bucket = k;
}
accum[bucket] += p;
count[bucket] += 1;
closest[j] = bucket;
}
for (size_t j = 0; j < dst.size (); ++j)
means[j] = accum[j] / count[j];
}
std::copy (std::begin (means), std::end (means), std::begin (dst));
return closest;
}
template <typename OutputT, typename InputT>
auto
kmeans (InputT &&src, OutputT &&dst)
{
return kmeans (
std::forward<InputT> (src),
std::forward<OutputT> (dst),
[] (auto a, auto b) {
return norm (a - b);
});
}
}