diff --git a/CMakeLists.txt b/CMakeLists.txt index 0632b365..43aff927 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -303,6 +303,7 @@ list ( json2/personality/rfc7519.hpp json2/tree.cpp json2/tree.hpp + kmeans.hpp library.hpp log.cpp log.hpp @@ -509,6 +510,7 @@ if (TESTS) job/queue json_types json2/event + kmeans maths maths/fast matrix diff --git a/kmeans.hpp b/kmeans.hpp new file mode 100644 index 00000000..bb2d2fed --- /dev/null +++ b/kmeans.hpp @@ -0,0 +1,68 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Copyright 2018 Danny Robson + */ + +#pragma once + +#include "debug.hpp" +#include "iterator.hpp" +#include "point.hpp" + +#include + +namespace util { + // a simplistic implementation of Lloyd's algorithm + // + // returns index of the closest output for each input + template + std::vector + kmeans (util::view src, util::view dst) + { + CHECK_GE (src.size (), dst.size ()); + + using coord_t = typename std::iterator_traits::value_type; + const int iterations = 100; + + std::vector means (src.begin (), src.begin () + dst.size ()); + std::vector accum (dst.size ()); + std::vector count (dst.size ()); + std::vector closest (src.size ()); + + for (auto i = 0; i < iterations; ++i) { + std::fill (std::begin (accum), std::end (accum), 0); + std::fill (std::begin (count), std::end (count), 0); + + for (auto const& [j,p]: util::izip (src)) { + size_t bucket = 0; + + for (size_t k = 1; k < dst.size (); ++k) { + if (norm2 (p - means[k]) < norm2 (p - means[bucket])) + bucket = k; + } + + accum[bucket] += p; + count[bucket] += 1; + closest[j] = bucket; + } + + for (size_t j = 0; j < dst.size (); ++j) + means[j] = accum[j] / count[j]; + } + + std::copy (std::begin (means), std::end (means), std::begin (dst)); + + return closest; + } +} diff --git a/test/kmeans.cpp b/test/kmeans.cpp new file mode 100644 index 00000000..ceff7bac --- /dev/null +++ b/test/kmeans.cpp @@ -0,0 +1,35 @@ +#include "tap.hpp" + +#include "kmeans.hpp" + +#include + + +/////////////////////////////////////////////////////////////////////////////// +int +main () +{ + util::TAP::logger tap; + + // create one point and check it 'converges' to this one point + { + const std::array p { {{1,2,3}} }; + std::array q; + + util::kmeans (util::view{p}, util::view{q}); + tap.expect_eq (p, q, "single point, single k"); + } + + // create two vectors, check if the mean converges to their average + { + const std::array p {{ + {1}, {2} + }}; + std::array q; + + util::kmeans (util::view{p}, util::view{q}); + tap.expect_eq (q[0], (p[0]+p[1])/2, "two point, single k"); + } + + return tap.status (); +} \ No newline at end of file