kmeans: add naive kmeans impl

2018-04-18 21:48:24 +10:00 · 2018-04-18 21:48:24 +10:00 · f31a344912
commit f31a344912
parent dcd789a075
3 changed files with 105 additions and 0 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -303,6 +303,7 @@ list (
    json2/personality/rfc7519.hpp
    json2/tree.cpp
    json2/tree.hpp
+    kmeans.hpp
    library.hpp
    log.cpp
    log.hpp
@ -509,6 +510,7 @@ if (TESTS)
        job/queue
        json_types
        json2/event
+        kmeans
        maths
        maths/fast
        matrix
--- a/kmeans.hpp
+++ b/kmeans.hpp
@ -0,0 +1,68 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Copyright 2018 Danny Robson <danny@nerdcruft.net>
+ */
+
+#pragma once
+
+#include "debug.hpp"
+#include "iterator.hpp"
+#include "point.hpp"
+
+#include <iterator>
+
+namespace util {
+    // a simplistic implementation of Lloyd's algorithm
+    //
+    // returns index of the closest output for each input
+    template <typename OutputT, typename InputT>
+    std::vector<size_t>
+    kmeans (util::view<InputT> src, util::view<OutputT> dst)
+    {
+        CHECK_GE (src.size (), dst.size ());
+
+        using coord_t = typename std::iterator_traits<InputT>::value_type;
+        const int iterations = 100;
+
+        std::vector<coord_t> means (src.begin (), src.begin () + dst.size ());
+        std::vector<coord_t> accum (dst.size ());
+        std::vector<size_t> count (dst.size ());
+        std::vector<size_t> closest (src.size ());
+
+        for (auto i = 0; i < iterations; ++i) {
+            std::fill (std::begin (accum), std::end (accum), 0);
+            std::fill (std::begin (count), std::end (count), 0);
+
+            for (auto const& [j,p]: util::izip (src)) {
+                size_t bucket = 0;
+
+                for (size_t k = 1; k < dst.size (); ++k) {
+                    if (norm2 (p - means[k]) < norm2 (p - means[bucket]))
+                        bucket = k;
+                }
+
+                accum[bucket] += p;
+                count[bucket] += 1;
+                closest[j] = bucket;
+            }
+
+            for (size_t j = 0; j < dst.size (); ++j)
+                means[j] = accum[j] / count[j];
+        }
+
+        std::copy (std::begin (means), std::end (means), std::begin (dst));
+
+        return closest;
+    }
+}
--- a/test/kmeans.cpp
+++ b/test/kmeans.cpp
@ -0,0 +1,35 @@
+#include "tap.hpp"
+
+#include "kmeans.hpp"
+
+#include <cruft/util/point.hpp>
+
+
+///////////////////////////////////////////////////////////////////////////////
+int
+main ()
+{
+    util::TAP::logger tap;
+
+    // create one point and check it 'converges' to this one point
+    {
+        const std::array<util::point3f,1> p { {{1,2,3}} };
+        std::array<util::point3f,1> q;
+
+        util::kmeans (util::view{p}, util::view{q});
+        tap.expect_eq (p, q, "single point, single k");
+    }
+
+    // create two vectors, check if the mean converges to their average
+    {
+        const std::array<util::vector3f,2> p {{
+            {1}, {2}
+        }};
+        std::array<util::vector3f,1> q;
+
+        util::kmeans (util::view{p}, util::view{q});
+        tap.expect_eq (q[0], (p[0]+p[1])/2, "two point, single k");
+    }
+
+    return tap.status ();
+}