analyse: avoid copying the chunk data for duplicate detection

This commit is contained in:
Danny Robson 2020-12-29 08:37:32 +10:00
parent 00dc05ac8f
commit b5e93c15fb

View File

@ -220,18 +220,22 @@ int main (int argc, char const **argv)
return accum + rhs.size ();
});
// WARNING: this is destructive, but suits our purposes for the moment as
// we intend to remove the reliance on in memory storage as much as we can.
auto const init_size = src.size ();
src.erase(
std::unique (
src.begin (),
src.end (),
region_equal
),
src.end ()
);
std::vector<emory::chunk::region> unique;
std::unique_copy (
auto const unique_bytes = std::accumulate (
src.begin (),
src.end (),
std::back_inserter (unique),
region_equal
);
auto const unique_bytes = std::accumulate (
unique.begin (),
unique.end (),
0, [] (auto const accum, auto const rhs) { return accum + rhs.size (); }
0, [] (auto const &accum, auto const &rhs) { return accum + rhs.size (); }
);
auto const duplicated_bytes = total_bytes - unique_bytes;
@ -244,5 +248,5 @@ int main (int argc, char const **argv)
100.f * duplicated_fraction
);
fmt::print ("{} duplicates\n", src.size () - unique.size ());
fmt::print ("{} duplicates\n", init_size - src.size ());
}