Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cleanup unit_ivf_pq_index.cc #548

Merged
merged 2 commits into from
Oct 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/include/api/ivf_pq_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ class IndexIVFPQ {
}

uint64_t nlist() const override {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we also rename the API level function nlist->partitions?

Copy link
Contributor Author

@jparismorgan jparismorgan Oct 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, good call, did this in a follow-up PR: #551

return impl_index_.nlist();
return impl_index_.partitions();
}

uint32_t num_subspaces() const override {
Expand Down
2 changes: 1 addition & 1 deletion src/include/index/ivf_pq_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -1415,7 +1415,7 @@ class ivf_pq_index {
return reassign_ratio_;
}

uint64_t nlist() const {
uint64_t partitions() const {
return num_partitions_;
}

Expand Down
86 changes: 28 additions & 58 deletions src/include/test/unit_ivf_pq_index.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,17 +65,6 @@ struct dummy_pq_index {
}
};

void debug_flat_ivf_centroids(const auto& index) {
std::cout << "\nDebug Centroids:\n" << std::endl;
for (size_t j = 0; j < index.get_flat_ivf_centroids().num_rows(); ++j) {
for (size_t i = 0; i < index.get_flat_ivf_centroids().num_cols(); ++i) {
std::cout << index.get_flat_ivf_centroids()(j, i) << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}

TEST_CASE("construct different types", "[ivf_pq_index]") {
ivf_pq_index<int8_t, uint32_t, uint32_t> index1{};
ivf_pq_index<uint8_t, uint32_t, uint32_t> index2{};
Expand Down Expand Up @@ -212,9 +201,6 @@ TEST_CASE("debug w/ sk", "[ivf_pq_index]") {
1e-4);
index.set_flat_ivf_centroids(sklearn_centroids);
index.train_ivf(training_data, kmeans_init::none);
if (debug) {
debug_flat_ivf_centroids(index);
}
}

SECTION("two iterations") {
Expand All @@ -229,10 +215,6 @@ TEST_CASE("debug w/ sk", "[ivf_pq_index]") {
1e-4);
index.set_flat_ivf_centroids(sklearn_centroids);
index.train_ivf(training_data, kmeans_init::none);
if (debug) {
debug_flat_ivf_centroids(index);
}
// debug_centroids(index);
}

SECTION("five iterations") {
Expand All @@ -247,10 +229,6 @@ TEST_CASE("debug w/ sk", "[ivf_pq_index]") {
1e-4);
index.set_flat_ivf_centroids(sklearn_centroids);
index.train_ivf(training_data, kmeans_init::none);
if (debug) {
debug_flat_ivf_centroids(index);
}
// debug_centroids(index);
}

SECTION("five iterations, perturbed") {
Expand All @@ -272,10 +250,6 @@ TEST_CASE("debug w/ sk", "[ivf_pq_index]") {
1e-4);
index.set_flat_ivf_centroids(sklearn_centroids);
index.train_ivf(training_data, kmeans_init::none);
if (debug) {
debug_flat_ivf_centroids(index);
}
// debug_centroids(index);
}

SECTION("five iterations") {
Expand All @@ -289,16 +263,12 @@ TEST_CASE("debug w/ sk", "[ivf_pq_index]") {
5,
1e-4);
index.train_ivf(training_data, kmeans_init::random);
if (debug) {
debug_flat_ivf_centroids(index);
}
// debug_centroids(index);
}
}

TEST_CASE("ivf_index write and read", "[ivf_pq_index]") {
size_t dimension = 128;
size_t nlist = 100;
size_t partitions = 100;
uint32_t num_subspaces = 16;
uint32_t max_iterations = 4;
size_t nprobe = 10;
Expand All @@ -318,7 +288,7 @@ TEST_CASE("ivf_index write and read", "[ivf_pq_index]") {
std::vector<siftsmall_ids_type> ids(num_vectors(training_set));
std::iota(begin(ids), end(ids), 0);
auto idx = ivf_pq_index<float, uint32_t, uint32_t>(
nlist, num_subspaces, max_iterations);
partitions, num_subspaces, max_iterations);
CHECK(idx.num_vectors() == 0);
idx.train_ivf(training_set, kmeans_init::kmeanspp);
idx.add(training_set, ids);
Expand Down Expand Up @@ -406,16 +376,16 @@ TEMPLATE_TEST_CASE(
hypercube4(j + 9, i) = hypercube1(j, i);
}
}
SECTION("nlist = 1") {
SECTION("partitions = 1") {
size_t k_nn = 6;
size_t nlist = 1;
size_t partitions = 1;

auto ivf_idx2 = ivf_pq_index<TestType, uint32_t, uint32_t>(
/*128,*/ nlist, 2, 4, 1.e-4); // dim nlist maxiter eps nthreads
/*128,*/ partitions, 2, 4, 1.e-4); // dim partitions maxiter eps nthreads
ivf_idx2.train_ivf(hypercube2);
ivf_idx2.add(hypercube2, ids);
auto ivf_idx4 = ivf_pq_index<TestType, uint32_t, uint32_t>(
/*128,*/ nlist, 2, 4, 1.e-4);
/*128,*/ partitions, 2, 4, 1.e-4);
ivf_idx4.train_ivf(hypercube4);
ivf_idx4.add(hypercube4, ids);

Expand Down Expand Up @@ -477,12 +447,12 @@ TEMPLATE_TEST_CASE(

TEST_CASE("build index and infinite query in place", "[ivf_pq_index]") {
tiledb::Context ctx;
// size_t nlist = GENERATE(1, 100);
size_t nlist = 20;
// size_t partitions = GENERATE(1, 100);
size_t partitions = 20;
using s = siftsmall_test_init_defaults;
using index = ivf_pq_index<s::feature_type, s::id_type, s::px_type>;

auto init = siftsmall_test_init<index>(ctx, nlist, 16);
auto init = siftsmall_test_init<index>(ctx, partitions, 16);

auto&& [nprobe, k_nn, nthreads, max_iterations, convergence_tolerance] =
std::tie(
Expand Down Expand Up @@ -581,9 +551,9 @@ TEST_CASE("query empty index", "[ivf_pq_index]") {
tiledb::VFS vfs(ctx);
size_t num_vectors = 0;
uint64_t dimensions = 10;
size_t nlist = 1;
size_t partitions = 1;
auto index = ivf_pq_index<siftsmall_feature_type, siftsmall_ids_type>(
nlist, dimensions / 2);
partitions, dimensions / 2);
auto queries =
ColMajorMatrix<siftsmall_feature_type>{{{1, 1, 1, 1, 1, 1, 1, 1, 1, 1}}};

Expand All @@ -599,7 +569,7 @@ TEST_CASE("query empty index", "[ivf_pq_index]") {
// We can query an empty index.
{
size_t k_nn = 1;
auto&& [scores, ids] = index.query(queries, k_nn, nlist);
auto&& [scores, ids] = index.query(queries, k_nn, partitions);
CHECK(_cpo::num_vectors(scores) == _cpo::num_vectors(queries));
CHECK(_cpo::num_vectors(ids) == _cpo::num_vectors(queries));
CHECK(_cpo::dimensions(scores) == k_nn);
Expand All @@ -625,7 +595,7 @@ TEST_CASE("query empty index", "[ivf_pq_index]") {
ctx, ivf_index_uri);
CHECK(index_infinite.num_vectors() == num_vectors);
size_t k_nn = 1;
auto&& [scores, ids] = index_infinite.query(queries, k_nn, nlist);
auto&& [scores, ids] = index_infinite.query(queries, k_nn, partitions);
CHECK(_cpo::num_vectors(scores) == _cpo::num_vectors(queries));
CHECK(_cpo::num_vectors(ids) == _cpo::num_vectors(queries));
CHECK(_cpo::dimensions(scores) == k_nn);
Expand All @@ -640,7 +610,7 @@ TEST_CASE("query empty index", "[ivf_pq_index]") {
ctx, ivf_index_uri, IndexLoadStrategy::PQ_OOC, upper_bound);
CHECK(index_finite.num_vectors() == num_vectors);
size_t k_nn = 1;
auto&& [scores, ids] = index_finite.query(queries, k_nn, nlist, 9);
auto&& [scores, ids] = index_finite.query(queries, k_nn, partitions, 9);
CHECK(_cpo::num_vectors(scores) == _cpo::num_vectors(queries));
CHECK(_cpo::num_vectors(ids) == _cpo::num_vectors(queries));
CHECK(_cpo::dimensions(scores) == k_nn);
Expand All @@ -656,7 +626,7 @@ TEST_CASE("query simple", "[ivf_pq_index]") {

size_t num_vectors = 4;
uint64_t dimensions = 4;
size_t nlist = 1;
size_t partitions = 1;
uint32_t num_subspaces = 2;
uint32_t max_iterations = 1;
float convergence_tolerance = 0.000025f;
Expand All @@ -665,7 +635,7 @@ TEST_CASE("query simple", "[ivf_pq_index]") {
using feature_type = float;
using id_type = uint32_t;
auto index = ivf_pq_index<feature_type, id_type>(
nlist,
partitions,
num_subspaces,
max_iterations,
convergence_tolerance,
Expand All @@ -676,7 +646,7 @@ TEST_CASE("query simple", "[ivf_pq_index]") {
(std::filesystem::temp_directory_path() / "ivf_index").string();

CHECK(index.num_vectors() == 0);
CHECK(index.nlist() == nlist);
CHECK(index.partitions() == partitions);

// We can train, add, query, and then write the index.
{
Expand All @@ -689,7 +659,7 @@ TEST_CASE("query simple", "[ivf_pq_index]") {
CHECK(index.num_vectors() == ::num_vectors(training));

size_t k_nn = 1;
size_t nprobe = nlist;
size_t nprobe = partitions;
for (int i = 1; i <= 4; ++i) {
auto value = static_cast<feature_type>(i);
auto queries =
Expand Down Expand Up @@ -722,7 +692,7 @@ TEST_CASE("query simple", "[ivf_pq_index]") {
CHECK(index2->num_vectors() == 4);

size_t k_nn = 1;
size_t nprobe = nlist;
size_t nprobe = partitions;
for (int i = 1; i <= 4; ++i) {
auto value = static_cast<feature_type>(i);
auto queries =
Expand All @@ -744,13 +714,13 @@ TEST_CASE("k_factor", "[ivf_pq_index]") {

size_t num_vectors = 500;
uint64_t dimensions = 4;
size_t nlist = 4;
size_t partitions = 4;
uint32_t num_subspaces = 1;
uint32_t max_iterations = 1;
float convergence_tolerance = 0.000025f;
float reassign_ratio = 0.09f;

size_t nprobe = nlist;
size_t nprobe = partitions;
size_t k_nn = 40;
float k_factor = 2.f;
size_t upper_bound = 350;
Expand All @@ -759,7 +729,7 @@ TEST_CASE("k_factor", "[ivf_pq_index]") {
using feature_type = float;
using id_type = uint32_t;
auto index = ivf_pq_index<feature_type, id_type>(
nlist,
partitions,
num_subspaces,
max_iterations,
convergence_tolerance,
Expand All @@ -769,7 +739,7 @@ TEST_CASE("k_factor", "[ivf_pq_index]") {
auto ivf_index_uri =
(std::filesystem::temp_directory_path() / "ivf_index").string();
CHECK(index.num_vectors() == 0);
CHECK(index.nlist() == nlist);
CHECK(index.partitions() == partitions);

// We can train, add, query, and then write the index.
std::vector<id_type> ids(num_vectors);
Expand Down Expand Up @@ -876,7 +846,7 @@ TEST_CASE("ivf_pq_index query index written twice", "[ivf_pq_index]") {
auto id_type = "uint32";
auto partitioning_index_type = "uint32";
uint64_t dimensions = 3;
size_t n_list = 1;
size_t partitions = 1;
uint32_t num_subspaces = 1;
uint32_t max_iterations = 3;

Expand All @@ -885,7 +855,7 @@ TEST_CASE("ivf_pq_index query index written twice", "[ivf_pq_index]") {
auto index = ivf_pq_index<
feature_type_type,
id_type_type,
partitioning_index_type_type>(n_list, dimensions / 2);
partitioning_index_type_type>(partitions, dimensions / 2);
auto data =
ColMajorMatrixWithIds<feature_type_type, id_type_type>(dimensions, 0);
index.train(data, data.raveled_ids());
Expand Down Expand Up @@ -932,7 +902,7 @@ TEST_CASE("ivf_pq_index query index written twice", "[ivf_pq_index]") {
auto queries = ColMajorMatrix<feature_type_type>{
{{1, 1, 1}, {2, 2, 2}, {3, 3, 3}, {4, 4, 4}}};

auto&& [scores, ids] = index->query(queries, 1, n_list, 5);
auto&& [scores, ids] = index->query(queries, 1, partitions, 5);
CHECK(std::equal(
scores.data(),
scores.data() + 4,
Expand All @@ -947,12 +917,12 @@ TEST_CASE("pq encoding has no error with <= 256 vectors", "[ivf_pq_index]") {
using id_type = uint32_t;
using partitioning_index_type = uint32_t;

size_t n_list = 16;
size_t partitions = 16;
size_t dimensions = 300;
uint32_t num_subspaces = dimensions;
for (auto num_vectors : std::vector<size_t>{255, 256, 257}) {
auto index = ivf_pq_index<feature_type, id_type, partitioning_index_type>(
n_list, num_subspaces);
partitions, num_subspaces);
std::vector<std::vector<feature_type>> vectors(num_vectors);
std::vector<id_type> vector_ids(num_vectors);
for (int i = 0; i < num_vectors; ++i) {
Expand Down
Loading