diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index f825f3d40..a92764e71 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -22,7 +22,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-22.04, ubicloud-standard-4-arm, macos-13] - postgres: [12, 13, 14, 15, 16] + postgres: [12, 13, 14, 15, 16, 17] steps: - uses: actions/checkout@v4 with: @@ -53,7 +53,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-22.04, ubicloud-standard-4-arm, macos-13] - postgres: [12, 13, 14, 15, 16] + postgres: [12, 13, 14, 15, 16, 17] steps: - uses: actions/checkout@v4 with: diff --git a/.github/workflows/publish-docker.yaml b/.github/workflows/publish-docker.yaml index 442d0afaa..a089a5f2d 100644 --- a/.github/workflows/publish-docker.yaml +++ b/.github/workflows/publish-docker.yaml @@ -24,6 +24,7 @@ jobs: fail-fast: false matrix: include: + - postgres: 17 - postgres: 16 - postgres: 15 - postgres: 14 diff --git a/.github/workflows/sanitizer-build-and-test.yaml b/.github/workflows/sanitizer-build-and-test.yaml index b10ffb9b8..f66be6a58 100644 --- a/.github/workflows/sanitizer-build-and-test.yaml +++ b/.github/workflows/sanitizer-build-and-test.yaml @@ -56,7 +56,7 @@ jobs: fail-fast: false matrix: os: ["ubuntu-22.04"] - pg: ["12.16", "13.12", "14.9", "15.4", "16.0"] + pg: ["12.16", "13.12", "14.9", "15.4", "16.0", "17.0"] steps: - name: Enable UBSan if this is a release if: ${{ github.event_name == 'release' }} diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 7d00f8bb3..b63455c08 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -22,7 +22,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-22.04, ubicloud-standard-4-arm, macos-13] - postgres: [12, 13, 14, 15, 16] + postgres: [12, 13, 14, 15, 16, 17] steps: - uses: actions/checkout@v4 with: @@ -148,7 +148,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-22.04, ubicloud-standard-4-arm] - postgres: [15] + postgres: [15, 17] steps: - uses: actions/checkout@v4 with: @@ -177,6 +177,7 @@ jobs: run: | cargo install cargo-pgrx --version 0.12.7 cargo pgrx init "--pg$PG_VERSION" /usr/bin/pg_config + sed -i -e "s/default = .*/default=[\"pg${PG_VERSION}\"]/" lantern_extras/Cargo.toml RUSTFLAGS="--cfg profile=\"ci-build\"" cargo pgrx install --sudo --pg-config /usr/bin/pg_config --package lantern_extras env: PG_VERSION: ${{ matrix.postgres }} @@ -186,8 +187,10 @@ jobs: PG_VERSION: ${{ matrix.postgres }} - name: Setup permissions run: | - sudo chmod 777 -R /usr/lib/postgresql/15/lib/ - sudo chmod 777 -R /usr/share/postgresql/15/extension/ + sudo chmod 777 -R "/usr/lib/postgresql/${PG_VERSION}/lib/" + sudo chmod 777 -R "/usr/share/postgresql/${PG_VERSION}/extension/" + env: + PG_VERSION: ${{ matrix.postgres }} - name: Run tests run: cargo llvm-cov --workspace --lcov --output-path lantern-extras-lcov.info env: @@ -196,7 +199,7 @@ jobs: DB_URL: "postgres://postgres@127.0.0.1:5432/postgres" - name: Upload lantern_extras coverage uses: actions/upload-artifact@v4 - if: ${{ startsWith(matrix.os, 'ubuntu') }} + if: ${{ startsWith(matrix.os, 'ubuntu') && matrix.postgres == 15}} with: name: lantern-extras-lcov.info path: ./lantern-extras-lcov.info diff --git a/README.md b/README.md index a6ea99e82..f0e091784 100644 --- a/README.md +++ b/README.md @@ -106,19 +106,9 @@ FROM small_world ORDER BY vector <-> ARRAY[0,0,0] LIMIT 1; ### A note on operators and operator classes -Lantern supports several distance functions in the index and it has 2 modes for operators: +Lantern supports several distance functions in the index -1. `lantern.pgvector_compat=TRUE` (default) - In this mode there are 3 operators available `<->` (l2sq), `<=>` (cosine), `<+>` (hamming). - - Note that in this mode, you need to use right operator in order to trigger an index scan. - -2. `lantern.pgvector_compat=FALSE` - In this mode you only need to specify the distance function used for a column at index creation time. Lantern will automatically infer the distance function to use for search so you always use `` operator in search queries. - - Note that in this mode, the operator `` is intended exclusively for use with index lookups. If you expect to not use the index in a query, use the distance function directly (e.g. `l2sq_dist(v1, v2)`) - -> To switch between modes set `lantern.pgvector_compat` variable to `TRUE` or `FALSE`. +There are 3 operators available `<->` (l2sq), `<=>` (cosine), `<+>` (hamming). There are four defined operator classes that can be employed during index creation: diff --git a/ci/scripts/run-tests-linux.sh b/ci/scripts/run-tests-linux.sh index 0c5d9e3f9..26acff89f 100755 --- a/ci/scripts/run-tests-linux.sh +++ b/ci/scripts/run-tests-linux.sh @@ -20,17 +20,6 @@ function run_pgvector_tests(){ pushd /tmp/pgvector # Add lantern to load-extension in pgregress sed -i '/REGRESS_OPTS \=/ s/$/ --load-extension lantern/' Makefile - - # Set pgvector_compat flag in test files - for file in ./test/sql/*; do - echo 'SET lantern.pgvector_compat=TRUE;' | cat - $file > temp && mv temp $file - done - - # Set pgvector_compat flag in result files - for file in ./test/expected/*.out; do - echo 'SET lantern.pgvector_compat=TRUE;' | cat - $file > temp && mv temp $file - done - # Run tests make installcheck popd diff --git a/ci/scripts/utils.sh b/ci/scripts/utils.sh index 31008b393..de15df758 100644 --- a/ci/scripts/utils.sh +++ b/ci/scripts/utils.sh @@ -9,7 +9,7 @@ function setup_environment() { export GITHUB_OUTPUT=${GITHUB_OUTPUT:-/dev/null} export PGVECTOR_VERSION=0.7.4-lanterncloud #fix pg_cron at the latest commit of the time - export PG_CRON_COMMIT_SHA=7e91e72b1bebc5869bb900d9253cc9e92518b33f + export PG_CRON_COMMIT_SHA=9490f9cc9803f75105f2f7d89839a998f011f8d8 } function setup_rust() { diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 371a2041b..e12b9d0cd 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -1,7 +1,7 @@ ARG VERSION=15 ARG PGVECTOR_VERSION=0.5.1 #fix pg_cron at the latest commit of the time -ARG PG_CRON_COMMIT_SHA=7e91e72b1bebc5869bb900d9253cc9e92518b33f +ARG PG_CRON_COMMIT_SHA=9490f9cc9803f75105f2f7d89839a998f011f8d8 # If you want to build the base image for different versions use Dockerfile.pg # To use GDB inside container run docker like this: diff --git a/lantern_extras/Cargo.toml b/lantern_extras/Cargo.toml index ccd47c043..df5429d66 100644 --- a/lantern_extras/Cargo.toml +++ b/lantern_extras/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "lantern_extras" -version = "0.4.1" +version = "0.4.2" edition = "2021" [lib] @@ -8,12 +8,13 @@ crate-type = ["cdylib", "lib"] doctest = false [features] -default = ["pg15"] +default = ["pg17"] pg12 = ["pgrx/pg12", "pgrx-tests/pg12"] pg13 = ["pgrx/pg13", "pgrx-tests/pg13"] pg14 = ["pgrx/pg14", "pgrx-tests/pg14"] pg15 = ["pgrx/pg15", "pgrx-tests/pg15"] pg16 = ["pgrx/pg16", "pgrx-tests/pg16"] +pg17 = ["pgrx/pg17", "pgrx-tests/pg17"] pg_test = [] [dependencies] diff --git a/lantern_extras/src/lib.rs b/lantern_extras/src/lib.rs index e613f46ad..5f1af1ec3 100644 --- a/lantern_extras/src/lib.rs +++ b/lantern_extras/src/lib.rs @@ -133,7 +133,7 @@ pub mod pg_test { pub fn postgresql_conf_options() -> Vec<&'static str> { vec![ - "shared_preload_libraries='lantern_extras.so'", + "shared_preload_libraries='lantern_extras'", "lantern_extras.daemon_databases='pgrx_tests'", "lantern_extras.enable_daemon=true", ] diff --git a/lantern_hnsw/CMakeLists.txt b/lantern_hnsw/CMakeLists.txt index 5b0e4ee8a..ca087bef2 100644 --- a/lantern_hnsw/CMakeLists.txt +++ b/lantern_hnsw/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.3) include(CheckSymbolExists) -set(LANTERN_VERSION 0.4.1) +set(LANTERN_VERSION 0.4.2) project( LanternDB @@ -267,6 +267,7 @@ set (_update_files sql/updates/0.3.3--0.3.4.sql sql/updates/0.3.4--0.4.0.sql sql/updates/0.4.0--0.4.1.sql + sql/updates/0.4.1--0.4.2.sql ) # Generate version information for the binary diff --git a/lantern_hnsw/scripts/integration_tests.py b/lantern_hnsw/scripts/integration_tests.py index 35fab0031..5d4911c36 100644 --- a/lantern_hnsw/scripts/integration_tests.py +++ b/lantern_hnsw/scripts/integration_tests.py @@ -46,7 +46,6 @@ def primary(): node.init() node.append_conf("enable_seqscan = off") node.append_conf("maintenance_work_mem = '1GB'") - node.append_conf("lantern.pgvector_compat=FALSE") node.append_conf("checkpoint_timeout = '100min'") node.append_conf("min_wal_size = '1GB'") node.append_conf("checkpoint_completion_target = '0.9'") @@ -158,7 +157,6 @@ def generic_vector_query( dist_with_function = f"{distance_metric}_dist(v, ({query_vector}))" dist_with_concrete_op = f"v {DIST_OPS[distance_metric]} ({query_vector})" - dist_with_generic_op = f"v ({query_vector})" query_generator = ( lambda order_by: f""" @@ -173,8 +171,6 @@ def generic_vector_query( return query_generator(dist_with_function) elif kind == "concrete": return query_generator(dist_with_concrete_op) - elif kind == "generic": - return query_generator(dist_with_generic_op) @pytest.mark.parametrize("distance_metric", ["l2sq", "cos"], scope="session") @@ -197,9 +193,6 @@ def test_selects(db, setup_copy_table_with_index, distance_metric, quant_bits, r concrete_op_query = generic_vector_query( table_name, distance_metric, "concrete", query_vector_id=q_vec_id ) - generic_op_query = generic_vector_query( - table_name, distance_metric, "generic", query_vector_id=q_vec_id - ) exact_explain_query = f"EXPLAIN {exact_query}" exact_plan = primary.execute("testdb", exact_explain_query) @@ -214,7 +207,7 @@ def test_selects(db, setup_copy_table_with_index, distance_metric, quant_bits, r q_vec_id == exact_res[0][0] ), "First result in exact query result should be the query vector" - for query in [generic_op_query, concrete_op_query]: + for query in [concrete_op_query]: explain_query = f"EXPLAIN {query}" plan = primary.execute("testdb", explain_query) assert f"Index Scan using idx_{table_name}" in str( @@ -348,9 +341,6 @@ def test_inserts(setup_copy_table_with_index, distance_metric, quant_bits, reque concrete_op_query = generic_vector_query( table_name, distance_metric, "concrete", query_vector_id=q_vec_id ) - generic_op_query = generic_vector_query( - table_name, distance_metric, "generic", query_vector_id=q_vec_id - ) exact_explain_query = f"EXPLAIN {exact_query}" for db in [primary, replica]: @@ -367,7 +357,7 @@ def test_inserts(setup_copy_table_with_index, distance_metric, quant_bits, reque exact_res[0][0] in inserted_vector_orig_ids[q_vec_id] ), "First result in exact query result should be the query vector" - for query in [generic_op_query, concrete_op_query]: + for query in [concrete_op_query]: explain_query = f"EXPLAIN {query}" plan = db.execute("testdb", explain_query) assert f"Index Scan using idx_{table_name}" in str( diff --git a/lantern_hnsw/scripts/test_updates.py b/lantern_hnsw/scripts/test_updates.py index 36d3e7cf3..fba007065 100644 --- a/lantern_hnsw/scripts/test_updates.py +++ b/lantern_hnsw/scripts/test_updates.py @@ -49,7 +49,8 @@ def __repr__(self): return self.version INCOMPATIBLE_VERSIONS = { - '16': [Version('0.0.4')] + '16': [Version('0.0.4')], + '17': [Version('0.3.0'), Version('0.3.1'), Version('0.3.2'), Version('0.3.3'), Version('0.3.4'), Version('0.4.0'), Version('0.4.1')], } def shell(cmd, exit_on_error=True): diff --git a/lantern_hnsw/sql/lantern.sql b/lantern_hnsw/sql/lantern.sql index ab115bb57..14cdd1625 100644 --- a/lantern_hnsw/sql/lantern.sql +++ b/lantern_hnsw/sql/lantern.sql @@ -924,3 +924,16 @@ BEGIN RETURN jsonb_pretty(_lantern_internal.mask_order_by_in_plan(explain_output)); END $$ LANGUAGE plpgsql; +-- Get vector type oid +CREATE FUNCTION _lantern_internal.get_vector_type_oid() RETURNS OID AS $$ +DECLARE + type_oid OID; +BEGIN + type_oid := (SELECT pg_type.oid FROM pg_type + JOIN pg_depend ON pg_type.oid = pg_depend.objid + JOIN pg_extension ON pg_depend.refobjid = pg_extension.oid + WHERE typname='vector' AND extname='vector' + LIMIT 1); + RETURN COALESCE(type_oid, 0); +END; +$$ LANGUAGE plpgsql; diff --git a/lantern_hnsw/sql/updates/0.4.1--0.4.2.sql b/lantern_hnsw/sql/updates/0.4.1--0.4.2.sql new file mode 100644 index 000000000..814c13723 --- /dev/null +++ b/lantern_hnsw/sql/updates/0.4.1--0.4.2.sql @@ -0,0 +1,13 @@ +-- Get vector type oid +CREATE FUNCTION _lantern_internal.get_vector_type_oid() RETURNS OID AS $$ +DECLARE + type_oid OID; +BEGIN + type_oid := (SELECT pg_type.oid FROM pg_type + JOIN pg_depend ON pg_type.oid = pg_depend.objid + JOIN pg_extension ON pg_depend.refobjid = pg_extension.oid + WHERE typname='vector' AND extname='vector' + LIMIT 1); + RETURN COALESCE(type_oid, 0); +END; +$$ LANGUAGE plpgsql; diff --git a/lantern_hnsw/src/hnsw.c b/lantern_hnsw/src/hnsw.c index 40921e6a8..846b80009 100644 --- a/lantern_hnsw/src/hnsw.c +++ b/lantern_hnsw/src/hnsw.c @@ -347,9 +347,7 @@ static float8 vector_dist(Vector *a, Vector *b, usearch_metric_kind_t metric_kin PGDLLEXPORT PG_FUNCTION_INFO_V1(ldb_generic_dist); Datum ldb_generic_dist(PG_FUNCTION_ARGS) { - if(ldb_pgvector_compat) { - elog(ERROR, "Operator can only be used when lantern.pgvector_compat=FALSE"); - } + elog(ERROR, "Operator is deprecated. Please explicitly use the operator that matches your distance function."); PG_RETURN_NULL(); } @@ -452,7 +450,7 @@ HnswColumnType GetColumnTypeFromOid(Oid oid) if(oid == FLOAT4ARRAYOID) { return REAL_ARRAY; - } else if(oid == TypenameGetTypid("vector")) { + } else if(oid == TypenameGetVectorTypid()) { return VECTOR; } else if(oid == INT4ARRAYOID) { return INT_ARRAY; diff --git a/lantern_hnsw/src/hnsw/build.c b/lantern_hnsw/src/hnsw/build.c index 7c3e75bcf..33672dc40 100644 --- a/lantern_hnsw/src/hnsw/build.c +++ b/lantern_hnsw/src/hnsw/build.c @@ -455,7 +455,7 @@ static void BuildIndexCleanup(ldb_HnswBuildState *buildstate) buildstate->external_socket->close(buildstate->external_socket); } - if(buildstate->index_file_fd != -1) { + if(buildstate->index_file_fd > 0) { // index_file_fd will only exist when we mmap the index file to memory if(!buildstate->external && buildstate->index_buffer) { int munmap_ret = munmap(buildstate->index_buffer, buildstate->index_buffer_size); diff --git a/lantern_hnsw/src/hnsw/options.c b/lantern_hnsw/src/hnsw/options.c index 855f3aabd..be4d346ad 100644 --- a/lantern_hnsw/src/hnsw/options.c +++ b/lantern_hnsw/src/hnsw/options.c @@ -15,9 +15,6 @@ #include // RelationData #include -#include "../hooks/executor_start.h" -#include "../hooks/post_parse.h" - // We import this header file // to access the op class support function pointers #include "../hnsw.h" @@ -54,10 +51,6 @@ int ldb_external_index_port; char *ldb_external_index_host; bool ldb_external_index_secure; -// if this variable is set to true -// our operator rewriting hooks will be disabled -bool ldb_pgvector_compat; - // this variable is only set during testing and controls whether // certain elog() calls are made // see ldb_dlog() definition and callsites for details @@ -229,8 +222,6 @@ static void ldb_wait_for_gdb(int sig) */ void _PG_init(void) { - (void)CheckExtensionVersions(); - if(process_shared_preload_libraries_in_progress) { elog(WARNING, "LanternDB HNSW index extension loaded inside shared_preload_libraries." @@ -368,17 +359,6 @@ void _PG_init(void) NULL, NULL); - DefineCustomBoolVariable("lantern.pgvector_compat", - "Whether or not the operator <-> should automatically detect the right distance function", - "set this to 1 to disable operator rewriting hooks", - &ldb_pgvector_compat, - true, - PGC_USERSET, - 0, - NULL, - NULL, - NULL); - DefineCustomIntVariable("lantern.external_index_port", "Port for external indexing", "Change this value if you run lantern daemon on different port", @@ -419,30 +399,8 @@ void _PG_init(void) MarkGUCPrefixReserved("_lantern_internal"); #endif - original_post_parse_analyze_hook = post_parse_analyze_hook; - original_ExecutorStart_hook = ExecutorStart_hook; - - post_parse_analyze_hook = post_parse_analyze_hook_with_operator_check; - ExecutorStart_hook = ExecutorStart_hook_with_operator_check; - #ifndef NDEBUG signal(SIGSEGV, ldb_wait_for_gdb); signal(SIGABRT, ldb_wait_for_gdb); #endif } - -// Called with extension unload. -void _PG_fini(void) -{ - // Return back the original hook value. - // This check is because there might be case if while we stop the hooks (in pgvector_compat mode) - // Another extension will be loaded and it will overwrite the hooks - // And when lantern extension will be unloaded it will set the hooks to original values - // Overwriting the current changed hooks set by another extension - if(ExecutorStart_hook == ExecutorStart_hook_with_operator_check) { - ExecutorStart_hook = original_ExecutorStart_hook; - } - if(post_parse_analyze_hook == post_parse_analyze_hook_with_operator_check) { - post_parse_analyze_hook = original_post_parse_analyze_hook; - } -} diff --git a/lantern_hnsw/src/hnsw/options.h b/lantern_hnsw/src/hnsw/options.h index edc3613e6..bdd4fc197 100644 --- a/lantern_hnsw/src/hnsw/options.h +++ b/lantern_hnsw/src/hnsw/options.h @@ -78,7 +78,6 @@ bytea* ldb_amoptions(Datum reloptions, bool validate); extern int ldb_hnsw_init_k; extern int ldb_hnsw_ef_search; extern bool ldb_is_test; -extern bool ldb_pgvector_compat; extern int ldb_external_index_port; extern char* ldb_external_index_host; extern bool ldb_external_index_secure; diff --git a/lantern_hnsw/src/hnsw/utils.c b/lantern_hnsw/src/hnsw/utils.c index 637851d45..94f67652f 100644 --- a/lantern_hnsw/src/hnsw/utils.c +++ b/lantern_hnsw/src/hnsw/utils.c @@ -2,7 +2,10 @@ #include "utils.h" +#include #include +#include +#include #include #include #include @@ -10,6 +13,8 @@ #include #include #include +#include +#include #if PG_VERSION_NUM >= 130000 #include @@ -271,3 +276,25 @@ usearch_metric_kind_t GetMetricKindFromStr(char *metric_kind_str) elog(ERROR, "Unsupported metric kind: %s . Should be one of (l2sq, cos, hamming)", metric_kind_str); } + +/* + * We are not using existing TypenameGetTypid because after Postgres 17 + * The maintenance operations have restricted search_path for namepsaces (pg_catalog, pg_temp) + * Thus if the type will be installed in public schema, it will not be able to find the type + * Here we will call SQL function defined in lantern.sql file, which will lookup pg_type relation + */ +Oid TypenameGetVectorTypid() +{ + Oid function_oid = GetSysCacheOid(PROCNAMEARGSNSP, + Anum_pg_proc_oid, + CStringGetDatum("get_vector_type_oid"), + PointerGetDatum(buildoidvector(NULL, 0)), + ObjectIdGetDatum(get_namespace_oid("_lantern_internal", false)), + 0); + + if(!OidIsValid(function_oid)) { + elog(ERROR, "Please update lantern extension"); + } + + return DatumGetObjectId(OidFunctionCall0(function_oid)); +} diff --git a/lantern_hnsw/src/hnsw/utils.h b/lantern_hnsw/src/hnsw/utils.h index 9d8248472..24ee23c96 100644 --- a/lantern_hnsw/src/hnsw/utils.h +++ b/lantern_hnsw/src/hnsw/utils.h @@ -17,6 +17,7 @@ void CheckExtensionVersions(); uint32 EstimateRowCount(Relation heap); int32 GetColumnAttributeNumber(Relation rel, const char *columnName); usearch_metric_kind_t GetMetricKindFromStr(char *metric_kind_str); +Oid TypenameGetVectorTypid(); // hoping to throw the error via an assertion, if those are on, before elog(ERROR)-ing as a last resort // We prefer Assert() because this function is used in contexts where the stack contains non-POD types diff --git a/lantern_hnsw/src/hooks/executor_start.c b/lantern_hnsw/src/hooks/executor_start.c deleted file mode 100644 index 53778a92f..000000000 --- a/lantern_hnsw/src/hooks/executor_start.c +++ /dev/null @@ -1,109 +0,0 @@ -#include - -#include "executor_start.h" - -#include -#include -#include -#include -#include -#include - -#include "../hnsw/options.h" -#include "../hnsw/utils.h" -#include "op_rewrite.h" -#include "plan_tree_walker.h" -#include "utils.h" - -ExecutorStart_hook_type original_ExecutorStart_hook = NULL; - -typedef struct -{ - List *oidList; - bool isIndexScan; -} OperatorUsedCorrectlyContext; - -static bool operator_used_incorrectly_walker(Node *node, void *context) -{ - OperatorUsedCorrectlyContext *context_typed = (OperatorUsedCorrectlyContext *)context; - if(node == NULL) return false; - if(IsA(node, IndexScan)) { - context_typed->isIndexScan = true; - bool status = plan_tree_walker((Plan *)node, operator_used_incorrectly_walker, context); - context_typed->isIndexScan = false; - return status; - } - if(IsA(node, OpExpr)) { - OpExpr *opExpr = (OpExpr *)node; - if(list_member_oid(context_typed->oidList, opExpr->opno) && !context_typed->isIndexScan) { - return true; - } - } - if(IsA(node, List)) { - List *list = (List *)node; - ListCell *lc; - foreach(lc, list) { - if(operator_used_incorrectly_walker(lfirst(lc), context)) return true; - } - return false; - } - - if(is_plan_node(node)) { - return plan_tree_walker((Plan *)node, operator_used_incorrectly_walker, (void *)context); - } else { - return expression_tree_walker(node, operator_used_incorrectly_walker, (void *)context); - } - return false; -} - -static void validate_operator_usage(Plan *plan, List *oidList) -{ - OperatorUsedCorrectlyContext context; - context.oidList = oidList; - context.isIndexScan = false; - if(operator_used_incorrectly_walker((Node *)plan, (void *)&context)) { - elog(ERROR, "Operator can only be used inside of an index"); - } -} - -void ExecutorStart_hook_with_operator_check(QueryDesc *queryDesc, int eflags) -{ - if(ldb_pgvector_compat) { - if(original_ExecutorStart_hook) { - original_ExecutorStart_hook(queryDesc, eflags); - } else { - standard_ExecutorStart(queryDesc, eflags); - } - return; - } - - if(creating_extension) { - // this is true in only CREATE EXTENSION and ALTER EXTENSION UPDATE commands - // these statements are guaranteed to not use our operators and state necessary - // to run our hooks is not ready anyway so it would be wrong to run this - elog(DEBUG2, "Skipping executor start hook for CREATE EXTENSION ... statement"); - standard_ExecutorStart(queryDesc, eflags); - return; - } - - List *oidList = ldb_get_operator_oids(); - if(oidList != NULL) { - // oidList will be NULL if LanternDB extension is not fully initialized - // e.g. in statements executed as a result of CREATE EXTENSION ... statement - ldb_rewrite_ops(queryDesc->plannedstmt->planTree, oidList, queryDesc->plannedstmt->rtable); - validate_operator_usage(queryDesc->plannedstmt->planTree, oidList); - ListCell *lc; - foreach(lc, queryDesc->plannedstmt->subplans) { - Plan *subplan = (Plan *)lfirst(lc); - ldb_rewrite_ops(subplan, oidList, queryDesc->plannedstmt->rtable); - validate_operator_usage(subplan, oidList); - } - list_free(oidList); - } - - if(original_ExecutorStart_hook) { - original_ExecutorStart_hook(queryDesc, eflags); - } else { - standard_ExecutorStart(queryDesc, eflags); - } -} diff --git a/lantern_hnsw/src/hooks/executor_start.h b/lantern_hnsw/src/hooks/executor_start.h deleted file mode 100644 index 73ee5879b..000000000 --- a/lantern_hnsw/src/hooks/executor_start.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef LDB_HOOKS_EXECUTOR_START_H -#define LDB_HOOKS_EXECUTOR_START_H - -#include - -#include - -extern ExecutorStart_hook_type original_ExecutorStart_hook; - -void ExecutorStart_hook_with_operator_check(QueryDesc *queryDesc, int eflags); - -#endif // LDB_HOOKS_EXECUTOR_START_H \ No newline at end of file diff --git a/lantern_hnsw/src/hooks/op_rewrite.c b/lantern_hnsw/src/hooks/op_rewrite.c deleted file mode 100644 index 3ad91c1bf..000000000 --- a/lantern_hnsw/src/hooks/op_rewrite.c +++ /dev/null @@ -1,284 +0,0 @@ -#include - -#include "op_rewrite.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "plan_tree_walker.h" -#include "utils.h" - -#if PG_VERSION_NUM < 120000 -#include -#include -#else -#include -#endif - -static Node *operator_rewriting_mutator(Node *node, void *ctx); - -void base_plan_mutator(Plan *plan, void *context) -{ - plan->lefttree = (Plan *)operator_rewriting_mutator((Node *)plan->lefttree, context); - plan->righttree = (Plan *)operator_rewriting_mutator((Node *)plan->righttree, context); - plan->initPlan = (List *)operator_rewriting_mutator((Node *)plan->initPlan, context); - // checking qual and target list at the end covers some edge cases, if you modify this leave them here - plan->qual = (List *)operator_rewriting_mutator((Node *)plan->qual, context); - plan->targetlist = (List *)operator_rewriting_mutator((Node *)plan->targetlist, context); -} - -// recursively descend the plan tree searching for expressions with the <-> operator that are part of a non-index scan -// src/include/nodes/plannodes.h and src/include/nodes/nodes.h contain relevant definitions -Node *plan_tree_mutator(Plan *plan, void *context) -{ - check_stack_depth(); - - switch(nodeTag(plan)) { - case T_SubqueryScan: - { - SubqueryScan *subqueryscan = (SubqueryScan *)plan; - base_plan_mutator(&(subqueryscan->scan.plan), context); - subqueryscan->subplan = (Plan *)operator_rewriting_mutator((Node *)subqueryscan->subplan, context); - return (Node *)subqueryscan; - } - case T_CteScan: - { - CteScan *ctescan = (CteScan *)plan; - base_plan_mutator(&(ctescan->scan.plan), context); - return (Node *)ctescan; - } -#if PG_VERSION_NUM < 160000 - case T_Join: - { - Join *join = (Join *)plan; - base_plan_mutator(&(join->plan), context); - join->joinqual = (List *)operator_rewriting_mutator((Node *)join->joinqual, context); - return (Node *)join; - } -#endif - case T_NestLoop: - { - NestLoop *nestloop = (NestLoop *)plan; - base_plan_mutator((Plan *)&(nestloop->join), context); - return (Node *)nestloop; - } - case T_Result: - { - Result *result = (Result *)plan; - base_plan_mutator(&(result->plan), context); - result->resconstantqual = operator_rewriting_mutator((Node *)result->resconstantqual, context); - return (Node *)result; - } - case T_Limit: - { - Limit *limit = (Limit *)plan; - base_plan_mutator(&(limit->plan), context); - limit->limitOffset = operator_rewriting_mutator((Node *)limit->limitOffset, context); - limit->limitCount = operator_rewriting_mutator((Node *)limit->limitCount, context); - return (Node *)limit; - } - case T_Append: - { - Append *append = (Append *)plan; - base_plan_mutator(&(append->plan), context); - append->appendplans = (List *)operator_rewriting_mutator((Node *)append->appendplans, context); - return (Node *)append; - } - // case T_IncrementalSort: // We will eventually support this - case T_Agg: - case T_Group: - case T_Sort: - case T_Unique: - case T_SetOp: - case T_Hash: - case T_HashJoin: - case T_WindowAgg: - case T_LockRows: - { - base_plan_mutator(plan, context); - return (Node *)plan; - } - case T_ModifyTable: // No order by when modifying a table (update/delete etc) - case T_BitmapAnd: // We do not provide a bitmap index - case T_BitmapOr: - case T_BitmapHeapScan: - case T_BitmapIndexScan: - case T_FunctionScan: // SELECT * FROM fn(x, y, z) - case T_ValuesScan: // VALUES (1), (2) - case T_Material: // https://stackoverflow.com/questions/31410030/ -#if PG_VERSION_NUM >= 140000 - case T_Memoize: // memoized inner loop must have an index to be memoized -#endif - case T_WorkTableScan: // temporary table, shouldn't have index - case T_ProjectSet: // "execute set returning functions" feels safe to exclude - case T_TableFuncScan: // scan of a function that returns a table, shouldn't have an index - case T_ForeignScan: // if the relation is foreign we can't determine if it has an index - default: - break; - } - return (Node *)plan; -} - -// To write syscache calls look for the 'static const struct cachedesc cacheinfo[]' in utils/cache/syscache.c -// These describe the different caches that will be initialized into SysCache and the keys they support in searches -// The anums tell you the table and the column that the key will be compared to this is afaict the only way to match -// them to SQL for example pg_am.oid -> Anum_pg_am_oid the keys must be in order but they need not all be included the -// comment next to the top label is the name of the #defined cacheid that you should use as your first argument you can -// destructure the tuple int a From_(table_name) with GETSTRUCT to pull individual rows out -static Oid get_func_id_from_index(Relation index) -{ - Oid hnswamoid = get_index_am_oid("lantern_hnsw", false); - if(index->rd_rel->relam != hnswamoid) return InvalidOid; - - // indclass is inaccessible on the form data - // https://www.postgresql.org/docs/current/system-catalog-declarations.html - bool isNull; - Oid idxopclassoid; - Datum classDatum = SysCacheGetAttr(INDEXRELID, index->rd_indextuple, Anum_pg_index_indclass, &isNull); - if(!isNull) { - oidvector *indclass = (oidvector *)DatumGetPointer(classDatum); - assert(indclass->dim1 == 1); - idxopclassoid = indclass->values[ 0 ]; - } else { - index_close(index, AccessShareLock); - elog(ERROR, "Failed to retrieve indclass oid from index class"); - } - - // SELECT * FROM pg_opclass WHERE opcmethod=hnswamoid AND opcname=dist_cos_ops - HeapTuple opclassTuple = SearchSysCache1(CLAOID, ObjectIdGetDatum(idxopclassoid)); - if(!HeapTupleIsValid(opclassTuple)) { - index_close(index, AccessShareLock); - elog(ERROR, "Failed to find operator class for key column"); - } - - Oid opclassOid = ((Form_pg_opclass)GETSTRUCT(opclassTuple))->opcfamily; - ReleaseSysCache(opclassTuple); - - // SELECT * FROM pg_amproc WHERE amprocfamily=opclassOid - // SearchSysCache1 is what we want and in fact it runs fine against release builds. However debug builds assert that - // AMPROCNUM takes only 1 arg which isn't true and so they fail. We therefore have to use SearchSysCacheList1 since - // it doesn't enforce this invariant. Ideally we would call SearchCatCache1 directly but postgres doesn't expose - // necessary constants - CatCList *opList = SearchSysCacheList1(AMPROCNUM, ObjectIdGetDatum(opclassOid)); - HeapTuple opTuple = &opList->members[ 0 ]->tuple; - if(!HeapTupleIsValid(opTuple)) { - index_close(index, AccessShareLock); - elog(ERROR, "Failed to find the function for operator class"); - } - Oid functionId = ((Form_pg_amproc)GETSTRUCT(opTuple))->amproc; - ReleaseCatCacheList(opList); - - return functionId; -} - -static Node *operator_rewriting_mutator(Node *node, void *ctx) -{ - OpRewriterContext *context = (OpRewriterContext *)ctx; - - if(node == NULL) return node; - - if(IsA(node, OpExpr)) { - OpExpr *opExpr = (OpExpr *)node; - if(list_member_oid(context->ldb_ops, opExpr->opno)) { - if(context->indices == NULL) { - return node; - } else { - ListCell *lc; - foreach(lc, context->indices) { - uintptr_t intermediate = (uintptr_t)lfirst(lc); - Oid indexid = (Oid)intermediate; - Relation index = index_open(indexid, AccessShareLock); - Oid indexfunc = get_func_id_from_index(index); - if(OidIsValid(indexfunc)) { - MemoryContext old = MemoryContextSwitchTo(PortalContext); - FuncExpr *fnExpr = makeNode(FuncExpr); - fnExpr->funcresulttype = opExpr->opresulttype; - fnExpr->funcretset = opExpr->opretset; - fnExpr->funccollid = opExpr->opcollid; - fnExpr->inputcollid = opExpr->inputcollid; - fnExpr->args = opExpr->args; - fnExpr->location = opExpr->location; - // operators can't take variadic arguments - fnExpr->funcvariadic = false; - // print it as a function - fnExpr->funcformat = COERCE_EXPLICIT_CALL; - fnExpr->funcid = indexfunc; - MemoryContextSwitchTo(old); - - index_close(index, AccessShareLock); - - return (Node *)fnExpr; - } - index_close(index, AccessShareLock); - } - return node; - } - } - } - - if(IsA(node, IndexScan) || IsA(node, IndexOnlyScan)) { - return node; - } - if(IsA(node, SeqScan) || IsA(node, SampleScan)) { - Scan *scan = (Scan *)node; - Plan *scanPlan = &scan->plan; - Oid rtrelid = scan->scanrelid; - RangeTblEntry *rte = rt_fetch(rtrelid, context->rtable); - Oid relid = rte->relid; - Relation rel = relation_open(relid, AccessShareLock); - if(rel->rd_indexvalid) { - context->indices = RelationGetIndexList(rel); - } - relation_close(rel, AccessShareLock); - - base_plan_mutator(scanPlan, context); - return (Node *)scan; - } - - if(IsA(node, List)) { - MemoryContext old = MemoryContextSwitchTo(PortalContext); - List *list = (List *)node; - List *ret = NIL; - ListCell *lc; - foreach(lc, list) { - ret = lappend(ret, operator_rewriting_mutator((Node *)lfirst(lc), ctx)); - } - MemoryContextSwitchTo(old); - return (Node *)ret; - } - - if(is_plan_node(node)) { - return (Node *)plan_tree_mutator((Plan *)node, ctx); - } else { - return expression_tree_mutator(node, operator_rewriting_mutator, ctx); - } -} - -bool ldb_rewrite_ops(Plan *plan, List *oidList, List *rtable) -{ - Node *node = (Node *)plan; - - OpRewriterContext context; - context.ldb_ops = oidList; - context.indices = NULL; - context.rtable = rtable; - - if(IsA(node, IndexScan) || IsA(node, IndexOnlyScan)) { - return false; - } - - operator_rewriting_mutator(node, (void *)&context); - return true; -} diff --git a/lantern_hnsw/src/hooks/op_rewrite.h b/lantern_hnsw/src/hooks/op_rewrite.h deleted file mode 100644 index 8db3a04e7..000000000 --- a/lantern_hnsw/src/hooks/op_rewrite.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef LDB_HOOKS_OP_REWRITE_H -#define LDB_HOOKS_OP_REWRITE_H - -#include -#include - -typedef struct OpRewriterContext -{ - List *ldb_ops; - List *indices; - List *rtable; -} OpRewriterContext; - -bool ldb_rewrite_ops(Plan *plan, List *oidList, List *rtable); -#endif diff --git a/lantern_hnsw/src/hooks/plan_tree_walker.c b/lantern_hnsw/src/hooks/plan_tree_walker.c deleted file mode 100644 index fabafc624..000000000 --- a/lantern_hnsw/src/hooks/plan_tree_walker.c +++ /dev/null @@ -1,158 +0,0 @@ -#include - -#include "plan_tree_walker.h" - -#include -#include -#include -#include -#include - -bool base_plan_walker(Plan *plan, bool (*walker_func)(Node *plan, void *context), void *context) -{ - /* - If there is a need to debug this function, follow the steps below: - 0. Add the following as the default branch in plan_tree_walker - default: - { - ldb_dlog("plan_tree_walker: unsupported plan node type: %d", nodeTag(plan)); - return false; - } - This will print all nodes that are not explicitly handled by the walker. - Currently there are several such nodes which probably means there are more - latent issues here. - 1. Attach gdb to the postgres process - 2. Set a breakpoint at the function entry - 3. navitate through relevant paths via gdb - 4. debug print Plan* nodes via - p (char*) nodeToString(plan); - - Note: for non-trivial Plan* nodes you may need to run: - set print elements 0 - in gdb to make sure the node string is not truncated. - */ - if(walker_func((Node *)plan->targetlist, context)) return true; - if(walker_func((Node *)plan->qual, context)) return true; - if(walker_func((Node *)plan->lefttree, context)) return true; - if(walker_func((Node *)plan->righttree, context)) return true; - if(walker_func((Node *)plan->initPlan, context)) return true; - return false; -} - -bool plan_tree_walker(Plan *plan, bool (*walker_func)(Node *plan, void *context), void *context) -{ - check_stack_depth(); - - switch(nodeTag(plan)) { - case T_SeqScan: - { - SeqScan *seqscan = (SeqScan *)plan; -#if PG_VERSION_NUM >= 150000 - Plan seqscanplan = seqscan->scan.plan; -#else - Plan seqscanplan = seqscan->plan; -#endif - if(base_plan_walker(&seqscanplan, walker_func, context)) return true; - break; - } - case T_IndexScan: - { - IndexScan *indexscan = (IndexScan *)plan; - if(base_plan_walker(&(indexscan->scan.plan), walker_func, context)) return true; - if(walker_func((Node *)indexscan->indexqual, context)) return true; - if(walker_func((Node *)indexscan->indexorderby, context)) return true; - break; - } - case T_IndexOnlyScan: - { - IndexOnlyScan *indexonlyscan = (IndexOnlyScan *)plan; - if(base_plan_walker(&(indexonlyscan->scan.plan), walker_func, context)) return true; - if(walker_func((Node *)indexonlyscan->indexqual, context)) return true; - if(walker_func((Node *)indexonlyscan->indexorderby, context)) return true; - break; - } - case T_SubqueryScan: - { - SubqueryScan *subqueryscan = (SubqueryScan *)plan; - if(base_plan_walker(&(subqueryscan->scan.plan), walker_func, context)) return true; - if(walker_func((Node *)subqueryscan->subplan, context)) return true; - break; - } - case T_CteScan: - { - CteScan *ctescan = (CteScan *)plan; - if(base_plan_walker(&(ctescan->scan.plan), walker_func, context)) return true; - break; - } -#if PG_VERSION_NUM < 160000 - case T_Join: - { - Join *join = (Join *)plan; - if(base_plan_walker(&(join->plan), walker_func, context)) return true; - if(walker_func((Node *)join->joinqual, context)) return true; - break; - } -#endif - case T_Agg: - { - Agg *agg = (Agg *)plan; - if(base_plan_walker(&(agg->plan), walker_func, context)) return true; - break; - } - case T_Group: - { - Group *group = (Group *)plan; - if(base_plan_walker(&(group->plan), walker_func, context)) return true; - break; - } - case T_Sort: - { - Sort *sort = (Sort *)plan; - if(base_plan_walker(&(sort->plan), walker_func, context)) return true; - break; - } - case T_Unique: - { - Unique *unique = (Unique *)plan; - if(base_plan_walker(&(unique->plan), walker_func, context)) return true; - break; - } - case T_NestLoop: - { - NestLoop *nestloop = (NestLoop *)plan; - if(base_plan_walker((Plan *)&(nestloop->join), walker_func, context)) return true; - break; - } - case T_Result: - { - Result *result = (Result *)plan; - if(base_plan_walker(&(result->plan), walker_func, context)) return true; - if(walker_func((Node *)result->resconstantqual, context)) return true; - break; - } - case T_Limit: - { - Limit *limit = (Limit *)plan; - if(base_plan_walker(&(limit->plan), walker_func, context)) return true; - if(walker_func((Node *)limit->limitOffset, context)) return true; - if(walker_func((Node *)limit->limitCount, context)) return true; - break; - } - case T_Append: - { - Append *append = (Append *)plan; - if(base_plan_walker(&(append->plan), walker_func, context)) return true; - if(walker_func((Node *)append->appendplans, context)) return true; - break; - } - case T_Material: - { - Material *material = (Material *)plan; - if(base_plan_walker(&(material->plan), walker_func, context)) return true; - break; - } - default: - return false; - } - return false; -} diff --git a/lantern_hnsw/src/hooks/plan_tree_walker.h b/lantern_hnsw/src/hooks/plan_tree_walker.h deleted file mode 100644 index 03f885c43..000000000 --- a/lantern_hnsw/src/hooks/plan_tree_walker.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef LDB_HOOKS_PLAN_TREE_WALKER_H -#define LDB_HOOKS_PLAN_TREE_WALKER_H - -#include - -#include -#include - -static inline bool is_plan_node(Node *node) -{ -#if PG_VERSION_NUM >= 160000 - return nodeTag(node) >= T_Result && nodeTag(node) <= T_PlanInvalItem; -#else - return nodeTag(node) >= T_Plan && nodeTag(node) < T_PlanState; -#endif -} - -bool plan_tree_walker(Plan *plan, bool (*walker_func)(Node *node, void *context), void *context); - -#endif // LDB_HOOKS_PLAN_TREE_WALKER_H diff --git a/lantern_hnsw/src/hooks/post_parse.c b/lantern_hnsw/src/hooks/post_parse.c deleted file mode 100644 index 7c27d1ce8..000000000 --- a/lantern_hnsw/src/hooks/post_parse.c +++ /dev/null @@ -1,189 +0,0 @@ -#include - -#include "post_parse.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../hnsw/options.h" -#include "utils.h" - -post_parse_analyze_hook_type original_post_parse_analyze_hook = NULL; - -typedef struct -{ - List *oidList; -} OperatorUsedContext; - -static bool operator_used_walker(Node *node, OperatorUsedContext *context) -{ - if(node == NULL) return false; - if(IsA(node, Query)) return query_tree_walker((Query *)node, operator_used_walker, (void *)context, 0); - if(IsA(node, OpExpr)) { - OpExpr *opExpr = (OpExpr *)node; - if(list_member_oid(context->oidList, opExpr->opno)) { - return true; - } - } - return expression_tree_walker(node, operator_used_walker, (void *)context); -} - -static bool is_operator_used(Node *node, List *oidList) -{ - OperatorUsedContext context; - context.oidList = oidList; - return operator_used_walker(node, &context); -} - -typedef struct -{ - List *sortGroupRefs; -} SortGroupRefContext; - -static bool sort_group_ref_walker(Node *node, SortGroupRefContext *context) -{ - if(node == NULL) return false; - if(IsA(node, Query)) { - Query *query = (Query *)node; - ListCell *lc; - foreach(lc, query->sortClause) { - SortGroupClause *sortGroupClause = (SortGroupClause *)lfirst(lc); - context->sortGroupRefs = lappend_int(context->sortGroupRefs, sortGroupClause->tleSortGroupRef); - } - return query_tree_walker((Query *)node, sort_group_ref_walker, (void *)context, 0); - } - return expression_tree_walker(node, sort_group_ref_walker, (void *)context); -} - -static List *get_sort_group_refs(Node *node) -{ - SortGroupRefContext context; - context.sortGroupRefs = NIL; - sort_group_ref_walker(node, &context); - return context.sortGroupRefs; -} - -typedef struct -{ - List *oidList; - List *sortGroupRefs; - bool usedCorrectly; -} OperatorUsedCorrectlyContext; - -static bool is_var_or_func_of_vars(Node *node) -{ - if(IsA(node, Var)) { - return true; - } else if(IsA(node, FuncExpr)) { - List *args = ((FuncExpr *)node)->args; - ListCell *cell; - foreach(cell, args) { - if(is_var_or_func_of_vars(lfirst(cell))) { - return true; - } - } - } - return false; -} - -static bool operator_used_incorrectly_walker(Node *node, OperatorUsedCorrectlyContext *context) -{ - if(node == NULL) return false; - if(IsA(node, Query)) return query_tree_walker((Query *)node, operator_used_incorrectly_walker, (void *)context, 0); - if(IsA(node, TargetEntry)) { - TargetEntry *te = (TargetEntry *)node; - if(te->resjunk && list_member_int(context->sortGroupRefs, te->ressortgroupref)) { - if(IsA(te->expr, OpExpr)) { - OpExpr *opExpr = (OpExpr *)te->expr; - if(list_member_oid(context->oidList, opExpr->opno)) { - Node *arg1 = (Node *)linitial(opExpr->args); - Node *arg2 = (Node *)lsecond(opExpr->args); - bool isVar1 = IsA(arg1, Var); - bool isVar2 = IsA(arg2, Var); - /* There is a case when operator is used with index - * that was created via expression (CREATE INDEX ON t USING hnsw (func(id)) WITH (M=2)) - * in this case the query may look like this - * SELECT id FROM test ORDER BY func(id) <-> ARRAY[0,0,0] LIMIT 2 - * or like this - * SELECT id FROM test ORDER BY func(id) <-> func(n) LIMIT 2 - * we should check if IsA(arg1, FuncExpr) || IsA(arg2, FuncExpr) - * if true we may go and check the oid of function result to see if it is an array type - * we also can check that the argument of FuncExpr is at least one of the arg1 and arg2 - * will contain column of the table (e.g iterate over list and check IsA(arg, Var)) - * so the function will not be called with constant arguments on both sides - */ - if(isVar1 && isVar2) { - return false; - } else if(isVar1 && !isVar2) { - return operator_used_incorrectly_walker(arg2, context); - } else if(!isVar1 && isVar2) { - return operator_used_incorrectly_walker(arg1, context); - } else { - bool isFuncOfVars1 = is_var_or_func_of_vars(arg1); - bool isFuncOfVars2 = is_var_or_func_of_vars(arg2); - if(!isFuncOfVars1 && !isFuncOfVars2) { - return true; - } else { - return operator_used_incorrectly_walker(arg1, context) - || operator_used_incorrectly_walker(arg2, context); - } - } - } - } - } - } - if(IsA(node, OpExpr)) { - OpExpr *opExpr = (OpExpr *)node; - if(list_member_oid(context->oidList, opExpr->opno)) { - return true; - } - } - - return expression_tree_walker(node, operator_used_incorrectly_walker, (void *)context); -} - -static bool is_operator_used_incorrectly(Node *node, List *oidList, List *sortGroupRefs) -{ - OperatorUsedCorrectlyContext context; - context.oidList = oidList; - context.sortGroupRefs = sortGroupRefs; - return operator_used_incorrectly_walker(node, &context); -} - -void post_parse_analyze_hook_with_operator_check(ParseState *pstate, - Query *query -#if PG_VERSION_NUM >= 140000 - , - JumbleState *jstate -#endif -) -{ - if(original_post_parse_analyze_hook) { -#if PG_VERSION_NUM >= 140000 - original_post_parse_analyze_hook(pstate, query, jstate); -#else - original_post_parse_analyze_hook(pstate, query); -#endif - } - - if(ldb_pgvector_compat || creating_extension) { - return; - } - - List *oidList = ldb_get_operator_oids(); - Node *query_as_node = (Node *)query; - if(is_operator_used(query_as_node, oidList)) { - List *sort_group_refs = get_sort_group_refs(query_as_node); - if(is_operator_used_incorrectly(query_as_node, oidList, sort_group_refs)) { - elog(ERROR, "Operator is invalid outside of ORDER BY context"); - } - list_free(sort_group_refs); - } - list_free(oidList); -} diff --git a/lantern_hnsw/src/hooks/post_parse.h b/lantern_hnsw/src/hooks/post_parse.h deleted file mode 100644 index 67ecfb45d..000000000 --- a/lantern_hnsw/src/hooks/post_parse.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef LDB_HOOKS_POST_PARSE_H -#define LDB_HOOKS_POST_PARSE_H - -#include - -#include -#include - -extern post_parse_analyze_hook_type original_post_parse_analyze_hook; - -void post_parse_analyze_hook_with_operator_check(ParseState *pstate, - Query *query -#if PG_VERSION_NUM >= 140000 - , - JumbleState *jstate -#endif -); - -#endif // LDB_HOOKS_POST_PARSE_H \ No newline at end of file diff --git a/lantern_hnsw/src/hooks/utils.c b/lantern_hnsw/src/hooks/utils.c deleted file mode 100644 index c46b98dc0..000000000 --- a/lantern_hnsw/src/hooks/utils.c +++ /dev/null @@ -1,27 +0,0 @@ -#include - -#include -#include -#include -#include - -List *ldb_get_operator_oids() -{ - List *oidList = NIL; - - List *nameList = lappend(NIL, makeString("")); - - Oid intOperator = LookupOperName(NULL, nameList, INT4ARRAYOID, INT4ARRAYOID, true, -1); - Oid floatOperator = LookupOperName(NULL, nameList, FLOAT4ARRAYOID, FLOAT4ARRAYOID, true, -1); - - if(OidIsValid(intOperator)) { - oidList = lappend_oid(oidList, intOperator); - } - if(OidIsValid(floatOperator)) { - oidList = lappend_oid(oidList, floatOperator); - } - - list_free(nameList); - - return oidList; -} diff --git a/lantern_hnsw/src/hooks/utils.h b/lantern_hnsw/src/hooks/utils.h deleted file mode 100644 index be89baaf4..000000000 --- a/lantern_hnsw/src/hooks/utils.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef LDB_HOOKS_UTILS_H -#define LDB_HOOKS_UTILS_H - -#include - -#include - -List *ldb_get_operator_oids(); - -List *ldb_get_operator_class_oids(Oid amId); - -#endif // LDB_HOOKS_UTILS_H diff --git a/lantern_hnsw/test/expected/async_tasks.out b/lantern_hnsw/test/expected/async_tasks.out index 0b6f78da1..954cef627 100644 --- a/lantern_hnsw/test/expected/async_tasks.out +++ b/lantern_hnsw/test/expected/async_tasks.out @@ -94,20 +94,17 @@ SELECT lantern.async_task($$SELECT pg_sleep(0.1);$$, 'Lantern job name'); async_task ------------ 1 -(1 row) SELECT lantern.async_task($$SELECT pg_sleep(70);$$::text); async_task ------------ 2 -(1 row) -- will fail since the task is not valid SQL SELECT lantern.async_task($$SELECT pg_sleep(haha);$$, 'Lantern job name'); async_task ------------ 3 -(1 row) SELECT jobid, query, pg_cron_job_name, job_name, duration IS NOT NULL AS is_done, status, error_message FROM lantern.tasks; jobid | query | pg_cron_job_name | job_name | is_done | status | error_message @@ -115,13 +112,11 @@ SELECT jobid, query, pg_cron_job_name, job_name, duration IS NOT NULL AS is_done 1 | SELECT pg_sleep(0.1); | async_task_1 | Lantern job name | f | | 2 | SELECT pg_sleep(70); | async_task_2 | | f | | 3 | SELECT pg_sleep(haha); | async_task_3 | Lantern job name | f | | -(3 rows) SELECT pg_sleep(3); pg_sleep ---------- -(1 row) SELECT jobid, query, pg_cron_job_name, job_name, duration IS NOT NULL AS is_done, status, error_message FROM lantern.tasks; jobid | query | pg_cron_job_name | job_name | is_done | status | error_message @@ -132,13 +127,11 @@ SELECT jobid, query, pg_cron_job_name, job_name, duration IS NOT NULL AS is_done | | | | | | ^ + | | | | | | 1 | SELECT pg_sleep(0.1); | async_task_1 | Lantern job name | t | succeeded | -(3 rows) SELECT lantern.cancel_all_async_tasks(); cancel_all_async_tasks ------------------------ -(1 row) -- test async tasks on index creation DROP TABLE IF EXISTS small_world; @@ -156,7 +149,6 @@ SELECT lantern.async_task($$CREATE INDEX idx ON "sift_base1k_UpperCase" USING la async_task ------------ 4 -(1 row) -- blocks DB deletions that is why it is disabled for now -- SELECT lantern.async_task($$CREATE INDEX CONCURRENTLY idx_concurrent ON "sift_base1k_UpperCase" USING lantern_hnsw (v) WITH (dim=128, M=6);$$, 'Indexing Job'); @@ -164,13 +156,11 @@ SELECT pg_sleep(5); pg_sleep ---------- -(1 row) SELECT * FROM ldb_get_indexes('sift_base1k_UpperCase'); indexname | size | indexdef | indisvalid -----------+--------+--------------------------------------------------------------------------------------------+------------ idx | 632 kB | CREATE INDEX idx ON "sift_base1k_UpperCase" USING lantern_hnsw (v) WITH (dim='128', m='6') | t -(1 row) SELECT _lantern_internal.validate_index('idx', false); INFO: validate_index() start for idx @@ -178,7 +168,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT jobid, query, pg_cron_job_name, job_name, duration IS NOT NULL AS is_done, status, error_message FROM lantern.tasks; jobid | query | pg_cron_job_name | job_name | is_done | status | error_message @@ -190,7 +179,6 @@ SELECT jobid, query, pg_cron_job_name, job_name, duration IS NOT NULL AS is_done 1 | SELECT pg_sleep(0.1); | async_task_1 | Lantern job name | t | succeeded | 2 | SELECT pg_sleep(70); | async_task_2 | | t | canceled | Canceled by user 4 | CREATE INDEX idx ON "sift_base1k_UpperCase" USING lantern_hnsw (v) WITH (dim=128, M=6); | async_task_4 | Indexing Job | t | succeeded | -(4 rows) -- NOTE: the test finishes but the async index creation may still be in progress -- create non superuser and test the function @@ -207,14 +195,12 @@ NOTICE: Job scheduled with pg_cron name: 'async_task_5' async_task ------------ 5 -(1 row) SELECT lantern.async_task($$CREATE INDEX idx2 ON "sift_base1k_UpperCase" USING lantern_hnsw (v) WITH (dim=128, M=6);$$, 'Indexing Job'); NOTICE: Job scheduled with pg_cron name: 'async_task_6' async_task ------------ 6 -(1 row) -- this should fail since test_user does not have permission to drop the table -- sql line for do not stop on error @@ -223,7 +209,6 @@ NOTICE: Job scheduled with pg_cron name: 'async_task_7' async_task ------------ 7 -(1 row) -- lantern.tasks jobid is distinct and independent from cron.jobid, even though they may often overlap -- make sure everything works even when they are out of sync @@ -231,20 +216,17 @@ SELECT nextval('lantern.tasks_jobid_seq'); nextval --------- 8 -(1 row) SELECT lantern.async_task($$SELECT 42$$, 'Life'); NOTICE: Job scheduled with pg_cron name: 'async_task_9' async_task ------------ 9 -(1 row) SELECT pg_sleep(4); pg_sleep ---------- -(1 row) SELECT jobid, query, pg_cron_job_name, job_name, duration IS NOT NULL AS is_done, status, error_message FROM lantern.tasks ORDER BY jobid; jobid | query | pg_cron_job_name | job_name | is_done | status | error_message @@ -255,5 +237,4 @@ SELECT jobid, query, pg_cron_job_name, job_name, duration IS NOT NULL AS is_done 7 | DROP TABLE "sift_base1k_UpperCase"; | async_task_7 | Dropping Table Job | t | failed | ERROR: must be owner of table sift_base1k_UpperCase+ | | | | | | 9 | SELECT 42 | async_task_9 | Life | t | succeeded | -(4 rows) diff --git a/lantern_hnsw/test/expected/ext_relocation.out b/lantern_hnsw/test/expected/ext_relocation.out index 71fb2cd6c..dccc44dd0 100644 --- a/lantern_hnsw/test/expected/ext_relocation.out +++ b/lantern_hnsw/test/expected/ext_relocation.out @@ -38,6 +38,7 @@ ORDER BY 1, 3, 2; schema1 | create_pq_codebook | _lantern_internal schema1 | failure_point_enable | _lantern_internal schema1 | forbid_table_change | _lantern_internal + schema1 | get_vector_type_oid | _lantern_internal schema1 | mask_arrays | _lantern_internal schema1 | mask_order_by_in_plan | _lantern_internal schema1 | quantize_vector | _lantern_internal @@ -62,7 +63,6 @@ ORDER BY 1, 3, 2; schema1 | ldb_pqvec_send | schema1 schema1 | quantize_table | schema1 schema1 | quantize_vector | schema1 -(28 rows) -- show all the extension operators SELECT ne.nspname AS extschema, op.oprname, np.nspname AS proschema @@ -80,7 +80,6 @@ ORDER BY 1, 3; schema1 | <-> | schema1 schema1 | <=> | schema1 schema1 | <+> | schema1 -(5 rows) SET search_path TO public, schema1; -- extension function is accessible @@ -88,7 +87,6 @@ SELECT l2sq_dist(ARRAY[1.0, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]); l2sq_dist ----------- 27 -(1 row) CREATE INDEX hnsw_index ON small_world USING lantern_hnsw(v) WITH (dim=3); INFO: done init usearch index @@ -100,7 +98,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) \set ON_ERROR_STOP off -- lantern does not support relocation. @@ -126,7 +123,6 @@ ORDER BY 1, 3; schema1 | <-> | schema1 schema1 | <=> | schema1 schema1 | <+> | schema1 -(5 rows) SET search_path TO public, schema2; --extension access method is still accessible since access methods are not schema-qualified @@ -140,7 +136,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) \set ON_ERROR_STOP off -- extension function cannot be found without schema-qualification @@ -151,5 +146,4 @@ SELECT schema1.l2sq_dist(ARRAY[1.0, 2.0, 3.0], ARRAY[4.0, 5.0, 6.0]); l2sq_dist ----------- 27 -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_concurrent.out b/lantern_hnsw/test/expected/hnsw_concurrent.out index a47d80c70..ab1673048 100644 --- a/lantern_hnsw/test/expected/hnsw_concurrent.out +++ b/lantern_hnsw/test/expected/hnsw_concurrent.out @@ -15,7 +15,6 @@ SELECT id, ROUND((v <-> :'v444')::numeric, 2) FROM sift_base1k ORDER BY v <-> :' 557 | 91664.00 62 | 93497.00 58 | 93637.00 -(6 rows) CREATE INDEX to_be_reindexed ON sift_base1k USING lantern_hnsw (v) WITH (dim=128, M=8); INFO: done init usearch index @@ -25,7 +24,6 @@ SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | indisvalid -----------------+--------+--------------------------------------------------------------------------------------------+------------ to_be_reindexed | 680 kB | CREATE INDEX to_be_reindexed ON sift_base1k USING lantern_hnsw (v) WITH (dim='128', m='8') | t -(1 row) SELECT _lantern_internal.validate_index('to_be_reindexed', false); INFO: validate_index() start for to_be_reindexed @@ -33,7 +31,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) REINDEX INDEX CONCURRENTLY to_be_reindexed; INFO: done init usearch index @@ -46,13 +43,11 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | indisvalid -----------------+--------+--------------------------------------------------------------------------------------------+------------ to_be_reindexed | 680 kB | CREATE INDEX to_be_reindexed ON sift_base1k USING lantern_hnsw (v) WITH (dim='128', m='8') | t -(1 row) set enable_seqscan=FALSE; -- 6 closest vectors to the vector with ID 444. note all the duplicate results because of bad handling of REINDEX @@ -65,5 +60,4 @@ SELECT id, ROUND((v <-> :'v444')::numeric, 2) FROM sift_base1k ORDER BY v <-> :' 557 | 91664.00 62 | 93497.00 58 | 93637.00 -(6 rows) diff --git a/lantern_hnsw/test/expected/hnsw_config.out b/lantern_hnsw/test/expected/hnsw_config.out index fb8a80d6f..61592413b 100644 --- a/lantern_hnsw/test/expected/hnsw_config.out +++ b/lantern_hnsw/test/expected/hnsw_config.out @@ -27,14 +27,12 @@ SELECT * FROM ldb_get_indexes('small_world'); indexname | size | indexdef | indisvalid -------------------+-------+-------------------------------------------------------------------------------------+------------ small_world_v_idx | 16 kB | CREATE INDEX small_world_v_idx ON small_world USING lantern_hnsw (v) WITH (dim='3') | t -(1 row) -- Verify that lantern_hnsw.init_k exists after index creation SHOW lantern_hnsw.init_k; lantern_hnsw.init_k --------------------- 10 -(1 row) -- Modify lantern_hnsw.init_k and verify that it was modified SET lantern_hnsw.init_k = 45; @@ -42,7 +40,6 @@ SHOW lantern_hnsw.init_k; lantern_hnsw.init_k --------------------- 45 -(1 row) -- Reset all parameters and verify that lantern_hnsw.init_k was reset RESET ALL; @@ -50,7 +47,6 @@ SHOW lantern_hnsw.init_k; lantern_hnsw.init_k --------------------- 10 -(1 row) -- Validate the index data structures SELECT _lantern_internal.validate_index('small_world_v_idx', false); @@ -59,5 +55,4 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_correct.out b/lantern_hnsw/test/expected/hnsw_correct.out index 32bf7a07b..e969ac258 100644 --- a/lantern_hnsw/test/expected/hnsw_correct.out +++ b/lantern_hnsw/test/expected/hnsw_correct.out @@ -13,7 +13,6 @@ INFO: done init usearch index INFO: inserted 4 elements INFO: done saving 4 vectors SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; -- Get the results without the index CREATE TEMP TABLE results_wo_index AS SELECT @@ -25,7 +24,7 @@ FROM -- Get the results with the index CREATE TEMP TABLE results_w_index AS SELECT - ROW_NUMBER() OVER (ORDER BY v '{0,0}') AS row_num, + ROW_NUMBER() OVER (ORDER BY v <-> '{0,0}') AS row_num, id, l2sq_dist(v, '{0,0}') AS dist FROM @@ -46,7 +45,6 @@ WHERE a.id != b.id; row_num | id_with_index | id_without_index | dist_with_index | dist_without_index ---------+---------------+------------------+-----------------+-------------------- -(0 rows) -- Validate the index data structures SELECT _lantern_internal.validate_index('small_world_v_idx', false); @@ -55,5 +53,4 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_cost_estimate.out b/lantern_hnsw/test/expected/hnsw_cost_estimate.out index b0ed80099..cd763cabb 100644 --- a/lantern_hnsw/test/expected/hnsw_cost_estimate.out +++ b/lantern_hnsw/test/expected/hnsw_cost_estimate.out @@ -50,11 +50,10 @@ BEGIN RETURN is_within_error(get_cost_estimate(explain_query), expected_cost, error_margin); END; $$ LANGUAGE plpgsql; -SET lantern.pgvector_compat=FALSE; -- Goal: make sure query cost estimate is accurate -- when index is created with varying costruction parameters. SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -\set explain_query_template 'EXPLAIN SELECT * FROM sift_base10k ORDER BY v ''%s'' LIMIT 10' +\set explain_query_template 'EXPLAIN SELECT * FROM sift_base10k ORDER BY v <-> ''%s'' LIMIT 10' \set enable_seqscan = off; -- Case 0, sanity check. No data. CREATE TABLE empty_table(id SERIAL PRIMARY KEY, v REAL[2]); @@ -63,7 +62,7 @@ INFO: done init usearch index INFO: inserted 0 elements INFO: done saving 0 vectors SET _lantern_internal.is_test = true; -SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v ''{1,2}'' LIMIT 10', 0.47); +SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v <-> ''{1,2}'' LIMIT 10', 0.47); DEBUG: LANTERN - Query cost estimator DEBUG: LANTERN - --------------------- DEBUG: LANTERN - Total cost: 4.225000 @@ -74,7 +73,6 @@ DEBUG: LANTERN - --------------------- is_cost_estimate_within_error ------------------------------- t -(1 row) SELECT _lantern_internal.validate_index('empty_idx', false); INFO: validate_index() start for empty_idx @@ -82,7 +80,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) DROP INDEX empty_idx; -- Case 1, more data in index. @@ -102,7 +99,6 @@ DEBUG: LANTERN - --------------------- is_cost_estimate_within_error ------------------------------- t -(1 row) SELECT _lantern_internal.validate_index('hnsw_idx', false); INFO: validate_index() start for hnsw_idx @@ -110,7 +106,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) DROP INDEX hnsw_idx; -- Case 2, higher M. @@ -130,7 +125,6 @@ DEBUG: LANTERN - --------------------- is_cost_estimate_within_error ------------------------------- t -(1 row) SELECT _lantern_internal.validate_index('hnsw_idx', false); INFO: validate_index() start for hnsw_idx @@ -138,7 +132,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) DROP INDEX hnsw_idx; -- Case 3, higher ef. @@ -158,7 +151,6 @@ DEBUG: LANTERN - --------------------- is_cost_estimate_within_error ------------------------------- t -(1 row) SELECT _lantern_internal.validate_index('hnsw_idx', false); INFO: validate_index() start for hnsw_idx @@ -166,7 +158,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) DROP INDEX hnsw_idx; -- Goal: Test cost estimation when number of pages in index is likely less than number of blockmaps allocated @@ -191,7 +182,6 @@ SELECT COUNT(*) FROM views_vec10k WHERE views < 100; count ------- 58 -(1 row) -- Create partial lantern index with (views < 100) filter CREATE INDEX hnsw_partial_views_100 ON views_vec10k USING lantern_hnsw (vec dist_l2sq_ops) WITH (M=8, dim=6) WHERE views < 100; @@ -212,7 +202,6 @@ DEBUG: LANTERN - --------------------- Limit -> Index Scan using hnsw_partial_views_100 on views_vec10k Order By: (vec <-> '{0,1,2,3,4,5}'::real[]) -(3 rows) -- Goal: Test that the index selectivity being calculated for partial indexes is correct -- note that these boundaries are selected so that mac num_pages and cost values align @@ -226,7 +215,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) CREATE INDEX hnsw_partial_views_2000 ON views_vec10k USING lantern_hnsw (vec dist_l2sq_ops) WITH (M=9, dim=6) WHERE views < 2000; INFO: done init usearch index @@ -238,7 +226,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) CREATE INDEX hnsw_partial_views_3000 ON views_vec10k USING lantern_hnsw (vec dist_l2sq_ops) WITH (M=9, dim=6) WHERE views < 3000; INFO: done init usearch index @@ -250,7 +237,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) CREATE INDEX hnsw_partial_views_4000 ON views_vec10k USING lantern_hnsw (vec dist_l2sq_ops) WITH (M=9, dim=6) WHERE views < 4000; INFO: done init usearch index @@ -262,7 +248,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) CREATE INDEX hnsw_partial_views_6000 ON views_vec10k USING lantern_hnsw (vec dist_l2sq_ops) WITH (M=9, dim=6) WHERE views < 6000; INFO: done init usearch index @@ -274,7 +259,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) CREATE INDEX hnsw_partial_views_8000 ON views_vec10k USING lantern_hnsw (vec dist_l2sq_ops) WITH (M=9, dim=6) WHERE views < 8000; INFO: done init usearch index @@ -286,7 +270,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Trigger each partial index by using its exact filter in a filtered query -- Each indexSelectivity value for a partial index with the filter (views < N) should be around N/20000 @@ -341,5 +324,4 @@ DEBUG: LANTERN - --------------------- Limit -> Index Scan using hnsw_partial_views_1000 on views_vec10k Order By: (vec <-> '{0,1,2,3,4,5}'::real[]) -(3 rows) diff --git a/lantern_hnsw/test/expected/hnsw_create.out b/lantern_hnsw/test/expected/hnsw_create.out index 8ed1fab6e..9c6ea0fed 100644 --- a/lantern_hnsw/test/expected/hnsw_create.out +++ b/lantern_hnsw/test/expected/hnsw_create.out @@ -32,7 +32,6 @@ SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | indisvalid -------------------+--------+----------------------------------------------------------------------------------------------+------------ sift_base1k_v_idx | 680 kB | CREATE INDEX sift_base1k_v_idx ON sift_base1k USING lantern_hnsw (v) WITH (dim='128', m='8') | t -(1 row) SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); INFO: validate_index() start for sift_base1k_v_idx @@ -40,7 +39,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Validate that creating a hamming index works CREATE TABLE sift_base1k_int as SELECT id, v::INT[] FROM sift_base1k; @@ -52,7 +50,6 @@ SELECT * FROM ldb_get_indexes('sift_base1k_int'); indexname | size | indexdef | indisvalid -----------------------+--------+------------------------------------------------------------------------------------------------------------+------------ sift_base1k_int_v_idx | 680 kB | CREATE INDEX sift_base1k_int_v_idx ON sift_base1k_int USING lantern_hnsw (v dist_hamming_ops) WITH (m='8') | t -(1 row) SELECT _lantern_internal.validate_index('sift_base1k_int_v_idx', false); INFO: validate_index() start for sift_base1k_int_v_idx @@ -60,7 +57,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Validate that index creation works with a larger number of vectors \ir utils/sift10k_array.sql @@ -69,19 +65,17 @@ CREATE TABLE IF NOT EXISTS sift_base10k ( v REAL[128] ); \copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv; -SET lantern.pgvector_compat=FALSE; CREATE INDEX hnsw_idx ON sift_base10k USING lantern_hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); INFO: done init usearch index INFO: inserted 10000 elements INFO: done saving 10000 vectors SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444' LIMIT 10; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' LIMIT 10; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit -> Index Scan using hnsw_idx on sift_base10k - Order By: (v '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) -(3 rows) + Order By: (v <-> '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) SELECT _lantern_internal.validate_index('hnsw_idx', false); INFO: validate_index() start for hnsw_idx @@ -89,7 +83,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) --- Validate that M values inside the allowed range [2, 128] do not throw an error CREATE INDEX ON small_world USING lantern_hnsw (v) WITH (M=2); @@ -141,7 +134,6 @@ SELECT * FROM ldb_get_indexes('small_world4'); indexname | size | indexdef | indisvalid -----------------------+-------+----------------------------------------------------------------------------------------------------------------------------+------------ small_world4_hnsw_idx | 16 kB | CREATE INDEX small_world4_hnsw_idx ON small_world4 USING lantern_hnsw (vector) WITH (m='14', ef='22', ef_construction='2') | t -(1 row) -- the index will not allow changing the dimension of a vector element \set ON_ERROR_STOP off @@ -160,7 +152,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- without the index, I can change the dimension of a vector element DROP INDEX small_world4_hnsw_idx; diff --git a/lantern_hnsw/test/expected/hnsw_create_expr.out b/lantern_hnsw/test/expected/hnsw_create_expr.out index 168ccfe36..33e7b2c7e 100644 --- a/lantern_hnsw/test/expected/hnsw_create_expr.out +++ b/lantern_hnsw/test/expected/hnsw_create_expr.out @@ -64,7 +64,6 @@ CREATE TABLE test_table (id INTEGER); INSERT INTO test_table VALUES (0), (1), (7); \set enable_seqscan = off; SET enable_seqscan = false; -SET lantern.pgvector_compat=FALSE; -- This should success CREATE INDEX ON test_table USING lantern_hnsw (int_to_fixed_binary_real_array(id)) WITH (M=2); INFO: done init usearch index @@ -76,7 +75,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) \set ON_ERROR_STOP off -- This should result in an error that dimensions does not match @@ -89,10 +87,9 @@ ERROR: data type text has no default operator class for access method "lantern_ -- This should result in error about multicolumn expressions support CREATE INDEX ON test_table USING lantern_hnsw (int_to_fixed_binary_real_array(id), int_to_dynamic_binary_real_array(id)) WITH (M=2); ERROR: access method "lantern_hnsw" does not support multicolumn indexes -SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) '{0,0,0}'::REAL[] LIMIT 2; +SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) <-> '{0,0,0}'::REAL[] LIMIT 2; id ---- 0 1 -(2 rows) diff --git a/lantern_hnsw/test/expected/hnsw_create_unlogged.out b/lantern_hnsw/test/expected/hnsw_create_unlogged.out index 3c62b2788..d9f3c3ed2 100644 --- a/lantern_hnsw/test/expected/hnsw_create_unlogged.out +++ b/lantern_hnsw/test/expected/hnsw_create_unlogged.out @@ -32,7 +32,6 @@ SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | indisvalid -------------------+--------+----------------------------------------------------------------------------------------------+------------ sift_base1k_v_idx | 680 kB | CREATE INDEX sift_base1k_v_idx ON sift_base1k USING lantern_hnsw (v) WITH (dim='128', m='8') | t -(1 row) SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); INFO: validate_index() start for sift_base1k_v_idx @@ -40,7 +39,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Validate that index creation works with a larger number of vectors \ir utils/sift10k_array_unlogged.sql @@ -49,19 +47,17 @@ CREATE UNLOGGED TABLE IF NOT EXISTS sift_base10k ( v REAL[128] ); \copy sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' with csv; -SET lantern.pgvector_compat=FALSE; CREATE INDEX hnsw_idx ON sift_base10k USING lantern_hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); INFO: done init usearch index INFO: inserted 10000 elements INFO: done saving 10000 vectors SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444' LIMIT 10; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' LIMIT 10; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit -> Index Scan using hnsw_idx on sift_base10k - Order By: (v '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) -(3 rows) + Order By: (v <-> '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) SELECT _lantern_internal.validate_index('hnsw_idx', false); INFO: validate_index() start for hnsw_idx @@ -69,7 +65,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) --- Validate that M values inside the allowed range [2, 128] do not throw an error CREATE INDEX ON small_world USING lantern_hnsw (v) WITH (M=2); @@ -121,7 +116,6 @@ SELECT * FROM ldb_get_indexes('small_world4'); indexname | size | indexdef | indisvalid -----------------------+-------+----------------------------------------------------------------------------------------------------------------------------+------------ small_world4_hnsw_idx | 16 kB | CREATE INDEX small_world4_hnsw_idx ON small_world4 USING lantern_hnsw (vector) WITH (m='14', ef='22', ef_construction='2') | t -(1 row) -- the index will not allow changing the dimension of a vector element \set ON_ERROR_STOP off @@ -140,7 +134,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- without the index, I can change the dimension of a vector element DROP INDEX small_world4_hnsw_idx; diff --git a/lantern_hnsw/test/expected/hnsw_delete.out b/lantern_hnsw/test/expected/hnsw_delete.out index e03105fb0..5f467a86d 100644 --- a/lantern_hnsw/test/expected/hnsw_delete.out +++ b/lantern_hnsw/test/expected/hnsw_delete.out @@ -24,7 +24,6 @@ SELECT * FROM small_world; 101 | f | {1,0,1} 110 | f | {1,1,0} 111 | t | {1,1,1} -(8 rows) CREATE INDEX ON small_world USING lantern_hnsw (v) WITH (M=128) WHERE b = FALSE; INFO: done init usearch index @@ -37,14 +36,12 @@ SELECT * FROM small_world WHERE b = FALSE order by v <-> '{1,0,0}' LIMIT 3; 100 | f | {1,0,0} 110 | f | {1,1,0} 101 | f | {1,0,1} -(3 rows) DELETE FROM small_world WHERE v <> '{1,0,0}'; SELECT * FROM small_world WHERE b = FALSE order by v <-> '{1,0,0}' LIMIT 3; id | b | v -----+---+--------- 100 | f | {1,0,0} -(1 row) VACUUM small_world; WARNING: LanternDB: hnsw index deletes are currently not implemented. This is a no-op. No memory will be reclaimed @@ -56,5 +53,4 @@ SELECT * FROM small_world WHERE b = FALSE order by v <-> '{1,0,0}' LIMIT 3; id | b | v -----+---+--------- 100 | f | {1,0,0} -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_dist_func.out b/lantern_hnsw/test/expected/hnsw_dist_func.out index 3414a212d..ebc373238 100644 --- a/lantern_hnsw/test/expected/hnsw_dist_func.out +++ b/lantern_hnsw/test/expected/hnsw_dist_func.out @@ -35,9 +35,8 @@ INSERT INTO small_world_l2 SELECT id, v FROM small_world; INSERT INTO small_world_cos SELECT id, v FROM small_world; INSERT INTO small_world_ham SELECT id, ARRAY[CAST(v[1] AS INTEGER), CAST(v[2] AS INTEGER), CAST(v[3] AS INTEGER)] FROM small_world; SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; -- Verify that the distance functions work (check distances) -SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v '{0,1,0}'; +SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v <-> '{0,1,0}'; round ------- 0.00 @@ -48,9 +47,8 @@ SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v 2.00 2.00 3.00 -(8 rows) -SELECT ROUND(cos_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_cos ORDER BY v '{0,1,0}'; +SELECT ROUND(cos_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_cos ORDER BY v <=> '{0,1,0}'; round ------- 0.00 @@ -61,9 +59,8 @@ SELECT ROUND(cos_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_cos ORDER BY v 1.00 1.00 1.00 -(8 rows) -SELECT ROUND(hamming_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_ham ORDER BY v '{0,1,0}'; +SELECT ROUND(hamming_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_ham ORDER BY v <+> '{0,1,0}'; round ------- 0.00 @@ -74,7 +71,6 @@ SELECT ROUND(hamming_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_ham ORDER 2.00 2.00 3.00 -(8 rows) -- Verify that the distance functions work (check IDs) SELECT ARRAY_AGG(id ORDER BY id), ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 GROUP BY 2 ORDER BY 2; @@ -84,7 +80,6 @@ SELECT ARRAY_AGG(id ORDER BY id), ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FRO {000,011,110} | 1.00 {001,100,111} | 2.00 {101} | 3.00 -(4 rows) SELECT ARRAY_AGG(id ORDER BY id), ROUND(cos_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_cos GROUP BY 2 ORDER BY 2; array_agg | round @@ -93,7 +88,6 @@ SELECT ARRAY_AGG(id ORDER BY id), ROUND(cos_dist(v, '{0,1,0}')::numeric, 2) FROM {011,110} | 0.29 {111} | 0.42 {000,001,100,101} | 1.00 -(4 rows) SELECT ARRAY_AGG(id ORDER BY id), ROUND(hamming_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_ham GROUP BY 2 ORDER BY 2; array_agg | round @@ -102,37 +96,33 @@ SELECT ARRAY_AGG(id ORDER BY id), ROUND(hamming_dist(v, '{0,1,0}')::numeric, 2) {000,011,110} | 1.00 {001,100,111} | 2.00 {101} | 3.00 -(4 rows) -- Verify that the indexes is being used -EXPLAIN (COSTS false) SELECT id FROM small_world_l2 ORDER BY v '{0,1,0}'; +EXPLAIN (COSTS false) SELECT id FROM small_world_l2 ORDER BY v <-> '{0,1,0}'; QUERY PLAN --------------------------------------------------------- Index Scan using small_world_l2_v_idx on small_world_l2 - Order By: (v '{0,1,0}'::real[]) -(2 rows) + Order By: (v <-> '{0,1,0}'::real[]) -EXPLAIN (COSTS false) SELECT id FROM small_world_cos ORDER BY v '{0,1,0}'; +EXPLAIN (COSTS false) SELECT id FROM small_world_cos ORDER BY v <=> '{0,1,0}'; QUERY PLAN ----------------------------------------------------------- Index Scan using small_world_cos_v_idx on small_world_cos - Order By: (v '{0,1,0}'::real[]) -(2 rows) + Order By: (v <=> '{0,1,0}'::real[]) -EXPLAIN (COSTS false) SELECT id FROM small_world_ham ORDER BY v '{0,1,0}'; +EXPLAIN (COSTS false) SELECT id FROM small_world_ham ORDER BY v <+> '{0,1,0}'; QUERY PLAN ----------------------------------------------------------- Index Scan using small_world_ham_v_idx on small_world_ham - Order By: (v '{0,1,0}'::integer[]) -(2 rows) + Order By: (v <+> '{0,1,0}'::integer[]) \set ON_ERROR_STOP off -- Expect errors due to mismatching vector dimensions -SELECT 1 FROM small_world_l2 ORDER BY v '{0,1,0,1}' LIMIT 1; +SELECT 1 FROM small_world_l2 ORDER BY v <-> '{0,1,0,1}' LIMIT 1; ERROR: Expected real array with dimension 3, got 4 -SELECT 1 FROM small_world_cos ORDER BY v '{0,1,0,1}' LIMIT 1; +SELECT 1 FROM small_world_cos ORDER BY v <=> '{0,1,0,1}' LIMIT 1; ERROR: Expected real array with dimension 3, got 4 -SELECT 1 FROM small_world_ham ORDER BY v '{0,1,0,1}' LIMIT 1; +SELECT 1 FROM small_world_ham ORDER BY v <+> '{0,1,0,1}' LIMIT 1; ERROR: Expected int array with dimension 3, got 4 SELECT l2sq_dist('{1,1}'::REAL[], '{0,1,0}'::REAL[]); ERROR: expected equally sized arrays but got arrays with dimensions 2 and 3 @@ -143,13 +133,6 @@ SELECT cos_dist('{1,1}', '{0,1,0}'); ERROR: expected equally sized arrays but got arrays with dimensions 2 and 3 SELECT hamming_dist('{1,1}', '{0,1,0}'); ERROR: expected equally sized arrays but got arrays with dimensions 2 and 3 --- Expect errors due to improper use of the operator outside of its supported context -SELECT ARRAY[1,2,3] ARRAY[3,2,1]; -ERROR: Operator is invalid outside of ORDER BY context -SELECT ROUND((v ARRAY[0,1,0])::numeric, 2) FROM small_world_cos ORDER BY v '{0,1,0}' LIMIT 7; -ERROR: Operator is invalid outside of ORDER BY context -SELECT ROUND((v ARRAY[0,1,0])::numeric, 2) FROM small_world_ham ORDER BY v '{0,1,0}' LIMIT 7; -ERROR: Operator is invalid outside of ORDER BY context \set ON_ERROR_STOP on -- More robust distance operator tests CREATE TABLE test1 (id SERIAL, v REAL[]); @@ -161,100 +144,12 @@ SELECT 0 + 1; ?column? ---------- 1 -(1 row) SELECT 1 FROM test1 WHERE id = 0 + 1; ?column? ---------- 1 -(1 row) -\set ON_ERROR_STOP off --- Expect errors due to incorrect usage -INSERT INTO test1 (v) VALUES (ARRAY['{1,2}'::REAL[] '{4,2}'::REAL[], 0]); -ERROR: Operator is invalid outside of ORDER BY context -SELECT v '{1,2}' FROM test1 ORDER BY v '{1,3}'; -ERROR: Operator is invalid outside of ORDER BY context -SELECT v '{1,2}' FROM test1; -ERROR: Operator is invalid outside of ORDER BY context -WITH temp AS (SELECT v '{1,2}' FROM test1) SELECT 1 FROM temp; -ERROR: Operator is invalid outside of ORDER BY context -SELECT t.res FROM (SELECT v '{1,2}' AS res FROM test1) t; -ERROR: Operator is invalid outside of ORDER BY context -SELECT (SELECT v '{1,2}' FROM test1 LIMIT 1) FROM test1; -ERROR: Operator is invalid outside of ORDER BY context -SELECT COALESCE(v '{1,2}', 0) FROM test1; -ERROR: Operator is invalid outside of ORDER BY context -SELECT EXISTS (SELECT v '{1,2}' FROM test1); -ERROR: Operator is invalid outside of ORDER BY context -SELECT test1.v test2.v FROM test1 JOIN test2 USING (id); -ERROR: Operator is invalid outside of ORDER BY context -SELECT v '{1,2}' FROM test1 UNION SELECT v '{1,3}' FROM test1; -ERROR: Operator is invalid outside of ORDER BY context -(SELECT v '{1,2}' FROM test1 WHERE id < 5) UNION (SELECT v '{1,3}' FROM test1 WHERE id >= 5); -ERROR: Operator is invalid outside of ORDER BY context -SELECT MAX(v '{1,2}') FROM test1; -ERROR: Operator is invalid outside of ORDER BY context -SELECT * FROM test1 JOIN test2 ON test1.v test2.v < 0.5; -ERROR: Operator is invalid outside of ORDER BY context -SELECT test1.v FROM test1 JOIN test2 ON test1.v '{1,2}' = test2.v '{1,3}'; -ERROR: Operator is invalid outside of ORDER BY context -SELECT (v '{1,2}') + (v '{1,3}') FROM test1; -ERROR: Operator is invalid outside of ORDER BY context -SELECT CASE WHEN v '{1,2}' > 1 THEN 'High' ELSE 'Low' END FROM test1; -ERROR: Operator is invalid outside of ORDER BY context -INSERT INTO test1 (v) VALUES ('{2,3}') RETURNING v '{1,2}'; -ERROR: Operator is invalid outside of ORDER BY context -SELECT 1 FROM test1 GROUP BY v '{1,3}'; -ERROR: Operator is invalid outside of ORDER BY context -SELECT 1 FROM test1 ORDER BY (('{1,2}'::real[] '{3,4}'::real[]) - 0); -ERROR: Operator is invalid outside of ORDER BY context -SELECT 1 FROM test1 ORDER BY '{1,2}'::REAL[] '{3,4}'::REAL[]; -ERROR: Operator is invalid outside of ORDER BY context -SELECT 1 FROM test1 ORDER BY v ARRAY[(SELECT '{1,4}'::REAL[] '{4,2}'::REAL[]), 3]; -ERROR: Operator is invalid outside of ORDER BY context --- Expect errors due to index not existing -SELECT id FROM test1 ORDER BY v '{1,2}'; -ERROR: Operator can only be used inside of an index -SELECT 1 FROM test1 ORDER BY v (SELECT '{1,3}'::real[]); -ERROR: Operator can only be used inside of an index -SELECT t2_results.id FROM test1 t1 JOIN LATERAL (SELECT t2.id FROM test2 t2 ORDER BY t1.v t2.v LIMIT 1) t2_results ON TRUE; -ERROR: Operator can only be used inside of an index -WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}' LIMIT 1) SELECT DISTINCT id FROM t; -ERROR: Operator can only be used inside of an index -WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}' LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1; -ERROR: Operator can only be used inside of an index -WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}') SELECT id FROM t UNION SELECT id FROM t; -ERROR: Operator can only be used inside of an index --- issue #227 -SELECT * from test2 JOIN LATERAL (SELECT * FROM (SELECT id FROM test2 ORDER BY v '{1,2}') as forall) haha on TRUE; -ERROR: Operator can only be used inside of an index --- more complex setup of the above -SELECT forall.id, nearest_per_id.* FROM -(SELECT * FROM - test2) AS forall - JOIN LATERAL ( - SELECT - ARRAY_AGG(id ORDER BY id) AS near_ids, - ARRAY_AGG(dist ORDER BY id) AS near_dists - FROM - ( - SELECT - id, - l2sq_dist(v, forall.v) as dist - FROM - test2 - ORDER BY - v forall.v - LIMIT - 5 - ) as __unused_name - ) nearest_per_id on TRUE -ORDER BY - forall.id -LIMIT - 9; -ERROR: Operator can only be used inside of an index \set ON_ERROR_STOP on -- cross-lateral joins work as expected when appropriate index exists -- nearest element for each vector @@ -276,7 +171,7 @@ SELECT forall.id, nearest_per_id.* FROM FROM small_world_l2 ORDER BY - v forall.v + v <-> forall.v LIMIT 4 ) as __unused_name @@ -295,7 +190,6 @@ LIMIT 101 | {101,001,100,111} | {0,1,1,1} 110 | {110,010,100,111} | {0,1,1,1} 111 | {111,011,101,110} | {0,1,1,1} -(8 rows) -- Check that hamming distance query results are sorted correctly CREATE TABLE extra_small_world_ham ( @@ -307,14 +201,13 @@ CREATE INDEX ON extra_small_world_ham USING lantern_hnsw (v dist_hamming_ops) WI INFO: done init usearch index INFO: inserted 4 elements INFO: done saving 4 vectors -SELECT ROUND(hamming_dist(v, '{0,0}')::numeric, 2) FROM extra_small_world_ham ORDER BY v '{0,0}'; +SELECT ROUND(hamming_dist(v, '{0,0}')::numeric, 2) FROM extra_small_world_ham ORDER BY v <+> '{0,0}'; round ------- 0.00 2.00 2.00 4.00 -(4 rows) SELECT _lantern_internal.validate_index('small_world_l2_v_idx', false); INFO: validate_index() start for small_world_l2_v_idx @@ -322,7 +215,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('small_world_cos_v_idx', false); INFO: validate_index() start for small_world_cos_v_idx @@ -330,7 +222,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('small_world_ham_v_idx', false); INFO: validate_index() start for small_world_ham_v_idx @@ -338,7 +229,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('extra_small_world_ham_v_idx', false); INFO: validate_index() start for extra_small_world_ham_v_idx @@ -346,5 +236,4 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_ef_search.out b/lantern_hnsw/test/expected/hnsw_ef_search.out index aba0026c1..6294135da 100644 --- a/lantern_hnsw/test/expected/hnsw_ef_search.out +++ b/lantern_hnsw/test/expected/hnsw_ef_search.out @@ -31,11 +31,10 @@ ERROR: 401 is outside the valid range for parameter "lantern_hnsw.ef" (0 .. 400 -- Repeat the same query while varying ef parameter -- NOTE: it is not entirely known if the results of these are deterministic SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; SELECT v AS v1001 FROM sift_base1k WHERE id = 1001 \gset -- Queries below have the same result SET lantern_hnsw.ef = 1; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -51,7 +50,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v (10 rows) SET lantern_hnsw.ef = 2; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -67,7 +66,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v (10 rows) SET lantern_hnsw.ef = 4; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -83,7 +82,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v (10 rows) SET lantern_hnsw.ef = 8; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -99,7 +98,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v (10 rows) SET lantern_hnsw.ef = 16; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -116,7 +115,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v -- Queries below have the same result, which is different from above SET lantern_hnsw.ef = 32; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -132,7 +131,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v (10 rows) SET lantern_hnsw.ef = 64; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -148,7 +147,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v (10 rows) SET lantern_hnsw.ef = 128; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -164,7 +163,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v (10 rows) SET lantern_hnsw.ef = 256; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -180,7 +179,7 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v (10 rows) SET lantern_hnsw.ef = 400; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 diff --git a/lantern_hnsw/test/expected/hnsw_extras.out b/lantern_hnsw/test/expected/hnsw_extras.out index ca1d4459b..b3ef3e461 100644 --- a/lantern_hnsw/test/expected/hnsw_extras.out +++ b/lantern_hnsw/test/expected/hnsw_extras.out @@ -28,7 +28,6 @@ SELECT lantern_create_external_index('v', 'sift_base1k'); lantern_create_external_index ------------------------------- -(1 row) SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); INFO: validate_index() start for sift_base1k_v_idx @@ -36,36 +35,22 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT v AS v777 FROM sift_base1k WHERE id = 777 \gset -- Validate that using corresponding operator triggers index scan -SET lantern.pgvector_compat=TRUE; EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v <-> :'v777' LIMIT 10; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit -> Index Scan using sift_base1k_v_idx on sift_base1k Order By: (v <-> '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -(3 rows) -SET lantern.pgvector_compat=FALSE; -EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v :'v777' LIMIT 10; - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - Limit - -> Index Scan using sift_base1k_v_idx on sift_base1k - Order By: (v '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -(3 rows) - -SET lantern.pgvector_compat=TRUE; DROP INDEX sift_base1k_v_idx; -- Create with params SELECT lantern_create_external_index('v', 'sift_base1k', 'public', 'cos', 128, 10, 10, 10, false, 'hnsw_cos_index'); lantern_create_external_index ------------------------------- -(1 row) SELECT _lantern_internal.validate_index('hnsw_cos_index', false); INFO: validate_index() start for hnsw_cos_index @@ -73,33 +58,19 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -SET lantern.pgvector_compat=TRUE; EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v <=> :'v777' LIMIT 10; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit -> Index Scan using hnsw_cos_index on sift_base1k Order By: (v <=> '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -(3 rows) - -SET lantern.pgvector_compat=FALSE; -EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v :'v777' LIMIT 10; - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - Limit - -> Index Scan using hnsw_cos_index on sift_base1k - Order By: (v '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -(3 rows) -SET lantern.pgvector_compat=TRUE; -- -- Reindex external index SELECT lantern_reindex_external_index('hnsw_cos_index'); lantern_reindex_external_index -------------------------------- -(1 row) SELECT _lantern_internal.validate_index('hnsw_cos_index', false); INFO: validate_index() start for hnsw_cos_index @@ -107,17 +78,14 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Validate that using corresponding operator triggers index scan -SET lantern.pgvector_compat=TRUE; EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v <=> :'v777' LIMIT 10; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit -> Index Scan using hnsw_cos_index on sift_base1k Order By: (v <=> '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -(3 rows) -- Create PQ Index SET client_min_messages=ERROR; @@ -135,13 +103,11 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) SELECT lantern_create_external_index('v', 'sift_base1k', 'public', 'cos', 128, 10, 10, 10, true, 'hnsw_cos_index_pq'); lantern_create_external_index ------------------------------- -(1 row) SELECT _lantern_internal.validate_index('hnsw_cos_index_pq', false); INFO: validate_index() start for hnsw_cos_index_pq @@ -149,13 +115,11 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT lantern_reindex_external_index('hnsw_cos_index_pq'); lantern_reindex_external_index -------------------------------- -(1 row) SELECT _lantern_internal.validate_index('hnsw_cos_index_pq', false); INFO: validate_index() start for hnsw_cos_index_pq @@ -163,14 +127,11 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -SET lantern.pgvector_compat=TRUE; EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v <=> :'v777' LIMIT 10; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit -> Index Scan using hnsw_cos_index_pq on sift_base1k Order By: (v <=> '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -(3 rows) diff --git a/lantern_hnsw/test/expected/hnsw_index_from_file.out b/lantern_hnsw/test/expected/hnsw_index_from_file.out index 4e60ad9f3..156ad45fd 100644 --- a/lantern_hnsw/test/expected/hnsw_index_from_file.out +++ b/lantern_hnsw/test/expected/hnsw_index_from_file.out @@ -44,26 +44,22 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | indisvalid ---------------+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------+------------ hnsw_l2_index | 776 kB | CREATE INDEX hnsw_l2_index ON sift_base1k USING lantern_hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2sq-0.3.0.usearch') | t -(1 row) SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; SELECT v AS v777 FROM sift_base1k WHERE id = 777 \gset -EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; +EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v <-> :'v777' LIMIT 10; QUERY PLAN --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit -> Index Scan using hnsw_l2_index on sift_base1k - Order By: (v '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -(3 rows) + Order By: (v <-> '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v <-> :'v777' LIMIT 10; round ----------- 0.00 @@ -76,14 +72,13 @@ SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v 130663.00 130863.00 132455.00 -(10 rows) -- Validate that inserting rows on index created from file works as expected INSERT INTO sift_base1k (id, v) VALUES (1001, array_fill(1, ARRAY[128])), (1002, array_fill(2, ARRAY[128])); SELECT v AS v1001 FROM sift_base1k WHERE id = 1001 \gset -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; round ----------- 0.00 @@ -96,7 +91,6 @@ SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v 249589.00 249647.00 249652.00 -(10 rows) -- Drop and recreate table DROP TABLE sift_base1k CASCADE; @@ -106,49 +100,6 @@ CREATE TABLE IF NOT EXISTS sift_base1k ( v REAL[] ); COPY sift_base1k (v) FROM '/tmp/lantern/vector_datasets/sift_base1k_arrays.csv' WITH csv; --- Validate that creating an index from file works with cosine distance function -CREATE INDEX hnsw_cos_index ON sift_base1k USING lantern_hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-cos-0.3.0.usearch'); -INFO: done init usearch index -INFO: done loading usearch index -INFO: done saving 1000 vectors -SELECT _lantern_internal.validate_index('hnsw_cos_index', false); -INFO: validate_index() start for hnsw_cos_index -INFO: validate_index() done, no issues found. - validate_index ----------------- - -(1 row) - -SELECT * FROM ldb_get_indexes('sift_base1k'); - indexname | size | indexdef | indisvalid -----------------+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------+------------ - hnsw_cos_index | 776 kB | CREATE INDEX hnsw_cos_index ON sift_base1k USING lantern_hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-cos-0.3.0.usearch') | t -(1 row) - -SELECT v AS v777 FROM sift_base1k WHERE id = 777 \gset -EXPLAIN (COSTS FALSE) SELECT ROUND(cos_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; - QUERY PLAN ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - Limit - -> Index Scan using hnsw_cos_index on sift_base1k - Order By: (v '{97,67,0,0,0,0,0,14,49,107,23,0,0,0,5,24,4,25,48,5,0,1,8,3,0,5,17,3,1,1,3,3,126,126,0,0,0,0,0,27,49,126,49,8,1,4,11,14,0,6,37,39,10,22,25,0,0,0,12,27,7,23,35,3,126,9,1,0,0,0,19,126,28,11,8,7,1,39,126,126,0,1,28,27,3,126,126,0,1,3,7,9,0,52,126,5,13,5,8,0,0,0,33,72,78,19,18,3,0,3,21,126,42,13,64,83,1,9,8,23,1,4,22,68,3,1,4,0}'::real[]) -(3 rows) - -SELECT ROUND(cos_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; - round -------- - 0.00 - 0.19 - 0.21 - 0.22 - 0.23 - 0.25 - 0.25 - 0.25 - 0.25 - 0.26 -(10 rows) - --- Test scenarious --- ----------------------------------------- -- Case: @@ -173,10 +124,9 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- This should not throw error, but the first result will not be 0 as vector 777 is deleted from the table -SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v <-> :'v777' LIMIT 10; round ----------- 98486.00 @@ -189,7 +139,6 @@ SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v 130863.00 132455.00 132813.00 -(10 rows) -- Should throw error when lantern_extras is not installed \set ON_ERROR_STOP off diff --git a/lantern_hnsw/test/expected/hnsw_insert.out b/lantern_hnsw/test/expected/hnsw_insert.out index 5bdee5204..e48d9b3b7 100644 --- a/lantern_hnsw/test/expected/hnsw_insert.out +++ b/lantern_hnsw/test/expected/hnsw_insert.out @@ -25,7 +25,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Insert rows with valid vector data INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); @@ -65,7 +64,6 @@ INFO: done init usearch index INFO: inserted 8 elements INFO: done saving 8 vectors SET enable_seqscan = false; -SET lantern.pgvector_compat = false; -- Inserting vectors of the same dimension and nulls should work INSERT INTO small_world (v) VALUES ('{1,1,2}'); INSERT INTO small_world (v) VALUES (NULL); @@ -80,7 +78,7 @@ SELECT FROM small_world ORDER BY - v '{0,0,0}'; + v <-> '{0,0,0}'; round ------- 0.00 @@ -92,14 +90,12 @@ ORDER BY 2.00 3.00 6.00 -(9 rows) -- Ensure the index size remains consistent after inserts SELECT * from ldb_get_indexes('small_world'); indexname | size | indexdef | indisvalid -------------------+-------+-------------------------------------------------------------------------------------+------------ small_world_v_idx | 16 kB | CREATE INDEX small_world_v_idx ON small_world USING lantern_hnsw (v) WITH (dim='3') | t -(1 row) -- Ensure the query plan remains consistent after inserts EXPLAIN (COSTS FALSE) @@ -108,14 +104,13 @@ SELECT FROM small_world ORDER BY - v '{0,0,0}' + v <-> '{0,0,0}' LIMIT 10; QUERY PLAN --------------------------------------------------------- Limit -> Index Scan using small_world_v_idx on small_world - Order By: (v '{0,0,0}'::real[]) -(3 rows) + Order By: (v <-> '{0,0,0}'::real[]) SELECT _lantern_internal.validate_index('small_world_v_idx', false); INFO: validate_index() start for small_world_v_idx @@ -123,7 +118,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Test the index with a larger number of vectors CREATE TABLE sift_base10k ( @@ -136,12 +130,11 @@ INFO: inserted 0 elements INFO: done saving 0 vectors \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444'; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444'; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Index Scan using hnsw_idx on sift_base10k - Order By: (v '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) -(2 rows) + Order By: (v <-> '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) SELECT _lantern_internal.validate_index('hnsw_idx', false); INFO: validate_index() start for hnsw_idx @@ -149,5 +142,4 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_insert_unlogged.out b/lantern_hnsw/test/expected/hnsw_insert_unlogged.out index 97ce44ddf..0a7f7a0f4 100644 --- a/lantern_hnsw/test/expected/hnsw_insert_unlogged.out +++ b/lantern_hnsw/test/expected/hnsw_insert_unlogged.out @@ -25,7 +25,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Insert rows with valid vector data INSERT INTO small_world (v) VALUES ('{0,0,1}'), ('{0,1,0}'); @@ -65,7 +64,6 @@ INFO: done init usearch index INFO: inserted 8 elements INFO: done saving 8 vectors SET enable_seqscan = false; -SET lantern.pgvector_compat = false; -- Inserting vectors of the same dimension and nulls should work INSERT INTO small_world (v) VALUES ('{1,1,2}'); INSERT INTO small_world (v) VALUES (NULL); @@ -80,7 +78,7 @@ SELECT FROM small_world ORDER BY - v '{0,0,0}'; + v <-> '{0,0,0}'; round ------- 0.00 @@ -92,14 +90,12 @@ ORDER BY 2.00 3.00 6.00 -(9 rows) -- Ensure the index size remains consistent after inserts SELECT * from ldb_get_indexes('small_world'); indexname | size | indexdef | indisvalid -------------------+-------+-------------------------------------------------------------------------------------+------------ small_world_v_idx | 16 kB | CREATE INDEX small_world_v_idx ON small_world USING lantern_hnsw (v) WITH (dim='3') | t -(1 row) -- Ensure the query plan remains consistent after inserts EXPLAIN (COSTS FALSE) @@ -108,14 +104,13 @@ SELECT FROM small_world ORDER BY - v '{0,0,0}' + v <-> '{0,0,0}' LIMIT 10; QUERY PLAN --------------------------------------------------------- Limit -> Index Scan using small_world_v_idx on small_world - Order By: (v '{0,0,0}'::real[]) -(3 rows) + Order By: (v <-> '{0,0,0}'::real[]) SELECT _lantern_internal.validate_index('small_world_v_idx', false); INFO: validate_index() start for small_world_v_idx @@ -123,7 +118,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Test the index with a larger number of vectors CREATE UNLOGGED TABLE sift_base10k ( @@ -136,12 +130,11 @@ INFO: inserted 0 elements INFO: done saving 0 vectors \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444'; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444'; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Index Scan using hnsw_idx on sift_base10k - Order By: (v '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) -(2 rows) + Order By: (v <-> '{55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26}'::real[]) SELECT _lantern_internal.validate_index('hnsw_idx', false); INFO: validate_index() start for hnsw_idx @@ -149,5 +142,4 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_logged_unlogged.out b/lantern_hnsw/test/expected/hnsw_logged_unlogged.out index 4530f12f8..7c01989aa 100644 --- a/lantern_hnsw/test/expected/hnsw_logged_unlogged.out +++ b/lantern_hnsw/test/expected/hnsw_logged_unlogged.out @@ -28,7 +28,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Query SET enable_seqscan = false; @@ -43,7 +42,6 @@ SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY 101 | 6 | {1,2,0,1} 110 | 7 | {1,2,1,1} 111 | 12 | {2,2,2,0} -(8 rows) -- Switch table to be unlogged ALTER TABLE small_world SET UNLOGGED; @@ -62,7 +60,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('small_world_idx2', false); INFO: validate_index() start for small_world_idx2 @@ -70,7 +67,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Insert INSERT INTO small_world (id, vector) VALUES ('002', '{0,3,1,1}'); @@ -87,7 +83,6 @@ SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY 110 | 7 | {1,2,1,1} 002 | 11 | {0,3,1,1} 111 | 12 | {2,2,2,0} -(9 rows) -- Switch table to be logged again ALTER TABLE small_world SET LOGGED; @@ -109,7 +104,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('small_world_idx2', false); INFO: validate_index() start for small_world_idx2 @@ -117,7 +111,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('small_world_idx3', false); INFO: validate_index() start for small_world_idx3 @@ -125,7 +118,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Insert INSERT INTO small_world (id, vector) VALUES ('020', '{0,0,4,0}'); @@ -143,7 +135,6 @@ SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY 002 | 11 | {0,3,1,1} 111 | 12 | {2,2,2,0} 020 | 16 | {0,0,4,0} -(10 rows) -- -------------------------- -- Start with unlogged table @@ -175,7 +166,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Query SET enable_seqscan = false; @@ -190,7 +180,6 @@ SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY v 101 | 6 | {1,2,0,1} 110 | 7 | {1,2,1,1} 111 | 12 | {2,2,2,0} -(8 rows) -- Switch table to be logged ALTER TABLE small_world SET LOGGED; @@ -209,7 +198,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('small_world_idx2', false); INFO: validate_index() start for small_world_idx2 @@ -217,7 +205,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Insert INSERT INTO small_world (id, vector) VALUES ('002', '{0,3,1,1}'); @@ -234,7 +221,6 @@ SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY v 110 | 7 | {1,2,1,1} 002 | 11 | {0,3,1,1} 111 | 12 | {2,2,2,0} -(9 rows) -- Switch table to be unlogged again ALTER TABLE small_world SET UNLOGGED; @@ -256,7 +242,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('small_world_idx2', false); INFO: validate_index() start for small_world_idx2 @@ -264,7 +249,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('small_world_idx3', false); INFO: validate_index() start for small_world_idx3 @@ -272,7 +256,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- Insert INSERT INTO small_world (id, vector) VALUES ('020', '{0,0,4,0}'); @@ -290,5 +273,4 @@ SELECT id, l2sq_dist(vector, '{0, 0, 0, 0}'), vector FROM small_world ORDER BY v 002 | 11 | {0,3,1,1} 111 | 12 | {2,2,2,0} 020 | 16 | {0,0,4,0} -(10 rows) diff --git a/lantern_hnsw/test/expected/hnsw_operators.out b/lantern_hnsw/test/expected/hnsw_operators.out index acc95be8c..949c73df1 100644 --- a/lantern_hnsw/test/expected/hnsw_operators.out +++ b/lantern_hnsw/test/expected/hnsw_operators.out @@ -1,31 +1,19 @@ --- Validate that lantern.pgvector_compat disables the operator rewriting hooks +\set ON_ERROR_STOP off CREATE TABLE op_test (v REAL[]); INSERT INTO op_test (v) VALUES (ARRAY[0,0,0]), (ARRAY[1,1,1]); CREATE INDEX cos_idx ON op_test USING lantern_hnsw(v dist_cos_ops); INFO: done init usearch index INFO: inserted 2 elements INFO: done saving 2 vectors --- should rewrite operator -SET lantern.pgvector_compat=FALSE; -SELECT * FROM op_test ORDER BY v ARRAY[1,1,1]; - v ---------- - {1,1,1} - {0,0,0} -(2 rows) - -\set ON_ERROR_STOP off -SET lantern.pgvector_compat=TRUE; --- should throw error -SELECT * FROM op_test ORDER BY v ARRAY[1,1,1]; -ERROR: Operator can only be used when lantern.pgvector_compat=FALSE +-- Expect deprecation error due to use of the operator +SELECT ARRAY[1,2,3] ARRAY[3,2,1]; +ERROR: Operator is deprecated. Please explicitly use the operator that matches your distance function. -- should not throw error SELECT * FROM op_test ORDER BY v <=> ARRAY[1,1,1]; v --------- {1,1,1} {0,0,0} -(2 rows) -- should not throw error SELECT * FROM op_test ORDER BY v::INTEGER[] <+> ARRAY[1,1,1]; @@ -33,7 +21,6 @@ SELECT * FROM op_test ORDER BY v::INTEGER[] <+> ARRAY[1,1,1]; --------- {1,1,1} {0,0,0} -(2 rows) -- should not throw error SELECT v <-> ARRAY[1,1,1] FROM op_test ORDER BY v <-> ARRAY[1,1,1]; @@ -41,9 +28,7 @@ SELECT v <-> ARRAY[1,1,1] FROM op_test ORDER BY v <-> ARRAY[1,1,1]; ---------- 0 3 -(2 rows) -SET lantern.pgvector_compat=FALSE; SET enable_seqscan=OFF; \set ON_ERROR_STOP on -- one-off vector distance calculations should work with relevant operator @@ -52,66 +37,47 @@ SELECT ARRAY[0,0,0] <-> ARRAY[2,3,-4]; ?column? ---------- 29 -(1 row) -- with float arrays: SELECT ARRAY[0,0,0] <-> ARRAY[2,3,-4]::real[]; ?column? ---------- 29 -(1 row) SELECT ARRAY[0,0,0]::real[] <-> ARRAY[2,3,-4]::real[]; ?column? ---------- 29 -(1 row) SELECT '{1,0,1}' <-> '{0,1,0}'::integer[]; ?column? ---------- 3 -(1 row) SELECT '{1,0,1}' <=> '{0,1,0}'::integer[]; ?column? ---------- 1 -(1 row) SELECT ROUND(num::NUMERIC, 2) FROM (SELECT '{1,1,1}' <=> '{0,1,0}'::INTEGER[] AS num) _sub; round ------- 0.42 -(1 row) SELECT ARRAY[.1,0,0] <=> ARRAY[0,.5,0]; ?column? ---------- 1 -(1 row) SELECT cos_dist(ARRAY[.1,0,0]::real[], ARRAY[0,.5,0]::real[]); cos_dist ---------- 1 -(1 row) SELECT ARRAY[1,0,0] <+> ARRAY[0,1,0]; ?column? ---------- 2 -(1 row) - --- NOW THIS IS TRIGGERING INDEX SCAN AS WELL --- BECAUSE WE ARE REGISTERING FOR ALL OPERATOR CLASSES --- IDEALLY THIS SHOULD NOT TRIGGER INDEX SCAN WHEN lantern.pgvector_compat=TRUE -EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v ARRAY[1,1,1]; - QUERY PLAN ---------------------------------------- - Index Scan using cos_idx on op_test - Order By: (v '{1,1,1}'::real[]) -(2 rows) -- should sort with index EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v <=> ARRAY[1,1,1]; @@ -119,7 +85,6 @@ EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v <=> ARRAY[1,1,1]; --------------------------------------- Index Scan using cos_idx on op_test Order By: (v <=> '{1,1,1}'::real[]) -(2 rows) -- should sort without index EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v::INTEGER[] <+> ARRAY[1,1,1]; @@ -128,7 +93,6 @@ EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v::INTEGER[] <+> ARRAY[1,1, Sort Sort Key: (((v)::integer[] <+> '{1,1,1}'::integer[])) -> Seq Scan on op_test -(3 rows) -- should not throw error \set ON_ERROR_STOP on @@ -137,7 +101,6 @@ SELECT v <=> ARRAY[1,1,1] FROM op_test ORDER BY v <=> ARRAY[1,1,1]; ---------- 0 1 -(2 rows) -- should not throw error SELECT v::INTEGER[] <+> ARRAY[1,1,1] FROM op_test ORDER BY v::INTEGER[] <+> ARRAY[1,1,1]; @@ -145,7 +108,6 @@ SELECT v::INTEGER[] <+> ARRAY[1,1,1] FROM op_test ORDER BY v::INTEGER[] <+> ARRA ---------- 0 3 -(2 rows) -- should not throw error SELECT v <-> ARRAY[1,1,1] FROM op_test ORDER BY v <-> ARRAY[1,1,1]; @@ -153,20 +115,10 @@ SELECT v <-> ARRAY[1,1,1] FROM op_test ORDER BY v <-> ARRAY[1,1,1]; ---------- 0 3 -(2 rows) RESET ALL; -- Set false twice to verify that no crash is happening -SET lantern.pgvector_compat=FALSE; \set ON_ERROR_STOP off --- should rewrite operator -SELECT * FROM op_test ORDER BY v ARRAY[1,1,1]; - v ---------- - {1,1,1} - {0,0,0} -(2 rows) - SET enable_seqscan=OFF; CREATE INDEX hamming_idx ON op_test USING lantern_hnsw(cast(v as INTEGER[]) dist_hamming_ops); INFO: done init usearch index @@ -178,7 +130,6 @@ EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v <=> ARRAY[1,1,1]; --------------------------------------- Index Scan using cos_idx on op_test Order By: (v <=> '{1,1,1}'::real[]) -(2 rows) -- should sort with hamming_idx index EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v::INTEGER[] <+> ARRAY[1,1,1]; @@ -186,5 +137,4 @@ EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v::INTEGER[] <+> ARRAY[1,1, ------------------------------------------------------- Index Scan using hamming_idx on op_test Order By: ((v)::integer[] <+> '{1,1,1}'::integer[]) -(2 rows) diff --git a/lantern_hnsw/test/expected/hnsw_pq.out b/lantern_hnsw/test/expected/hnsw_pq.out index 846ef26e9..7ff94a63e 100644 --- a/lantern_hnsw/test/expected/hnsw_pq.out +++ b/lantern_hnsw/test/expected/hnsw_pq.out @@ -77,19 +77,16 @@ SELECT '{84,1,4,128,255}'::pqvec; pqvec ------------------ {84,1,4,128,255} -(1 row) SELECT '{84,1,4,128,255}'::pqvec::INT[]; int4 ------------------ {84,1,4,128,255} -(1 row) SELECT '{84,1,4,128,255}'::INT[]::pqvec; pqvec ------------------ {84,1,4,128,255} -(1 row) \set ON_ERROR_STOP off -- Test PQVec type @@ -124,19 +121,16 @@ SELECT array_length(:'codebook'::REAL[][][], 1); array_length -------------- 1 -(1 row) SELECT array_length(:'codebook'::REAL[][][], 2); array_length -------------- 1 -(1 row) SELECT array_length(:'codebook'::REAL[][][], 3); array_length -------------- 128 -(1 row) -- This should create codebook[1][10][128] SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 1, 'l2sq', 0) as codebook \gset @@ -147,19 +141,16 @@ SELECT array_length(:'codebook'::REAL[][][], 1); array_length -------------- 1 -(1 row) SELECT array_length(:'codebook'::REAL[][][], 2); array_length -------------- 10 -(1 row) SELECT array_length(:'codebook'::REAL[][][], 3); array_length -------------- 128 -(1 row) -- This should create codebook[32][10][4] SELECT _lantern_internal.create_pq_codebook('sift_base1k'::regclass, 'v', 10, 32, 'l2sq', 0) as codebook \gset @@ -170,19 +161,16 @@ SELECT array_length(:'codebook'::REAL[][][], 1); array_length -------------- 32 -(1 row) SELECT array_length(:'codebook'::REAL[][][], 2); array_length -------------- 10 -(1 row) SELECT array_length(:'codebook'::REAL[][][], 3); array_length -------------- 4 -(1 row) -- This should create codebook _lantern_internal.pq_sift_base1k_v and add v_pq column in sift_base1k table with compressed vectors -- The codebook will be codebook[32][50][4], so in the table there should be 32 distinct subvector ids each with 50 centroid ids @@ -194,31 +182,26 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) SELECT COUNT(DISTINCT subvector_id) FROM _lantern_internal.pq_sift_base1k_v; count ------- 32 -(1 row) SELECT COUNT(DISTINCT centroid_id) FROM _lantern_internal.pq_sift_base1k_v; count ------- 50 -(1 row) SELECT COUNT(*) FROM _lantern_internal.pq_sift_base1k_v; count ------- 1600 -(1 row) SELECT array_length(c, 1) FROM _lantern_internal.pq_sift_base1k_v LIMIT 1; array_length -------------- 4 -(1 row) -- Validate that table is readonly \set ON_ERROR_STOP off @@ -246,13 +229,11 @@ SELECT dequantize_vector(:'v1_pq', '_lantern_internal.pq_sift_base1k_v'::regclas ?column? ---------- 0 -(1 row) SELECT l2sq_dist(:'decompressed_1', :'decompressed_2'); l2sq_dist ----------- 0 -(1 row) -- Test recall for quantized vs non quantized vectors ALTER TABLE sift_base1k ADD COLUMN v_pq_dec REAL[]; @@ -264,14 +245,12 @@ SELECT :'recall_diff' < 0.2 as recall_diff_meets_threshold; recall_diff_meets_threshold ----------------------------- t -(1 row) -- Verify that column triggers for insert and update are working correctly INSERT INTO sift_base1k(id, v) VALUES (1001, random_array(128, 0.0, 5.0)); SELECT id FROM sift_base1k WHERE v_pq IS NULL; id ---- -(0 rows) SELECT v_pq::TEXT as old_pq FROM sift_base1k WHERE id=1001 \gset UPDATE sift_base1k SET v=(SELECT v FROM sift_base1k WHERE id=1) WHERE id=1001; @@ -280,20 +259,17 @@ SELECT :'old_pq' <> :'new_pq' as is_updated; is_updated ------------ t -(1 row) SELECT :'new_pq' = (SELECT v_pq::TEXT FROM sift_base1k WHERE id=1) as is_updated; is_updated ------------ t -(1 row) -- Verify that compressed column size is smaller than regular integer SELECT pg_column_size(v_pq) as compressed_size, pg_column_size(v_pq::int[]) as int_size FROM sift_base1k LIMIT 1; compressed_size | int_size -----------------+---------- 37 | 152 -(1 row) -- Verify that table can have multiple quantized vectors SELECT quantize_table('sift_base1k'::regclass, 'v_pq_dec', 10, 32, 'l2sq'); @@ -304,44 +280,37 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) SELECT COUNT(DISTINCT subvector_id) FROM _lantern_internal.pq_sift_base1k_v_pq_dec; count ------- 32 -(1 row) SELECT COUNT(DISTINCT centroid_id) FROM _lantern_internal.pq_sift_base1k_v_pq_dec; count ------- 10 -(1 row) SELECT COUNT(*) FROM _lantern_internal.pq_sift_base1k_v_pq_dec; count ------- 320 -(1 row) SELECT array_length(c, 1) FROM _lantern_internal.pq_sift_base1k_v_pq_dec LIMIT 1; array_length -------------- 4 -(1 row) -- Test that resources are being cleared correctly SELECT drop_quantization('sift_base1k'::regclass, 'v'); drop_quantization ------------------- -(1 row) SELECT drop_quantization('sift_base1k'::regclass, 'v_pq_dec'); drop_quantization ------------------- -(1 row) SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' AND table_name = 'sift_base1k'; column_name @@ -349,12 +318,10 @@ SELECT column_name FROM information_schema.columns WHERE table_schema = 'public' id v v_pq_dec -(3 rows) SELECT table_name FROM information_schema.tables WHERE table_schema = '_lantern_internal'; table_name ------------ -(0 rows) -- Test quantization over subset of data SELECT quantize_table('sift_base1k'::regclass, 'v', 10, 32, 'l2sq', 500); @@ -365,31 +332,26 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) SELECT COUNT(DISTINCT subvector_id) FROM _lantern_internal.pq_sift_base1k_v; count ------- 32 -(1 row) SELECT COUNT(DISTINCT centroid_id) FROM _lantern_internal.pq_sift_base1k_v; count ------- 10 -(1 row) SELECT COUNT(*) FROM _lantern_internal.pq_sift_base1k_v; count ------- 320 -(1 row) SELECT array_length(c, 1) FROM _lantern_internal.pq_sift_base1k_v LIMIT 1; array_length -------------- 4 -(1 row) -- Test quantization with mixed case and schema qualified table name SELECT id, v AS "v_New" into "sift_Base1k_NEW" FROM sift_base1k; @@ -401,7 +363,6 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) SELECT array_length( dequantize_vector( @@ -415,11 +376,9 @@ SELECT array_length( array_length -------------- 128 -(1 row) SELECT drop_quantization('"sift_Base1k_NEW"'::regclass, 'v_New'); drop_quantization ------------------- -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_pq_index.out b/lantern_hnsw/test/expected/hnsw_pq_index.out index a3be20de6..651c42abf 100644 --- a/lantern_hnsw/test/expected/hnsw_pq_index.out +++ b/lantern_hnsw/test/expected/hnsw_pq_index.out @@ -103,19 +103,16 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) SELECT COUNT(DISTINCT subvector_id) FROM _lantern_internal.pq_small_world_pq_v; count ------- 4 -(1 row) SELECT COUNT(DISTINCT centroid_id) FROM _lantern_internal.pq_small_world_pq_v; count ------- 10 -(1 row) ALTER TABLE small_world_pq ADD COLUMN v_pq_dec REAL[]; UPDATE small_world_pq SET v_pq_dec=dequantize_vector(v_pq, '_lantern_internal.pq_small_world_pq_v'); @@ -132,19 +129,16 @@ EXPLAIN (COSTS FALSE) SELECT id, v, v_pq, v_pq_dec FROM small_world_pq ORDER BY Limit -> Index Scan using hnsw_l2_index on small_world_pq Order By: (v <-> '{0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4}'::real[]) -(3 rows) SELECT id FROM small_world_pq ORDER BY v <-> :'v4' LIMIT 1; id ---- 4 -(1 row) SELECT * FROM ldb_get_indexes('small_world_pq'); indexname | size | indexdef | indisvalid ---------------+-------+---------------------------------------------------------------------------------------+------------ hnsw_l2_index | 16 kB | CREATE INDEX hnsw_l2_index ON small_world_pq USING lantern_hnsw (v) WITH (pq='false') | t -(1 row) DROP INDEX hnsw_l2_index; -- index with pq @@ -158,7 +152,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) EXPLAIN (COSTS FALSE) SELECT id, v, v_pq, v_pq_dec, (v <-> :'v4') as dist, (v_pq_dec <-> :'v4') real_dist FROM small_world_pq ORDER BY dist LIMIT 1; QUERY PLAN @@ -166,13 +159,11 @@ EXPLAIN (COSTS FALSE) SELECT id, v, v_pq, v_pq_dec, (v <-> :'v4') as dist, (v_pq Limit -> Index Scan using hnsw_pq_l2_index on small_world_pq Order By: (v <-> '{0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4}'::real[]) -(3 rows) SELECT id FROM small_world_pq ORDER BY v <-> :'v4' LIMIT 1; id ---- 4 -(1 row) ALTER TABLE small_world_pq DROP COLUMN v_pq; ALTER TABLE small_world_pq DROP COLUMN v_pq_dec; @@ -186,7 +177,6 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) ALTER TABLE small_world_pq ADD COLUMN v_pq_dec REAL[]; -- GENERATED ALWAYS AS (dequantize_vector("v_pq", '_lanternpq_small_world_pq')) STORED; -- << cannot do because genrated columns cannot refer to other generated columns UPDATE small_world_pq SET v_pq_dec=dequantize_vector(v_pq, '_lantern_internal.pq_small_world_pq_v'); @@ -200,7 +190,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) EXPLAIN (COSTS FALSE) SELECT id, v, v_pq, v_pq_dec, (v <-> :'v4') as dist, (v_pq_dec <-> :'v4') real_dist FROM small_world_pq ORDER BY dist LIMIT 1; QUERY PLAN @@ -208,7 +197,6 @@ EXPLAIN (COSTS FALSE) SELECT id, v, v_pq, v_pq_dec, (v <-> :'v4') as dist, (v_pq Limit -> Index Scan using hnsw_pq_l2_index on small_world_pq Order By: (v <-> '{0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4}'::real[]) -(3 rows) -- add another entry with vector v4, and search for it again INSERT INTO small_world_pq(id, v) VALUES (42, :'v4'); @@ -227,7 +215,6 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) ALTER TABLE small_world_pq ADD COLUMN v_pq_dec REAL[]; UPDATE small_world_pq SET v_pq_dec=dequantize_vector(v_pq, '_lantern_internal.pq_small_world_pq_v'); @@ -241,7 +228,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- we had inserted a value with id=42 and vector=:'v4' above, before making the table unlogged -- disable these since they are flaky, depending on the the quality of the codebook @@ -261,7 +247,6 @@ INFO: Compressing vectors... quantize_table ---------------- -(1 row) SELECT v as v1 FROM sift_base1k WHERE id=1 \gset SELECT v_pq as v1_pq FROM sift_base1k WHERE id=1 \gset @@ -296,7 +281,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT calculate_table_recall('sift_base1k', 'sift_query1k', 'sift_truth1k', 'v', 10, 100) as recall_pq_index \gset SELECT (:'recall_pq'::float - :'recall_pq_index'::float)::float as recall_diff \gset @@ -306,7 +290,6 @@ SELECT :recall_diff >= 0 AND :recall_diff <= 0.1 as recall_within_range; recall_within_range --------------------- t -(1 row) -- inserts SELECT v as v2 FROM sift_base1k WHERE id=2 \gset @@ -318,14 +301,12 @@ SELECT SUM(id1002::int) = 1 as contains_id_1002 FROM (SELECT id = 1002 as id1002 contains_id_1002 ------------------ t -(1 row) -- the top two results must be the vectors corresponding to v2 SELECT ARRAY_AGG(id ORDER BY id) FROM (SELECT id FROM sift_base1k ORDER BY v <-> :'v2' LIMIT 2) b; array_agg ----------- {2,1001} -(1 row) -- since codebook are generated each time and are non deterministic, we cannot print them in regression tests. -- run something like the following to view the results diff --git a/lantern_hnsw/test/expected/hnsw_select.out b/lantern_hnsw/test/expected/hnsw_select.out index 09dc1717f..279e1e2de 100644 --- a/lantern_hnsw/test/expected/hnsw_select.out +++ b/lantern_hnsw/test/expected/hnsw_select.out @@ -40,23 +40,20 @@ INFO: done init usearch index INFO: inserted 1 elements INFO: done saving 1 vectors SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; -- Verify that basic queries still work given our query parser and planner hooks SELECT 0 + 1; ?column? ---------- 1 -(1 row) SELECT 1 FROM test1 WHERE id = 0 + 1; ?column? ---------- 1 -(1 row) -- Verify that the index is being used SET _lantern_internal.is_test = true; -EXPLAIN (COSTS FALSE) SELECT * FROM small_world order by v '{1,0,0}' LIMIT 1; +EXPLAIN (COSTS FALSE) SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 1; DEBUG: LANTERN - Query cost estimator DEBUG: LANTERN - --------------------- DEBUG: LANTERN - Total cost: 4.015000 @@ -68,8 +65,7 @@ DEBUG: LANTERN - --------------------- --------------------------------------------------------- Limit -> Index Scan using small_world_v_idx on small_world - Order By: (v '{1,0,0}'::real[]) -(3 rows) + Order By: (v <-> '{1,0,0}'::real[]) -- Verify that this does not use the index EXPLAIN (COSTS FALSE) SELECT 1 FROM small_world WHERE v = '{0,0,0}'; @@ -77,11 +73,10 @@ EXPLAIN (COSTS FALSE) SELECT 1 FROM small_world WHERE v = '{0,0,0}'; ----------------------------------- Seq Scan on small_world Filter: (v = '{0,0,0}'::real[]) -(2 rows) -- Ensure we can query an index for more elements than the value of init_k WITH neighbors AS ( - SELECT * FROM small_world order by v '{1,0,0}' LIMIT 3 + SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 3 ) SELECT COUNT(*) from neighbors; DEBUG: LANTERN - Query cost estimator DEBUG: LANTERN - --------------------- @@ -94,10 +89,9 @@ DEBUG: LANTERN querying index for 10 elements count ------- 3 -(1 row) WITH neighbors AS ( - SELECT * FROM small_world order by v '{1,0,0}' LIMIT 15 + SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 15 ) SELECT COUNT(*) from neighbors; DEBUG: LANTERN - Query cost estimator DEBUG: LANTERN - --------------------- @@ -110,12 +104,11 @@ DEBUG: LANTERN querying index for 10 elements count ------- 8 -(1 row) -- Change default k and make sure the number of usearch_searchs makes sense SET lantern_hnsw.init_k = 4; WITH neighbors AS ( - SELECT * FROM small_world order by v '{1,0,0}' LIMIT 3 + SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 3 ) SELECT COUNT(*) from neighbors; DEBUG: LANTERN - Query cost estimator DEBUG: LANTERN - --------------------- @@ -128,10 +121,9 @@ DEBUG: LANTERN querying index for 4 elements count ------- 3 -(1 row) WITH neighbors AS ( - SELECT * FROM small_world order by v '{1,0,0}' LIMIT 15 + SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 15 ) SELECT COUNT(*) from neighbors; DEBUG: LANTERN - Query cost estimator DEBUG: LANTERN - --------------------- @@ -146,259 +138,57 @@ DEBUG: LANTERN - querying index for 8 elements count ------- 8 -(1 row) RESET client_min_messages; SET _lantern_internal.is_test = false; -- Verify where condition works properly and still uses index -SELECT has_index_scan('EXPLAIN SELECT * FROM small_world WHERE b IS TRUE ORDER BY v ''{0,0,0}'''); +SELECT has_index_scan('EXPLAIN SELECT * FROM small_world WHERE b IS TRUE ORDER BY v <-> ''{0,0,0}'''); has_index_scan ---------------- t -(1 row) -- Verify that the index is not being used when there is no order by SELECT NOT has_index_scan('EXPLAIN SELECT COUNT(*) FROM small_world'); ?column? ---------- t -(1 row) -- Verify swapping order doesn't change anything and still uses index -SELECT has_index_scan('EXPLAIN SELECT id FROM test1 ORDER BY ''{1,2}''::REAL[] v'); +SELECT has_index_scan('EXPLAIN SELECT id FROM test1 ORDER BY ''{1,2}''::REAL[] <-> v'); has_index_scan ---------------- t -(1 row) -- Verify group by works and uses index -SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY ''{1,2}''::REAL[] v LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1'); +SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY ''{1,2}''::REAL[] <-> v LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1'); has_index_scan ---------------- t -(1 row) -- Validate distinct works and uses index -SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v ''{1,2}'' LIMIT 1) SELECT DISTINCT id FROM t'); +SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v <-> ''{1,2}'' LIMIT 1) SELECT DISTINCT id FROM t'); has_index_scan ---------------- t -(1 row) -- Validate join lateral works and uses index -SELECT has_index_scan('EXPLAIN SELECT t1_results.id FROM test2 t2 JOIN LATERAL (SELECT t1.id FROM test1 t1 ORDER BY t2.v t1.v LIMIT 1) t1_results ON TRUE'); +SELECT has_index_scan('EXPLAIN SELECT t1_results.id FROM test2 t2 JOIN LATERAL (SELECT t1.id FROM test1 t1 ORDER BY t2.v <-> t1.v LIMIT 1) t1_results ON TRUE'); has_index_scan ---------------- t -(1 row) -- Validate union works and uses index -SELECT has_index_scan('EXPLAIN (SELECT id FROM test1 ORDER BY v ''{1,4}'') UNION (SELECT id FROM test1 ORDER BY v IS NOT NULL LIMIT 1)'); +SELECT has_index_scan('EXPLAIN (SELECT id FROM test1 ORDER BY v <-> ''{1,4}'') UNION (SELECT id FROM test1 ORDER BY v IS NOT NULL LIMIT 1)'); has_index_scan ---------------- t -(1 row) -- Validate CTEs work and still use index -SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v ''{1,4}'') SELECT id FROM t UNION SELECT id FROM t'); +SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v <-> ''{1,4}'') SELECT id FROM t UNION SELECT id FROM t'); has_index_scan ---------------- t -(1 row) --- Validate is replaced with the matching function when an index is present -set enable_seqscan = true; -set enable_indexscan = false; -EXPLAIN (COSTS false) SELECT * from small_world ORDER BY v '{1,1,1}'; - QUERY PLAN ------------------------------------------------ - Sort - Sort Key: (l2sq_dist(v, '{1,1,1}'::real[])) - -> Seq Scan on small_world -(3 rows) - -SELECT * from small_world ORDER BY v '{1,1,1}'; - id | b | v ------+---+--------- - 111 | t | {1,1,1} - 101 | f | {1,0,1} - 110 | f | {1,1,0} - 011 | t | {0,1,1} - 100 | f | {1,0,0} - 001 | t | {0,0,1} - 010 | f | {0,1,0} - 000 | t | {0,0,0} -(8 rows) - -begin; -INSERT INTO test2 (v) VALUES ('{1,4}'); -INSERT INTO test2 (v) VALUES ('{2,4}'); -CREATE INDEX test2_cos ON test2 USING lantern_hnsw(v dist_cos_ops); -INFO: done init usearch index -INFO: inserted 3 elements -INFO: done saving 3 vectors -EXPLAIN (COSTS false) SELECT * from test2 ORDER BY v '{1,4}'; - QUERY PLAN --------------------------------------------- - Sort - Sort Key: (cos_dist(v, '{1,4}'::real[])) - -> Seq Scan on test2 -(3 rows) - --- Some additional cases that trigger operator rewriting --- SampleScan -EXPLAIN (COSTS false) SELECT * FROM small_world TABLESAMPLE BERNOULLI (20) ORDER BY v '{1,1,1}' ASC; - QUERY PLAN ------------------------------------------------ - Sort - Sort Key: (l2sq_dist(v, '{1,1,1}'::real[])) - -> Sample Scan on small_world - Sampling: bernoulli ('20'::real) -(4 rows) - --- can't compare direct equality here because it's random -SELECT results_match('EXPLAIN SELECT * FROM small_world TABLESAMPLE BERNOULLI (20) ORDER BY v ''{1,1,1}'' ASC', - 'EXPLAIN SELECT * FROM small_world TABLESAMPLE BERNOULLI (20) ORDER BY l2sq_dist(v, ''{1,1,1}'') ASC'); - results_match ---------------- - t -(1 row) - --- SetOpt/HashSetOp -EXPLAIN (COSTS false) (SELECT * FROM small_world ORDER BY v '{1,0,1}' ASC ) EXCEPT (SELECT * FROM small_world ORDER by v '{1,1,1}' ASC LIMIT 5); - QUERY PLAN -------------------------------------------------------------------------------------- - HashSetOp Except - -> Append - -> Subquery Scan on "*SELECT* 1" - -> Sort - Sort Key: (l2sq_dist(small_world.v, '{1,0,1}'::real[])) - -> Seq Scan on small_world - -> Subquery Scan on "*SELECT* 2" - -> Limit - -> Sort - Sort Key: (l2sq_dist(small_world_1.v, '{1,1,1}'::real[])) - -> Seq Scan on small_world small_world_1 -(11 rows) - -SELECT results_match('(SELECT * FROM small_world ORDER BY v ''{1,0,1}'' ASC ) EXCEPT (SELECT * FROM small_world ORDER by v ''{1,1,1}'' ASC LIMIT 5)', - '(SELECT * FROM small_world ORDER BY l2sq_dist(v, ''{1,0,1}'') ASC ) EXCEPT (SELECT * FROM small_world ORDER by l2sq_dist(v, ''{1,1,1}'') ASC LIMIT 5)'); - results_match ---------------- - t -(1 row) - --- HashAggregate -EXPLAIN (COSTS false) SELECT v, COUNT(*) FROM small_world GROUP BY v ORDER BY v '{1,1,1}'; - QUERY PLAN ------------------------------------------------ - Sort - Sort Key: (l2sq_dist(v, '{1,1,1}'::real[])) - -> HashAggregate - Group Key: v - -> Seq Scan on small_world -(5 rows) - -SELECT results_match('SELECT v, COUNT(*) FROM small_world GROUP BY v ORDER BY v ''{1,1,1}''', - 'SELECT v, COUNT(*) FROM small_world GROUP BY v ORDER BY l2sq_dist(v, ''{1,1,1}'')'); - results_match ---------------- - t -(1 row) - --- GroupBy this -EXPLAIN (COSTS false) SELECT * FROM small_world GROUP BY id, v, b ORDER BY v '{1,1,1}'; - QUERY PLAN ------------------------------------------------ - Sort - Sort Key: (l2sq_dist(v, '{1,1,1}'::real[])) - -> HashAggregate - Group Key: id, v, b - -> Seq Scan on small_world -(5 rows) - -SELECT results_match('SELECT * FROM small_world GROUP BY id, v, b ORDER BY v ''{1,1,1}''', - 'SELECT * FROM small_world GROUP BY id, v, b ORDER BY l2sq_dist(v, ''{1,1,1}'')'); - results_match ---------------- - t -(1 row) - --- HashJoin/Hash -CREATE TABLE small_world_2 AS (SELECT * FROM small_world); -EXPLAIN (COSTS false) SELECT * FROM small_world JOIN small_world_2 using (v) ORDER BY v '{1,1,1}'; - QUERY PLAN ------------------------------------------------------------ - Sort - Sort Key: (l2sq_dist(small_world.v, '{1,1,1}'::real[])) - -> Hash Join - Hash Cond: (small_world_2.v = small_world.v) - -> Seq Scan on small_world_2 - -> Hash - -> Seq Scan on small_world -(7 rows) - -SELECT results_match('SELECT * FROM small_world JOIN small_world_2 using (v) ORDER BY v ''{1,1,1}''', - 'SELECT * FROM small_world JOIN small_world_2 using (v) ORDER BY l2sq_dist(v, ''{1,1,1}'')'); - results_match ---------------- - t -(1 row) - --- MixedAggregate (this doesn't require additional logic, but I include it here as an example of generating the path) -EXPLAIN (COSTS false) SELECT v FROM small_world GROUP BY ROLLUP(v) ORDER BY v '{1,1,1}'; - QUERY PLAN ------------------------------------------------ - Sort - Sort Key: (l2sq_dist(v, '{1,1,1}'::real[])) - -> MixedAggregate - Hash Key: v - Group Key: () - -> Seq Scan on small_world -(6 rows) - -SELECT results_match('SELECT v FROM small_world GROUP BY ROLLUP(v) ORDER BY v ''{1,1,1}''', - 'SELECT v FROM small_world GROUP BY ROLLUP(v) ORDER BY l2sq_dist(v, ''{1,1,1}'')'); - results_match ---------------- - t -(1 row) - --- WindowAgg -EXPLAIN (COSTS false) SELECT v, EVERY(b) OVER () FROM small_world ORDER BY v '{1,1,1}'; - QUERY PLAN ------------------------------------------------ - Sort - Sort Key: (l2sq_dist(v, '{1,1,1}'::real[])) - -> WindowAgg - -> Seq Scan on small_world -(4 rows) - -SELECT results_match('SELECT v, EVERY(b) OVER () FROM small_world ORDER BY v ''{1,1,1}''', - 'SELECT v, EVERY(b) OVER () FROM small_world ORDER BY l2sq_dist(v, ''{1,1,1}'')'); - results_match ---------------- - t -(1 row) - --- LockRows -EXPLAIN (COSTS false) SELECT * FROM small_world ORDER BY v '{1,1,1}' ASC FOR UPDATE; - QUERY PLAN ------------------------------------------------------ - LockRows - -> Sort - Sort Key: (l2sq_dist(v, '{1,1,1}'::real[])) - -> Seq Scan on small_world -(4 rows) - -SELECT results_match('SELECT * FROM small_world ORDER BY v ''{1,1,1}'' ASC FOR UPDATE', - 'SELECT * FROM small_world ORDER BY l2sq_dist(v, ''{1,1,1}'') ASC FOR UPDATE'); - results_match ---------------- - t -(1 row) - -rollback; set enable_indexscan = true; set enable_seqscan = false; -- test pagination in face of duplicates @@ -434,7 +224,6 @@ SELECT fill_same(); fill_same ----------- -(1 row) CREATE INDEX hnsw_l2_index_repeat ON small_world_repeat USING lantern_hnsw(v); INFO: done init usearch index @@ -454,7 +243,6 @@ explain (costs false) select id, ARRAY_AGG(dist) as dists, count(id) as cnt from -> Limit -> Index Scan using hnsw_l2_index_repeat on small_world_repeat Order By: (v <-> '{0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4}'::real[]) -(8 rows) select case when s.cnt > 1 then 'incorrect' else 'correct' end from ( select id, ARRAY_AGG(dist) as dists, count(id) as cnt from (select id, (v <-> ARRAY[0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4]) as dist FROM small_world_repeat order by dist LIMIT 200) b group by id order by cnt DESC, dists, id limit 10 @@ -471,7 +259,6 @@ explain (costs false) select id, ARRAY_AGG(dist) as dists, count(id) as cnt from correct correct correct -(10 rows) set lantern_hnsw.init_k=200; select id, ARRAY_AGG(dist) as dists, count(id) as cnt from (select id, (v <-> ARRAY[0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4,0.4]) as dist FROM small_world_repeat order by dist LIMIT 200) b group by id order by cnt DESC, dists, id limit 10; @@ -487,7 +274,6 @@ set lantern_hnsw.init_k=200; 1007 | {0} | 1 1008 | {0} | 1 1009 | {0} | 1 -(10 rows) -- todo:: Verify joins work and still use index -- todo:: Verify incremental sorts work @@ -498,7 +284,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); INFO: validate_index() start for sift_base1k_v_idx @@ -506,7 +291,6 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) SELECT _lantern_internal.validate_index('test1_v_idx', false); INFO: validate_index() start for test1_v_idx @@ -514,5 +298,4 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) diff --git a/lantern_hnsw/test/expected/hnsw_sq.out b/lantern_hnsw/test/expected/hnsw_sq.out index f7ea4198d..67cc68d44 100644 --- a/lantern_hnsw/test/expected/hnsw_sq.out +++ b/lantern_hnsw/test/expected/hnsw_sq.out @@ -47,7 +47,6 @@ SELECT * FROM ldb_get_indexes('sift_base1k'); -----------+--------+---------------------------------------------------------------------------------------------------+------------ ind16 | 400 kB | CREATE INDEX ind16 ON sift_base1k USING lantern_hnsw (v) WITH (dim='128', m='8', quant_bits='16') | t ind32 | 680 kB | CREATE INDEX ind32 ON sift_base1k USING lantern_hnsw (v) WITH (dim='128', m='8', quant_bits='32') | t -(2 rows) SELECT v as v42 from sift_base1k WHERE id = 42 \gset BEGIN; @@ -59,7 +58,6 @@ EXPLAIN (COSTS FALSE) SELECT id, ROUND((v <-> :'v42')::numeric, 1) as dist FROM Limit -> Index Scan using ind32 on sift_base1k Order By: (v <-> '{1,0,0,0,0,0,21,35,1,0,0,0,0,77,51,42,66,2,0,0,0,86,140,71,52,1,0,0,0,0,23,70,2,0,0,0,0,64,73,50,11,0,0,0,0,140,97,18,140,64,0,0,0,99,51,65,78,11,0,0,0,0,41,76,0,0,0,0,0,124,82,2,48,1,0,0,0,118,31,5,140,21,0,0,0,4,12,78,12,0,0,0,0,0,58,117,1,0,0,0,2,25,7,2,46,2,0,0,1,12,4,8,140,9,0,0,0,1,8,16,3,0,0,0,0,0,21,34}'::real[]) -(3 rows) SELECT id, ROUND((v <-> :'v42')::numeric, 1) as dist, l2sq_dist(v, :'v42') FROM sift_base1k ORDER BY v <-> :'v42' LIMIT 10; id | dist | l2sq_dist @@ -74,7 +72,6 @@ EXPLAIN (COSTS FALSE) SELECT id, ROUND((v <-> :'v42')::numeric, 1) as dist FROM 340 | 87261.0 | 87261 331 | 87796.0 | 87796 682 | 94988.0 | 94988 -(10 rows) ROLLBACK; DROP INDEX ind32, ind16; @@ -94,7 +91,6 @@ SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | indisvalid -----------+--------+-------------------------------------------------------------------------------------------------------------+------------ ind8 | 272 kB | CREATE INDEX ind8 ON sift_base1k USING lantern_hnsw (v_transformed) WITH (dim='128', m='8', quant_bits='8') | t -(1 row) EXPLAIN SELECT id, ROUND((v_transformed <-> :'v_transformed')::numeric, 1) as dist FROM sift_base1k ORDER BY v_transformed <-> :'v_transformed' LIMIT 10; QUERY PLAN @@ -102,7 +98,6 @@ EXPLAIN SELECT id, ROUND((v_transformed <-> :'v_transformed')::numeric, 1) as di Limit (cost=0.00..9.26 rows=10 width=40) -> Index Scan using ind8 on sift_base1k (cost=0.00..926.29 rows=1000 width=40) Order By: (v_transformed <-> '{-0.49,-0.5,-0.5,-0.5,-0.5,-0.5,-0.29,-0.15,-0.49,-0.5,-0.5,-0.5,-0.5,0.27,0.01,-0.08,0.16,-0.48,-0.5,-0.5,-0.5,0.36,0.9,0.21,0.02,-0.49,-0.5,-0.5,-0.5,-0.5,-0.27,0.2,-0.48,-0.5,-0.5,-0.5,-0.5,0.14,0.23,0,-0.39,-0.5,-0.5,-0.5,-0.5,0.9,0.47,-0.32,0.9,0.14,-0.5,-0.5,-0.5,0.49,0.01,0.15,0.28,-0.39,-0.5,-0.5,-0.5,-0.5,-0.09,0.26,-0.5,-0.5,-0.5,-0.5,-0.5,0.74,0.32,-0.48,-0.02,-0.49,-0.5,-0.5,-0.5,0.68,-0.19,-0.45,0.9,-0.29,-0.5,-0.5,-0.5,-0.46,-0.38,0.28,-0.38,-0.5,-0.5,-0.5,-0.5,-0.5,0.08,0.67,-0.49,-0.5,-0.5,-0.5,-0.48,-0.25,-0.43,-0.48,-0.04,-0.48,-0.5,-0.5,-0.49,-0.38,-0.46,-0.42,0.9,-0.41,-0.5,-0.5,-0.5,-0.49,-0.42,-0.34,-0.47,-0.5,-0.5,-0.5,-0.5,-0.5,-0.29,-0.16}'::real[]) -(3 rows) SELECT id, ROUND((v_transformed <-> :'v_transformed')::numeric, 1) as dist FROM sift_base1k ORDER BY v_transformed <-> :'v_transformed' LIMIT 10; id | dist @@ -117,13 +112,11 @@ EXPLAIN SELECT id, ROUND((v_transformed <-> :'v_transformed')::numeric, 1) as di 340 | 8.7 331 | 8.8 682 | 9.5 -(10 rows) DROP INDEX ind8; SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | indisvalid -----------+------+----------+------------ -(0 rows) SELECT v_transformed as v_transformed42 from sift_base1k WHERE id = 42 \gset CREATE INDEX ind1 ON sift_base1k USING lantern_hnsw (v_transformed) WITH (dim=128, M=8, quant_bits=1); @@ -134,7 +127,6 @@ SELECT * FROM ldb_get_indexes('sift_base1k'); indexname | size | indexdef | indisvalid -----------+--------+-------------------------------------------------------------------------------------------------------------+------------ ind1 | 160 kB | CREATE INDEX ind1 ON sift_base1k USING lantern_hnsw (v_transformed) WITH (dim='128', m='8', quant_bits='1') | t -(1 row) EXPLAIN SELECT id, ROUND((v_transformed <-> :'v_transformed42')::numeric, 1) as dist FROM sift_base1k ORDER BY v_transformed <-> :'v_transformed42' LIMIT 4; QUERY PLAN @@ -142,7 +134,6 @@ EXPLAIN SELECT id, ROUND((v_transformed <-> :'v_transformed42')::numeric, 1) as Limit (cost=0.00..3.64 rows=4 width=40) -> Index Scan using ind1 on sift_base1k (cost=0.00..910.50 rows=1000 width=40) Order By: (v_transformed <-> '{-0.49,-0.5,-0.5,-0.5,-0.5,-0.5,-0.29,-0.15,-0.49,-0.5,-0.5,-0.5,-0.5,0.27,0.01,-0.08,0.16,-0.48,-0.5,-0.5,-0.5,0.36,0.9,0.21,0.02,-0.49,-0.5,-0.5,-0.5,-0.5,-0.27,0.2,-0.48,-0.5,-0.5,-0.5,-0.5,0.14,0.23,0,-0.39,-0.5,-0.5,-0.5,-0.5,0.9,0.47,-0.32,0.9,0.14,-0.5,-0.5,-0.5,0.49,0.01,0.15,0.28,-0.39,-0.5,-0.5,-0.5,-0.5,-0.09,0.26,-0.5,-0.5,-0.5,-0.5,-0.5,0.74,0.32,-0.48,-0.02,-0.49,-0.5,-0.5,-0.5,0.68,-0.19,-0.45,0.9,-0.29,-0.5,-0.5,-0.5,-0.46,-0.38,0.28,-0.38,-0.5,-0.5,-0.5,-0.5,-0.5,0.08,0.67,-0.49,-0.5,-0.5,-0.5,-0.48,-0.25,-0.43,-0.48,-0.04,-0.48,-0.5,-0.5,-0.49,-0.38,-0.46,-0.42,0.9,-0.41,-0.5,-0.5,-0.5,-0.49,-0.42,-0.34,-0.47,-0.5,-0.5,-0.5,-0.5,-0.5,-0.29,-0.16}'::real[]) -(3 rows) SELECT id, ROUND((v_transformed <-> :'v_transformed42')::numeric, 1) as dist FROM sift_base1k ORDER BY v_transformed <-> :'v_transformed42' LIMIT 4; id | dist @@ -151,6 +142,5 @@ EXPLAIN SELECT id, ROUND((v_transformed <-> :'v_transformed42')::numeric, 1) as 36 | 1.1 886 | 7.2 340 | 8.7 -(4 rows) -- test on 2000+ dim vectors diff --git a/lantern_hnsw/test/expected/hnsw_todo.out b/lantern_hnsw/test/expected/hnsw_todo.out index 8b701d53e..6e7248d5b 100644 --- a/lantern_hnsw/test/expected/hnsw_todo.out +++ b/lantern_hnsw/test/expected/hnsw_todo.out @@ -15,7 +15,6 @@ INSERT INTO small_world_l2 (id, vector) VALUES ('110', '{1,1,0}'), ('111', '{1,1,1}'); SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; \set ON_ERROR_STOP off CREATE INDEX ON small_world_l2 USING lantern_hnsw (vector dist_l2sq_ops); INFO: done init usearch index @@ -27,27 +26,12 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- this should be supported CREATE INDEX ON small_world_l2 USING lantern_hnsw (vector_int dist_l2sq_int_ops); ERROR: operator class "dist_l2sq_int_ops" does not exist for access method "lantern_hnsw" SELECT _lantern_internal.validate_index('small_world_l2_vector_int_idx', false); ERROR: relation "small_world_l2_vector_int_idx" does not exist at character 41 --- this should use index -EXPLAIN (COSTS FALSE) -SELECT id, ROUND(l2sq_dist(vector_int, array[0,1,0])::numeric, 2) as dist -FROM small_world_l2 -ORDER BY vector_int array[0,1,0] LIMIT 7; - QUERY PLAN ------------------------------------------------------------------------ - Limit - -> Result - -> Sort - Sort Key: (l2sq_dist(vector_int, '{0,1,0}'::integer[])) - -> Seq Scan on small_world_l2 -(5 rows) - --- Test scenarious --- ----------------------------------------- -- Case: @@ -77,14 +61,12 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- The 1001 and 1002 vectors will be ignored in search, so the first row will not be 0 in result -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 1; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 1; round ----------- 249249.00 -(1 row) -- Case: -- Index is created externally @@ -108,42 +90,15 @@ INFO: validate_index() done, no issues found. validate_index ---------------- -(1 row) -- The first row will not be 0 now as the vector under id=777 was updated to 1,1,1,1... but it was indexed with different vector -- So the usearch index can not find 1,1,1,1,1.. vector in the index and wrong results will be returned -- This is an expected behaviour for now -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 1; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 1; round ----------- 249249.00 -(1 row) ----- Query on expression based index is failing to check correct operator usage -------- -CREATE OR REPLACE FUNCTION int_to_fixed_binary_real_array(n INT) RETURNS REAL[] AS $$ -DECLARE - binary_string TEXT; - real_array REAL[] := '{}'; - i INT; -BEGIN - binary_string := lpad(CAST(n::BIT(3) AS TEXT), 3, '0'); - FOR i IN 1..length(binary_string) - LOOP - real_array := array_append(real_array, CAST(substring(binary_string, i, 1) AS REAL)); - END LOOP; - RETURN real_array; -END; -$$ LANGUAGE plpgsql IMMUTABLE; CREATE TABLE test_table (id INTEGER); INSERT INTO test_table VALUES (0), (1), (7); \set enable_seqscan = off; --- This currently results in an error about using the operator outside of index --- This case should be fixed -SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) '{0,0,0}'::REAL[] LIMIT 2; -ERROR: Operator can only be used inside of an index --- =========== THIS CAUSES SERVER CRASH =============== - --- create extension lantern_extras; --- select v as v777 from sift_base1k where id = 777 \gset --- set lantern.pgvector_compat=false; --- select lantern_create_external_index('v', 'sift_base1k', 'public', 'cos', 128, 10, 10, 10, 'hnsw_cos_index'); --- ===================================================== - diff --git a/lantern_hnsw/test/expected/hnsw_vector.out b/lantern_hnsw/test/expected/hnsw_vector.out index 5c2b925ef..45fdb879d 100644 --- a/lantern_hnsw/test/expected/hnsw_vector.out +++ b/lantern_hnsw/test/expected/hnsw_vector.out @@ -7,13 +7,11 @@ DROP EXTENSION IF EXISTS lantern; CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION lantern; RESET client_min_messages; -SET lantern.pgvector_compat=FALSE; -- Verify basic functionality of pgvector SELECT '[1,2,3]'::vector; vector --------- [1,2,3] -(1 row) -- Test index creation x2 on empty table and subsequent inserts CREATE TABLE items (id SERIAL PRIMARY KEY, trait_ai VECTOR(3)); @@ -28,13 +26,12 @@ INFO: done init usearch index INFO: inserted 3 elements INFO: done saving 3 vectors INSERT INTO items (trait_ai) VALUES ('[10,10,10]'), (NULL); -SELECT * FROM items ORDER BY trait_ai '[0,0,0]' LIMIT 3; +SELECT * FROM items ORDER BY trait_ai <-> '[0,0,0]' LIMIT 3; id | trait_ai ----+---------- 1 | [1,2,3] 2 | [4,5,6] 3 | [6,7,8] -(3 rows) SELECT * FROM ldb_get_indexes('items'); indexname | size | indexdef | indisvalid @@ -42,7 +39,6 @@ SELECT * FROM ldb_get_indexes('items'); items_pkey | 16 kB | CREATE UNIQUE INDEX items_pkey ON items USING btree (id) | t items_trait_ai_idx | 16 kB | CREATE INDEX items_trait_ai_idx ON items USING lantern_hnsw (trait_ai) WITH (dim='3', m='2') | t items_trait_ai_idx1 | 16 kB | CREATE INDEX items_trait_ai_idx1 ON items USING lantern_hnsw (trait_ai) WITH (dim='3', m='4') | t -(3 rows) -- Test index creation on table with existing data \ir utils/small_world_vector.sql @@ -69,13 +65,12 @@ SELECT * FROM ldb_get_indexes('small_world'); indexname | size | indexdef | indisvalid -------------------+-------+---------------------------------------------------------------------------------------------------------------------------+------------ small_world_v_idx | 16 kB | CREATE INDEX small_world_v_idx ON small_world USING lantern_hnsw (v) WITH (dim='3', m='5', ef='20', ef_construction='20') | t -(1 row) INSERT INTO small_world (v) VALUES ('[99,99,2]'); INSERT INTO small_world (v) VALUES (NULL); -- Distance functions SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist -FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; +FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; dist ------ 0.00 @@ -85,19 +80,17 @@ FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; 2.00 2.00 2.00 -(7 rows) EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist -FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; +FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; QUERY PLAN --------------------------------------------------------- Limit -> Index Scan using small_world_v_idx on small_world - Order By: (v '[0,1,0]'::vector) -(3 rows) + Order By: (v <-> '[0,1,0]'::vector) SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist -FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; +FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; dist ------ 0.00 @@ -107,16 +100,14 @@ FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; 2.00 2.00 2.00 -(7 rows) EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist -FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; +FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; QUERY PLAN --------------------------------------------------------- Limit -> Index Scan using small_world_v_idx on small_world - Order By: (v '[0,1,0]'::vector) -(3 rows) + Order By: (v <-> '[0,1,0]'::vector) -- Verify that index creation on a large vector produces an error CREATE TABLE large_vector (v VECTOR(2001)); @@ -135,39 +126,33 @@ INFO: done init usearch index INFO: inserted 10000 elements INFO: done saving 10000 vectors SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k ORDER BY v :'v4444' LIMIT 10; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k ORDER BY v <-> :'v4444' LIMIT 10; QUERY PLAN ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- Limit -> Index Scan using hnsw_idx on sift_base10k - Order By: (v '[55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26]'::vector) -(3 rows) + Order By: (v <-> '[55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26]'::vector) -- Ensure we can query an index for more elements than the value of init_k SET lantern_hnsw.init_k = 4; WITH neighbors AS ( - SELECT * FROM small_world order by v '[1,0,0]' LIMIT 3 + SELECT * FROM small_world order by v <-> '[1,0,0]' LIMIT 3 ) SELECT COUNT(*) from neighbors; count ------- 3 -(1 row) WITH neighbors AS ( - SELECT * FROM small_world order by v '[1,0,0]' LIMIT 15 + SELECT * FROM small_world order by v <-> '[1,0,0]' LIMIT 15 ) SELECT COUNT(*) from neighbors; count ------- 9 -(1 row) RESET client_min_messages; \set ON_ERROR_STOP off --- Expect error due to improper use of the operator outside of its supported context -SELECT ARRAY[1,2,3] ARRAY[3,2,1]; -ERROR: Operator is invalid outside of ORDER BY context -- Expect error due to mismatching vector dimensions -SELECT 1 FROM small_world ORDER BY v '[0,1,0,1]' LIMIT 1; +SELECT 1 FROM small_world ORDER BY v <-> '[0,1,0,1]' LIMIT 1; ERROR: Expected vector with dimension 3, got 4 SELECT l2sq_dist('[1,1]'::vector, '[0,1,0]'::vector); ERROR: expected equally sized vectors but got vectors with dimensions 2 and 3 @@ -185,7 +170,7 @@ BEGIN LOOP real_array := array_append(real_array, CAST(substring(binary_string, i, 1) AS REAL)); END LOOP; - RETURN real_array::vector; + RETURN real_array::public.vector; END; $$ LANGUAGE plpgsql IMMUTABLE; CREATE INDEX ON test_table USING lantern_hnsw (int_to_fixed_binary_vector(id)) WITH (M=2); @@ -199,48 +184,45 @@ CREATE INDEX l2_idx ON small_world_arr USING lantern_hnsw(v) WITH (dim=3, m=2); INFO: done init usearch index INFO: inserted 3 elements INFO: done saving 3 vectors -EXPLAIN (COSTS FALSE) SELECT id FROM small_world_arr ORDER BY v ARRAY[0,0,0]; +EXPLAIN (COSTS FALSE) SELECT id FROM small_world_arr ORDER BY v <-> ARRAY[0,0,0]; QUERY PLAN -------------------------------------------- Index Scan using l2_idx on small_world_arr - Order By: (v '{0,0,0}'::real[]) -(2 rows) + Order By: (v <-> '{0,0,0}'::real[]) -SELECT id FROM small_world_arr ORDER BY v ARRAY[0,0,0]; +SELECT id FROM small_world_arr ORDER BY v <-> ARRAY[0,0,0]; id ---- 1 2 3 -(3 rows) DROP INDEX l2_idx; -CREATE INDEX cos_idx ON small_world_arr USING lantern_hnsw(v) WITH (m=2); +CREATE INDEX cos_idx ON small_world_arr USING lantern_hnsw(v dist_cos_ops) WITH (m=2); INFO: done init usearch index INFO: inserted 3 elements INFO: done saving 3 vectors -SELECT id FROM small_world_arr ORDER BY v ARRAY[0,0,0]; +SELECT id FROM small_world_arr ORDER BY v <=> ARRAY[0,0,0]; id ---- 1 - 2 3 -(3 rows) + 2 DROP INDEX cos_idx; -CREATE INDEX ham_idx ON small_world_arr USING lantern_hnsw(v) WITH (m=3); +ALTER TABLE small_world_arr ADD COLUMN v_int INT[]; +UPDATE small_world_arr SET v_int=v::INT[]; +CREATE INDEX ham_idx ON small_world_arr USING lantern_hnsw(v_int dist_hamming_ops) WITH (m=3); INFO: done init usearch index INFO: inserted 3 elements INFO: done saving 3 vectors -SELECT id FROM small_world_arr ORDER BY v ARRAY[0,0,0]; +SELECT id FROM small_world_arr ORDER BY v_int <+> ARRAY[0,0,0]; id ---- 1 - 2 3 -(3 rows) + 2 --- Test pgvector in lantern.pgvector_compat=TRUE mode DROP TABLE small_world; \ir utils/small_world_vector.sql CREATE TABLE small_world ( @@ -258,7 +240,6 @@ INSERT INTO small_world (id, b, v) VALUES ('110', FALSE, '[1,1,0]'), ('111', TRUE, '[1,1,1]'); -- Distance functions -SET lantern.pgvector_compat=TRUE; SET enable_seqscan=OFF; -- Note: -- For l2sqs and cosine distances in SELECT statement @@ -280,7 +261,6 @@ FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; 2.00 2.00 2.00 -(7 rows) EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; @@ -289,7 +269,6 @@ FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; Limit -> Index Scan using l2_idx on small_world Order By: (v <-> '[0,1,0]'::vector) -(3 rows) -- cosine index CREATE INDEX cos_idx ON small_world USING lantern_hnsw (v dist_vec_cos_ops); @@ -307,7 +286,6 @@ FROM small_world ORDER BY v <=> '[0,1,0]'::VECTOR LIMIT 7; 1.00 1.00 1.00 -(7 rows) EXPLAIN (COSTS FALSE) SELECT ROUND(cos_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist FROM small_world ORDER BY v <=> '[0,1,0]'::VECTOR LIMIT 7; @@ -316,5 +294,4 @@ FROM small_world ORDER BY v <=> '[0,1,0]'::VECTOR LIMIT 7; Limit -> Index Scan using cos_idx on small_world Order By: (v <=> '[0,1,0]'::vector) -(3 rows) diff --git a/lantern_hnsw/test/expected/missing_outer_snapshot_portal.out b/lantern_hnsw/test/expected/missing_outer_snapshot_portal.out index f43e8c2ec..5cf949713 100644 --- a/lantern_hnsw/test/expected/missing_outer_snapshot_portal.out +++ b/lantern_hnsw/test/expected/missing_outer_snapshot_portal.out @@ -26,11 +26,9 @@ EXPLAIN (COSTS false) SELECT COUNT(*) FROM ourtable; Workers Planned: 4 -> Partial Aggregate -> Parallel Seq Scan on ourtable -(5 rows) SELECT COUNT(*) FROM ourtable; count ------- 10000 -(1 row) diff --git a/lantern_hnsw/test/expected/weighted_search.out b/lantern_hnsw/test/expected/weighted_search.out index a58c7142d..b9d9f6551 100644 --- a/lantern_hnsw/test/expected/weighted_search.out +++ b/lantern_hnsw/test/expected/weighted_search.out @@ -29,7 +29,6 @@ SELECT id, ROUND((v <-> :'v4')::numeric, 2) as dist FROM sift_Base1k ORDER BY v 183 | 259.18 254 | 263.45 116 | 264.64 -(10 rows) SELECT id, ROUND((v <-> :'v44')::numeric, 2) as dist FROM sift_Base1k ORDER BY v <-> :'v44' LIMIT 10; id | dist @@ -44,14 +43,12 @@ SELECT id, ROUND((v <-> :'v44')::numeric, 2) as dist FROM sift_Base1k ORDER BY v 950 | 338.17 744 | 343.25 539 | 344.02 -(10 rows) -- Make sure the function does not modify the global hnsw.ef_search SHOW hnsw.ef_search; hnsw.ef_search ---------------- 40 -(1 row) SELECT id, ROUND((v <-> :'v4')::numeric, 2) as v4_dist, ROUND((v <-> :'v44')::numeric, 2) v44_dist FROM lantern.weighted_vector_search(CAST(NULL as "sift_base1k"), max_dist => 750., debug_output => true, exact => false, @@ -170,7 +167,6 @@ WARNING: weighted vector search explain: [ 44 | 634.43 | 0.00 2 | 122.45 | 611.24 15 | 141.39 | 607.78 -(4 rows) SELECT id, ROUND((v <-> :'v4')::numeric, 2) as v4_dist, ROUND((v <-> :'v44')::numeric, 2) v44_dist FROM lantern.weighted_vector_search(CAST(NULL as "sift_base1k"), max_dist => 750., debug_output => true, exact => true, @@ -205,14 +201,12 @@ WARNING: weighted vector search explain(exact=true): [ 44 | 634.43 | 0.00 2 | 122.45 | 611.24 15 | 141.39 | 607.78 -(4 rows) -- Make sure the function does not modify the global hnsw.ef_search SHOW hnsw.ef_search; hnsw.ef_search ---------------- 40 -(1 row) SELECT *, 0.03 * v4_dist + 0.45 * v44_dist + 0.52 * v444_dist as weighted_dist FROM (SELECT id, ROUND((v <-> :'v4')::numeric, 2) as v4_dist, ROUND((v <-> :'v44')::numeric, 2) v44_dist, ROUND((v <-> :'v444')::numeric, 2) v444_dist @@ -361,7 +355,6 @@ WARNING: weighted vector search explain: [ 830 | 437.72 | 586.69 | 214.16 | 388.5053 77 | 615.89 | 218.82 | 595.58 | 426.6473 76 | 316.78 | 569.97 | 313.14 | 428.8227 -(5 rows) SELECT *, 0.03 * v4_dist + 0.45 * v44_dist + 0.52 * v444_dist as weighted_dist FROM (SELECT id, ROUND((v <-> :'v4')::numeric, 2) as v4_dist, ROUND((v <-> :'v44')::numeric, 2) v44_dist, ROUND((v <-> :'v444')::numeric, 2) v444_dist @@ -400,7 +393,6 @@ WARNING: weighted vector search explain(exact=true): [ 830 | 437.72 | 586.69 | 214.16 | 388.5053 77 | 615.89 | 218.82 | 595.58 | 426.6473 76 | 316.78 | 569.97 | 313.14 | 428.8227 -(5 rows) -- when max_dist is not specified, number of returned values dicreases with smaller ef SELECT count(*) @@ -433,7 +425,6 @@ WARNING: weighted vector search explain(exact=true): [ count ------- 1000 -(1 row) SELECT count(*) FROM lantern.weighted_vector_search(CAST(NULL as "sift_base1k"), exact => false, ef => 100, -- default @@ -444,7 +435,6 @@ SELECT count(*) count ------- 272 -(1 row) SELECT count(*) FROM lantern.weighted_vector_search(CAST(NULL as "sift_base1k"), exact => false, ef => 10, @@ -455,7 +445,6 @@ SELECT count(*) count ------- 30 -(1 row) SELECT count(*) FROM lantern.weighted_vector_search(CAST(NULL as "sift_base1k"), exact => false, ef => 5, @@ -466,7 +455,6 @@ SELECT count(*) count ------- 15 -(1 row) CREATE INDEX ON sift_base1k USING hnsw (v vector_cosine_ops) WITH (M=5, ef_construction=128); SELECT count(*) @@ -478,7 +466,6 @@ SELECT count(*) count ------- 15 -(1 row) -- test the API-shortcut helper (should be same as the one above) SELECT count(*) @@ -490,7 +477,6 @@ SELECT count(*) count ------- 15 -(1 row) -- Make sure API still works when the table stores real[] for vectors ALTER TABLE sift_base1k ADD COLUMN v_real real[]; @@ -509,7 +495,6 @@ SELECT count(*) count ------- 15 -(1 row) SELECT count(*) FROM lantern.weighted_vector_search_cos(CAST(NULL as "sift_base1k"), exact => false, ef => 5, @@ -520,7 +505,6 @@ SELECT count(*) count ------- 15 -(1 row) -- create non superuser and test the function SET client_min_messages = WARNING; @@ -540,5 +524,4 @@ SELECT count(*) count ------- 15 -(1 row) diff --git a/lantern_hnsw/test/parallel/expected/begin.out b/lantern_hnsw/test/parallel/expected/begin.out index 87c9640f4..560fc4c67 100644 --- a/lantern_hnsw/test/parallel/expected/begin.out +++ b/lantern_hnsw/test/parallel/expected/begin.out @@ -4,7 +4,6 @@ SELECT pg_reload_conf(); pg_reload_conf ---------------- t -(1 row) \ir utils/sift10k_array.sql CREATE TABLE IF NOT EXISTS sift_base10k ( @@ -34,7 +33,6 @@ SELECT id, ROUND((v <-> :'v444')::numeric, 2) FROM sift_base10k ORDER BY v <-> : 1336 | 73429.00 2654 | 78240.00 7642 | 78451.00 -(6 rows) CREATE INDEX to_be_reindexed ON sift_base10k USING lantern_hnsw (v) WITH (M=7, ef=20, ef_construction=20); INFO: done init usearch index @@ -49,7 +47,6 @@ SELECT id, ROUND((v <-> :'v444')::numeric, 2) FROM sift_base10k ORDER BY v <-> : 1336 | 73429.00 2654 | 78240.00 9185 | 78983.00 -(6 rows) REINDEX INDEX CONCURRENTLY to_be_reindexed; INFO: done init usearch index diff --git a/lantern_hnsw/test/parallel/expected/end.out b/lantern_hnsw/test/parallel/expected/end.out index 59130eafc..85a959b71 100644 --- a/lantern_hnsw/test/parallel/expected/end.out +++ b/lantern_hnsw/test/parallel/expected/end.out @@ -3,11 +3,9 @@ SELECT COUNT(*) FROM sift_base10k; count ------- 10030 -(1 row) SELECT * from sift_base10k WHERE id=4444; id | v ------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 4444 | {55,61,11,4,5,2,13,24,65,49,13,9,23,37,94,38,54,11,14,14,40,31,50,44,53,4,0,0,27,17,8,34,12,10,4,4,22,52,68,53,9,2,0,0,2,116,119,64,119,2,0,0,2,30,119,119,116,5,0,8,47,9,5,60,7,7,10,23,56,50,23,5,28,68,6,18,24,65,50,9,119,75,3,0,1,8,12,85,119,11,4,6,8,9,5,74,25,11,8,20,18,12,2,21,11,90,25,32,33,15,2,9,84,67,8,4,22,31,11,33,119,30,3,6,0,0,0,26} -(1 row) diff --git a/lantern_hnsw/test/parallel/expected/select.out b/lantern_hnsw/test/parallel/expected/select.out index c807d51e5..26d898359 100644 --- a/lantern_hnsw/test/parallel/expected/select.out +++ b/lantern_hnsw/test/parallel/expected/select.out @@ -10,7 +10,6 @@ EXPLAIN (COSTS false) SELECT id FROM sift_base10k ORDER BY v <-> :'v1111' ASC Limit -> Index Scan using to_be_reindexed on sift_base10k Order By: (v <-> '{21,24,5,0,0,26,22,6,16,16,10,9,0,18,114,19,13,13,9,1,2,53,111,19,39,32,5,0,4,9,10,13,6,10,8,0,2,130,77,4,2,0,0,0,3,130,130,11,130,0,0,0,0,37,130,84,130,5,0,1,17,11,4,28,17,39,3,3,30,77,28,3,20,0,0,1,49,125,13,7,130,6,0,0,0,5,11,61,130,2,0,1,12,84,48,73,1,12,2,0,31,57,9,2,16,12,1,0,32,36,0,1,63,6,3,1,0,0,24,51,9,0,0,0,0,44,88,48}'::real[]) -(3 rows) -- Do the queries -- Make sure the new delete hook works to fix concurrent builds in 0.2.5->0.2.6 @@ -27,29 +26,24 @@ SELECT id, ROUND((v <-> :'v444')::numeric, 2) FROM sift_base10k ORDER BY v <-> : 1336 | 73429.00 2654 | 78240.00 9185 | 78983.00 -(6 rows) SELECT id FROM sift_base10k ORDER BY v <-> :'v1111' ASC LIMIT 1; id ------ 1111 -(1 row) SELECT id FROM sift_base10k ORDER BY v <-> :'v2222' ASC LIMIT 1; id ------ 2222 -(1 row) SELECT id FROM sift_base10k ORDER BY v <-> :'v3333' ASC LIMIT 1; id ------ 3333 -(1 row) SELECT id FROM sift_base10k ORDER BY v <-> :'v4444' ASC LIMIT 1; id ------ 4444 -(1 row) diff --git a/lantern_hnsw/test/sql/hnsw_correct.sql b/lantern_hnsw/test/sql/hnsw_correct.sql index 53a99e22f..b2298f19e 100644 --- a/lantern_hnsw/test/sql/hnsw_correct.sql +++ b/lantern_hnsw/test/sql/hnsw_correct.sql @@ -12,7 +12,6 @@ INSERT INTO small_world (v) VALUES ('{0,0}'), ('{1,1}'), ('{2,2}'), ('{3,3}'); -- Create index CREATE INDEX ON small_world USING lantern_hnsw (v dist_l2sq_ops) WITH (dim=2, M=4); SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; -- Get the results without the index @@ -27,7 +26,7 @@ FROM -- Get the results with the index CREATE TEMP TABLE results_w_index AS SELECT - ROW_NUMBER() OVER (ORDER BY v '{0,0}') AS row_num, + ROW_NUMBER() OVER (ORDER BY v <-> '{0,0}') AS row_num, id, l2sq_dist(v, '{0,0}') AS dist FROM diff --git a/lantern_hnsw/test/sql/hnsw_cost_estimate.sql b/lantern_hnsw/test/sql/hnsw_cost_estimate.sql index c9c0d85f2..d11d66b4b 100644 --- a/lantern_hnsw/test/sql/hnsw_cost_estimate.sql +++ b/lantern_hnsw/test/sql/hnsw_cost_estimate.sql @@ -50,19 +50,17 @@ BEGIN END; $$ LANGUAGE plpgsql; -SET lantern.pgvector_compat=FALSE; - -- Goal: make sure query cost estimate is accurate -- when index is created with varying costruction parameters. SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -\set explain_query_template 'EXPLAIN SELECT * FROM sift_base10k ORDER BY v ''%s'' LIMIT 10' +\set explain_query_template 'EXPLAIN SELECT * FROM sift_base10k ORDER BY v <-> ''%s'' LIMIT 10' \set enable_seqscan = off; -- Case 0, sanity check. No data. CREATE TABLE empty_table(id SERIAL PRIMARY KEY, v REAL[2]); CREATE INDEX empty_idx ON empty_table USING lantern_hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=2, dim=2); SET _lantern_internal.is_test = true; -SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v ''{1,2}'' LIMIT 10', 0.47); +SELECT is_cost_estimate_within_error('EXPLAIN SELECT * FROM empty_table ORDER BY v <-> ''{1,2}'' LIMIT 10', 0.47); SELECT _lantern_internal.validate_index('empty_idx', false); DROP INDEX empty_idx; diff --git a/lantern_hnsw/test/sql/hnsw_create.sql b/lantern_hnsw/test/sql/hnsw_create.sql index a4bd6d5ea..40f26fecc 100644 --- a/lantern_hnsw/test/sql/hnsw_create.sql +++ b/lantern_hnsw/test/sql/hnsw_create.sql @@ -19,11 +19,10 @@ SELECT _lantern_internal.validate_index('sift_base1k_int_v_idx', false); -- Validate that index creation works with a larger number of vectors \ir utils/sift10k_array.sql -SET lantern.pgvector_compat=FALSE; CREATE INDEX hnsw_idx ON sift_base10k USING lantern_hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444' LIMIT 10; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' LIMIT 10; SELECT _lantern_internal.validate_index('hnsw_idx', false); --- Validate that M values inside the allowed range [2, 128] do not throw an error diff --git a/lantern_hnsw/test/sql/hnsw_create_expr.sql b/lantern_hnsw/test/sql/hnsw_create_expr.sql index c51bf4015..20b408124 100644 --- a/lantern_hnsw/test/sql/hnsw_create_expr.sql +++ b/lantern_hnsw/test/sql/hnsw_create_expr.sql @@ -69,8 +69,6 @@ INSERT INTO test_table VALUES (0), (1), (7); \set enable_seqscan = off; SET enable_seqscan = false; -SET lantern.pgvector_compat=FALSE; - -- This should success CREATE INDEX ON test_table USING lantern_hnsw (int_to_fixed_binary_real_array(id)) WITH (M=2); @@ -86,4 +84,4 @@ CREATE INDEX ON test_table USING lantern_hnsw (int_to_string(id)) WITH (M=2); -- This should result in error about multicolumn expressions support CREATE INDEX ON test_table USING lantern_hnsw (int_to_fixed_binary_real_array(id), int_to_dynamic_binary_real_array(id)) WITH (M=2); -SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) '{0,0,0}'::REAL[] LIMIT 2; +SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) <-> '{0,0,0}'::REAL[] LIMIT 2; diff --git a/lantern_hnsw/test/sql/hnsw_create_unlogged.sql b/lantern_hnsw/test/sql/hnsw_create_unlogged.sql index 0e56c9ff6..09b1f763e 100644 --- a/lantern_hnsw/test/sql/hnsw_create_unlogged.sql +++ b/lantern_hnsw/test/sql/hnsw_create_unlogged.sql @@ -13,11 +13,10 @@ SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); -- Validate that index creation works with a larger number of vectors \ir utils/sift10k_array_unlogged.sql -SET lantern.pgvector_compat=FALSE; CREATE INDEX hnsw_idx ON sift_base10k USING lantern_hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444' LIMIT 10; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444' LIMIT 10; SELECT _lantern_internal.validate_index('hnsw_idx', false); --- Validate that M values inside the allowed range [2, 128] do not throw an error diff --git a/lantern_hnsw/test/sql/hnsw_dist_func.sql b/lantern_hnsw/test/sql/hnsw_dist_func.sql index d47981915..8d8474345 100644 --- a/lantern_hnsw/test/sql/hnsw_dist_func.sql +++ b/lantern_hnsw/test/sql/hnsw_dist_func.sql @@ -17,12 +17,11 @@ INSERT INTO small_world_cos SELECT id, v FROM small_world; INSERT INTO small_world_ham SELECT id, ARRAY[CAST(v[1] AS INTEGER), CAST(v[2] AS INTEGER), CAST(v[3] AS INTEGER)] FROM small_world; SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; -- Verify that the distance functions work (check distances) -SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v '{0,1,0}'; -SELECT ROUND(cos_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_cos ORDER BY v '{0,1,0}'; -SELECT ROUND(hamming_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_ham ORDER BY v '{0,1,0}'; +SELECT ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 ORDER BY v <-> '{0,1,0}'; +SELECT ROUND(cos_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_cos ORDER BY v <=> '{0,1,0}'; +SELECT ROUND(hamming_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_ham ORDER BY v <+> '{0,1,0}'; -- Verify that the distance functions work (check IDs) SELECT ARRAY_AGG(id ORDER BY id), ROUND(l2sq_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_l2 GROUP BY 2 ORDER BY 2; @@ -30,26 +29,22 @@ SELECT ARRAY_AGG(id ORDER BY id), ROUND(cos_dist(v, '{0,1,0}')::numeric, 2) FROM SELECT ARRAY_AGG(id ORDER BY id), ROUND(hamming_dist(v, '{0,1,0}')::numeric, 2) FROM small_world_ham GROUP BY 2 ORDER BY 2; -- Verify that the indexes is being used -EXPLAIN (COSTS false) SELECT id FROM small_world_l2 ORDER BY v '{0,1,0}'; -EXPLAIN (COSTS false) SELECT id FROM small_world_cos ORDER BY v '{0,1,0}'; -EXPLAIN (COSTS false) SELECT id FROM small_world_ham ORDER BY v '{0,1,0}'; +EXPLAIN (COSTS false) SELECT id FROM small_world_l2 ORDER BY v <-> '{0,1,0}'; +EXPLAIN (COSTS false) SELECT id FROM small_world_cos ORDER BY v <=> '{0,1,0}'; +EXPLAIN (COSTS false) SELECT id FROM small_world_ham ORDER BY v <+> '{0,1,0}'; \set ON_ERROR_STOP off -- Expect errors due to mismatching vector dimensions -SELECT 1 FROM small_world_l2 ORDER BY v '{0,1,0,1}' LIMIT 1; -SELECT 1 FROM small_world_cos ORDER BY v '{0,1,0,1}' LIMIT 1; -SELECT 1 FROM small_world_ham ORDER BY v '{0,1,0,1}' LIMIT 1; +SELECT 1 FROM small_world_l2 ORDER BY v <-> '{0,1,0,1}' LIMIT 1; +SELECT 1 FROM small_world_cos ORDER BY v <=> '{0,1,0,1}' LIMIT 1; +SELECT 1 FROM small_world_ham ORDER BY v <+> '{0,1,0,1}' LIMIT 1; SELECT l2sq_dist('{1,1}'::REAL[], '{0,1,0}'::REAL[]); SELECT cos_dist('{1,1}'::real[], '{0,1,0}'::real[]); -- the one below is umbiguous if pgvector's vector type is present SELECT cos_dist('{1,1}', '{0,1,0}'); SELECT hamming_dist('{1,1}', '{0,1,0}'); --- Expect errors due to improper use of the operator outside of its supported context -SELECT ARRAY[1,2,3] ARRAY[3,2,1]; -SELECT ROUND((v ARRAY[0,1,0])::numeric, 2) FROM small_world_cos ORDER BY v '{0,1,0}' LIMIT 7; -SELECT ROUND((v ARRAY[0,1,0])::numeric, 2) FROM small_world_ham ORDER BY v '{0,1,0}' LIMIT 7; \set ON_ERROR_STOP on @@ -63,67 +58,6 @@ INSERT INTO test2 (v) VALUES ('{5,4}'); SELECT 0 + 1; SELECT 1 FROM test1 WHERE id = 0 + 1; -\set ON_ERROR_STOP off - --- Expect errors due to incorrect usage -INSERT INTO test1 (v) VALUES (ARRAY['{1,2}'::REAL[] '{4,2}'::REAL[], 0]); -SELECT v '{1,2}' FROM test1 ORDER BY v '{1,3}'; -SELECT v '{1,2}' FROM test1; -WITH temp AS (SELECT v '{1,2}' FROM test1) SELECT 1 FROM temp; -SELECT t.res FROM (SELECT v '{1,2}' AS res FROM test1) t; -SELECT (SELECT v '{1,2}' FROM test1 LIMIT 1) FROM test1; -SELECT COALESCE(v '{1,2}', 0) FROM test1; -SELECT EXISTS (SELECT v '{1,2}' FROM test1); -SELECT test1.v test2.v FROM test1 JOIN test2 USING (id); -SELECT v '{1,2}' FROM test1 UNION SELECT v '{1,3}' FROM test1; -(SELECT v '{1,2}' FROM test1 WHERE id < 5) UNION (SELECT v '{1,3}' FROM test1 WHERE id >= 5); -SELECT MAX(v '{1,2}') FROM test1; -SELECT * FROM test1 JOIN test2 ON test1.v test2.v < 0.5; -SELECT test1.v FROM test1 JOIN test2 ON test1.v '{1,2}' = test2.v '{1,3}'; -SELECT (v '{1,2}') + (v '{1,3}') FROM test1; -SELECT CASE WHEN v '{1,2}' > 1 THEN 'High' ELSE 'Low' END FROM test1; -INSERT INTO test1 (v) VALUES ('{2,3}') RETURNING v '{1,2}'; -SELECT 1 FROM test1 GROUP BY v '{1,3}'; -SELECT 1 FROM test1 ORDER BY (('{1,2}'::real[] '{3,4}'::real[]) - 0); -SELECT 1 FROM test1 ORDER BY '{1,2}'::REAL[] '{3,4}'::REAL[]; -SELECT 1 FROM test1 ORDER BY v ARRAY[(SELECT '{1,4}'::REAL[] '{4,2}'::REAL[]), 3]; - --- Expect errors due to index not existing -SELECT id FROM test1 ORDER BY v '{1,2}'; -SELECT 1 FROM test1 ORDER BY v (SELECT '{1,3}'::real[]); -SELECT t2_results.id FROM test1 t1 JOIN LATERAL (SELECT t2.id FROM test2 t2 ORDER BY t1.v t2.v LIMIT 1) t2_results ON TRUE; -WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}' LIMIT 1) SELECT DISTINCT id FROM t; -WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}' LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1; -WITH t AS (SELECT id FROM test1 ORDER BY v '{1,2}') SELECT id FROM t UNION SELECT id FROM t; - --- issue #227 -SELECT * from test2 JOIN LATERAL (SELECT * FROM (SELECT id FROM test2 ORDER BY v '{1,2}') as forall) haha on TRUE; --- more complex setup of the above -SELECT forall.id, nearest_per_id.* FROM -(SELECT * FROM - test2) AS forall - JOIN LATERAL ( - SELECT - ARRAY_AGG(id ORDER BY id) AS near_ids, - ARRAY_AGG(dist ORDER BY id) AS near_dists - FROM - ( - SELECT - id, - l2sq_dist(v, forall.v) as dist - FROM - test2 - ORDER BY - v forall.v - LIMIT - 5 - ) as __unused_name - ) nearest_per_id on TRUE -ORDER BY - forall.id -LIMIT - 9; - \set ON_ERROR_STOP on -- cross-lateral joins work as expected when appropriate index exists -- nearest element for each vector @@ -145,7 +79,7 @@ SELECT forall.id, nearest_per_id.* FROM FROM small_world_l2 ORDER BY - v forall.v + v <-> forall.v LIMIT 4 ) as __unused_name @@ -162,7 +96,7 @@ CREATE TABLE extra_small_world_ham ( ); INSERT INTO extra_small_world_ham (v) VALUES ('{0,0}'), ('{1,1}'), ('{2,2}'), ('{3,3}'); CREATE INDEX ON extra_small_world_ham USING lantern_hnsw (v dist_hamming_ops) WITH (dim=2); -SELECT ROUND(hamming_dist(v, '{0,0}')::numeric, 2) FROM extra_small_world_ham ORDER BY v '{0,0}'; +SELECT ROUND(hamming_dist(v, '{0,0}')::numeric, 2) FROM extra_small_world_ham ORDER BY v <+> '{0,0}'; SELECT _lantern_internal.validate_index('small_world_l2_v_idx', false); SELECT _lantern_internal.validate_index('small_world_cos_v_idx', false); diff --git a/lantern_hnsw/test/sql/hnsw_ef_search.sql b/lantern_hnsw/test/sql/hnsw_ef_search.sql index 57b563037..d5196f11d 100644 --- a/lantern_hnsw/test/sql/hnsw_ef_search.sql +++ b/lantern_hnsw/test/sql/hnsw_ef_search.sql @@ -21,37 +21,36 @@ SET lantern_hnsw.ef = 401; -- Repeat the same query while varying ef parameter -- NOTE: it is not entirely known if the results of these are deterministic SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; SELECT v AS v1001 FROM sift_base1k WHERE id = 1001 \gset -- Queries below have the same result SET lantern_hnsw.ef = 1; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; SET lantern_hnsw.ef = 2; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; SET lantern_hnsw.ef = 4; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; SET lantern_hnsw.ef = 8; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; SET lantern_hnsw.ef = 16; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; -- Queries below have the same result, which is different from above SET lantern_hnsw.ef = 32; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; SET lantern_hnsw.ef = 64; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; SET lantern_hnsw.ef = 128; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; SET lantern_hnsw.ef = 256; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; SET lantern_hnsw.ef = 400; -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; diff --git a/lantern_hnsw/test/sql/hnsw_extras.sql b/lantern_hnsw/test/sql/hnsw_extras.sql index 62e510e64..6a52a70f3 100644 --- a/lantern_hnsw/test/sql/hnsw_extras.sql +++ b/lantern_hnsw/test/sql/hnsw_extras.sql @@ -23,30 +23,19 @@ SELECT _lantern_internal.validate_index('sift_base1k_v_idx', false); SELECT v AS v777 FROM sift_base1k WHERE id = 777 \gset -- Validate that using corresponding operator triggers index scan -SET lantern.pgvector_compat=TRUE; EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v <-> :'v777' LIMIT 10; - -SET lantern.pgvector_compat=FALSE; -EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v :'v777' LIMIT 10; -SET lantern.pgvector_compat=TRUE; DROP INDEX sift_base1k_v_idx; -- Create with params SELECT lantern_create_external_index('v', 'sift_base1k', 'public', 'cos', 128, 10, 10, 10, false, 'hnsw_cos_index'); SELECT _lantern_internal.validate_index('hnsw_cos_index', false); -SET lantern.pgvector_compat=TRUE; EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v <=> :'v777' LIMIT 10; -SET lantern.pgvector_compat=FALSE; -EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v :'v777' LIMIT 10; -SET lantern.pgvector_compat=TRUE; - -- -- Reindex external index SELECT lantern_reindex_external_index('hnsw_cos_index'); SELECT _lantern_internal.validate_index('hnsw_cos_index', false); -- Validate that using corresponding operator triggers index scan -SET lantern.pgvector_compat=TRUE; EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v <=> :'v777' LIMIT 10; -- Create PQ Index @@ -61,5 +50,4 @@ SELECT lantern_create_external_index('v', 'sift_base1k', 'public', 'cos', 128, 1 SELECT _lantern_internal.validate_index('hnsw_cos_index_pq', false); SELECT lantern_reindex_external_index('hnsw_cos_index_pq'); SELECT _lantern_internal.validate_index('hnsw_cos_index_pq', false); -SET lantern.pgvector_compat=TRUE; EXPLAIN (COSTS FALSE) SELECT id FROM sift_base1k order by v <=> :'v777' LIMIT 10; diff --git a/lantern_hnsw/test/sql/hnsw_index_from_file.sql b/lantern_hnsw/test/sql/hnsw_index_from_file.sql index 16c0e129c..7f2924164 100644 --- a/lantern_hnsw/test/sql/hnsw_index_from_file.sql +++ b/lantern_hnsw/test/sql/hnsw_index_from_file.sql @@ -26,31 +26,21 @@ SELECT _lantern_internal.validate_index('hnsw_l2_index', false); SELECT * FROM ldb_get_indexes('sift_base1k'); SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; SELECT v AS v777 FROM sift_base1k WHERE id = 777 \gset -EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; -SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; +EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v <-> :'v777' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v <-> :'v777' LIMIT 10; -- Validate that inserting rows on index created from file works as expected INSERT INTO sift_base1k (id, v) VALUES (1001, array_fill(1, ARRAY[128])), (1002, array_fill(2, ARRAY[128])); SELECT v AS v1001 FROM sift_base1k WHERE id = 1001 \gset -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 10; -- Drop and recreate table DROP TABLE sift_base1k CASCADE; \ir utils/sift1k_array.sql --- Validate that creating an index from file works with cosine distance function -CREATE INDEX hnsw_cos_index ON sift_base1k USING lantern_hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-cos-0.3.0.usearch'); -SELECT _lantern_internal.validate_index('hnsw_cos_index', false); -SELECT * FROM ldb_get_indexes('sift_base1k'); - -SELECT v AS v777 FROM sift_base1k WHERE id = 777 \gset -EXPLAIN (COSTS FALSE) SELECT ROUND(cos_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; -SELECT ROUND(cos_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; - --- Test scenarious --- ----------------------------------------- @@ -64,7 +54,7 @@ DELETE FROM sift_base1k WHERE id=777; CREATE INDEX hnsw_l2_index ON sift_base1k USING lantern_hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2sq-0.3.0.usearch'); SELECT _lantern_internal.validate_index('hnsw_l2_index', false); -- This should not throw error, but the first result will not be 0 as vector 777 is deleted from the table -SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v :'v777' LIMIT 10; +SELECT ROUND(l2sq_dist(v, :'v777')::numeric, 2) FROM sift_base1k order by v <-> :'v777' LIMIT 10; -- Should throw error when lantern_extras is not installed \set ON_ERROR_STOP off diff --git a/lantern_hnsw/test/sql/hnsw_insert.sql b/lantern_hnsw/test/sql/hnsw_insert.sql index 99bcf0bee..2e014c119 100644 --- a/lantern_hnsw/test/sql/hnsw_insert.sql +++ b/lantern_hnsw/test/sql/hnsw_insert.sql @@ -47,7 +47,6 @@ set work_mem = '10MB'; CREATE INDEX ON small_world USING lantern_hnsw (v) WITH (dim=3); SET enable_seqscan = false; -SET lantern.pgvector_compat = false; -- Inserting vectors of the same dimension and nulls should work INSERT INTO small_world (v) VALUES ('{1,1,2}'); @@ -64,7 +63,7 @@ SELECT FROM small_world ORDER BY - v '{0,0,0}'; + v <-> '{0,0,0}'; -- Ensure the index size remains consistent after inserts SELECT * from ldb_get_indexes('small_world'); @@ -76,7 +75,7 @@ SELECT FROM small_world ORDER BY - v '{0,0,0}' + v <-> '{0,0,0}' LIMIT 10; SELECT _lantern_internal.validate_index('small_world_v_idx', false); @@ -89,6 +88,6 @@ CREATE TABLE sift_base10k ( CREATE INDEX hnsw_idx ON sift_base10k USING lantern_hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444'; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444'; SELECT _lantern_internal.validate_index('hnsw_idx', false); diff --git a/lantern_hnsw/test/sql/hnsw_insert_unlogged.sql b/lantern_hnsw/test/sql/hnsw_insert_unlogged.sql index ec7dff67b..f682297bc 100644 --- a/lantern_hnsw/test/sql/hnsw_insert_unlogged.sql +++ b/lantern_hnsw/test/sql/hnsw_insert_unlogged.sql @@ -47,7 +47,6 @@ set work_mem = '10MB'; CREATE INDEX ON small_world USING lantern_hnsw (v) WITH (dim=3); SET enable_seqscan = false; -SET lantern.pgvector_compat = false; -- Inserting vectors of the same dimension and nulls should work INSERT INTO small_world (v) VALUES ('{1,1,2}'); @@ -64,7 +63,7 @@ SELECT FROM small_world ORDER BY - v '{0,0,0}'; + v <-> '{0,0,0}'; -- Ensure the index size remains consistent after inserts SELECT * from ldb_get_indexes('small_world'); @@ -76,7 +75,7 @@ SELECT FROM small_world ORDER BY - v '{0,0,0}' + v <-> '{0,0,0}' LIMIT 10; SELECT _lantern_internal.validate_index('small_world_v_idx', false); @@ -89,6 +88,6 @@ CREATE UNLOGGED TABLE sift_base10k ( CREATE INDEX hnsw_idx ON sift_base10k USING lantern_hnsw (v dist_l2sq_ops) WITH (M=2, ef_construction=10, ef=4, dim=128); \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base_arrays.csv' WITH CSV; SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v :'v4444'; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k order by v <-> :'v4444'; SELECT _lantern_internal.validate_index('hnsw_idx', false); diff --git a/lantern_hnsw/test/sql/hnsw_operators.sql b/lantern_hnsw/test/sql/hnsw_operators.sql index 8134d3b0f..7f4150a17 100644 --- a/lantern_hnsw/test/sql/hnsw_operators.sql +++ b/lantern_hnsw/test/sql/hnsw_operators.sql @@ -1,15 +1,11 @@ --- Validate that lantern.pgvector_compat disables the operator rewriting hooks +\set ON_ERROR_STOP off CREATE TABLE op_test (v REAL[]); INSERT INTO op_test (v) VALUES (ARRAY[0,0,0]), (ARRAY[1,1,1]); CREATE INDEX cos_idx ON op_test USING lantern_hnsw(v dist_cos_ops); --- should rewrite operator -SET lantern.pgvector_compat=FALSE; -SELECT * FROM op_test ORDER BY v ARRAY[1,1,1]; -\set ON_ERROR_STOP off -SET lantern.pgvector_compat=TRUE; --- should throw error -SELECT * FROM op_test ORDER BY v ARRAY[1,1,1]; +-- Expect deprecation error due to use of the operator +SELECT ARRAY[1,2,3] ARRAY[3,2,1]; + -- should not throw error SELECT * FROM op_test ORDER BY v <=> ARRAY[1,1,1]; @@ -19,7 +15,6 @@ SELECT * FROM op_test ORDER BY v::INTEGER[] <+> ARRAY[1,1,1]; -- should not throw error SELECT v <-> ARRAY[1,1,1] FROM op_test ORDER BY v <-> ARRAY[1,1,1]; -SET lantern.pgvector_compat=FALSE; SET enable_seqscan=OFF; \set ON_ERROR_STOP on @@ -36,11 +31,6 @@ SELECT ARRAY[.1,0,0] <=> ARRAY[0,.5,0]; SELECT cos_dist(ARRAY[.1,0,0]::real[], ARRAY[0,.5,0]::real[]); SELECT ARRAY[1,0,0] <+> ARRAY[0,1,0]; --- NOW THIS IS TRIGGERING INDEX SCAN AS WELL --- BECAUSE WE ARE REGISTERING FOR ALL OPERATOR CLASSES --- IDEALLY THIS SHOULD NOT TRIGGER INDEX SCAN WHEN lantern.pgvector_compat=TRUE -EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v ARRAY[1,1,1]; - -- should sort with index EXPLAIN (COSTS FALSE) SELECT * FROM op_test ORDER BY v <=> ARRAY[1,1,1]; @@ -60,11 +50,7 @@ SELECT v <-> ARRAY[1,1,1] FROM op_test ORDER BY v <-> ARRAY[1,1,1]; RESET ALL; -- Set false twice to verify that no crash is happening -SET lantern.pgvector_compat=FALSE; \set ON_ERROR_STOP off --- should rewrite operator -SELECT * FROM op_test ORDER BY v ARRAY[1,1,1]; - SET enable_seqscan=OFF; CREATE INDEX hamming_idx ON op_test USING lantern_hnsw(cast(v as INTEGER[]) dist_hamming_ops); diff --git a/lantern_hnsw/test/sql/hnsw_select.sql b/lantern_hnsw/test/sql/hnsw_select.sql index 4f5a3be58..fbbe023ba 100644 --- a/lantern_hnsw/test/sql/hnsw_select.sql +++ b/lantern_hnsw/test/sql/hnsw_select.sql @@ -16,7 +16,6 @@ INSERT INTO test2 (v) VALUES ('{5,4}'); CREATE INDEX ON test1 USING lantern_hnsw (v); SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; -- Verify that basic queries still work given our query parser and planner hooks SELECT 0 + 1; @@ -24,101 +23,54 @@ SELECT 1 FROM test1 WHERE id = 0 + 1; -- Verify that the index is being used SET _lantern_internal.is_test = true; -EXPLAIN (COSTS FALSE) SELECT * FROM small_world order by v '{1,0,0}' LIMIT 1; +EXPLAIN (COSTS FALSE) SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 1; -- Verify that this does not use the index EXPLAIN (COSTS FALSE) SELECT 1 FROM small_world WHERE v = '{0,0,0}'; -- Ensure we can query an index for more elements than the value of init_k WITH neighbors AS ( - SELECT * FROM small_world order by v '{1,0,0}' LIMIT 3 + SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 3 ) SELECT COUNT(*) from neighbors; WITH neighbors AS ( - SELECT * FROM small_world order by v '{1,0,0}' LIMIT 15 + SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 15 ) SELECT COUNT(*) from neighbors; -- Change default k and make sure the number of usearch_searchs makes sense SET lantern_hnsw.init_k = 4; WITH neighbors AS ( - SELECT * FROM small_world order by v '{1,0,0}' LIMIT 3 + SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 3 ) SELECT COUNT(*) from neighbors; WITH neighbors AS ( - SELECT * FROM small_world order by v '{1,0,0}' LIMIT 15 + SELECT * FROM small_world order by v <-> '{1,0,0}' LIMIT 15 ) SELECT COUNT(*) from neighbors; RESET client_min_messages; SET _lantern_internal.is_test = false; -- Verify where condition works properly and still uses index -SELECT has_index_scan('EXPLAIN SELECT * FROM small_world WHERE b IS TRUE ORDER BY v ''{0,0,0}'''); +SELECT has_index_scan('EXPLAIN SELECT * FROM small_world WHERE b IS TRUE ORDER BY v <-> ''{0,0,0}'''); -- Verify that the index is not being used when there is no order by SELECT NOT has_index_scan('EXPLAIN SELECT COUNT(*) FROM small_world'); -- Verify swapping order doesn't change anything and still uses index -SELECT has_index_scan('EXPLAIN SELECT id FROM test1 ORDER BY ''{1,2}''::REAL[] v'); +SELECT has_index_scan('EXPLAIN SELECT id FROM test1 ORDER BY ''{1,2}''::REAL[] <-> v'); -- Verify group by works and uses index -SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY ''{1,2}''::REAL[] v LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1'); +SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY ''{1,2}''::REAL[] <-> v LIMIT 1) SELECT id, COUNT(*) FROM t GROUP BY 1'); -- Validate distinct works and uses index -SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v ''{1,2}'' LIMIT 1) SELECT DISTINCT id FROM t'); +SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v <-> ''{1,2}'' LIMIT 1) SELECT DISTINCT id FROM t'); -- Validate join lateral works and uses index -SELECT has_index_scan('EXPLAIN SELECT t1_results.id FROM test2 t2 JOIN LATERAL (SELECT t1.id FROM test1 t1 ORDER BY t2.v t1.v LIMIT 1) t1_results ON TRUE'); +SELECT has_index_scan('EXPLAIN SELECT t1_results.id FROM test2 t2 JOIN LATERAL (SELECT t1.id FROM test1 t1 ORDER BY t2.v <-> t1.v LIMIT 1) t1_results ON TRUE'); -- Validate union works and uses index -SELECT has_index_scan('EXPLAIN (SELECT id FROM test1 ORDER BY v ''{1,4}'') UNION (SELECT id FROM test1 ORDER BY v IS NOT NULL LIMIT 1)'); +SELECT has_index_scan('EXPLAIN (SELECT id FROM test1 ORDER BY v <-> ''{1,4}'') UNION (SELECT id FROM test1 ORDER BY v IS NOT NULL LIMIT 1)'); -- Validate CTEs work and still use index -SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v ''{1,4}'') SELECT id FROM t UNION SELECT id FROM t'); - --- Validate is replaced with the matching function when an index is present -set enable_seqscan = true; -set enable_indexscan = false; -EXPLAIN (COSTS false) SELECT * from small_world ORDER BY v '{1,1,1}'; -SELECT * from small_world ORDER BY v '{1,1,1}'; -begin; -INSERT INTO test2 (v) VALUES ('{1,4}'); -INSERT INTO test2 (v) VALUES ('{2,4}'); -CREATE INDEX test2_cos ON test2 USING lantern_hnsw(v dist_cos_ops); -EXPLAIN (COSTS false) SELECT * from test2 ORDER BY v '{1,4}'; --- Some additional cases that trigger operator rewriting --- SampleScan -EXPLAIN (COSTS false) SELECT * FROM small_world TABLESAMPLE BERNOULLI (20) ORDER BY v '{1,1,1}' ASC; --- can't compare direct equality here because it's random -SELECT results_match('EXPLAIN SELECT * FROM small_world TABLESAMPLE BERNOULLI (20) ORDER BY v ''{1,1,1}'' ASC', - 'EXPLAIN SELECT * FROM small_world TABLESAMPLE BERNOULLI (20) ORDER BY l2sq_dist(v, ''{1,1,1}'') ASC'); --- SetOpt/HashSetOp -EXPLAIN (COSTS false) (SELECT * FROM small_world ORDER BY v '{1,0,1}' ASC ) EXCEPT (SELECT * FROM small_world ORDER by v '{1,1,1}' ASC LIMIT 5); -SELECT results_match('(SELECT * FROM small_world ORDER BY v ''{1,0,1}'' ASC ) EXCEPT (SELECT * FROM small_world ORDER by v ''{1,1,1}'' ASC LIMIT 5)', - '(SELECT * FROM small_world ORDER BY l2sq_dist(v, ''{1,0,1}'') ASC ) EXCEPT (SELECT * FROM small_world ORDER by l2sq_dist(v, ''{1,1,1}'') ASC LIMIT 5)'); --- HashAggregate -EXPLAIN (COSTS false) SELECT v, COUNT(*) FROM small_world GROUP BY v ORDER BY v '{1,1,1}'; -SELECT results_match('SELECT v, COUNT(*) FROM small_world GROUP BY v ORDER BY v ''{1,1,1}''', - 'SELECT v, COUNT(*) FROM small_world GROUP BY v ORDER BY l2sq_dist(v, ''{1,1,1}'')'); --- GroupBy this -EXPLAIN (COSTS false) SELECT * FROM small_world GROUP BY id, v, b ORDER BY v '{1,1,1}'; -SELECT results_match('SELECT * FROM small_world GROUP BY id, v, b ORDER BY v ''{1,1,1}''', - 'SELECT * FROM small_world GROUP BY id, v, b ORDER BY l2sq_dist(v, ''{1,1,1}'')'); --- HashJoin/Hash -CREATE TABLE small_world_2 AS (SELECT * FROM small_world); -EXPLAIN (COSTS false) SELECT * FROM small_world JOIN small_world_2 using (v) ORDER BY v '{1,1,1}'; -SELECT results_match('SELECT * FROM small_world JOIN small_world_2 using (v) ORDER BY v ''{1,1,1}''', - 'SELECT * FROM small_world JOIN small_world_2 using (v) ORDER BY l2sq_dist(v, ''{1,1,1}'')'); --- MixedAggregate (this doesn't require additional logic, but I include it here as an example of generating the path) -EXPLAIN (COSTS false) SELECT v FROM small_world GROUP BY ROLLUP(v) ORDER BY v '{1,1,1}'; -SELECT results_match('SELECT v FROM small_world GROUP BY ROLLUP(v) ORDER BY v ''{1,1,1}''', - 'SELECT v FROM small_world GROUP BY ROLLUP(v) ORDER BY l2sq_dist(v, ''{1,1,1}'')'); --- WindowAgg -EXPLAIN (COSTS false) SELECT v, EVERY(b) OVER () FROM small_world ORDER BY v '{1,1,1}'; -SELECT results_match('SELECT v, EVERY(b) OVER () FROM small_world ORDER BY v ''{1,1,1}''', - 'SELECT v, EVERY(b) OVER () FROM small_world ORDER BY l2sq_dist(v, ''{1,1,1}'')'); --- LockRows -EXPLAIN (COSTS false) SELECT * FROM small_world ORDER BY v '{1,1,1}' ASC FOR UPDATE; -SELECT results_match('SELECT * FROM small_world ORDER BY v ''{1,1,1}'' ASC FOR UPDATE', - 'SELECT * FROM small_world ORDER BY l2sq_dist(v, ''{1,1,1}'') ASC FOR UPDATE'); - -rollback; +SELECT has_index_scan('EXPLAIN WITH t AS (SELECT id FROM test1 ORDER BY v <-> ''{1,4}'') SELECT id FROM t UNION SELECT id FROM t'); + set enable_indexscan = true; set enable_seqscan = false; diff --git a/lantern_hnsw/test/sql/hnsw_todo.sql b/lantern_hnsw/test/sql/hnsw_todo.sql index fcf84324f..d1076c04f 100644 --- a/lantern_hnsw/test/sql/hnsw_todo.sql +++ b/lantern_hnsw/test/sql/hnsw_todo.sql @@ -18,8 +18,6 @@ INSERT INTO small_world_l2 (id, vector) VALUES ('111', '{1,1,1}'); SET enable_seqscan=FALSE; -SET lantern.pgvector_compat=FALSE; - \set ON_ERROR_STOP off CREATE INDEX ON small_world_l2 USING lantern_hnsw (vector dist_l2sq_ops); @@ -29,12 +27,6 @@ SELECT _lantern_internal.validate_index('small_world_l2_vector_idx', false); CREATE INDEX ON small_world_l2 USING lantern_hnsw (vector_int dist_l2sq_int_ops); SELECT _lantern_internal.validate_index('small_world_l2_vector_int_idx', false); --- this should use index -EXPLAIN (COSTS FALSE) -SELECT id, ROUND(l2sq_dist(vector_int, array[0,1,0])::numeric, 2) as dist -FROM small_world_l2 -ORDER BY vector_int array[0,1,0] LIMIT 7; - --- Test scenarious --- ----------------------------------------- -- Case: @@ -53,7 +45,7 @@ SELECT v AS v1001 FROM sift_base1k WHERE id = 1001 \gset CREATE INDEX hnsw_l2_index ON sift_base1k USING lantern_hnsw (v) WITH (_experimental_index_path='/tmp/lantern/files/index-sift1k-l2sq-0.3.0.usearch'); SELECT _lantern_internal.validate_index('hnsw_l2_index', false); -- The 1001 and 1002 vectors will be ignored in search, so the first row will not be 0 in result -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 1; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 1; -- Case: -- Index is created externally @@ -67,36 +59,9 @@ SELECT _lantern_internal.validate_index('hnsw_l2_index', false); -- The first row will not be 0 now as the vector under id=777 was updated to 1,1,1,1... but it was indexed with different vector -- So the usearch index can not find 1,1,1,1,1.. vector in the index and wrong results will be returned -- This is an expected behaviour for now -SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v :'v1001' LIMIT 1; - ----- Query on expression based index is failing to check correct operator usage -------- -CREATE OR REPLACE FUNCTION int_to_fixed_binary_real_array(n INT) RETURNS REAL[] AS $$ -DECLARE - binary_string TEXT; - real_array REAL[] := '{}'; - i INT; -BEGIN - binary_string := lpad(CAST(n::BIT(3) AS TEXT), 3, '0'); - FOR i IN 1..length(binary_string) - LOOP - real_array := array_append(real_array, CAST(substring(binary_string, i, 1) AS REAL)); - END LOOP; - RETURN real_array; -END; -$$ LANGUAGE plpgsql IMMUTABLE; +SELECT ROUND(l2sq_dist(v, :'v1001')::numeric, 2) FROM sift_base1k order by v <-> :'v1001' LIMIT 1; CREATE TABLE test_table (id INTEGER); INSERT INTO test_table VALUES (0), (1), (7); \set enable_seqscan = off; --- This currently results in an error about using the operator outside of index --- This case should be fixed -SELECT id FROM test_table ORDER BY int_to_fixed_binary_real_array(id) '{0,0,0}'::REAL[] LIMIT 2; - --- =========== THIS CAUSES SERVER CRASH =============== - --- create extension lantern_extras; --- select v as v777 from sift_base1k where id = 777 \gset --- set lantern.pgvector_compat=false; --- select lantern_create_external_index('v', 'sift_base1k', 'public', 'cos', 128, 10, 10, 10, 'hnsw_cos_index'); --- ===================================================== - - diff --git a/lantern_hnsw/test/sql/hnsw_vector.sql b/lantern_hnsw/test/sql/hnsw_vector.sql index 3704dd396..c0ab3c106 100644 --- a/lantern_hnsw/test/sql/hnsw_vector.sql +++ b/lantern_hnsw/test/sql/hnsw_vector.sql @@ -8,7 +8,6 @@ DROP EXTENSION IF EXISTS lantern; CREATE EXTENSION IF NOT EXISTS vector; CREATE EXTENSION lantern; RESET client_min_messages; -SET lantern.pgvector_compat=FALSE; -- Verify basic functionality of pgvector SELECT '[1,2,3]'::vector; @@ -20,7 +19,7 @@ CREATE INDEX ON items USING lantern_hnsw (trait_ai dist_vec_l2sq_ops) WITH (dim= INSERT INTO items (trait_ai) VALUES ('[6,7,8]'); CREATE INDEX ON items USING lantern_hnsw (trait_ai dist_vec_l2sq_ops) WITH (dim=3, M=4); INSERT INTO items (trait_ai) VALUES ('[10,10,10]'), (NULL); -SELECT * FROM items ORDER BY trait_ai '[0,0,0]' LIMIT 3; +SELECT * FROM items ORDER BY trait_ai <-> '[0,0,0]' LIMIT 3; SELECT * FROM ldb_get_indexes('items'); -- Test index creation on table with existing data @@ -33,14 +32,14 @@ INSERT INTO small_world (v) VALUES (NULL); -- Distance functions SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist -FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; +FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist -FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; +FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist -FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; +FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; EXPLAIN (COSTS FALSE) SELECT ROUND(l2sq_dist(v, '[0,1,0]'::VECTOR)::numeric, 2) as dist -FROM small_world ORDER BY v '[0,1,0]'::VECTOR LIMIT 7; +FROM small_world ORDER BY v <-> '[0,1,0]'::VECTOR LIMIT 7; -- Verify that index creation on a large vector produces an error CREATE TABLE large_vector (v VECTOR(2001)); @@ -56,25 +55,22 @@ CREATE TABLE sift_base10k ( \COPY sift_base10k (v) FROM '/tmp/lantern/vector_datasets/siftsmall_base.csv' WITH CSV; CREATE INDEX hnsw_idx ON sift_base10k USING lantern_hnsw (v); SELECT v AS v4444 FROM sift_base10k WHERE id = 4444 \gset -EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k ORDER BY v :'v4444' LIMIT 10; +EXPLAIN (COSTS FALSE) SELECT * FROM sift_base10k ORDER BY v <-> :'v4444' LIMIT 10; -- Ensure we can query an index for more elements than the value of init_k SET lantern_hnsw.init_k = 4; WITH neighbors AS ( - SELECT * FROM small_world order by v '[1,0,0]' LIMIT 3 + SELECT * FROM small_world order by v <-> '[1,0,0]' LIMIT 3 ) SELECT COUNT(*) from neighbors; WITH neighbors AS ( - SELECT * FROM small_world order by v '[1,0,0]' LIMIT 15 + SELECT * FROM small_world order by v <-> '[1,0,0]' LIMIT 15 ) SELECT COUNT(*) from neighbors; RESET client_min_messages; \set ON_ERROR_STOP off --- Expect error due to improper use of the operator outside of its supported context -SELECT ARRAY[1,2,3] ARRAY[3,2,1]; - -- Expect error due to mismatching vector dimensions -SELECT 1 FROM small_world ORDER BY v '[0,1,0,1]' LIMIT 1; +SELECT 1 FROM small_world ORDER BY v <-> '[0,1,0,1]' LIMIT 1; SELECT l2sq_dist('[1,1]'::vector, '[0,1,0]'::vector); -- Test creating index with expression @@ -92,7 +88,7 @@ BEGIN LOOP real_array := array_append(real_array, CAST(substring(binary_string, i, 1) AS REAL)); END LOOP; - RETURN real_array::vector; + RETURN real_array::public.vector; END; $$ LANGUAGE plpgsql IMMUTABLE; @@ -102,21 +98,21 @@ CREATE INDEX ON test_table USING lantern_hnsw (int_to_fixed_binary_vector(id)) W CREATE TABLE small_world_arr (id SERIAL PRIMARY KEY, v REAL[]); INSERT INTO small_world_arr (v) VALUES ('{0,0,0}'), ('{0,0,1}'), ('{0,0,2}'); CREATE INDEX l2_idx ON small_world_arr USING lantern_hnsw(v) WITH (dim=3, m=2); -EXPLAIN (COSTS FALSE) SELECT id FROM small_world_arr ORDER BY v ARRAY[0,0,0]; -SELECT id FROM small_world_arr ORDER BY v ARRAY[0,0,0]; +EXPLAIN (COSTS FALSE) SELECT id FROM small_world_arr ORDER BY v <-> ARRAY[0,0,0]; +SELECT id FROM small_world_arr ORDER BY v <-> ARRAY[0,0,0]; DROP INDEX l2_idx; -CREATE INDEX cos_idx ON small_world_arr USING lantern_hnsw(v) WITH (m=2); -SELECT id FROM small_world_arr ORDER BY v ARRAY[0,0,0]; +CREATE INDEX cos_idx ON small_world_arr USING lantern_hnsw(v dist_cos_ops) WITH (m=2); +SELECT id FROM small_world_arr ORDER BY v <=> ARRAY[0,0,0]; DROP INDEX cos_idx; -CREATE INDEX ham_idx ON small_world_arr USING lantern_hnsw(v) WITH (m=3); -SELECT id FROM small_world_arr ORDER BY v ARRAY[0,0,0]; +ALTER TABLE small_world_arr ADD COLUMN v_int INT[]; +UPDATE small_world_arr SET v_int=v::INT[]; +CREATE INDEX ham_idx ON small_world_arr USING lantern_hnsw(v_int dist_hamming_ops) WITH (m=3); +SELECT id FROM small_world_arr ORDER BY v_int <+> ARRAY[0,0,0]; --- Test pgvector in lantern.pgvector_compat=TRUE mode DROP TABLE small_world; \ir utils/small_world_vector.sql -- Distance functions -SET lantern.pgvector_compat=TRUE; SET enable_seqscan=OFF; -- Note: diff --git a/lantern_hnsw/test/test_runner.sh b/lantern_hnsw/test/test_runner.sh index 792d3d770..5f62c0c09 100755 --- a/lantern_hnsw/test/test_runner.sh +++ b/lantern_hnsw/test/test_runner.sh @@ -37,6 +37,7 @@ function run_regression_test { # Exclude debug/inconsistent output from psql # So tests will always have the same output psql -U ${DB_USER} \ + -P footer=off \ -v ON_ERROR_STOP=1 \ -v VERBOSITY=terse \ -v ECHO=all \ @@ -46,6 +47,8 @@ function run_regression_test { -e 's! Average Peak Memory: [0-9]\{1,\}kB!!' \ -e 's! time=[0-9]\+\.[0-9]\+\.\.[0-9]\+\.[0-9]\+!!' | \ grep -v 'DEBUG: rehashing catalog cache id' | \ + grep -Ev '^[[:space:]]*Disabled:' | \ + grep -Gv '"Disabled": \(true\|false\),' | \ grep -Gv '^ Planning Time:' | \ grep -Gv '^ Execution Time:' | \ # ignore lines in explain(format json) output that differ among pg12-pg16