From b689c2d36e04a4d49949237a01c38bee1a437bef Mon Sep 17 00:00:00 2001 From: Di Qi Date: Thu, 15 Aug 2024 23:55:40 -0700 Subject: [PATCH 1/7] Support sparsevec in weighted vector search function --- docker/Dockerfile.dev | 8 +- sql/lantern.sql | 755 ++++++++++++++++++-- sql/updates/0.3.1--0.3.2.sql | 855 +++++++++++++++++++++++ test/expected/weighted_search.out | 23 + test/sql/utils/small_world_sparsevec.sql | 16 + test/sql/weighted_search.sql | 26 +- 6 files changed, 1612 insertions(+), 71 deletions(-) create mode 100644 test/sql/utils/small_world_sparsevec.sql diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index 97295f863..df06055bb 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -1,5 +1,5 @@ ARG VERSION=15 -ARG PGVECTOR_VERSION=0.5.1 +ARG PGVECTOR_VERSION=0.6.1 #fix pg_cron at the latest commit of the time ARG PG_CRON_COMMIT_SHA=7e91e72b1bebc5869bb900d9253cc9e92518b33f @@ -31,7 +31,7 @@ RUN gem install pg -- --with-pg-include=/usr/local/pgsql/include/ --with-pg-lib= # hack to make sure postgres user has write access to externally mounted volumes RUN mkdir /lantern_shared && chown postgres:postgres /lantern_shared -RUN cd /root/postgresql-15.5/contrib && make install -j +RUN cd /root/postgresql-15.5/contrib && make install # allow non-root users to install in the container to make it easier to run update-tests RUN chmod -R 777 /usr/local/pgsql/lib/ /usr/local/pgsql/share/extension/ /usr/local/pgsql/include/server/ @@ -55,7 +55,7 @@ COPY . . RUN sudo rm -rf build \ && mkdir build \ && cd build \ - && cmake -DCMAKE_BUILD_TYPE=Debug .. \ + && cmake .. \ && make -j install # Install benchmarking tools in build folder @@ -65,7 +65,7 @@ RUN git clone https://github.com/lanterndata/benchmark build/benchmark \ && pip install -r external/requirements.txt # Install perf -RUN sudo apt update && sudo apt install -y linux-tools-common linux-tools-generic linux-tools-`uname -r` +RUN sudo apt update && sudo apt install -y linux-tools-common linux-tools-generic # in host, enable perf_event paranoid via # echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid diff --git a/sql/lantern.sql b/sql/lantern.sql index ab115bb57..9982cdc03 100644 --- a/sql/lantern.sql +++ b/sql/lantern.sql @@ -140,7 +140,7 @@ BEGIN COMMUTATOR = '' ); - -- pgvecor's vector type requires floats and we cannot define hamming distance for floats + -- pgvector's vector type requires floats and we cannot define hamming distance for floats CREATE OPERATOR CLASS dist_vec_l2sq_ops DEFAULT FOR TYPE vector USING lantern_hnsw AS OPERATOR 1 (vector, vector) FOR ORDER BY float_ops, @@ -667,6 +667,7 @@ CREATE OR REPLACE FUNCTION _lantern_internal.maybe_setup_weighted_vector_search( $weighted_vector_search$ DECLARE pgvector_exists boolean; + pgvector_sparsevec_exists boolean; BEGIN -- Check if the vector type from pgvector exists SELECT EXISTS ( @@ -680,33 +681,32 @@ BEGIN RETURN; END IF; - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, + -- Check if the sparsevec type from pgvector exists + SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'sparsevec' + ) INTO pgvector_sparsevec_exists; + + CREATE OR REPLACE FUNCTION _lantern_internal.weighted_vector_search_helper( + table_name regtype, w1 numeric, col1 text, - vec1 vector, - w2 numeric= 0, + vec1 text, + w2 numeric = 0, col2 text = NULL, - vec2 vector = NULL, + vec2 text = NULL, w3 numeric = 0, col3 text = NULL, - vec3 vector = NULL, + vec3 text = NULL, ef integer = 100, max_dist numeric = NULL, - -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. distance_operator text = '<->', id_col text = 'id', exact boolean = false, debug_output boolean = false, analyze_output boolean = false - ) - -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement - -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) - -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place - -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS - $$ + ) RETURNS TEXT AS $$ DECLARE joint_condition text; query_base text; @@ -714,10 +714,15 @@ BEGIN query1 text; query2 text; query3 text; - -- variables for weighted columns + parsed_schema_name text; + parsed_table_name text; wc1 text = NULL; wc2 text = NULL; wc3 text = NULL; + is_sparsevec_regex text = '\{\d+:\d+(\.\d+)?(,\d+:\d+(\.\d+)?)*\}/\d+'; + vec1_string text = NULL; + vec2_string text = NULL; + vec3_string text = NULL; cte_query text; maybe_unions_query text; final_query text; @@ -727,32 +732,80 @@ BEGIN debug_count integer; maybe_analyze text = ''; BEGIN - -- TODO:: better validate inputs to throw nicer errors in case of wrong input: - -- 1. only allow valid distance_operator stirngs (<->, <=>, but not abracadabra) - -- 2. only allow valid column names - -- 3. throw an error on negative weights - -- 4. check that id_col column exists before proceeding + + -- Start: Validate inputs + + -- 1. only allow valid distance_operator strings (<->, <=>, but not abracadabra) + IF distance_operator NOT IN ('<->', '<=>', '<#>', '<+>') THEN + RAISE EXCEPTION 'Invalid distance operator: %', distance_operator; + END IF; + + -- 2. only allow valid column names, i.e., column names that exist in the table + SELECT n.nspname, c.relname INTO parsed_schema_name, parsed_table_name FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.reltype = table_name::oid; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = id_col) THEN + RAISE EXCEPTION 'Invalid column name: %', id_col; + END IF; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col1) THEN + RAISE EXCEPTION 'Invalid column name: %', col1; + END IF; + IF col2 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col2) THEN + RAISE EXCEPTION 'Invalid column name: %', col2; + END IF; + IF col3 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col3) THEN + RAISE EXCEPTION 'Invalid column name: %', col3; + END IF; + + -- 3. throw an error on negative weights + IF w1 < 0 OR w2 < 0 OR w3 < 0 THEN + RAISE EXCEPTION 'Invalid weight: %', w1; + END IF; + + -- End: Validate inputs IF analyze_output THEN - maybe_analyze := 'ANALYZE, BUFFERS,'; + maybe_analyze := 'ANALYZE, BUFFERS,'; END IF; - -- Joint similarity metric condition - -- the cast ::vector is necessary for cases when the column is not of type vector + + -- Generate vector strings + -- the cast is necessary for cases when the column is not of type vector -- and for some reason in those cases cast does not happen automatically - wc1 := format('(%s * (%I %s %L::vector))', w1, col1, distance_operator, vec1); + IF vec1 IS NOT NULL THEN + IF vec1 ~ is_sparsevec_regex THEN + vec1_string := vec1 || '::sparsevec'; + ELSE + vec1_string := vec1 || '::vector'; + END IF; + END IF; + IF vec2 IS NOT NULL THEN + IF vec2 ~ is_sparsevec_regex THEN + vec2_string := vec2 || '::sparsevec'; + ELSE + vec2_string := vec2 || '::vector'; + END IF; + END IF; + IF vec3 IS NOT NULL THEN + IF vec3 ~ is_sparsevec_regex THEN + vec3_string := vec3 || '::sparsevec'; + ELSE + vec3_string := vec3 || '::vector'; + END IF; + END IF; + + -- Joint similarity metric condition + wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1_string); IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - wc2 := format(' (%s * (%I %s %L::vector))', w2, col2, distance_operator, vec2); + wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2_string); END IF; IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - wc3 := format(' (%s * (%I %s %L::vector))', w3, col3, distance_operator, vec3); + wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3_string); END IF; joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, ''); -- Base query with joint similarity metric - query_base := format('SELECT * FROM %s ', pg_typeof(relation_type)); + query_base := format('SELECT * FROM %s ', table_name); IF max_dist IS NOT NULL THEN - query_final_where := format(' WHERE %s < %L', joint_condition, max_dist); + query_final_where := format(' WHERE %s < %L', joint_condition, max_dist); END IF; IF exact THEN @@ -766,9 +819,8 @@ BEGIN explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); RAISE WARNING 'weighted vector search explain(exact=true): %', jsonb_pretty(explain_output); END IF; - RETURN QUERY EXECUTE final_query; - -- the empty return below is crucial, to make sure the rest of the function is not executed after the return query above - RETURN; + + RETURN final_query; END IF; EXECUTE format('SET LOCAL hnsw.ef_search TO %L', ef); @@ -776,7 +828,7 @@ BEGIN maybe_unions_query := ''; -- Query 1: Order by first condition's weighted similarity - query1 := format('%s ORDER BY %I %s %L::vector LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1, ef); + query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1_string, ef); IF debug_output THEN EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count; @@ -787,7 +839,7 @@ BEGIN -- Query 2: Order by other conditions' weighted similarity, if applicable IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - query2 := format('%s ORDER BY %I %s %L::vector LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2, ef); + query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2_string, ef); cte_query := cte_query || format(', query2 AS (%s)', query2); maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) '); IF debug_output THEN @@ -796,8 +848,9 @@ BEGIN END IF; END IF; + -- Query 3: Order by third condition's weighted similarity, if applicable IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - query3 := format('%s ORDER BY %I %s %L::vector LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3, ef); + query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3_string, ef); cte_query := cte_query || format(', query3 AS (%s)', query3); maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) '); IF debug_output THEN @@ -815,26 +868,28 @@ BEGIN tt %s ORDER BY %s$final_cte_query$, id_col, maybe_unions_query, query_final_where, joint_condition); - IF debug_output THEN - explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); - EXECUTE explain_query INTO explain_output; + IF debug_output THEN + explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); + EXECUTE explain_query INTO explain_output; - RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); + RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); - explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); - RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output); - END IF; - RETURN QUERY EXECUTE final_query; - END + explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); + RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output); + END IF; + + RETURN final_query; + + END $$ LANGUAGE plpgsql; --- setup API shortcuts - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( relation_type anyelement, w1 numeric, col1 text, vec1 vector, - w2 numeric= 0, + w2 numeric = 0, col2 text = NULL, vec2 vector = NULL, w3 numeric = 0, @@ -842,6 +897,8 @@ BEGIN vec3 vector = NULL, ef integer = 100, max_dist numeric = NULL, + -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. + distance_operator text = '<->', id_col text = 'id', exact boolean = false, debug_output boolean = false, @@ -850,20 +907,241 @@ BEGIN -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place + -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS $$ + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; -BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); -END $$ LANGUAGE plpgsql; + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + END IF; - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + -- setup Cosine API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( relation_type anyelement, w1 numeric, col1 text, vec1 vector, - w2 numeric= 0, + w2 numeric = 0, col2 text = NULL, vec2 vector = NULL, w3 numeric = 0, @@ -875,18 +1153,365 @@ END $$ LANGUAGE plpgsql; exact boolean = false, debug_output boolean = false, analyze_output boolean = false - ) - -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement - -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) - -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place - -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS $$ + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; -BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); -END $$ LANGUAGE plpgsql; + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_cos setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; + -- setup L2SQ API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_l2sq setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; END $weighted_vector_search$ LANGUAGE plpgsql; diff --git a/sql/updates/0.3.1--0.3.2.sql b/sql/updates/0.3.1--0.3.2.sql index e69de29bb..a4f2846c9 100644 --- a/sql/updates/0.3.1--0.3.2.sql +++ b/sql/updates/0.3.1--0.3.2.sql @@ -0,0 +1,855 @@ +CREATE OR REPLACE FUNCTION _lantern_internal.maybe_setup_weighted_vector_search() RETURNS VOID AS +$weighted_vector_search$ +DECLARE + pgvector_exists boolean; + pgvector_sparsevec_exists boolean; +BEGIN + -- Check if the vector type from pgvector exists + SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'vector' + ) INTO pgvector_exists; + + IF NOT pgvector_exists THEN + RAISE NOTICE 'pgvector extension not found. Skipping lantern weighted vector search setup'; + RETURN; + END IF; + + -- Check if the sparsevec type from pgvector exists + SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'sparsevec' + ) INTO pgvector_sparsevec_exists; + + CREATE OR REPLACE FUNCTION _lantern_internal.weighted_vector_search_helper( + table_name regtype, + w1 numeric, + col1 text, + vec1 text, + w2 numeric = 0, + col2 text = NULL, + vec2 text = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 text = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TEXT AS $$ + DECLARE + joint_condition text; + query_base text; + query_final_where text = ''; + query1 text; + query2 text; + query3 text; + parsed_schema_name text; + parsed_table_name text; + wc1 text = NULL; + wc2 text = NULL; + wc3 text = NULL; + is_sparsevec_regex text = '\{\d+:\d+(\.\d+)?(,\d+:\d+(\.\d+)?)*\}/\d+'; + vec1_string text = NULL; + vec2_string text = NULL; + vec3_string text = NULL; + cte_query text; + maybe_unions_query text; + final_query text; + explain_query text; + explain_output jsonb; + old_hnsw_ef_search numeric; + debug_count integer; + maybe_analyze text = ''; + BEGIN + + -- Start: Validate inputs + + -- 1. only allow valid distance_operator strings (<->, <=>, but not abracadabra) + IF distance_operator NOT IN ('<->', '<=>', '<#>', '<+>') THEN + RAISE EXCEPTION 'Invalid distance operator: %', distance_operator; + END IF; + + -- 2. only allow valid column names, i.e., column names that exist in the table + SELECT n.nspname, c.relname INTO parsed_schema_name, parsed_table_name FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.reltype = table_name::oid; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = id_col) THEN + RAISE EXCEPTION 'Invalid column name: %', id_col; + END IF; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col1) THEN + RAISE EXCEPTION 'Invalid column name: %', col1; + END IF; + IF col2 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col2) THEN + RAISE EXCEPTION 'Invalid column name: %', col2; + END IF; + IF col3 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col3) THEN + RAISE EXCEPTION 'Invalid column name: %', col3; + END IF; + + -- 3. throw an error on negative weights + IF w1 < 0 OR w2 < 0 OR w3 < 0 THEN + RAISE EXCEPTION 'Invalid weight: %', w1; + END IF; + + -- End: Validate inputs + + IF analyze_output THEN + maybe_analyze := 'ANALYZE, BUFFERS,'; + END IF; + + -- Generate vector strings + -- the cast is necessary for cases when the column is not of type vector + -- and for some reason in those cases cast does not happen automatically + IF vec1 IS NOT NULL THEN + IF vec1 ~ is_sparsevec_regex THEN + vec1_string := vec1 || '::sparsevec'; + ELSE + vec1_string := vec1 || '::vector'; + END IF; + END IF; + IF vec2 IS NOT NULL THEN + IF vec2 ~ is_sparsevec_regex THEN + vec2_string := vec2 || '::sparsevec'; + ELSE + vec2_string := vec2 || '::vector'; + END IF; + END IF; + IF vec3 IS NOT NULL THEN + IF vec3 ~ is_sparsevec_regex THEN + vec3_string := vec3 || '::sparsevec'; + ELSE + vec3_string := vec3 || '::vector'; + END IF; + END IF; + + -- Joint similarity metric condition + wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1_string); + IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN + wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2_string); + END IF; + IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN + wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3_string); + END IF; + + joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, ''); + + -- Base query with joint similarity metric + query_base := format('SELECT * FROM %s ', table_name); + IF max_dist IS NOT NULL THEN + query_final_where := format(' WHERE %s < %L', joint_condition, max_dist); + END IF; + + IF exact THEN + final_query := query_base || query_final_where || format(' ORDER BY %s', joint_condition); + IF debug_output THEN + explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); + EXECUTE explain_query INTO explain_output; + + RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); + + explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); + RAISE WARNING 'weighted vector search explain(exact=true): %', jsonb_pretty(explain_output); + END IF; + + RETURN final_query; + END IF; + + EXECUTE format('SET LOCAL hnsw.ef_search TO %L', ef); + -- UNION ALL.. part of the final query that aggregates results from individual vector search queries + maybe_unions_query := ''; + + -- Query 1: Order by first condition's weighted similarity + query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1_string, ef); + + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count; + RAISE WARNING 'col1 yielded % rows', debug_count; + END IF; + + cte_query = format('WITH query1 AS (%s) ', query1); + + -- Query 2: Order by other conditions' weighted similarity, if applicable + IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN + query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2_string, ef); + cte_query := cte_query || format(', query2 AS (%s)', query2); + maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) '); + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query2) INTO debug_count; + RAISE WARNING 'col2 yielded % rows', debug_count; + END IF; + END IF; + + -- Query 3: Order by third condition's weighted similarity, if applicable + IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN + query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3_string, ef); + cte_query := cte_query || format(', query3 AS (%s)', query3); + maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) '); + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query3) INTO debug_count; + RAISE WARNING 'col3 yielded % rows', debug_count; + END IF; + END IF; + + final_query := cte_query || format($final_cte_query$SELECT * FROM ( + SELECT DISTINCT ON (%I) * FROM ( + (SELECT * FROM query1) + %s + ) t + ) + tt %s ORDER BY %s$final_cte_query$, + id_col, maybe_unions_query, query_final_where, joint_condition); + + IF debug_output THEN + explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); + EXECUTE explain_query INTO explain_output; + + RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); + + explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); + RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output); + END IF; + + RETURN final_query; + + END + $$ LANGUAGE plpgsql; + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement + -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) + -- then that single "column" is properly spread with source table's column names + -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place + -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + END IF; + + -- setup Cosine API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_cos setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; + + -- setup L2SQ API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_l2sq setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; + +END +$weighted_vector_search$ LANGUAGE plpgsql; + +SELECT _lantern_internal.maybe_setup_weighted_vector_search(); +DROP FUNCTION _lantern_internal.maybe_setup_weighted_vector_search; \ No newline at end of file diff --git a/test/expected/weighted_search.out b/test/expected/weighted_search.out index a58c7142d..16e48d5ce 100644 --- a/test/expected/weighted_search.out +++ b/test/expected/weighted_search.out @@ -522,6 +522,29 @@ SELECT count(*) 15 (1 row) +-- Check if the type 'sparsevec' exists and store the result in a variable +SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'sparsevec' +) AS exists_sparsevec \gset +-- Conditional execution based on the variable +\if :exists_sparsevec + \echo 'The sparsevec type exists. Running commands...' + \ir utils/small_world_sparsevec.sql + SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset + SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset + SELECT + id, + 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist + FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, + w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, + w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector + ); +\else + \echo 'The sparsevec type does not exist. Skipping commands...' +The sparsevec type does not exist. Skipping commands... +\endif -- create non superuser and test the function SET client_min_messages = WARNING; -- suppress NOTICE: role "test_user" does not exist, skipping diff --git a/test/sql/utils/small_world_sparsevec.sql b/test/sql/utils/small_world_sparsevec.sql new file mode 100644 index 000000000..e9ea24b82 --- /dev/null +++ b/test/sql/utils/small_world_sparsevec.sql @@ -0,0 +1,16 @@ +CREATE TABLE small_world ( + id VARCHAR(3), + b BOOLEAN, + v VECTOR(3), + s SPARSEVEC(3) +); + +INSERT INTO small_world (id, b, v, s) VALUES + ('000', TRUE, '[0,0,0]', '{}/3'), + ('001', TRUE, '[0,0,1]', '{3:1}/3'), + ('010', FALSE, '[0,1,0]' , '{2:1}/3'), + ('011', TRUE, '[0,1,1]', '{2:1,3:1}/3'), + ('100', FALSE, '[1,0,0]', '{1:1}/3'), + ('101', FALSE, '[1,0,1]', '{1:1,3:1}/3'), + ('110', FALSE, '[1,1,0]', '{1:1,2:1}/3'), + ('111', TRUE, '[1,1,1]', '{1:1,2:1,3:1}/3'); \ No newline at end of file diff --git a/test/sql/weighted_search.sql b/test/sql/weighted_search.sql index 9902219a1..f860118eb 100644 --- a/test/sql/weighted_search.sql +++ b/test/sql/weighted_search.sql @@ -53,7 +53,6 @@ LIMIT 100) t; -- when max_dist is not specified, number of returned values dicreases with smaller ef - SELECT count(*) FROM lantern.weighted_vector_search(CAST(NULL as "sift_base1k"), debug_output => true, exact => true, w1=> 0.03, col1=>'v'::text, vec1=>:'v4'::vector, @@ -121,6 +120,30 @@ SELECT count(*) w3=> 0.52, col3=>'v_real'::text, vec3=>:'v444'::vector ); +-- Check if the type 'sparsevec' exists and store the result in a variable +SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'sparsevec' +) AS exists_sparsevec \gset + +-- Conditional execution based on the variable +\if :exists_sparsevec + \echo 'The sparsevec type exists. Running commands...' + \ir utils/small_world_sparsevec.sql + SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset + SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset + SELECT + id, + 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist + FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, + w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, + w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector + ); +\else + \echo 'The sparsevec type does not exist. Skipping commands...' +\endif + -- create non superuser and test the function SET client_min_messages = WARNING; -- suppress NOTICE: role "test_user" does not exist, skipping @@ -138,4 +161,3 @@ SELECT count(*) w2=> 0.45, col2=>'v_real'::text, vec2=>:'v44'::vector, w3=> 0.52, col3=>'v_real'::text, vec3=>:'v444'::vector ); - From ec924ebc042b13071419ffe0142bf07a41ef9be5 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Tue, 20 Aug 2024 13:49:00 +0400 Subject: [PATCH 2/7] use pgvector v0.7.3-lanterncloud in CI and development dockerfile --- ci/scripts/build.sh | 4 ++-- docker/Dockerfile.dev | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/scripts/build.sh b/ci/scripts/build.sh index 7bc740f82..fe456f732 100755 --- a/ci/scripts/build.sh +++ b/ci/scripts/build.sh @@ -19,7 +19,7 @@ function setup_environment() { export DEBIAN_FRONTEND=noninteractive export PG_VERSION=${PG_VERSION:-15} export GITHUB_OUTPUT=${GITHUB_OUTPUT:-/dev/null} - export PGVECTOR_VERSION=0.6.1 + export PGVECTOR_VERSION=0.7.3-lanterncloud #fix pg_cron at the latest commit of the time export PG_CRON_COMMIT_SHA=7e91e72b1bebc5869bb900d9253cc9e92518b33f } @@ -41,7 +41,7 @@ function clone_or_use_source() { function install_external_dependencies() { # Install pgvector pushd /tmp - wget --quiet -O pgvector.tar.gz https://github.com/pgvector/pgvector/archive/refs/tags/v${PGVECTOR_VERSION}.tar.gz + wget --quiet -O pgvector.tar.gz https://github.com/lanterndata/pgvector/archive/refs/tags/v${PGVECTOR_VERSION}.tar.gz tar xzf pgvector.tar.gz rm -rf pgvector || true mv pgvector-${PGVECTOR_VERSION} pgvector diff --git a/docker/Dockerfile.dev b/docker/Dockerfile.dev index df06055bb..71977aa93 100644 --- a/docker/Dockerfile.dev +++ b/docker/Dockerfile.dev @@ -1,5 +1,5 @@ ARG VERSION=15 -ARG PGVECTOR_VERSION=0.6.1 +ARG PGVECTOR_VERSION=0.7.3-lanterncloud #fix pg_cron at the latest commit of the time ARG PG_CRON_COMMIT_SHA=7e91e72b1bebc5869bb900d9253cc9e92518b33f @@ -41,7 +41,7 @@ USER postgres RUN pip install GitPython libtmux # Build & Install pgvector -RUN wget --quiet -O pgvector.tar.gz https://github.com/pgvector/pgvector/archive/refs/tags/v${PGVECTOR_VERSION}.tar.gz && \ +RUN wget --quiet -O pgvector.tar.gz https://github.com/lanterndata/pgvector/archive/refs/tags/v${PGVECTOR_VERSION}.tar.gz && \ tar xzf pgvector.tar.gz && \ (cd pgvector-${PGVECTOR_VERSION} && make -j && make install) From decf20854d56b145d927768bd5633d5043a74660 Mon Sep 17 00:00:00 2001 From: Varik Matevosyan Date: Mon, 19 Aug 2024 09:41:03 +0400 Subject: [PATCH 3/7] remove unnecessary assert for external_socket_fd --- src/hnsw/build.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/hnsw/build.c b/src/hnsw/build.c index 2402f8fcc..f28360db7 100644 --- a/src/hnsw/build.c +++ b/src/hnsw/build.c @@ -526,7 +526,6 @@ static void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, ldb_ &opts, buildstate, estimated_row_count); - assert(buildstate->external_client_fd > 0); } else { usearch_reserve(buildstate->usearch_index, estimated_row_count, &error); } From c661feadecf9cfb9d854def358d3c445073a7969 Mon Sep 17 00:00:00 2001 From: Di Qi Date: Tue, 20 Aug 2024 15:20:31 -0700 Subject: [PATCH 4/7] Change version to 0.3.3, update tests given pgvector 0.7+ --- CMakeLists.txt | 3 +- sql/updates/0.3.1--0.3.2.sql | 855 ------------------------------ sql/updates/0.3.2--0.3.3.sql | 855 ++++++++++++++++++++++++++++++ test/expected/weighted_search.out | 59 ++- test/sql/weighted_search.sql | 34 +- 5 files changed, 904 insertions(+), 902 deletions(-) create mode 100644 sql/updates/0.3.2--0.3.3.sql diff --git a/CMakeLists.txt b/CMakeLists.txt index 193161aef..9f82858e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.3) include(CheckSymbolExists) -set(LANTERN_VERSION 0.3.2) +set(LANTERN_VERSION 0.3.3) project( LanternDB @@ -263,6 +263,7 @@ endif() set (_update_files sql/updates/0.3.0--0.3.1.sql sql/updates/0.3.1--0.3.2.sql + sql/updates/0.3.2--0.3.3.sql ) # Generate version information for the binary diff --git a/sql/updates/0.3.1--0.3.2.sql b/sql/updates/0.3.1--0.3.2.sql index a4f2846c9..e69de29bb 100644 --- a/sql/updates/0.3.1--0.3.2.sql +++ b/sql/updates/0.3.1--0.3.2.sql @@ -1,855 +0,0 @@ -CREATE OR REPLACE FUNCTION _lantern_internal.maybe_setup_weighted_vector_search() RETURNS VOID AS -$weighted_vector_search$ -DECLARE - pgvector_exists boolean; - pgvector_sparsevec_exists boolean; -BEGIN - -- Check if the vector type from pgvector exists - SELECT EXISTS ( - SELECT 1 - FROM pg_type - WHERE typname = 'vector' - ) INTO pgvector_exists; - - IF NOT pgvector_exists THEN - RAISE NOTICE 'pgvector extension not found. Skipping lantern weighted vector search setup'; - RETURN; - END IF; - - -- Check if the sparsevec type from pgvector exists - SELECT EXISTS ( - SELECT 1 - FROM pg_type - WHERE typname = 'sparsevec' - ) INTO pgvector_sparsevec_exists; - - CREATE OR REPLACE FUNCTION _lantern_internal.weighted_vector_search_helper( - table_name regtype, - w1 numeric, - col1 text, - vec1 text, - w2 numeric = 0, - col2 text = NULL, - vec2 text = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 text = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TEXT AS $$ - DECLARE - joint_condition text; - query_base text; - query_final_where text = ''; - query1 text; - query2 text; - query3 text; - parsed_schema_name text; - parsed_table_name text; - wc1 text = NULL; - wc2 text = NULL; - wc3 text = NULL; - is_sparsevec_regex text = '\{\d+:\d+(\.\d+)?(,\d+:\d+(\.\d+)?)*\}/\d+'; - vec1_string text = NULL; - vec2_string text = NULL; - vec3_string text = NULL; - cte_query text; - maybe_unions_query text; - final_query text; - explain_query text; - explain_output jsonb; - old_hnsw_ef_search numeric; - debug_count integer; - maybe_analyze text = ''; - BEGIN - - -- Start: Validate inputs - - -- 1. only allow valid distance_operator strings (<->, <=>, but not abracadabra) - IF distance_operator NOT IN ('<->', '<=>', '<#>', '<+>') THEN - RAISE EXCEPTION 'Invalid distance operator: %', distance_operator; - END IF; - - -- 2. only allow valid column names, i.e., column names that exist in the table - SELECT n.nspname, c.relname INTO parsed_schema_name, parsed_table_name FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.reltype = table_name::oid; - IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = id_col) THEN - RAISE EXCEPTION 'Invalid column name: %', id_col; - END IF; - IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col1) THEN - RAISE EXCEPTION 'Invalid column name: %', col1; - END IF; - IF col2 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col2) THEN - RAISE EXCEPTION 'Invalid column name: %', col2; - END IF; - IF col3 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col3) THEN - RAISE EXCEPTION 'Invalid column name: %', col3; - END IF; - - -- 3. throw an error on negative weights - IF w1 < 0 OR w2 < 0 OR w3 < 0 THEN - RAISE EXCEPTION 'Invalid weight: %', w1; - END IF; - - -- End: Validate inputs - - IF analyze_output THEN - maybe_analyze := 'ANALYZE, BUFFERS,'; - END IF; - - -- Generate vector strings - -- the cast is necessary for cases when the column is not of type vector - -- and for some reason in those cases cast does not happen automatically - IF vec1 IS NOT NULL THEN - IF vec1 ~ is_sparsevec_regex THEN - vec1_string := vec1 || '::sparsevec'; - ELSE - vec1_string := vec1 || '::vector'; - END IF; - END IF; - IF vec2 IS NOT NULL THEN - IF vec2 ~ is_sparsevec_regex THEN - vec2_string := vec2 || '::sparsevec'; - ELSE - vec2_string := vec2 || '::vector'; - END IF; - END IF; - IF vec3 IS NOT NULL THEN - IF vec3 ~ is_sparsevec_regex THEN - vec3_string := vec3 || '::sparsevec'; - ELSE - vec3_string := vec3 || '::vector'; - END IF; - END IF; - - -- Joint similarity metric condition - wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1_string); - IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2_string); - END IF; - IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3_string); - END IF; - - joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, ''); - - -- Base query with joint similarity metric - query_base := format('SELECT * FROM %s ', table_name); - IF max_dist IS NOT NULL THEN - query_final_where := format(' WHERE %s < %L', joint_condition, max_dist); - END IF; - - IF exact THEN - final_query := query_base || query_final_where || format(' ORDER BY %s', joint_condition); - IF debug_output THEN - explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); - EXECUTE explain_query INTO explain_output; - - RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); - - explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); - RAISE WARNING 'weighted vector search explain(exact=true): %', jsonb_pretty(explain_output); - END IF; - - RETURN final_query; - END IF; - - EXECUTE format('SET LOCAL hnsw.ef_search TO %L', ef); - -- UNION ALL.. part of the final query that aggregates results from individual vector search queries - maybe_unions_query := ''; - - -- Query 1: Order by first condition's weighted similarity - query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1_string, ef); - - IF debug_output THEN - EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count; - RAISE WARNING 'col1 yielded % rows', debug_count; - END IF; - - cte_query = format('WITH query1 AS (%s) ', query1); - - -- Query 2: Order by other conditions' weighted similarity, if applicable - IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2_string, ef); - cte_query := cte_query || format(', query2 AS (%s)', query2); - maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) '); - IF debug_output THEN - EXECUTE format('SELECT count(*) FROM (%s) t', query2) INTO debug_count; - RAISE WARNING 'col2 yielded % rows', debug_count; - END IF; - END IF; - - -- Query 3: Order by third condition's weighted similarity, if applicable - IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3_string, ef); - cte_query := cte_query || format(', query3 AS (%s)', query3); - maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) '); - IF debug_output THEN - EXECUTE format('SELECT count(*) FROM (%s) t', query3) INTO debug_count; - RAISE WARNING 'col3 yielded % rows', debug_count; - END IF; - END IF; - - final_query := cte_query || format($final_cte_query$SELECT * FROM ( - SELECT DISTINCT ON (%I) * FROM ( - (SELECT * FROM query1) - %s - ) t - ) - tt %s ORDER BY %s$final_cte_query$, - id_col, maybe_unions_query, query_final_where, joint_condition); - - IF debug_output THEN - explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); - EXECUTE explain_query INTO explain_output; - - RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); - - explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); - RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output); - END IF; - - RETURN final_query; - - END - $$ LANGUAGE plpgsql; - - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement - -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) - -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place - -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - END IF; - - -- setup Cosine API shortcuts - - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_cos setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - END IF; - - -- setup L2SQ API shortcuts - - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_l2sq setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - END IF; - -END -$weighted_vector_search$ LANGUAGE plpgsql; - -SELECT _lantern_internal.maybe_setup_weighted_vector_search(); -DROP FUNCTION _lantern_internal.maybe_setup_weighted_vector_search; \ No newline at end of file diff --git a/sql/updates/0.3.2--0.3.3.sql b/sql/updates/0.3.2--0.3.3.sql new file mode 100644 index 000000000..a4f2846c9 --- /dev/null +++ b/sql/updates/0.3.2--0.3.3.sql @@ -0,0 +1,855 @@ +CREATE OR REPLACE FUNCTION _lantern_internal.maybe_setup_weighted_vector_search() RETURNS VOID AS +$weighted_vector_search$ +DECLARE + pgvector_exists boolean; + pgvector_sparsevec_exists boolean; +BEGIN + -- Check if the vector type from pgvector exists + SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'vector' + ) INTO pgvector_exists; + + IF NOT pgvector_exists THEN + RAISE NOTICE 'pgvector extension not found. Skipping lantern weighted vector search setup'; + RETURN; + END IF; + + -- Check if the sparsevec type from pgvector exists + SELECT EXISTS ( + SELECT 1 + FROM pg_type + WHERE typname = 'sparsevec' + ) INTO pgvector_sparsevec_exists; + + CREATE OR REPLACE FUNCTION _lantern_internal.weighted_vector_search_helper( + table_name regtype, + w1 numeric, + col1 text, + vec1 text, + w2 numeric = 0, + col2 text = NULL, + vec2 text = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 text = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TEXT AS $$ + DECLARE + joint_condition text; + query_base text; + query_final_where text = ''; + query1 text; + query2 text; + query3 text; + parsed_schema_name text; + parsed_table_name text; + wc1 text = NULL; + wc2 text = NULL; + wc3 text = NULL; + is_sparsevec_regex text = '\{\d+:\d+(\.\d+)?(,\d+:\d+(\.\d+)?)*\}/\d+'; + vec1_string text = NULL; + vec2_string text = NULL; + vec3_string text = NULL; + cte_query text; + maybe_unions_query text; + final_query text; + explain_query text; + explain_output jsonb; + old_hnsw_ef_search numeric; + debug_count integer; + maybe_analyze text = ''; + BEGIN + + -- Start: Validate inputs + + -- 1. only allow valid distance_operator strings (<->, <=>, but not abracadabra) + IF distance_operator NOT IN ('<->', '<=>', '<#>', '<+>') THEN + RAISE EXCEPTION 'Invalid distance operator: %', distance_operator; + END IF; + + -- 2. only allow valid column names, i.e., column names that exist in the table + SELECT n.nspname, c.relname INTO parsed_schema_name, parsed_table_name FROM pg_class c JOIN pg_namespace n ON n.oid = c.relnamespace WHERE c.reltype = table_name::oid; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = id_col) THEN + RAISE EXCEPTION 'Invalid column name: %', id_col; + END IF; + IF NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col1) THEN + RAISE EXCEPTION 'Invalid column name: %', col1; + END IF; + IF col2 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col2) THEN + RAISE EXCEPTION 'Invalid column name: %', col2; + END IF; + IF col3 IS NOT NULL AND NOT EXISTS (SELECT 1 FROM information_schema.columns c WHERE c.table_name = parsed_table_name AND table_schema = parsed_schema_name AND column_name = col3) THEN + RAISE EXCEPTION 'Invalid column name: %', col3; + END IF; + + -- 3. throw an error on negative weights + IF w1 < 0 OR w2 < 0 OR w3 < 0 THEN + RAISE EXCEPTION 'Invalid weight: %', w1; + END IF; + + -- End: Validate inputs + + IF analyze_output THEN + maybe_analyze := 'ANALYZE, BUFFERS,'; + END IF; + + -- Generate vector strings + -- the cast is necessary for cases when the column is not of type vector + -- and for some reason in those cases cast does not happen automatically + IF vec1 IS NOT NULL THEN + IF vec1 ~ is_sparsevec_regex THEN + vec1_string := vec1 || '::sparsevec'; + ELSE + vec1_string := vec1 || '::vector'; + END IF; + END IF; + IF vec2 IS NOT NULL THEN + IF vec2 ~ is_sparsevec_regex THEN + vec2_string := vec2 || '::sparsevec'; + ELSE + vec2_string := vec2 || '::vector'; + END IF; + END IF; + IF vec3 IS NOT NULL THEN + IF vec3 ~ is_sparsevec_regex THEN + vec3_string := vec3 || '::sparsevec'; + ELSE + vec3_string := vec3 || '::vector'; + END IF; + END IF; + + -- Joint similarity metric condition + wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1_string); + IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN + wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2_string); + END IF; + IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN + wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3_string); + END IF; + + joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, ''); + + -- Base query with joint similarity metric + query_base := format('SELECT * FROM %s ', table_name); + IF max_dist IS NOT NULL THEN + query_final_where := format(' WHERE %s < %L', joint_condition, max_dist); + END IF; + + IF exact THEN + final_query := query_base || query_final_where || format(' ORDER BY %s', joint_condition); + IF debug_output THEN + explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); + EXECUTE explain_query INTO explain_output; + + RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); + + explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); + RAISE WARNING 'weighted vector search explain(exact=true): %', jsonb_pretty(explain_output); + END IF; + + RETURN final_query; + END IF; + + EXECUTE format('SET LOCAL hnsw.ef_search TO %L', ef); + -- UNION ALL.. part of the final query that aggregates results from individual vector search queries + maybe_unions_query := ''; + + -- Query 1: Order by first condition's weighted similarity + query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1_string, ef); + + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count; + RAISE WARNING 'col1 yielded % rows', debug_count; + END IF; + + cte_query = format('WITH query1 AS (%s) ', query1); + + -- Query 2: Order by other conditions' weighted similarity, if applicable + IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN + query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2_string, ef); + cte_query := cte_query || format(', query2 AS (%s)', query2); + maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) '); + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query2) INTO debug_count; + RAISE WARNING 'col2 yielded % rows', debug_count; + END IF; + END IF; + + -- Query 3: Order by third condition's weighted similarity, if applicable + IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN + query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3_string, ef); + cte_query := cte_query || format(', query3 AS (%s)', query3); + maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) '); + IF debug_output THEN + EXECUTE format('SELECT count(*) FROM (%s) t', query3) INTO debug_count; + RAISE WARNING 'col3 yielded % rows', debug_count; + END IF; + END IF; + + final_query := cte_query || format($final_cte_query$SELECT * FROM ( + SELECT DISTINCT ON (%I) * FROM ( + (SELECT * FROM query1) + %s + ) t + ) + tt %s ORDER BY %s$final_cte_query$, + id_col, maybe_unions_query, query_final_where, joint_condition); + + IF debug_output THEN + explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query); + EXECUTE explain_query INTO explain_output; + + RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query); + + explain_output := _lantern_internal.mask_order_by_in_plan(explain_output); + RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output); + END IF; + + RETURN final_query; + + END + $$ LANGUAGE plpgsql; + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement + -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) + -- then that single "column" is properly spread with source table's column names + -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place + -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = '<->', + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + END IF; + + -- setup Cosine API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_cos setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; + + -- setup L2SQ API shortcuts + + -- v (v) (v) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric = 0, + col2 text = NULL, + vec2 vector = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 vector = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + IF NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_l2sq setup for sparsevec'; + ELSE + -- s v v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v v s + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric, + col3 text, + vec3 sparsevec, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s s v + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric, + col3 text, + vec3 vector, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s v (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric, + col2 text, + vec2 vector, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- v s (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 vector, + w2 numeric, + col2 text, + vec2 sparsevec, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + + -- s (s) (s) + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 sparsevec, + w2 numeric = 0, + col2 text = NULL, + vec2 sparsevec = NULL, + w3 numeric = 0, + col3 text = NULL, + vec3 sparsevec = NULL, + ef integer = 100, + max_dist numeric = NULL, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) RETURNS TABLE ("row" anyelement) AS $$ + BEGIN + RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); + END $$ LANGUAGE plpgsql; + END IF; + +END +$weighted_vector_search$ LANGUAGE plpgsql; + +SELECT _lantern_internal.maybe_setup_weighted_vector_search(); +DROP FUNCTION _lantern_internal.maybe_setup_weighted_vector_search; \ No newline at end of file diff --git a/test/expected/weighted_search.out b/test/expected/weighted_search.out index 16e48d5ce..dfd8ea5f5 100644 --- a/test/expected/weighted_search.out +++ b/test/expected/weighted_search.out @@ -522,29 +522,42 @@ SELECT count(*) 15 (1 row) --- Check if the type 'sparsevec' exists and store the result in a variable -SELECT EXISTS ( - SELECT 1 - FROM pg_type - WHERE typname = 'sparsevec' -) AS exists_sparsevec \gset --- Conditional execution based on the variable -\if :exists_sparsevec - \echo 'The sparsevec type exists. Running commands...' - \ir utils/small_world_sparsevec.sql - SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset - SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset - SELECT - id, - 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist - FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, - w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, - w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector - ); -\else - \echo 'The sparsevec type does not exist. Skipping commands...' -The sparsevec type does not exist. Skipping commands... -\endif +-- test sparsevec +\ir utils/small_world_sparsevec.sql +CREATE TABLE small_world ( + id VARCHAR(3), + b BOOLEAN, + v VECTOR(3), + s SPARSEVEC(3) +); +INSERT INTO small_world (id, b, v, s) VALUES + ('000', TRUE, '[0,0,0]', '{}/3'), + ('001', TRUE, '[0,0,1]', '{3:1}/3'), + ('010', FALSE, '[0,1,0]' , '{2:1}/3'), + ('011', TRUE, '[0,1,1]', '{2:1,3:1}/3'), + ('100', FALSE, '[1,0,0]', '{1:1}/3'), + ('101', FALSE, '[1,0,1]', '{1:1,3:1}/3'), + ('110', FALSE, '[1,1,0]', '{1:1,2:1}/3'), + ('111', TRUE, '[1,1,1]', '{1:1,2:1,3:1}/3'); +SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset +SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset +SELECT + id, + 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist +FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, + w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, + w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector +); + id | dist +-----+-------------------- + 000 | 0.5438256229963183 + 100 | 0.7832971001877798 + 010 | 0.8720951452234501 + 001 | 0.9887007171012123 + 110 | 1.0374727590416999 + 011 | 1.1996858765930867 +(6 rows) + -- create non superuser and test the function SET client_min_messages = WARNING; -- suppress NOTICE: role "test_user" does not exist, skipping diff --git a/test/sql/weighted_search.sql b/test/sql/weighted_search.sql index f860118eb..408703a6a 100644 --- a/test/sql/weighted_search.sql +++ b/test/sql/weighted_search.sql @@ -120,29 +120,17 @@ SELECT count(*) w3=> 0.52, col3=>'v_real'::text, vec3=>:'v444'::vector ); --- Check if the type 'sparsevec' exists and store the result in a variable -SELECT EXISTS ( - SELECT 1 - FROM pg_type - WHERE typname = 'sparsevec' -) AS exists_sparsevec \gset - --- Conditional execution based on the variable -\if :exists_sparsevec - \echo 'The sparsevec type exists. Running commands...' - \ir utils/small_world_sparsevec.sql - SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset - SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset - SELECT - id, - 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist - FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, - w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, - w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector - ); -\else - \echo 'The sparsevec type does not exist. Skipping commands...' -\endif +-- test sparsevec +\ir utils/small_world_sparsevec.sql +SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset +SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset +SELECT + id, + 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist +FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, + w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, + w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector +); -- create non superuser and test the function SET client_min_messages = WARNING; From 4a20f395f959cae6463584d9ede5c7bcff755fa1 Mon Sep 17 00:00:00 2001 From: Di Qi Date: Sat, 24 Aug 2024 00:29:16 -0700 Subject: [PATCH 5/7] Fix test difference due to CPU architecture inconsistency --- test/expected/weighted_search.out | 18 +++++++++--------- test/sql/weighted_search.sql | 2 +- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/test/expected/weighted_search.out b/test/expected/weighted_search.out index dfd8ea5f5..6e1e39bb3 100644 --- a/test/expected/weighted_search.out +++ b/test/expected/weighted_search.out @@ -543,19 +543,19 @@ SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset SELECT id, - 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist + round(cast(0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as numeric), 2) as dist FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector ); - id | dist ------+-------------------- - 000 | 0.5438256229963183 - 100 | 0.7832971001877798 - 010 | 0.8720951452234501 - 001 | 0.9887007171012123 - 110 | 1.0374727590416999 - 011 | 1.1996858765930867 + id | dist +-----+------ + 000 | 0.54 + 100 | 0.78 + 010 | 0.87 + 001 | 0.99 + 110 | 1.04 + 011 | 1.20 (6 rows) -- create non superuser and test the function diff --git a/test/sql/weighted_search.sql b/test/sql/weighted_search.sql index 408703a6a..3cbd01e5b 100644 --- a/test/sql/weighted_search.sql +++ b/test/sql/weighted_search.sql @@ -126,7 +126,7 @@ SELECT '{1:0.4,2:0.3,3:0.2}/3' AS s3 \gset SELECT '[-0.5,-0.1,-0.3]' AS v3 \gset SELECT id, - 0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as dist + round(cast(0.9 * (s <-> :'s3'::sparsevec) + 0.1 * (v <-> :'v3'::vector) as numeric), 2) as dist FROM lantern.weighted_vector_search(CAST(NULL as "small_world"), exact => false, ef => 5, w1=> 0.9, col1=>'s'::text, vec1=>:'s3'::sparsevec, w2=> 0.1, col2=>'v'::text, vec2=>:'v3'::vector From 2eff4f16b8f6078f94cfb3ee48b6a7fcdac25d49 Mon Sep 17 00:00:00 2001 From: Di Qi Date: Mon, 26 Aug 2024 14:48:09 -0700 Subject: [PATCH 6/7] Simplify code --- sql/lantern.sql | 747 ++++------------------------------- sql/updates/0.3.2--0.3.3.sql | 747 ++++------------------------------- 2 files changed, 166 insertions(+), 1328 deletions(-) diff --git a/sql/lantern.sql b/sql/lantern.sql index 9982cdc03..f49f0ec80 100644 --- a/sql/lantern.sql +++ b/sql/lantern.sql @@ -668,6 +668,25 @@ $weighted_vector_search$ DECLARE pgvector_exists boolean; pgvector_sparsevec_exists boolean; + + -- required type exist, v1 input type, v2 input type with defaults, v3 input type with defaults, v1 input type, v2 input type, v3 input type + search_inputs text[4][] := ARRAY[ + ARRAY['vector', 'vector', 'vector = NULL', 'vector = NULL'], + ARRAY['sparsevec', 'sparsevec', 'vector', 'vector'], + ARRAY['sparsevec', 'vector', 'sparsevec', 'vector'], + ARRAY['sparsevec', 'vector', 'vector', 'sparsevec'], + ARRAY['sparsevec', 'sparsevec', 'sparsevec', 'vector'], + ARRAY['sparsevec', 'sparsevec', 'vector', 'sparsevec = NULL'], + ARRAY['sparsevec', 'vector', 'sparsevec', 'sparsevec = NULL'], + ARRAY['sparsevec', 'sparsevec', 'sparsevec = NULL', 'sparsevec = NULL'] + ]; + + -- function suffix, function default operator + utility_functions text[2][] := ARRAY[ + ARRAY['', '<->'], + ARRAY['_cos', '<->'], + ARRAY['_l2sq', '<=>'] + ]; BEGIN -- Check if the vector type from pgvector exists SELECT EXISTS ( @@ -719,10 +738,6 @@ BEGIN wc1 text = NULL; wc2 text = NULL; wc3 text = NULL; - is_sparsevec_regex text = '\{\d+:\d+(\.\d+)?(,\d+:\d+(\.\d+)?)*\}/\d+'; - vec1_string text = NULL; - vec2_string text = NULL; - vec3_string text = NULL; cte_query text; maybe_unions_query text; final_query text; @@ -766,38 +781,13 @@ BEGIN maybe_analyze := 'ANALYZE, BUFFERS,'; END IF; - -- Generate vector strings - -- the cast is necessary for cases when the column is not of type vector - -- and for some reason in those cases cast does not happen automatically - IF vec1 IS NOT NULL THEN - IF vec1 ~ is_sparsevec_regex THEN - vec1_string := vec1 || '::sparsevec'; - ELSE - vec1_string := vec1 || '::vector'; - END IF; - END IF; - IF vec2 IS NOT NULL THEN - IF vec2 ~ is_sparsevec_regex THEN - vec2_string := vec2 || '::sparsevec'; - ELSE - vec2_string := vec2 || '::vector'; - END IF; - END IF; - IF vec3 IS NOT NULL THEN - IF vec3 ~ is_sparsevec_regex THEN - vec3_string := vec3 || '::sparsevec'; - ELSE - vec3_string := vec3 || '::vector'; - END IF; - END IF; - -- Joint similarity metric condition - wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1_string); + wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1); IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2_string); + wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2); END IF; IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3_string); + wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3); END IF; joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, ''); @@ -828,7 +818,7 @@ BEGIN maybe_unions_query := ''; -- Query 1: Order by first condition's weighted similarity - query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1_string, ef); + query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1, ef); IF debug_output THEN EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count; @@ -839,7 +829,7 @@ BEGIN -- Query 2: Order by other conditions' weighted similarity, if applicable IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2_string, ef); + query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2, ef); cte_query := cte_query || format(', query2 AS (%s)', query2); maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) '); IF debug_output THEN @@ -850,7 +840,7 @@ BEGIN -- Query 3: Order by third condition's weighted similarity, if applicable IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3_string, ef); + query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3, ef); cte_query := cte_query || format(', query3 AS (%s)', query3); maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) '); IF debug_output THEN @@ -883,636 +873,65 @@ BEGIN END $$ LANGUAGE plpgsql; - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement - -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) - -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place - -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - END IF; - - -- setup Cosine API shortcuts - - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_cos setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - END IF; - - -- setup L2SQ API shortcuts - - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_l2sq setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - END IF; + FOR i IN 1 .. array_length(search_inputs, 1) LOOP + FOR j IN 1 .. array_length(utility_functions, 1) LOOP + IF search_inputs[i][1] = 'sparsevec' AND NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + CONTINUE; + END IF; + EXECUTE format($create_weighted_vector_search_functions$ + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search%s( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 %s, + w2 numeric %s, + col2 text %s, + vec2 %s, + w3 numeric %s, + col3 text %s, + vec3 %s, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = %L, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement + -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) + -- then that single "column" is properly spread with source table's column names + -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place + -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + vec1_string text = CASE WHEN vec1 IS NULL THEN '' ELSE format('%%L::%s', vec1) END; + vec2_string text = CASE WHEN vec2 IS NULL THEN '' ELSE format('%%L::%s', vec2) END; + vec3_string text = CASE WHEN vec3 IS NULL THEN '' ELSE format('%%L::%s', vec3) END; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, vec1_string, w2, col2, vec2_string, w3, col3, vec3_string, ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + $create_weighted_vector_search_functions$, + utility_functions[j][1], + search_inputs[i][2], + CASE WHEN search_inputs[i][2] LIKE '%NULL' THEN ' = 0' ELSE '' END, + CASE WHEN search_inputs[i][2] LIKE '%NULL' THEN ' = NULL' ELSE '' END, + search_inputs[i][3], + CASE WHEN search_inputs[i][3] LIKE '%NULL' THEN ' = 0' ELSE '' END, + CASE WHEN search_inputs[i][3] LIKE '%NULL' THEN ' = NULL' ELSE '' END, + search_inputs[i][4], + utility_functions[j][2], + CASE WHEN search_inputs[i][2] LIKE 'sparsevec%%' THEN 'sparsevec' ELSE 'vector' END, + CASE WHEN search_inputs[i][3] LIKE 'sparsevec%%' THEN 'sparsevec' ELSE 'vector' END, + CASE WHEN search_inputs[i][4] LIKE 'sparsevec%%' THEN 'sparsevec' ELSE 'vector' END); + END LOOP; + END LOOP; END $weighted_vector_search$ LANGUAGE plpgsql; diff --git a/sql/updates/0.3.2--0.3.3.sql b/sql/updates/0.3.2--0.3.3.sql index a4f2846c9..e52c6c0ab 100644 --- a/sql/updates/0.3.2--0.3.3.sql +++ b/sql/updates/0.3.2--0.3.3.sql @@ -3,6 +3,25 @@ $weighted_vector_search$ DECLARE pgvector_exists boolean; pgvector_sparsevec_exists boolean; + + -- required type exist, v1 input type, v2 input type with defaults, v3 input type with defaults, v1 input type, v2 input type, v3 input type + search_inputs text[4][] := ARRAY[ + ARRAY['vector', 'vector', 'vector = NULL', 'vector = NULL'], + ARRAY['sparsevec', 'sparsevec', 'vector', 'vector'], + ARRAY['sparsevec', 'vector', 'sparsevec', 'vector'], + ARRAY['sparsevec', 'vector', 'vector', 'sparsevec'], + ARRAY['sparsevec', 'sparsevec', 'sparsevec', 'vector'], + ARRAY['sparsevec', 'sparsevec', 'vector', 'sparsevec = NULL'], + ARRAY['sparsevec', 'vector', 'sparsevec', 'sparsevec = NULL'], + ARRAY['sparsevec', 'sparsevec', 'sparsevec = NULL', 'sparsevec = NULL'] + ]; + + -- function suffix, function default operator + utility_functions text[2][] := ARRAY[ + ARRAY['', '<->'], + ARRAY['_cos', '<->'], + ARRAY['_l2sq', '<=>'] + ]; BEGIN -- Check if the vector type from pgvector exists SELECT EXISTS ( @@ -54,10 +73,6 @@ BEGIN wc1 text = NULL; wc2 text = NULL; wc3 text = NULL; - is_sparsevec_regex text = '\{\d+:\d+(\.\d+)?(,\d+:\d+(\.\d+)?)*\}/\d+'; - vec1_string text = NULL; - vec2_string text = NULL; - vec3_string text = NULL; cte_query text; maybe_unions_query text; final_query text; @@ -101,38 +116,13 @@ BEGIN maybe_analyze := 'ANALYZE, BUFFERS,'; END IF; - -- Generate vector strings - -- the cast is necessary for cases when the column is not of type vector - -- and for some reason in those cases cast does not happen automatically - IF vec1 IS NOT NULL THEN - IF vec1 ~ is_sparsevec_regex THEN - vec1_string := vec1 || '::sparsevec'; - ELSE - vec1_string := vec1 || '::vector'; - END IF; - END IF; - IF vec2 IS NOT NULL THEN - IF vec2 ~ is_sparsevec_regex THEN - vec2_string := vec2 || '::sparsevec'; - ELSE - vec2_string := vec2 || '::vector'; - END IF; - END IF; - IF vec3 IS NOT NULL THEN - IF vec3 ~ is_sparsevec_regex THEN - vec3_string := vec3 || '::sparsevec'; - ELSE - vec3_string := vec3 || '::vector'; - END IF; - END IF; - -- Joint similarity metric condition - wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1_string); + wc1 := format('(%s * (%I %s %s))', w1, col1, distance_operator, vec1); IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2_string); + wc2 := format(' (%s * (%I %s %s))', w2, col2, distance_operator, vec2); END IF; IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3_string); + wc3 := format(' (%s * (%I %s %s))', w3, col3, distance_operator, vec3); END IF; joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, ''); @@ -163,7 +153,7 @@ BEGIN maybe_unions_query := ''; -- Query 1: Order by first condition's weighted similarity - query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1_string, ef); + query1 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1, ef); IF debug_output THEN EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count; @@ -174,7 +164,7 @@ BEGIN -- Query 2: Order by other conditions' weighted similarity, if applicable IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN - query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2_string, ef); + query2 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2, ef); cte_query := cte_query || format(', query2 AS (%s)', query2); maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) '); IF debug_output THEN @@ -185,7 +175,7 @@ BEGIN -- Query 3: Order by third condition's weighted similarity, if applicable IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN - query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3_string, ef); + query3 := format('%s ORDER BY %I %s %s LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3, ef); cte_query := cte_query || format(', query3 AS (%s)', query3); maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) '); IF debug_output THEN @@ -218,636 +208,65 @@ BEGIN END $$ LANGUAGE plpgsql; - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - -- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index. - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement - -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) - -- then that single "column" is properly spread with source table's column names - -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place - -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - distance_operator text = '<->', - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) - RETURNS TABLE ("row" anyelement) AS - $$ - DECLARE - query text; - BEGIN - query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, format('%L', vec1), w2, col2, format('%L', vec2), w3, col3, format('%L', vec3), ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); - RETURN QUERY EXECUTE query; - END - $$ LANGUAGE plpgsql; - END IF; - - -- setup Cosine API shortcuts - - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_cos setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - END IF; - - -- setup L2SQ API shortcuts - - -- v (v) (v) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric = 0, - col2 text = NULL, - vec2 vector = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 vector = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - IF NOT pgvector_sparsevec_exists THEN - RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern.weighted_vector_search_l2sq setup for sparsevec'; - ELSE - -- s v v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v v s - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric, - col3 text, - vec3 sparsevec, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s s v - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric, - col3 text, - vec3 vector, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s v (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric, - col2 text, - vec2 vector, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- v s (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 vector, - w2 numeric, - col2 text, - vec2 sparsevec, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - - -- s (s) (s) - CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq( - relation_type anyelement, - w1 numeric, - col1 text, - vec1 sparsevec, - w2 numeric = 0, - col2 text = NULL, - vec2 sparsevec = NULL, - w3 numeric = 0, - col3 text = NULL, - vec3 sparsevec = NULL, - ef integer = 100, - max_dist numeric = NULL, - id_col text = 'id', - exact boolean = false, - debug_output boolean = false, - analyze_output boolean = false - ) RETURNS TABLE ("row" anyelement) AS $$ - BEGIN - RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output, analyze_output); - END $$ LANGUAGE plpgsql; - END IF; + FOR i IN 1 .. array_length(search_inputs, 1) LOOP + FOR j IN 1 .. array_length(utility_functions, 1) LOOP + IF search_inputs[i][1] = 'sparsevec' AND NOT pgvector_sparsevec_exists THEN + RAISE NOTICE 'pgvector sparsevec type not found. Skipping lantern weighted vector search setup for sparsevec'; + CONTINUE; + END IF; + EXECUTE format($create_weighted_vector_search_functions$ + CREATE OR REPLACE FUNCTION lantern.weighted_vector_search%s( + relation_type anyelement, + w1 numeric, + col1 text, + vec1 %s, + w2 numeric %s, + col2 text %s, + vec2 %s, + w3 numeric %s, + col3 text %s, + vec3 %s, + ef integer = 100, + max_dist numeric = NULL, + distance_operator text = %L, + id_col text = 'id', + exact boolean = false, + debug_output boolean = false, + analyze_output boolean = false + ) + -- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement + -- when there is single "anylement column" being returned (e.g. returns table ("row" anylement)) + -- then that single "column" is properly spread with source table's column names + -- but, when returning ("row" anyelement, "anothercol" integer), things fall all over the place + -- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol" + RETURNS TABLE ("row" anyelement) AS + $$ + DECLARE + query text; + vec1_string text = CASE WHEN vec1 IS NULL THEN '' ELSE format('%%L::%s', vec1) END; + vec2_string text = CASE WHEN vec2 IS NULL THEN '' ELSE format('%%L::%s', vec2) END; + vec3_string text = CASE WHEN vec3 IS NULL THEN '' ELSE format('%%L::%s', vec3) END; + BEGIN + query := _lantern_internal.weighted_vector_search_helper(pg_typeof(relation_type), w1, col1, vec1_string, w2, col2, vec2_string, w3, col3, vec3_string, ef, max_dist, distance_operator, id_col, exact, debug_output, analyze_output); + RETURN QUERY EXECUTE query; + END + $$ LANGUAGE plpgsql; + $create_weighted_vector_search_functions$, + utility_functions[j][1], + search_inputs[i][2], + CASE WHEN search_inputs[i][2] LIKE '%NULL' THEN ' = 0' ELSE '' END, + CASE WHEN search_inputs[i][2] LIKE '%NULL' THEN ' = NULL' ELSE '' END, + search_inputs[i][3], + CASE WHEN search_inputs[i][3] LIKE '%NULL' THEN ' = 0' ELSE '' END, + CASE WHEN search_inputs[i][3] LIKE '%NULL' THEN ' = NULL' ELSE '' END, + search_inputs[i][4], + utility_functions[j][2], + CASE WHEN search_inputs[i][2] LIKE 'sparsevec%%' THEN 'sparsevec' ELSE 'vector' END, + CASE WHEN search_inputs[i][3] LIKE 'sparsevec%%' THEN 'sparsevec' ELSE 'vector' END, + CASE WHEN search_inputs[i][4] LIKE 'sparsevec%%' THEN 'sparsevec' ELSE 'vector' END); + END LOOP; + END LOOP; END $weighted_vector_search$ LANGUAGE plpgsql; From 17bc1f5117b0251a18c1439aec5c37358886a371 Mon Sep 17 00:00:00 2001 From: Di Qi Date: Mon, 2 Sep 2024 18:21:26 -0700 Subject: [PATCH 7/7] Add loop test --- scripts/integration_tests.py | 1 + scripts/test_weighted_search.py | 67 +++++++++++++++++++++++++++++++++ sql/lantern.sql | 4 +- sql/updates/0.3.2--0.3.3.sql | 4 +- 4 files changed, 72 insertions(+), 4 deletions(-) create mode 100644 scripts/test_weighted_search.py diff --git a/scripts/integration_tests.py b/scripts/integration_tests.py index be1cd41c4..db34cae17 100644 --- a/scripts/integration_tests.py +++ b/scripts/integration_tests.py @@ -650,6 +650,7 @@ def test_vector_search_with_filter(primary, source_table): row[2] == filter_val ), f"Expected all results to have random_bool == {filter_val}" + # fixture to handle external index server setup @pytest.fixture def external_index(request): diff --git a/scripts/test_weighted_search.py b/scripts/test_weighted_search.py new file mode 100644 index 000000000..8691fb5ba --- /dev/null +++ b/scripts/test_weighted_search.py @@ -0,0 +1,67 @@ +import psycopg2 + +# Database connection parameters +db_params = { + 'database': 'postgres', + 'user': 'postgres', # Update with your username if different + 'password': '', # Update with your password if required + 'host': 'localhost', + 'port': '5432' +} + +# Connect to the database +conn = psycopg2.connect(**db_params) +conn.autocommit = True +cur = conn.cursor() + +# Execute the SQL commands +cur.execute(""" +DROP EXTENSION IF EXISTS lantern; +CREATE EXTENSION IF NOT EXISTS vector; +CREATE EXTENSION IF NOT EXISTS lantern; + +CREATE TABLE IF NOT EXISTS small_world_weighted_search ( + id VARCHAR(3) PRIMARY KEY, + b BOOLEAN, + v VECTOR(3), + s SPARSEVEC(3) +); + +INSERT INTO small_world_weighted_search (id, b, v, s) VALUES + ('000', TRUE, '[0,0,0]', '{}/3'), + ('001', TRUE, '[0,0,1]', '{3:1}/3'), + ('010', FALSE, '[0,1,0]' , '{2:1}/3'), + ('011', TRUE, '[0,1,1]', '{2:1,3:1}/3'), + ('100', FALSE, '[1,0,0]', '{1:1}/3'), + ('101', FALSE, '[1,0,1]', '{1:1,3:1}/3'), + ('110', FALSE, '[1,1,0]', '{1:1,2:1}/3'), + ('111', TRUE, '[1,1,1]', '{1:1,2:1,3:1}/3') +ON CONFLICT DO NOTHING; +""") + +distance_metrics = ["", "cos", "l2sq"] +for distance_metric in distance_metrics: + operator = op = { 'l2sq': '<->', 'cos': '<=>', 'hamming': '<+>' }[distance_metric or 'l2sq'] + query_s = "{1:0.4,2:0.3,3:0.2}/3" + query_v = "[-0.5,-0.1,-0.3]" + function = f'weighted_vector_search_{distance_metric}' if distance_metric else 'weighted_vector_search' + query = f""" + SELECT + id, + round(cast(0.9 * (s {operator} '{query_s}'::sparsevec) + 0.1 * (v {operator} '{query_v}'::vector) as numeric), 2) as dist + FROM lantern.{function}(CAST(NULL as "small_world_weighted_search"), distance_operator=>'{operator}', + w1=> 0.9, col1=>'s'::text, vec1=>'{query_s}'::sparsevec, + w2=> 0.1, col2=>'v'::text, vec2=>'{query_v}'::vector + ) + LIMIT 3; + """ + cur.execute(query) + res = cur.fetchall() + res = [(key, float(value)) for key, value in res] + + expected_results_cos = [('111', 0.22), ('110', 0.24), ('101', 0.39)] + expected_results_l2sq = [('000', 0.54), ('100', 0.78), ('010', 0.87)] + if distance_metric == 'cos': + assert res == expected_results_cos + else: + assert res == expected_results_l2sq \ No newline at end of file diff --git a/sql/lantern.sql b/sql/lantern.sql index f49f0ec80..a1b65ce9b 100644 --- a/sql/lantern.sql +++ b/sql/lantern.sql @@ -684,8 +684,8 @@ DECLARE -- function suffix, function default operator utility_functions text[2][] := ARRAY[ ARRAY['', '<->'], - ARRAY['_cos', '<->'], - ARRAY['_l2sq', '<=>'] + ARRAY['_cos', '<=>'], + ARRAY['_l2sq', '<->'] ]; BEGIN -- Check if the vector type from pgvector exists diff --git a/sql/updates/0.3.2--0.3.3.sql b/sql/updates/0.3.2--0.3.3.sql index e52c6c0ab..131cecddf 100644 --- a/sql/updates/0.3.2--0.3.3.sql +++ b/sql/updates/0.3.2--0.3.3.sql @@ -19,8 +19,8 @@ DECLARE -- function suffix, function default operator utility_functions text[2][] := ARRAY[ ARRAY['', '<->'], - ARRAY['_cos', '<->'], - ARRAY['_l2sq', '<=>'] + ARRAY['_cos', '<=>'], + ARRAY['_l2sq', '<->'] ]; BEGIN -- Check if the vector type from pgvector exists