Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Narek/bm25 bloom #343

Merged
merged 27 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
6623034
Add bloom filter type and casts from postgres arrays
Ngalstyan4 Oct 14, 2024
29bb7bb
Rename cargo to cargo.toml in .cargo
Ngalstyan4 Oct 15, 2024
5a96375
Add rust and tsvector stemmers API
Ngalstyan4 Oct 16, 2024
a2e95a3
Add bm25 table creation API
Ngalstyan4 Oct 16, 2024
68efb59
Implement bm25_agg acreator
Ngalstyan4 Oct 16, 2024
994e79c
Fix typo in text_to_stem_array function name
Ngalstyan4 Oct 16, 2024
d45768c
Explicitly order the output of unnest, since there are no guarantees …
Ngalstyan4 Oct 16, 2024
cc09f9c
Fix cast issue
Ngalstyan4 Oct 16, 2024
10b7edd
Add debug print possibility to bm25_agg test
Ngalstyan4 Oct 16, 2024
28e7a99
Fix bm25/idf algo implementation bug
Ngalstyan4 Oct 16, 2024
91d0f09
Add option to recreate bm25 table
Ngalstyan4 Oct 16, 2024
25a6d5f
Improve error messages when set_user_stopwords fails to write to file
Ngalstyan4 Oct 16, 2024
07caa49
Add search_bm25 API
Ngalstyan4 Oct 17, 2024
f510be4
Make bm25 aux table logged
Ngalstyan4 Oct 22, 2024
bcf592c
Store table stats in _bm25 table to avoid expensive aggregates at que…
Ngalstyan4 Oct 28, 2024
865d47e
Make content_join optional in search
Ngalstyan4 Oct 28, 2024
54c0028
Fix lifetime issues in tests with pgrx 0.12
Ngalstyan4 Oct 29, 2024
18e0338
Add bm25 approximation threshhold guc
Ngalstyan4 Nov 6, 2024
ae23df4
Add bm25 hyperparameter GUCs
Ngalstyan4 Nov 6, 2024
c014921
fix syntax error
Ngalstyan4 Nov 15, 2024
28d6a52
Add bm25_score function and tests
Ngalstyan4 Nov 6, 2024
de92e0d
Add bm25 aux table invariant tests
Ngalstyan4 Nov 6, 2024
a692e88
Fix typo
Ngalstyan4 Nov 15, 2024
ee1cb43
Add lantern_extras mod to make sure corresponding SQL schema is gener…
Ngalstyan4 Nov 15, 2024
c943719
Fix tsearch permission
Ngalstyan4 Nov 15, 2024
a0a7c42
Change write permissions on tsearch directory
Ngalstyan4 Nov 15, 2024
e918320
Quote table name identifiers in bm25 queries
Ngalstyan4 Nov 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
File renamed without changes.
3 changes: 3 additions & 0 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ jobs:
run: |
sudo chmod 777 -R "/usr/lib/postgresql/${PG_VERSION}/lib/"
sudo chmod 777 -R "/usr/share/postgresql/${PG_VERSION}/extension/"
# make sure the rust function exposing custom dict API can change the dictionary files
sudo chown -R postgres:postgres /usr/share/postgresql/${PG_VERSION}/tsearch_data/
sudo chmod 777 -R /usr/share/postgresql/${PG_VERSION}/tsearch_data/
env:
PG_VERSION: ${{ matrix.postgres }}
- name: Run tests
Expand Down
5 changes: 5 additions & 0 deletions lantern_extras/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ rand = "0.8.5"
serde_json = "1.0.132"
tokio-util = "0.7.12"
tokio = { version = "1.41.0", features = ["rt-multi-thread"] }
serde = { version = "1.0", features = ["derive"] }
serde_bytes = "0.11.3"
binary-heap-plus = "0.5.0"
fastbloom = "0.7.1"
rust-stemmers = { git = "https://github.com/Ngalstyan4/rust-stemmers.git", branch = "narek/drop-unused-dependency" }

[dev-dependencies]
pgrx-tests = "=0.12.7"
Expand Down
90 changes: 90 additions & 0 deletions lantern_extras/src/bloom.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
use fastbloom::BloomFilter;
use pgrx::prelude::*;
use pgrx::PostgresType;
use serde::{Deserialize, Serialize};

const BLOOM_HASHER_SEED: u128 = 42;

// this is called "Bloom" to make sure postgres type has the name 'bloom'
#[derive(Clone, Debug, Serialize, Deserialize, PostgresType)]
pub struct Bloom {
#[serde(with = "serde_bytes")]
bitmap: Vec<u8>,
num_hashes: u32,
}

impl From<BloomFilter> for Bloom {
fn from(bloom_filter: BloomFilter) -> Self {
let v = bloom_filter.as_slice().to_vec();
let bitmap =
unsafe { Vec::from_raw_parts(v.as_ptr() as *mut u8, v.len() * 8, v.capacity() * 8) };
std::mem::forget(v);
Bloom {
bitmap,
num_hashes: bloom_filter.num_hashes(),
}
}
}

impl From<Bloom> for BloomFilter {
#[inline(never)]
fn from(bloom: Bloom) -> Self {
let bitmap = unsafe {
Vec::from_raw_parts(
bloom.bitmap.as_ptr() as *mut u64,
bloom.bitmap.len() / 8,
bloom.bitmap.capacity() / 8,
)
};
std::mem::forget(bloom.bitmap);
BloomFilter::from_vec(bitmap)
.seed(&BLOOM_HASHER_SEED)
.hashes(bloom.num_hashes)
}
}

fn array_to_bloom<T: std::hash::Hash>(arr: Vec<T>) -> Bloom {
let mut bloom = BloomFilter::with_false_pos(0.01)
.seed(&BLOOM_HASHER_SEED)
.expected_items(arr.len());
for i in arr {
bloom.insert(&i);
}
return bloom.into();
}

#[pg_extern(immutable, parallel_safe, name = "array_to_bloom")]
fn array_to_bloom_smallint(arr: Vec<i16>) -> Bloom {
return array_to_bloom(arr);
}

#[pg_extern(immutable, parallel_safe, name = "array_to_bloom")]
fn array_to_bloom_integer(arr: Vec<i32>) -> Bloom {
return array_to_bloom(arr);
}

#[pg_extern(immutable, parallel_safe, name = "array_to_bloom")]
fn array_to_bloom_bigint(arr: Vec<i64>) -> Bloom {
return array_to_bloom(arr);
}

#[pg_extern(requires = [Bloom])]
fn elem_in_bloom(elem: i32, bloom: Bloom) -> bool {
let bloom: BloomFilter = bloom.into();
bloom.contains(&elem)
}

extension_sql!(
r#"
CREATE CAST (smallint[] AS bloom) WITH FUNCTION array_to_bloom(smallint[]);
CREATE CAST (integer[] AS bloom) WITH FUNCTION array_to_bloom(integer[]);
CREATE CAST (bigint[] AS bloom) WITH FUNCTION array_to_bloom(bigint[]);
"#,
name = "bloom_type_casts",
requires = [
Bloom,
array_to_bloom_smallint,
array_to_bloom_integer,
array_to_bloom_bigint,
]
);
Loading
Loading