-
Notifications
You must be signed in to change notification settings - Fork 59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Integer array type conversion in array_dist to compute hamming distance #191
Changes from 1 commit
7225b4c
3dba819
5054cf9
b000930
099d476
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -291,10 +291,36 @@ static float4 array_dist(ArrayType *a, ArrayType *b, usearch_metric_kind_t metri | |
elog(ERROR, "expected equally sized arrays but got arrays with dimensions %d and %d", a_dim, b_dim); | ||
} | ||
|
||
float4 *ax = (float4 *)ARR_DATA_PTR(a); | ||
float4 *bx = (float4 *)ARR_DATA_PTR(b); | ||
float4 *ax; | ||
float4 *bx; | ||
|
||
return usearch_dist(ax, bx, metric_kind, a_dim, usearch_scalar_f32_k); | ||
bool convert_to_int = (metric_kind == usearch_metric_hamming_k); | ||
|
||
if(convert_to_int) { | ||
int32 *ax_int = (int32*) ARR_DATA_PTR(a); | ||
int32 *bx_int = (int32*) ARR_DATA_PTR(b); | ||
|
||
ax = (float4*) palloc(a_dim * sizeof(float4)); | ||
bx = (float4*) palloc(b_dim * sizeof(float4)); | ||
|
||
for (int i = 0; i < a_dim; i++) { | ||
ax[i] = (float4) ax_int[i]; | ||
bx[i] = (float4) bx_int[i]; | ||
} | ||
} | ||
else { | ||
ax = (float4*)ARR_DATA_PTR(a); | ||
bx = (float4*)ARR_DATA_PTR(b); | ||
} | ||
|
||
float4 result = usearch_dist(ax, bx, metric_kind, a_dim, usearch_scalar_f32_k); | ||
|
||
if(convert_to_int) { | ||
pfree(ax); | ||
pfree(bx); | ||
} | ||
|
||
return result; | ||
} | ||
|
||
static float8 vector_dist(Vector *a, Vector *b, usearch_metric_kind_t metric_kind) | ||
|
@@ -330,7 +356,7 @@ Datum hamming_dist(PG_FUNCTION_ARGS) | |
{ | ||
ArrayType *a = PG_GETARG_ARRAYTYPE_P(0); | ||
ArrayType *b = PG_GETARG_ARRAYTYPE_P(1); | ||
PG_RETURN_INT32(array_dist(a, b, usearch_metric_hamming_k)); | ||
PG_RETURN_INT32((int32)array_dist(a, b, usearch_metric_hamming_k)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is returning the distance between vectors and not the vectors themselves. Why should this distance be an integer? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hamming distance is always an integer and we also pass it to PG_RETURN_INT32 which returns an integer in postgres. So I added this explicit cast since |
||
} | ||
|
||
PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_l2sq_dist); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I do not think this really solves the issue. We should not be measuring hamming distance of float arrays. We just get lucky here that when we cast ints to floats, they end up having the same byte arrangement and hamming distance calculation (an inherently, bitwise operation) results in the same number it would result in if we directly passed int arrays.
the right approach is to have
usearch_dist
support integer or byte typed arrays.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. I can modify
usearch_dist
to take an integer typed array and compute hamming distance appropriately. Let me know