Added jaccard_distance function

This commit is contained in:
Andrew Kane
2024-03-25 22:35:53 -07:00
parent e7a7936bb2
commit 791fc2436f
7 changed files with 76 additions and 0 deletions

View File

@@ -2,6 +2,7 @@
- Added support for binary vectors to HNSW
- Added `hamming_distance` function
- Added `jaccard_distance` function
- Added `quantize_binary` function
## 0.6.2 (2024-03-18)

View File

@@ -728,6 +728,7 @@ Operator | Description | Added
Function | Description | Added
--- | --- | ---
hamming_distance(bit, bit) → double precision | Hamming distance | 0.7.0
jaccard_distance(bit, bit) → double precision | Jaccard distance | 0.7.0
## Installation Notes - Linux and Mac

View File

@@ -7,6 +7,9 @@ CREATE FUNCTION quantize_binary(vector) RETURNS bit
CREATE FUNCTION hamming_distance(bit, bit) RETURNS float8
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION jaccard_distance(bit, bit) RETURNS float8
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE OPERATOR <~> (
LEFTARG = bit, RIGHTARG = bit, PROCEDURE = hamming_distance,
COMMUTATOR = '<~>'

View File

@@ -296,6 +296,9 @@ CREATE OPERATOR CLASS vector_cosine_ops
CREATE FUNCTION hamming_distance(bit, bit) RETURNS float8
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION jaccard_distance(bit, bit) RETURNS float8
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE OPERATOR <~> (
LEFTARG = bit, RIGHTARG = bit, PROCEDURE = hamming_distance,
COMMUTATOR = '<~>'

View File

@@ -58,3 +58,32 @@ hamming_distance(PG_FUNCTION_ARGS)
PG_RETURN_FLOAT8((double) distance);
}
/*
* Get the Jaccard distance between two bit strings
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(jaccard_distance);
Datum
jaccard_distance(PG_FUNCTION_ARGS)
{
VarBit *a = PG_GETARG_VARBIT_P(0);
VarBit *b = PG_GETARG_VARBIT_P(1);
unsigned char *ax = VARBITS(a);
unsigned char *bx = VARBITS(b);
uint64 aa;
uint64 bb;
uint64 ab = 0;
CheckBitLengths(VARBITLEN(a), VARBITLEN(b));
/* TODO Improve performance */
aa = pg_popcount((char *) ax, VARBITBYTES(a));
bb = pg_popcount((char *) bx, VARBITBYTES(b));
for (uint32 i = 0; i < VARBITBYTES(a); i++)
ab += pg_number_of_ones[ax[i] & bx[i]];
if (ab == 0)
PG_RETURN_FLOAT8(1);
PG_RETURN_FLOAT8(1 - (ab / ((double) (aa + bb - ab))));
}

View File

@@ -234,6 +234,38 @@ SELECT hamming_distance(B'111', B'000');
SELECT hamming_distance(B'111', B'00');
ERROR: different bit lengths 3 and 2
SELECT jaccard_distance(B'1111', B'1111');
jaccard_distance
------------------
0
(1 row)
SELECT jaccard_distance(B'1111', B'1110');
jaccard_distance
------------------
0.25
(1 row)
SELECT jaccard_distance(B'1111', B'1100');
jaccard_distance
------------------
0.5
(1 row)
SELECT jaccard_distance(B'1111', B'1000');
jaccard_distance
------------------
0.75
(1 row)
SELECT jaccard_distance(B'1111', B'0000');
jaccard_distance
------------------
1
(1 row)
SELECT jaccard_distance(B'1111', B'000');
ERROR: different bit lengths 4 and 3
SELECT quantize_binary('[1,0,-1]');
quantize_binary
-----------------

View File

@@ -54,6 +54,13 @@ SELECT hamming_distance(B'111', B'100');
SELECT hamming_distance(B'111', B'000');
SELECT hamming_distance(B'111', B'00');
SELECT jaccard_distance(B'1111', B'1111');
SELECT jaccard_distance(B'1111', B'1110');
SELECT jaccard_distance(B'1111', B'1100');
SELECT jaccard_distance(B'1111', B'1000');
SELECT jaccard_distance(B'1111', B'0000');
SELECT jaccard_distance(B'1111', B'000');
SELECT quantize_binary('[1,0,-1]');
SELECT quantize_binary('[0,0.1,-0.2,-0.3,0.4,0.5,0.6,-0.7,0.8,-0.9,1]');