From 02c4f4884cca01c90e20286254905d27e042e9a7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 25 Mar 2024 22:44:51 -0700 Subject: [PATCH] Added support for indexing Jaccard distance --- README.md | 1 + sql/vector--0.6.2--0.7.0.sql | 10 ++++++++++ sql/vector.sql | 10 ++++++++++ test/expected/hnsw_jaccard.out | 21 +++++++++++++++++++++ test/sql/hnsw_jaccard.sql | 12 ++++++++++++ 5 files changed, 54 insertions(+) create mode 100644 test/expected/hnsw_jaccard.out create mode 100644 test/sql/hnsw_jaccard.sql diff --git a/README.md b/README.md index 1364297..84dc7be 100644 --- a/README.md +++ b/README.md @@ -722,6 +722,7 @@ sum(vector) → vector | sum | 0.5.0 Operator | Description | Added --- | --- | --- <~> | Hamming distance | 0.7.0 +<%> | Jaccard distance | 0.7.0 ### Bit Functions diff --git a/sql/vector--0.6.2--0.7.0.sql b/sql/vector--0.6.2--0.7.0.sql index ba4504d..68409d3 100644 --- a/sql/vector--0.6.2--0.7.0.sql +++ b/sql/vector--0.6.2--0.7.0.sql @@ -15,7 +15,17 @@ CREATE OPERATOR <~> ( COMMUTATOR = '<~>' ); +CREATE OPERATOR <%> ( + LEFTARG = bit, RIGHTARG = bit, PROCEDURE = jaccard_distance, + COMMUTATOR = '<%>' +); + CREATE OPERATOR CLASS bit_hamming_ops FOR TYPE bit USING hnsw AS OPERATOR 1 <~> (bit, bit) FOR ORDER BY float_ops, FUNCTION 1 hamming_distance(bit, bit); + +CREATE OPERATOR CLASS bit_jaccard_ops + FOR TYPE bit USING hnsw AS + OPERATOR 1 <%> (bit, bit) FOR ORDER BY float_ops, + FUNCTION 1 jaccard_distance(bit, bit); diff --git a/sql/vector.sql b/sql/vector.sql index 6edea66..fc253c6 100644 --- a/sql/vector.sql +++ b/sql/vector.sql @@ -304,7 +304,17 @@ CREATE OPERATOR <~> ( COMMUTATOR = '<~>' ); +CREATE OPERATOR <%> ( + LEFTARG = bit, RIGHTARG = bit, PROCEDURE = jaccard_distance, + COMMUTATOR = '<%>' +); + CREATE OPERATOR CLASS bit_hamming_ops FOR TYPE bit USING hnsw AS OPERATOR 1 <~> (bit, bit) FOR ORDER BY float_ops, FUNCTION 1 hamming_distance(bit, bit); + +CREATE OPERATOR CLASS bit_jaccard_ops + FOR TYPE bit USING hnsw AS + OPERATOR 1 <%> (bit, bit) FOR ORDER BY float_ops, + FUNCTION 1 jaccard_distance(bit, bit); diff --git a/test/expected/hnsw_jaccard.out b/test/expected/hnsw_jaccard.out new file mode 100644 index 0000000..6524f00 --- /dev/null +++ b/test/expected/hnsw_jaccard.out @@ -0,0 +1,21 @@ +SET enable_seqscan = off; +CREATE TABLE t (val bit(4)); +INSERT INTO t (val) VALUES (B'0000'), (B'1100'), (B'1111'), (NULL); +CREATE INDEX ON t USING hnsw (val bit_jaccard_ops); +INSERT INTO t (val) VALUES (B'1110'); +SELECT * FROM t ORDER BY val <%> B'1111'; + val +------ + 1111 + 1110 + 1100 + 0000 +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <%> (SELECT NULL::bit)) t2; + count +------- + 4 +(1 row) + +DROP TABLE t; diff --git a/test/sql/hnsw_jaccard.sql b/test/sql/hnsw_jaccard.sql new file mode 100644 index 0000000..ca61c53 --- /dev/null +++ b/test/sql/hnsw_jaccard.sql @@ -0,0 +1,12 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val bit(4)); +INSERT INTO t (val) VALUES (B'0000'), (B'1100'), (B'1111'), (NULL); +CREATE INDEX ON t USING hnsw (val bit_jaccard_ops); + +INSERT INTO t (val) VALUES (B'1110'); + +SELECT * FROM t ORDER BY val <%> B'1111'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <%> (SELECT NULL::bit)) t2; + +DROP TABLE t;