Removed support for L1 distance and Jaccard distance from ivfflat due to non-optimal clustering

This commit is contained in:
Andrew Kane
2024-04-22 14:11:29 -07:00
parent 881fbc15ef
commit b2f7dad8a7
15 changed files with 13 additions and 170 deletions

View File

@@ -3,7 +3,7 @@
- Added `halfvec` type
- Added `sparsevec` type
- Added support for indexing `bit` type
- Added support for indexing L1 distance
- Added support for indexing L1 distance with HNSW
- Added `binary_quantize` function
- Added `hamming_distance` function
- Added `jaccard_distance` function

View File

@@ -356,24 +356,12 @@ Cosine distance
CREATE INDEX ON items USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
```
L1 distance - unreleased
```sql
CREATE INDEX ON items USING ivfflat (embedding vector_l1_ops) WITH (lists = 100);
```
Hamming distance - unreleased
```sql
CREATE INDEX ON items USING ivfflat (embedding bit_hamming_ops) WITH (lists = 100);
```
Jaccard distance - unreleased
```sql
CREATE INDEX ON items USING ivfflat (embedding bit_jaccard_ops) WITH (lists = 100);
```
Supported types are:
- `vector` - up to 2,000 dimensions

View File

@@ -22,12 +22,6 @@ CREATE OPERATOR || (
LEFTARG = vector, RIGHTARG = vector, PROCEDURE = vector_concat
);
CREATE OPERATOR CLASS vector_l1_ops
FOR TYPE vector USING ivfflat AS
OPERATOR 1 <+> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 l1_distance(vector, vector),
FUNCTION 3 l1_distance(vector, vector);
CREATE OPERATOR CLASS vector_l1_ops
FOR TYPE vector USING hnsw AS
OPERATOR 1 <+> (vector, vector) FOR ORDER BY float_ops,
@@ -55,12 +49,6 @@ CREATE OPERATOR CLASS bit_hamming_ops
FUNCTION 1 hamming_distance(bit, bit),
FUNCTION 3 hamming_distance(bit, bit);
CREATE OPERATOR CLASS bit_jaccard_ops
FOR TYPE bit USING ivfflat AS
OPERATOR 1 <%> (bit, bit) FOR ORDER BY float_ops,
FUNCTION 1 jaccard_distance(bit, bit),
FUNCTION 3 jaccard_distance(bit, bit);
CREATE OPERATOR CLASS bit_hamming_ops
FOR TYPE bit USING hnsw AS
OPERATOR 1 <~> (bit, bit) FOR ORDER BY float_ops,
@@ -340,12 +328,6 @@ CREATE OPERATOR CLASS halfvec_cosine_ops
FUNCTION 3 halfvec_spherical_distance(halfvec, halfvec),
FUNCTION 4 l2_norm(halfvec);
CREATE OPERATOR CLASS halfvec_l1_ops
FOR TYPE halfvec USING ivfflat AS
OPERATOR 1 <+> (halfvec, halfvec) FOR ORDER BY float_ops,
FUNCTION 1 l1_distance(halfvec, halfvec),
FUNCTION 3 l1_distance(halfvec, halfvec);
CREATE OPERATOR CLASS halfvec_l2_ops
FOR TYPE halfvec USING hnsw AS
OPERATOR 1 <-> (halfvec, halfvec) FOR ORDER BY float_ops,

View File

@@ -293,12 +293,6 @@ CREATE OPERATOR CLASS vector_cosine_ops
FUNCTION 3 vector_spherical_distance(vector, vector),
FUNCTION 4 vector_norm(vector);
CREATE OPERATOR CLASS vector_l1_ops
FOR TYPE vector USING ivfflat AS
OPERATOR 1 <+> (vector, vector) FOR ORDER BY float_ops,
FUNCTION 1 l1_distance(vector, vector),
FUNCTION 3 l1_distance(vector, vector);
CREATE OPERATOR CLASS vector_l2_ops
FOR TYPE vector USING hnsw AS
OPERATOR 1 <-> (vector, vector) FOR ORDER BY float_ops,
@@ -348,12 +342,6 @@ CREATE OPERATOR CLASS bit_hamming_ops
FUNCTION 1 hamming_distance(bit, bit),
FUNCTION 3 hamming_distance(bit, bit);
CREATE OPERATOR CLASS bit_jaccard_ops
FOR TYPE bit USING ivfflat AS
OPERATOR 1 <%> (bit, bit) FOR ORDER BY float_ops,
FUNCTION 1 jaccard_distance(bit, bit),
FUNCTION 3 jaccard_distance(bit, bit);
CREATE OPERATOR CLASS bit_hamming_ops
FOR TYPE bit USING hnsw AS
OPERATOR 1 <~> (bit, bit) FOR ORDER BY float_ops,
@@ -649,12 +637,6 @@ CREATE OPERATOR CLASS halfvec_cosine_ops
FUNCTION 3 halfvec_spherical_distance(halfvec, halfvec),
FUNCTION 4 l2_norm(halfvec);
CREATE OPERATOR CLASS halfvec_l1_ops
FOR TYPE halfvec USING ivfflat AS
OPERATOR 1 <+> (halfvec, halfvec) FOR ORDER BY float_ops,
FUNCTION 1 l1_distance(halfvec, halfvec),
FUNCTION 3 l1_distance(halfvec, halfvec);
CREATE OPERATOR CLASS halfvec_l2_ops
FOR TYPE halfvec USING hnsw AS
OPERATOR 1 <-> (halfvec, halfvec) FOR ORDER BY float_ops,

View File

@@ -1,21 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val bit(4));
INSERT INTO t (val) VALUES (B'0000'), (B'1100'), (B'1111'), (NULL);
CREATE INDEX ON t USING ivfflat (val bit_jaccard_ops) WITH (lists = 1);
INSERT INTO t (val) VALUES (B'1110');
SELECT * FROM t ORDER BY val <%> B'1111';
val
------
1111
1110
1100
0000
(4 rows)
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <%> (SELECT NULL::bit)) t2;
count
-------
4
(1 row)
DROP TABLE t;

View File

@@ -1,21 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val halfvec(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING ivfflat (val halfvec_l1_ops) WITH (lists = 1);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <+> '[3,3,3]';
val
---------
[1,2,3]
[1,2,4]
[1,1,1]
[0,0,0]
(4 rows)
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <+> (SELECT NULL::halfvec)) t2;
count
-------
4
(1 row)
DROP TABLE t;

View File

@@ -1,21 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING ivfflat (val vector_l1_ops) WITH (lists = 1);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <+> '[3,3,3]';
val
---------
[1,2,3]
[1,2,4]
[1,1,1]
[0,0,0]
(4 rows)
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <+> (SELECT NULL::vector)) t2;
count
-------
4
(1 row)
DROP TABLE t;

View File

@@ -1,12 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val bit(4));
INSERT INTO t (val) VALUES (B'0000'), (B'1100'), (B'1111'), (NULL);
CREATE INDEX ON t USING ivfflat (val bit_jaccard_ops) WITH (lists = 1);
INSERT INTO t (val) VALUES (B'1110');
SELECT * FROM t ORDER BY val <%> B'1111';
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <%> (SELECT NULL::bit)) t2;
DROP TABLE t;

View File

@@ -1,12 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val halfvec(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING ivfflat (val halfvec_l1_ops) WITH (lists = 1);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <+> '[3,3,3]';
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <+> (SELECT NULL::halfvec)) t2;
DROP TABLE t;

View File

@@ -1,12 +0,0 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING ivfflat (val vector_l1_ops) WITH (lists = 1);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <+> '[3,3,3]';
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <+> (SELECT NULL::vector)) t2;
DROP TABLE t;

View File

@@ -70,8 +70,8 @@ for (1 .. 20)
}
# Check each index type
my @operators = ("<->", "<#>", "<=>", "<+>");
my @opclasses = ("vector_l2_ops", "vector_ip_ops", "vector_cosine_ops", "vector_l1_ops");
my @operators = ("<->", "<#>", "<=>");
my @opclasses = ("vector_l2_ops", "vector_ip_ops", "vector_cosine_ops");
for my $i (0 .. $#operators)
{

View File

@@ -17,8 +17,8 @@ $node->safe_psql("postgres",
);
# Check each index type
my @operators = ("<->", "<#>", "<=>", "<+>");
my @opclasses = ("vector_l2_ops", "vector_ip_ops", "vector_cosine_ops", "vector_l1_ops");
my @operators = ("<->", "<#>", "<=>");
my @opclasses = ("vector_l2_ops", "vector_ip_ops", "vector_cosine_ops");
for my $i (0 .. $#operators)
{

View File

@@ -66,8 +66,8 @@ for (1 .. 20)
}
# Check each index type
my @operators = ("<->", "<#>", "<=>", "<+>");
my @opclasses = ("vector_l2_ops", "vector_ip_ops", "vector_cosine_ops", "vector_l1_ops");
my @operators = ("<->", "<#>", "<=>");
my @opclasses = ("vector_l2_ops", "vector_ip_ops", "vector_cosine_ops");
for my $i (0 .. $#operators)
{

View File

@@ -74,8 +74,8 @@ for (1 .. 20)
}
# Check each index type
my @operators = ("<->", "<#>", "<=>", "<+>");
my @opclasses = ("halfvec_l2_ops", "halfvec_ip_ops", "halfvec_cosine_ops", "halfvec_l1_ops");
my @operators = ("<->", "<#>", "<=>");
my @opclasses = ("halfvec_l2_ops", "halfvec_ip_ops", "halfvec_cosine_ops");
for my $i (0 .. $#operators)
{
@@ -102,12 +102,7 @@ for my $i (0 .. $#operators)
));
# Test approximate results
if ($operator eq "<+>")
{
test_recall(1, 0.30, $operator);
test_recall(10, 0.90, $operator);
}
elsif ($operator ne "<#>")
if ($operator ne "<#>")
{
# TODO Fix test (uniform random vectors all have similar inner product)
test_recall(1, 0.34, $operator);
@@ -136,12 +131,7 @@ for my $i (0 .. $#operators)
like($stderr, qr/using \d+ parallel workers/);
# Test approximate results
if ($operator eq "<+>")
{
test_recall(1, 0.30, $operator);
test_recall(10, 0.90, $operator);
}
elsif ($operator ne "<#>")
if ($operator ne "<#>")
{
# TODO Fix test (uniform random vectors all have similar inner product)
test_recall(1, 0.34, $operator);

View File

@@ -70,8 +70,8 @@ for (1 .. 20)
}
# Check each index type
my @operators = ("<~>", "<\%>");
my @opclasses = ("bit_hamming_ops", "bit_jaccard_ops");
my @operators = ("<~>");
my @opclasses = ("bit_hamming_ops");
for my $i (0 .. $#operators)
{