mirror of
https://github.com/pgvector/pgvector.git
synced 2026-07-04 11:40:57 +08:00
Added support for halfvec to IVFFlat
This commit is contained in:
26
test/expected/ivfflat_halfvec_cosine.out
Normal file
26
test/expected/ivfflat_halfvec_cosine.out
Normal file
@@ -0,0 +1,26 @@
|
||||
SET enable_seqscan = off;
|
||||
CREATE TABLE t (val halfvec(3));
|
||||
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
|
||||
CREATE INDEX ON t USING ivfflat (val halfvec_cosine_ops) WITH (lists = 1);
|
||||
INSERT INTO t (val) VALUES ('[1,2,4]');
|
||||
SELECT * FROM t ORDER BY val <=> '[3,3,3]';
|
||||
val
|
||||
---------
|
||||
[1,1,1]
|
||||
[1,2,3]
|
||||
[1,2,4]
|
||||
(3 rows)
|
||||
|
||||
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '[0,0,0]') t2;
|
||||
count
|
||||
-------
|
||||
3
|
||||
(1 row)
|
||||
|
||||
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::halfvec)) t2;
|
||||
count
|
||||
-------
|
||||
3
|
||||
(1 row)
|
||||
|
||||
DROP TABLE t;
|
||||
21
test/expected/ivfflat_halfvec_ip.out
Normal file
21
test/expected/ivfflat_halfvec_ip.out
Normal file
@@ -0,0 +1,21 @@
|
||||
SET enable_seqscan = off;
|
||||
CREATE TABLE t (val halfvec(3));
|
||||
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
|
||||
CREATE INDEX ON t USING ivfflat (val halfvec_ip_ops) WITH (lists = 1);
|
||||
INSERT INTO t (val) VALUES ('[1,2,4]');
|
||||
SELECT * FROM t ORDER BY val <#> '[3,3,3]';
|
||||
val
|
||||
---------
|
||||
[1,2,4]
|
||||
[1,2,3]
|
||||
[1,1,1]
|
||||
[0,0,0]
|
||||
(4 rows)
|
||||
|
||||
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::halfvec)) t2;
|
||||
count
|
||||
-------
|
||||
4
|
||||
(1 row)
|
||||
|
||||
DROP TABLE t;
|
||||
36
test/expected/ivfflat_halfvec_l2.out
Normal file
36
test/expected/ivfflat_halfvec_l2.out
Normal file
@@ -0,0 +1,36 @@
|
||||
SET enable_seqscan = off;
|
||||
CREATE TABLE t (val halfvec(3));
|
||||
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
|
||||
CREATE INDEX ON t USING ivfflat (val halfvec_l2_ops) WITH (lists = 1);
|
||||
INSERT INTO t (val) VALUES ('[1,2,4]');
|
||||
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
|
||||
val
|
||||
---------
|
||||
[1,2,3]
|
||||
[1,2,4]
|
||||
[1,1,1]
|
||||
[0,0,0]
|
||||
(4 rows)
|
||||
|
||||
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <-> (SELECT NULL::halfvec)) t2;
|
||||
count
|
||||
-------
|
||||
4
|
||||
(1 row)
|
||||
|
||||
SELECT COUNT(*) FROM t;
|
||||
count
|
||||
-------
|
||||
5
|
||||
(1 row)
|
||||
|
||||
TRUNCATE t;
|
||||
NOTICE: ivfflat index created with little data
|
||||
DETAIL: This will cause low recall.
|
||||
HINT: Drop the index until the table has more data.
|
||||
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
|
||||
val
|
||||
-----
|
||||
(0 rows)
|
||||
|
||||
DROP TABLE t;
|
||||
13
test/sql/ivfflat_halfvec_cosine.sql
Normal file
13
test/sql/ivfflat_halfvec_cosine.sql
Normal file
@@ -0,0 +1,13 @@
|
||||
SET enable_seqscan = off;
|
||||
|
||||
CREATE TABLE t (val halfvec(3));
|
||||
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
|
||||
CREATE INDEX ON t USING ivfflat (val halfvec_cosine_ops) WITH (lists = 1);
|
||||
|
||||
INSERT INTO t (val) VALUES ('[1,2,4]');
|
||||
|
||||
SELECT * FROM t ORDER BY val <=> '[3,3,3]';
|
||||
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '[0,0,0]') t2;
|
||||
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::halfvec)) t2;
|
||||
|
||||
DROP TABLE t;
|
||||
12
test/sql/ivfflat_halfvec_ip.sql
Normal file
12
test/sql/ivfflat_halfvec_ip.sql
Normal file
@@ -0,0 +1,12 @@
|
||||
SET enable_seqscan = off;
|
||||
|
||||
CREATE TABLE t (val halfvec(3));
|
||||
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
|
||||
CREATE INDEX ON t USING ivfflat (val halfvec_ip_ops) WITH (lists = 1);
|
||||
|
||||
INSERT INTO t (val) VALUES ('[1,2,4]');
|
||||
|
||||
SELECT * FROM t ORDER BY val <#> '[3,3,3]';
|
||||
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::halfvec)) t2;
|
||||
|
||||
DROP TABLE t;
|
||||
16
test/sql/ivfflat_halfvec_l2.sql
Normal file
16
test/sql/ivfflat_halfvec_l2.sql
Normal file
@@ -0,0 +1,16 @@
|
||||
SET enable_seqscan = off;
|
||||
|
||||
CREATE TABLE t (val halfvec(3));
|
||||
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
|
||||
CREATE INDEX ON t USING ivfflat (val halfvec_l2_ops) WITH (lists = 1);
|
||||
|
||||
INSERT INTO t (val) VALUES ('[1,2,4]');
|
||||
|
||||
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
|
||||
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <-> (SELECT NULL::halfvec)) t2;
|
||||
SELECT COUNT(*) FROM t;
|
||||
|
||||
TRUNCATE t;
|
||||
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
|
||||
|
||||
DROP TABLE t;
|
||||
132
test/t/032_ivfflat_halfvec_build_recall.pl
Normal file
132
test/t/032_ivfflat_halfvec_build_recall.pl
Normal file
@@ -0,0 +1,132 @@
|
||||
use strict;
|
||||
use warnings;
|
||||
use PostgresNode;
|
||||
use TestLib;
|
||||
use Test::More;
|
||||
|
||||
my $node;
|
||||
my @queries = ();
|
||||
my @expected;
|
||||
my $limit = 20;
|
||||
my $dim = 10;
|
||||
my $array_sql = join(",", ('random()') x $dim);
|
||||
|
||||
sub test_recall
|
||||
{
|
||||
my ($probes, $min, $operator) = @_;
|
||||
my $correct = 0;
|
||||
my $total = 0;
|
||||
|
||||
my $explain = $node->safe_psql("postgres", qq(
|
||||
SET enable_seqscan = off;
|
||||
SET ivfflat.probes = $probes;
|
||||
EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v $operator '$queries[0]' LIMIT $limit;
|
||||
));
|
||||
like($explain, qr/Index Scan using idx on tst/);
|
||||
|
||||
for my $i (0 .. $#queries)
|
||||
{
|
||||
my $actual = $node->safe_psql("postgres", qq(
|
||||
SET enable_seqscan = off;
|
||||
SET ivfflat.probes = $probes;
|
||||
SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit;
|
||||
));
|
||||
my @actual_ids = split("\n", $actual);
|
||||
my %actual_set = map { $_ => 1 } @actual_ids;
|
||||
|
||||
my @expected_ids = split("\n", $expected[$i]);
|
||||
|
||||
foreach (@expected_ids)
|
||||
{
|
||||
if (exists($actual_set{$_}))
|
||||
{
|
||||
$correct++;
|
||||
}
|
||||
$total++;
|
||||
}
|
||||
}
|
||||
|
||||
cmp_ok($correct / $total, ">=", $min, $operator);
|
||||
}
|
||||
|
||||
# Initialize node
|
||||
$node = get_new_node('node');
|
||||
$node->init;
|
||||
$node->start;
|
||||
|
||||
# Create table
|
||||
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
|
||||
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v halfvec($dim));");
|
||||
$node->safe_psql("postgres",
|
||||
"INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, 100000) i;"
|
||||
);
|
||||
|
||||
# Generate queries
|
||||
for (1 .. 20)
|
||||
{
|
||||
my @r = ();
|
||||
for (1 .. $dim)
|
||||
{
|
||||
push(@r, rand());
|
||||
}
|
||||
push(@queries, "[" . join(",", @r) . "]");
|
||||
}
|
||||
|
||||
# Check each index type
|
||||
my @operators = ("<->", "<#>", "<=>");
|
||||
my @opclasses = ("halfvec_l2_ops", "halfvec_ip_ops", "halfvec_cosine_ops");
|
||||
|
||||
for my $i (0 .. $#operators)
|
||||
{
|
||||
my $operator = $operators[$i];
|
||||
my $opclass = $opclasses[$i];
|
||||
|
||||
# Get exact results
|
||||
@expected = ();
|
||||
foreach (@queries)
|
||||
{
|
||||
my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v $operator '$_' LIMIT $limit;");
|
||||
push(@expected, $res);
|
||||
}
|
||||
|
||||
# Build index serially
|
||||
$node->safe_psql("postgres", qq(
|
||||
SET max_parallel_maintenance_workers = 0;
|
||||
CREATE INDEX idx ON tst USING ivfflat (v $opclass);
|
||||
));
|
||||
|
||||
# Test approximate results
|
||||
if ($operator ne "<#>")
|
||||
{
|
||||
# TODO Fix test (uniform random vectors all have similar inner product)
|
||||
test_recall(1, 0.4, $operator);
|
||||
test_recall(10, 0.95, $operator);
|
||||
}
|
||||
# Account for equal distances
|
||||
test_recall(100, 0.9925, $operator);
|
||||
|
||||
$node->safe_psql("postgres", "DROP INDEX idx;");
|
||||
|
||||
# Build index in parallel
|
||||
my ($ret, $stdout, $stderr) = $node->psql("postgres", qq(
|
||||
SET client_min_messages = DEBUG;
|
||||
SET min_parallel_table_scan_size = 1;
|
||||
CREATE INDEX idx ON tst USING ivfflat (v $opclass);
|
||||
));
|
||||
is($ret, 0, $stderr);
|
||||
like($stderr, qr/using \d+ parallel workers/);
|
||||
|
||||
# Test approximate results
|
||||
if ($operator ne "<#>")
|
||||
{
|
||||
# TODO Fix test (uniform random vectors all have similar inner product)
|
||||
test_recall(1, 0.4, $operator);
|
||||
test_recall(10, 0.95, $operator);
|
||||
}
|
||||
# Account for equal distances
|
||||
test_recall(100, 0.9925, $operator);
|
||||
|
||||
$node->safe_psql("postgres", "DROP INDEX idx;");
|
||||
}
|
||||
|
||||
done_testing();
|
||||
Reference in New Issue
Block a user