Added HNSW index type - #181

This commit is contained in:
Andrew Kane
2023-08-08 16:42:47 -07:00
parent 19a6c81367
commit 51d292c93d
29 changed files with 3927 additions and 7 deletions

View File

@@ -0,0 +1,26 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_cosine_ops);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <=> '[3,3,3]';
val
---------
[1,1,1]
[1,2,3]
[1,2,4]
(3 rows)
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '[0,0,0]') t2;
count
-------
3
(1 row)
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::vector)) t2;
count
-------
3
(1 row)
DROP TABLE t;

21
test/expected/hnsw_ip.out Normal file
View File

@@ -0,0 +1,21 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_ip_ops);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <#> '[3,3,3]';
val
---------
[1,2,4]
[1,2,3]
[1,1,1]
[0,0,0]
(4 rows)
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::vector)) t2;
count
-------
4
(1 row)
DROP TABLE t;

30
test/expected/hnsw_l2.out Normal file
View File

@@ -0,0 +1,30 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_l2_ops);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
val
---------
[1,2,3]
[1,2,4]
[1,1,1]
[0,0,0]
(4 rows)
SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector);
val
---------
[0,0,0]
[1,1,1]
[1,2,3]
[1,2,4]
(4 rows)
SELECT COUNT(*) FROM t;
count
-------
5
(1 row)
DROP TABLE t;

View File

@@ -0,0 +1,25 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
CREATE INDEX ON t USING hnsw (val vector_l2_ops) WITH (m = 3);
ERROR: value 3 out of bounds for option "m"
DETAIL: Valid values are between "4" and "100".
CREATE INDEX ON t USING hnsw (val vector_l2_ops) WITH (m = 101);
ERROR: value 101 out of bounds for option "m"
DETAIL: Valid values are between "4" and "100".
CREATE INDEX ON t USING hnsw (val vector_l2_ops) WITH (ef_construction = 9);
ERROR: value 9 out of bounds for option "ef_construction"
DETAIL: Valid values are between "10" and "1000".
CREATE INDEX ON t USING hnsw (val vector_l2_ops) WITH (ef_construction = 1001);
ERROR: value 1001 out of bounds for option "ef_construction"
DETAIL: Valid values are between "10" and "1000".
SHOW hnsw.ef_search;
hnsw.ef_search
----------------
40
(1 row)
SET hnsw.ef_search = 9;
ERROR: 9 is outside the valid range for parameter "hnsw.ef_search" (10 .. 1000)
SET hnsw.ef_search = 1001;
ERROR: 1001 is outside the valid range for parameter "hnsw.ef_search" (10 .. 1000)
DROP TABLE t;

View File

@@ -0,0 +1,13 @@
SET enable_seqscan = off;
CREATE UNLOGGED TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_l2_ops);
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
val
---------
[1,2,3]
[1,1,1]
[0,0,0]
(3 rows)
DROP TABLE t;

13
test/sql/hnsw_cosine.sql Normal file
View File

@@ -0,0 +1,13 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_cosine_ops);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <=> '[3,3,3]';
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '[0,0,0]') t2;
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::vector)) t2;
DROP TABLE t;

12
test/sql/hnsw_ip.sql Normal file
View File

@@ -0,0 +1,12 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_ip_ops);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <#> '[3,3,3]';
SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::vector)) t2;
DROP TABLE t;

13
test/sql/hnsw_l2.sql Normal file
View File

@@ -0,0 +1,13 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_l2_ops);
INSERT INTO t (val) VALUES ('[1,2,4]');
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
SELECT * FROM t ORDER BY val <-> (SELECT NULL::vector);
SELECT COUNT(*) FROM t;
DROP TABLE t;

14
test/sql/hnsw_options.sql Normal file
View File

@@ -0,0 +1,14 @@
SET enable_seqscan = off;
CREATE TABLE t (val vector(3));
CREATE INDEX ON t USING hnsw (val vector_l2_ops) WITH (m = 3);
CREATE INDEX ON t USING hnsw (val vector_l2_ops) WITH (m = 101);
CREATE INDEX ON t USING hnsw (val vector_l2_ops) WITH (ef_construction = 9);
CREATE INDEX ON t USING hnsw (val vector_l2_ops) WITH (ef_construction = 1001);
SHOW hnsw.ef_search;
SET hnsw.ef_search = 9;
SET hnsw.ef_search = 1001;
DROP TABLE t;

View File

@@ -0,0 +1,9 @@
SET enable_seqscan = off;
CREATE UNLOGGED TABLE t (val vector(3));
INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL);
CREATE INDEX ON t USING hnsw (val vector_l2_ops);
SELECT * FROM t ORDER BY val <-> '[3,3,3]';
DROP TABLE t;

99
test/t/010_hnsw_wal.pl Normal file
View File

@@ -0,0 +1,99 @@
# Based on postgres/contrib/bloom/t/001_wal.pl
# Test generic xlog record work for hnsw index replication.
use strict;
use warnings;
use PostgresNode;
use TestLib;
use Test::More;
my $dim = 32;
my $node_primary;
my $node_replica;
# Run few queries on both primary and replica and check their results match.
sub test_index_replay
{
my ($test_name) = @_;
# Wait for replica to catch up
my $applname = $node_replica->name;
my $server_version_num = $node_primary->safe_psql("postgres", "SHOW server_version_num");
my $caughtup_query = "SELECT pg_current_wal_lsn() <= replay_lsn FROM pg_stat_replication WHERE application_name = '$applname';";
$node_primary->poll_query_until('postgres', $caughtup_query)
or die "Timed out while waiting for replica 1 to catch up";
my @r = ();
for (1 .. $dim) {
push(@r, rand());
}
my $sql = join(",", @r);
my $queries = qq(
SET enable_seqscan = off;
SELECT * FROM tst ORDER BY v <-> '[$sql]' LIMIT 10;
);
# Run test queries and compare their result
my $primary_result = $node_primary->safe_psql("postgres", $queries);
my $replica_result = $node_replica->safe_psql("postgres", $queries);
is($primary_result, $replica_result, "$test_name: query result matches");
return;
}
# Use ARRAY[random(), random(), random(), ...] over
# SELECT array_agg(random()) FROM generate_series(1, $dim)
# to generate different values for each row
my $array_sql = join(",", ('random()') x $dim);
# Initialize primary node
$node_primary = get_new_node('primary');
$node_primary->init(allows_streaming => 1);
if ($dim > 32) {
# TODO use wal_keep_segments for Postgres < 13
$node_primary->append_conf('postgresql.conf', qq(wal_keep_size = 1GB));
}
if ($dim > 1500) {
$node_primary->append_conf('postgresql.conf', qq(maintenance_work_mem = 128MB));
}
$node_primary->start;
my $backup_name = 'my_backup';
# Take backup
$node_primary->backup($backup_name);
# Create streaming replica linking to primary
$node_replica = get_new_node('replica');
$node_replica->init_from_backup($node_primary, $backup_name,
has_streaming => 1);
$node_replica->start;
# Create hnsw index on primary
$node_primary->safe_psql("postgres", "CREATE EXTENSION vector;");
$node_primary->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));");
$node_primary->safe_psql("postgres",
"INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 1000) i;"
);
$node_primary->safe_psql("postgres", "CREATE INDEX ON tst USING hnsw (v vector_l2_ops);");
# Test that queries give same result
test_index_replay('initial');
# Run 10 cycles of table modification. Run test queries after each modification.
for my $i (1 .. 10)
{
$node_primary->safe_psql("postgres", "DELETE FROM tst WHERE i = $i;");
test_index_replay("delete $i");
$node_primary->safe_psql("postgres", "VACUUM tst;");
test_index_replay("vacuum $i");
my ($start, $end) = (1001 + ($i - 1) * 100, 1000 + $i * 100);
$node_primary->safe_psql("postgres",
"INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series($start, $end) i;"
);
test_index_replay("insert $i");
}
done_testing();

43
test/t/011_hnsw_vacuum.pl Normal file
View File

@@ -0,0 +1,43 @@
use strict;
use warnings;
use PostgresNode;
use TestLib;
use Test::More;
my $dim = 3;
my @r = ();
for (1 .. $dim) {
my $v = int(rand(1000)) + 1;
push(@r, "i % $v");
}
my $array_sql = join(", ", @r);
# Initialize node
my $node = get_new_node('node');
$node->init;
$node->start;
# Create table and index
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 10000) i;"
);
$node->safe_psql("postgres", "CREATE INDEX ON tst USING hnsw (v vector_l2_ops);");
# Get size
my $size = $node->safe_psql("postgres", "SELECT pg_total_relation_size('tst_v_idx');");
# Delete all, vacuum, and insert same data
$node->safe_psql("postgres", "DELETE FROM tst;");
$node->safe_psql("postgres", "VACUUM tst;");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 10000) i;"
);
# Check size
my $new_size = $node->safe_psql("postgres", "SELECT pg_total_relation_size('tst_v_idx');");
cmp_ok($new_size, "<=", $size * 1.01, "size does not increase too much");
done_testing();

View File

@@ -0,0 +1,96 @@
use strict;
use warnings;
use PostgresNode;
use TestLib;
use Test::More;
my $node;
my @queries = ();
my @expected;
my $limit = 20;
sub test_recall
{
my ($min, $operator) = @_;
my $correct = 0;
my $total = 0;
my $explain = $node->safe_psql("postgres", qq(
SET enable_seqscan = off;
EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v $operator '$queries[0]' LIMIT $limit;
));
like($explain, qr/Index Scan/);
for my $i (0 .. $#queries) {
my $actual = $node->safe_psql("postgres", qq(
SET enable_seqscan = off;
SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit;
));
my @actual_ids = split("\n", $actual);
my %actual_set = map { $_ => 1 } @actual_ids;
my @expected_ids = split("\n", $expected[$i]);
foreach (@expected_ids) {
if (exists($actual_set{$_})) {
$correct++;
}
$total++;
}
}
cmp_ok($correct / $total, ">=", $min, $operator);
}
# Initialize node
$node = get_new_node('node');
$node->init;
$node->start;
# Create table
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector(3));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 10000) i;"
);
# Generate queries
for (1..20) {
my $r1 = rand();
my $r2 = rand();
my $r3 = rand();
push(@queries, "[$r1,$r2,$r3]");
}
# Check each index type
my @operators = ("<->", "<#>", "<=>");
foreach (@operators) {
my $operator = $_;
# Get exact results
@expected = ();
foreach (@queries) {
my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v $operator '$_' LIMIT $limit;");
push(@expected, $res);
}
# Add index
my $opclass;
if ($operator eq "<->") {
$opclass = "vector_l2_ops";
} elsif ($operator eq "<#>") {
$opclass = "vector_ip_ops";
} else {
$opclass = "vector_cosine_ops";
}
$node->safe_psql("postgres", "CREATE INDEX ON tst USING hnsw (v $opclass);");
if ($operator eq "<#>") {
test_recall(0.80, $operator);
} else {
test_recall(0.99, $operator);
}
}
done_testing();

View File

@@ -0,0 +1,103 @@
use strict;
use warnings;
use PostgresNode;
use TestLib;
use Test::More;
my $node;
my @queries = ();
my @expected;
my $limit = 20;
sub test_recall
{
my ($min, $operator) = @_;
my $correct = 0;
my $total = 0;
my $explain = $node->safe_psql("postgres", qq(
SET enable_seqscan = off;
EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v $operator '$queries[0]' LIMIT $limit;
));
like($explain, qr/Index Scan/);
for my $i (0 .. $#queries) {
my $actual = $node->safe_psql("postgres", qq(
SET enable_seqscan = off;
SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit;
));
my @actual_ids = split("\n", $actual);
my %actual_set = map { $_ => 1 } @actual_ids;
my @expected_ids = split("\n", $expected[$i]);
foreach (@expected_ids) {
if (exists($actual_set{$_})) {
$correct++;
}
$total++;
}
}
cmp_ok($correct / $total, ">=", $min, $operator);
}
# Initialize node
$node = get_new_node('node');
$node->init;
$node->start;
# Create table
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector(3));");
# Generate queries
for (1..20) {
my $r1 = rand();
my $r2 = rand();
my $r3 = rand();
push(@queries, "[$r1,$r2,$r3]");
}
# Check each index type
my @operators = ("<->", "<#>", "<=>");
foreach (@operators) {
my $operator = $_;
# Add index
my $opclass;
if ($operator eq "<->") {
$opclass = "vector_l2_ops";
} elsif ($operator eq "<#>") {
$opclass = "vector_ip_ops";
} else {
$opclass = "vector_cosine_ops";
}
$node->safe_psql("postgres", "CREATE INDEX idx ON tst USING hnsw (v $opclass);");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 10000) i;"
);
# Get exact results
@expected = ();
foreach (@queries) {
my $res = $node->safe_psql("postgres", qq(
SET enable_indexscan = off;
SELECT i FROM tst ORDER BY v $operator '$_' LIMIT $limit;
));
push(@expected, $res);
}
if ($operator eq "<#>") {
test_recall(0.80, $operator);
} else {
test_recall(0.99, $operator);
}
$node->safe_psql("postgres", "DROP INDEX idx;");
$node->safe_psql("postgres", "TRUNCATE tst;");
}
done_testing();

View File

@@ -0,0 +1,58 @@
use strict;
use warnings;
use PostgresNode;
use TestLib;
use Test::More;
# Ensures elements and neighbors on both same and different pages
my $dim = 1900;
my $array_sql = join(",", ('random()') x $dim);
# Initialize node
my $node = get_new_node('node');
$node->init;
$node->start;
# Create table and index
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (v vector($dim));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 100) i;"
);
$node->safe_psql("postgres", "CREATE INDEX ON tst USING hnsw (v vector_l2_ops);");
$node->pgbench(
"--no-vacuum --client=5 --transactions=100",
0,
[qr{actually processed}],
[qr{^$}],
"concurrent INSERTs",
{
"007_inserts" => "INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10) i;"
}
);
sub idx_scan
{
# Stats do not update instantaneously
# https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-STATS-VIEWS
sleep(1);
$node->safe_psql("postgres", "SELECT idx_scan FROM pg_stat_user_indexes WHERE indexrelid = 'tst_v_idx'::regclass;");
}
my $expected = 100 + 5 * 100 * 10;
my $count = $node->safe_psql("postgres", "SELECT COUNT(*) FROM tst;");
is($count, $expected);
is(idx_scan(), 0);
$count = $node->safe_psql("postgres", qq(
SET enable_seqscan = off;
SET hnsw.ef_search = 400;
SELECT COUNT(*) FROM (SELECT v FROM tst ORDER BY v <-> (SELECT v FROM tst LIMIT 1)) t;
));
is($count, 400);
is(idx_scan(), 1);
done_testing();