Merge branch 'master' into hnsw-streaming

This commit is contained in:
Andrew Kane
2024-09-25 16:09:09 -07:00
9 changed files with 162 additions and 54 deletions

View File

@@ -1,6 +1,7 @@
## 0.8.0 (unreleased)
- Added casts for arrays to `sparsevec`
- Improved cost estimation
- Reduced memory usage for HNSW index scans
- Dropped support for Postgres 12

View File

@@ -159,24 +159,6 @@ CheckStateArray(ArrayType *statearray, const char *caller)
return (float8 *) ARR_DATA_PTR(statearray);
}
#if PG_VERSION_NUM < 120003
static pg_noinline void
float_overflow_error(void)
{
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("value out of range: overflow")));
}
static pg_noinline void
float_underflow_error(void)
{
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("value out of range: underflow")));
}
#endif
/*
* Convert textual representation to internal representation
*/

View File

@@ -12,6 +12,7 @@
#include "utils/float.h"
#include "utils/guc.h"
#include "utils/selfuncs.h"
#include "utils/spccache.h"
#if PG_VERSION_NUM < 150000
#define MarkGUCPrefixReserved(x) EmitWarningsOnPlaceholders(x)
@@ -112,6 +113,10 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
GenericCosts costs;
int m;
int entryLevel;
int layer0TuplesMax;
double layer0Selectivity;
double scalingFactor = 0.55;
double spc_seq_page_cost;
Relation index;
/* Never use index without order */
@@ -131,17 +136,57 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
HnswGetMetaPageInfo(index, &m, NULL);
index_close(index, NoLock);
/* Approximate entry level */
entryLevel = (int) -log(1.0 / path->indexinfo->tuples) * HnswGetMl(m);
/*
* HNSW cost estimation follows a formula that accounts for the total
* number of tuples indexed combined with the parameters that most
* influence the duration of the index scan, namely: m - the number of
* tuples that are scanned in each step of the HNSW graph traversal
* ef_search - which influences the total number of steps taken at layer 0
*
* The source of the vector data can impact how many steps it takes to
* converge on the set of vectors to return to the executor. Currently, we
* use a hardcoded scaling factor (HNSWScanScalingFactor) to help
* influence that, but this could later become a configurable parameter
* based on the cost estimations.
*
* The tuple estimator formula is below:
*
* numIndexTuples = entryLevel * m + layer0TuplesMax * layer0Selectivity
*
* "entryLevel * m" represents the floor of tuples we need to scan to get
* to layer 0 (L0).
*
* "layer0TuplesMax" is the estimated total number of tuples we'd scan at
* L0 if we weren't discarding already visited tuples as part of the scan.
*
* "layer0Selectivity" estimates the percentage of tuples that are scanned
* at L0, accounting for previously visited tuples, multiplied by the
* "scalingFactor" (currently hardcoded).
*/
entryLevel = (int) (log(path->indexinfo->tuples + 1) * HnswGetMl(m));
layer0TuplesMax = HnswGetLayerM(m, 0) * hnsw_ef_search;
layer0Selectivity = (scalingFactor * log(path->indexinfo->tuples + 1)) /
(log(m) * (1 + log(hnsw_ef_search)));
/* TODO Improve estimate of visited tuples (currently underestimates) */
/* Account for number of tuples (or entry level), m, and ef_search */
costs.numIndexTuples = (entryLevel + 2) * m;
costs.numIndexTuples = (entryLevel * m) +
(layer0TuplesMax * layer0Selectivity);
/* TODO Adjust for selectivity for iterative scans */
genericcostestimate(root, path, loop_count, &costs);
get_tablespace_page_costs(path->indexinfo->reltablespace, NULL, &spc_seq_page_cost);
/* Adjust cost if needed since TOAST not included in seq scan cost */
if (costs.numIndexPages > path->indexinfo->rel->pages)
{
/* Change all page cost from random to sequential */
costs.indexTotalCost -= costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost);
/* Remove cost of extra pages */
costs.indexTotalCost -= (costs.numIndexPages - path->indexinfo->rel->pages) * spc_seq_page_cost;
}
/* Use total cost since most work happens before first tuple is returned */
*indexStartupCost = costs.indexTotalCost;
*indexTotalCost = costs.indexTotalCost;

View File

@@ -120,13 +120,6 @@ ivfflatcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
costs.indexTotalCost -= 0.5 * costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost);
}
/*
* If the list selectivity is lower than what is returned from the generic
* cost estimator, use that.
*/
if (ratio < costs.indexSelectivity)
costs.indexSelectivity = ratio;
/* Use total cost since most work happens before first tuple is returned */
*indexStartupCost = costs.indexTotalCost;
*indexTotalCost = costs.indexTotalCost;

View File

@@ -155,24 +155,6 @@ CheckStateArray(ArrayType *statearray, const char *caller)
return (float8 *) ARR_DATA_PTR(statearray);
}
#if PG_VERSION_NUM < 120003
static pg_noinline void
float_overflow_error(void)
{
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("value out of range: overflow")));
}
static pg_noinline void
float_underflow_error(void)
{
ereport(ERROR,
(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
errmsg("value out of range: underflow")));
}
#endif
/*
* Convert textual representation to internal representation
*/

View File

@@ -94,8 +94,7 @@ like($explain, qr/Seq Scan/);
$explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT i FROM tst WHERE v <-> '$query' < 1 ORDER BY v <-> '$query';
));
# TODO Do not use index
like($explain, qr/Index Scan using idx/);
like($explain, qr/Seq Scan/);
# Test attribute index
$node->safe_psql("postgres", "CREATE INDEX attribute_idx ON tst (c);");
@@ -110,7 +109,6 @@ $node->safe_psql("postgres", "CREATE INDEX partial_idx ON tst USING ivfflat (v v
$explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT i FROM tst WHERE c = $c ORDER BY v <-> '$query' LIMIT $limit;
));
# TODO Use partial index
like($explain, qr/Index Scan using idx/);
like($explain, qr/Index Scan using partial_idx/);
done_testing();

View File

@@ -18,9 +18,13 @@ $node->start;
# Create table and index
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim), c int4, t text);");
$node->safe_psql("postgres", "CREATE TABLE cat (i int4 PRIMARY KEY, t text, b boolean);");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i, ARRAY[$array_sql], i % $nc, 'test ' || i FROM generate_series(1, 10000) i;"
);
$node->safe_psql("postgres",
"INSERT INTO cat SELECT i, 'cat ' || i, i % 5 = 0 FROM generate_series(1, $nc) i;"
);
$node->safe_psql("postgres", "CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);");
$node->safe_psql("postgres", "ANALYZE tst;");
@@ -96,13 +100,25 @@ $explain = $node->safe_psql("postgres", qq(
));
like($explain, qr/Seq Scan/);
# Test join
$explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT cat.t FROM cat INNER JOIN tst ON cat.i = tst.c ORDER BY v <-> '$query' LIMIT $limit;
));
like($explain, qr/Index Scan using idx/);
# Test join with attribute filtering
$explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT cat.t FROM cat INNER JOIN tst ON cat.i = tst.c WHERE cat.b = 't' ORDER BY v <-> '$query' LIMIT $limit;
));
like($explain, qr/Index Scan using idx/);
# Test attribute index
$node->safe_psql("postgres", "CREATE INDEX attribute_idx ON tst (c);");
$explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT i FROM tst WHERE c = $c ORDER BY v <-> '$query' LIMIT $limit;
));
# TODO Use attribute index
like($explain, qr/Index Scan using idx/);
# Use attribute index
like($explain, qr/Bitmap Index Scan on attribute_idx/);
# Test partial index
$node->safe_psql("postgres", "CREATE INDEX partial_idx ON tst USING hnsw (v vector_l2_ops) WHERE (c = $c);");

46
test/t/039_hnsw_cost.pl Normal file
View File

@@ -0,0 +1,46 @@
use strict;
use warnings FATAL => 'all';
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
my @dims = (384, 1536);
my $limit = 10;
# Initialize node
my $node = PostgreSQL::Test::Cluster->new('node');
$node->init;
$node->start;
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
for my $dim (@dims)
{
my $array_sql = join(",", ('random()') x $dim);
my $n = $dim == 384 ? 2000 : 1000;
# Create table and index
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, $n) i;"
);
$node->safe_psql("postgres", "CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);");
$node->safe_psql("postgres", "ANALYZE tst;");
# Generate query
my @r = ();
for (1 .. $dim)
{
push(@r, rand());
}
my $query = "[" . join(",", @r) . "]";
my $explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v <-> '$query' LIMIT $limit;
));
like($explain, qr/Index Scan using idx/);
$node->safe_psql("postgres", "DROP TABLE tst;");
}
done_testing();

View File

@@ -0,0 +1,45 @@
use strict;
use warnings FATAL => 'all';
use PostgreSQL::Test::Cluster;
use PostgreSQL::Test::Utils;
use Test::More;
my @dims = (384, 1536);
my $limit = 10;
# Initialize node
my $node = PostgreSQL::Test::Cluster->new('node');
$node->init;
$node->start;
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
for my $dim (@dims)
{
my $array_sql = join(",", ('random()') x $dim);
# Create table and index
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, 5000) i;"
);
$node->safe_psql("postgres", "CREATE INDEX idx ON tst USING ivfflat (v vector_l2_ops) WITH (lists = 5);");
$node->safe_psql("postgres", "ANALYZE tst;");
# Generate query
my @r = ();
for (1 .. $dim)
{
push(@r, rand());
}
my $query = "[" . join(",", @r) . "]";
my $explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v <-> '$query' LIMIT $limit;
));
like($explain, qr/Index Scan using idx/);
$node->safe_psql("postgres", "DROP TABLE tst;");
}
done_testing();