Improved cost estimation [skip ci]

This commit is contained in:
Andrew Kane
2024-09-28 16:04:07 -07:00
parent ba8e29600b
commit e1c2d03dba
3 changed files with 26 additions and 49 deletions

View File

@@ -102,33 +102,6 @@ hnswbuildphasename(int64 phasenum)
}
}
/*
* Estimate ef needed for iterative scans
*/
static int
EstimateEf(PlannerInfo *root, IndexPath *path)
{
double selectivity = 1;
ListCell *lc;
/* Cannot estimate without limit */
/* limit_tuples includes offset */
if (root->limit_tuples < 0)
return 0;
/* Get the selectivity of non-index conditions */
foreach(lc, path->indexinfo->indrestrictinfo)
{
RestrictInfo *rinfo = lfirst(lc);
/* Skip DEFAULT_INEQ_SEL since it may be a distance filter */
if (rinfo->norm_selec >= 0 && rinfo->norm_selec <= 1 && rinfo->norm_selec != (Selectivity) DEFAULT_INEQ_SEL)
selectivity *= rinfo->norm_selec;
}
return root->limit_tuples / Max(selectivity, 0.00001);
}
/*
* Estimate the cost of an index scan
*/
@@ -140,12 +113,13 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
{
GenericCosts costs;
int m;
int ef;
int entryLevel;
int layer0TuplesMax;
double layer0Selectivity;
double scalingFactor = 0.55;
double ratio;
double spc_seq_page_cost;
double startupPages;
Relation index;
/* Never use index without order */
@@ -161,13 +135,12 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
MemSet(&costs, 0, sizeof(costs));
genericcostestimate(root, path, loop_count, &costs);
index = index_open(path->indexinfo->indexoid, NoLock);
HnswGetMetaPageInfo(index, &m, NULL);
index_close(index, NoLock);
/* TODO Separate startup and total cost */
ef = hnsw_streaming ? Max(hnsw_ef_search, EstimateEf(root, path)) : hnsw_ef_search;
/*
* HNSW cost estimation follows a formula that accounts for the total
* number of tuples indexed combined with the parameters that most
@@ -195,34 +168,40 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
* at L0, accounting for previously visited tuples, multiplied by the
* "scalingFactor" (currently hardcoded).
*/
entryLevel = (int) (log(path->indexinfo->tuples + 1) * HnswGetMl(m));
layer0TuplesMax = HnswGetLayerM(m, 0) * ef;
layer0Selectivity = (scalingFactor * log(path->indexinfo->tuples + 1)) /
(log(m) * (1 + log(ef)));
if (path->indexinfo->tuples > 0)
{
entryLevel = (int) (log(path->indexinfo->tuples) * HnswGetMl(m));
layer0TuplesMax = HnswGetLayerM(m, 0) * hnsw_ef_search;
layer0Selectivity = scalingFactor * log(path->indexinfo->tuples) / (log(m) * (1 + log(hnsw_ef_search)));
ratio = (entryLevel * m + layer0TuplesMax * layer0Selectivity) / path->indexinfo->tuples;
}
else
ratio = 1;
costs.numIndexTuples = (entryLevel * m) +
(layer0TuplesMax * layer0Selectivity);
genericcostestimate(root, path, loop_count, &costs);
/* Set startup cost since this work happens before first tuple is returned */
costs.indexStartupCost = costs.indexTotalCost * ratio;
startupPages = costs.numIndexPages * ratio;
get_tablespace_page_costs(path->indexinfo->reltablespace, NULL, &spc_seq_page_cost);
/* Adjust cost if needed since TOAST not included in seq scan cost */
if (costs.numIndexPages > path->indexinfo->rel->pages && costs.numIndexTuples / (path->indexinfo->tuples + 1) < 0.5)
if (startupPages > path->indexinfo->rel->pages && ratio < 0.5)
{
/* Change all page cost from random to sequential */
costs.indexTotalCost -= costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost);
costs.indexStartupCost -= startupPages * (costs.spc_random_page_cost - spc_seq_page_cost);
/* Remove cost of extra pages */
costs.indexTotalCost -= (costs.numIndexPages - path->indexinfo->rel->pages) * spc_seq_page_cost;
costs.indexStartupCost -= (startupPages - path->indexinfo->rel->pages) * spc_seq_page_cost;
}
/* Use total cost since most work happens before first tuple is returned */
*indexStartupCost = costs.indexTotalCost;
*indexStartupCost = costs.indexStartupCost;
*indexTotalCost = costs.indexTotalCost;
*indexSelectivity = costs.indexSelectivity;
*indexCorrelation = costs.indexCorrelation;
*indexPages = costs.numIndexPages;
Assert(*indexStartupCost > 0);
Assert(*indexTotalCost > *indexStartupCost);
}
/*

View File

@@ -41,8 +41,7 @@ my $c = int(rand() * $nc);
my $explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT i FROM tst WHERE c = $c ORDER BY v <-> '$query' LIMIT $limit;
));
# TODO Do not use index
like($explain, qr/Index Scan using idx/);
like($explain, qr/Seq Scan/);
# Test attribute filtering with few rows removed
$explain = $node->safe_psql("postgres", qq(
@@ -60,8 +59,7 @@ like($explain, qr/Index Scan using idx/);
$explain = $node->safe_psql("postgres", qq(
EXPLAIN ANALYZE SELECT i FROM tst WHERE c < 1 ORDER BY v <-> '$query' LIMIT $limit;
));
# TODO Do not use index
like($explain, qr/Index Scan using idx/);
like($explain, qr/Seq Scan/);
# Test attribute filtering with few rows removed like
$explain = $node->safe_psql("postgres", qq(

View File

@@ -17,7 +17,7 @@ $node->safe_psql("postgres", "CREATE EXTENSION vector;");
for my $dim (@dims)
{
my $array_sql = join(",", ('random()') x $dim);
my $n = $dim == 384 ? 2000 : 1000;
my $n = 2000;
# Create table and index
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));");