From 2df9f24aadbfa4c52af01d271e6246671d00abdf Mon Sep 17 00:00:00 2001 From: "Jonathan S. Katz" Date: Wed, 25 Sep 2024 17:01:33 -0400 Subject: [PATCH] Update HNSW cost estimatation to utilize search and index info (#682) Previously, the cost estimation formula for a HNSW index scan utilized a methodology that only factored in the entry level for an HNSW scan and the "m" index parameter, which reflects the number of tuples (or vectors) to scan at each step of a HNSW graph traversal. While this would bias the PostgreSQL query planner to choose an HNSW index scan over other available paths, this could lead to potential suboptimal index selection, for example, choosing to use a HNSW index instead of an available B-tree index that has better selectivity. The number of tuples scanned during HNSW graph traversal is principally influenced by these factors: * The number of tuples stored in the index * `m` - the number of tuples that are scanned in each step of the graph traversal * `hnsw.ef_search` - which influences the total number of steps it takes for the scan to converge on the approximated nearest neighbors Through testing different source models for vectors, we also observed that the correlation of vectors in mdoels would impact this convergence. For this first iteration, we've opted to hardcode a constant scaling factor and set it to `0.55`, though a future commit may turn this into a configurable parameter. The high-level formula for estimating the cost of a HNSW index scan is as such: ``` (entryLevel * m) + (layer0TuplesMax * layer0Selectivity) ``` where - `(entryLevel * m)` is the lower bound of tuples to scan, as it accounts for the graph traversal to layer 0 (L0). (L1 and above has an ef=1) - `layer0TuplesMax` is an estimate of the maximum number of tuples to scan at L0. This accounts for tuples that may end up being discarded due to them already being visited. Testing shows that the number of steps until converge is similar to the value of `hnsw.ef_search`, thus we can estimate tuples max at `hnsw.ef_search * m * 2` - `layer0Selectivity` - estimates the percentage of tuples that will actually be scanned during the index traversal, multipled by the scaling factor In addition to the `m` build parameter and `hsnw.ef_search`, costs estimates can be influenced by standard PostgreSQL costing parameters, though adjusting those (e.g. `random_page_cost`) should be done with care. Co-authored-by: @ankane --- src/hnsw.c | 45 +++++++++++++++++++++++++++++++----- test/t/017_hnsw_filtering.pl | 4 ++-- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/hnsw.c b/src/hnsw.c index a7b1e5f..765392e 100644 --- a/src/hnsw.c +++ b/src/hnsw.c @@ -99,7 +99,10 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, { GenericCosts costs; int m; - int entryLevel; + int entryLevel; + int layer0TuplesMax; + double layer0Selectivity; + double scalingFactor = 0.55; Relation index; /* Never use index without order */ @@ -119,12 +122,42 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, HnswGetMetaPageInfo(index, &m, NULL); index_close(index, NoLock); - /* Approximate entry level */ - entryLevel = (int) -log(1.0 / path->indexinfo->tuples) * HnswGetMl(m); + /* + * HNSW cost estimation follows a formula that accounts for the total + * number of tuples indexed combined with the parameters that most influence + * the duration of the index scan, namely: + * m - the number of tuples that are scanned in each step of the HNSW + * graph traversal + * ef_search - which influences the total number of steps taken at layer 0 + * + * The source of the vector data can impact how many steps it takes to + * converge on the set of vectors to return to the executor. Currently, + * we use a hardcoded scaling factor (HNSWScanScalingFactor) to help + * influence that, but this could later become a configurable parameter + * based on the cost estimations. + * + * The tuple estimator formula is below: + * + * numIndexTuples = (entryLevel * m) + + * (layer0TuplesMax * layer0Selectivity) + * + * "entryLevel * m" represents the floor of tuples we need to scan to get + * to layer 0 (L0). + * + * "layer0TuplesMax" is the estimated total number of tuples we'd scan at + * L0 if we weren't discarding already visited tuples as part of the scan. + * + * "layer0Selectivity" estimates the percentage of tuples that are scanned + * at L0, accounting for previously visited tuples, multiplied by the + * "scalingFactor" (currently hardcoded). + */ + entryLevel = (int) floor(log(path->indexinfo->tuples + 1) * HnswGetMl(m)); + layer0TuplesMax = HnswGetLayerM(m, 0) * hnsw_ef_search; + layer0Selectivity = (scalingFactor * log(path->indexinfo->tuples + 1)) / + (log(m) * (1 + log(hnsw_ef_search))); - /* TODO Improve estimate of visited tuples (currently underestimates) */ - /* Account for number of tuples (or entry level), m, and ef_search */ - costs.numIndexTuples = (entryLevel + 2) * m; + costs.numIndexTuples = (entryLevel * m) + + (layer0TuplesMax * layer0Selectivity); genericcostestimate(root, path, loop_count, &costs); diff --git a/test/t/017_hnsw_filtering.pl b/test/t/017_hnsw_filtering.pl index 0896d32..9dbdcf3 100644 --- a/test/t/017_hnsw_filtering.pl +++ b/test/t/017_hnsw_filtering.pl @@ -117,8 +117,8 @@ $node->safe_psql("postgres", "CREATE INDEX attribute_idx ON tst (c);"); $explain = $node->safe_psql("postgres", qq( EXPLAIN ANALYZE SELECT i FROM tst WHERE c = $c ORDER BY v <-> '$query' LIMIT $limit; )); -# TODO Use attribute index -like($explain, qr/Index Scan using idx/); +# Use attribute index +like($explain, qr/Bitmap Index Scan on attribute_idx/); # Test partial index $node->safe_psql("postgres", "CREATE INDEX partial_idx ON tst USING hnsw (v vector_l2_ops) WHERE (c = $c);");