From 2df9f24aadbfa4c52af01d271e6246671d00abdf Mon Sep 17 00:00:00 2001
From: "Jonathan S. Katz" <jkatz@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:01:33 -0400
Subject: [PATCH] Update HNSW cost estimatation to utilize search and index
 info (#682)

Previously, the cost estimation formula for a HNSW index scan utilized
a methodology that only factored in the entry level for an HNSW scan
and the "m" index parameter, which reflects the number of tuples (or
vectors) to scan at each step of a HNSW graph traversal. While this
would bias the PostgreSQL query planner to choose an HNSW index scan
over other available paths, this could lead to potential suboptimal
index selection, for example, choosing to use a HNSW index instead of
an available B-tree index that has better selectivity.

The number of tuples scanned during HNSW graph traversal is principally
influenced by these factors:

 * The number of tuples stored in the index
 * `m` - the number of tuples that are scanned in each step of the graph
   traversal
 * `hnsw.ef_search` - which influences the total number of steps it
   takes for the scan to converge on the approximated nearest neighbors

Through testing different source models for vectors, we also observed
that the correlation of vectors in mdoels would impact this convergence.
For this first iteration, we've opted to hardcode a constant scaling
factor and set it to `0.55`, though a future commit may turn this into
a configurable parameter.

The high-level formula for estimating the cost of a HNSW index scan is
as such:

```
(entryLevel * m) + (layer0TuplesMax * layer0Selectivity)
```

where

- `(entryLevel * m)` is the lower bound of tuples to scan, as it
accounts for the graph traversal to layer 0 (L0). (L1 and above has an ef=1)
- `layer0TuplesMax` is an estimate of the maximum number of tuples to
scan at L0. This accounts for tuples that may end up being discarded due
to them already being visited. Testing shows that the number of steps
until converge is similar to the value of `hnsw.ef_search`, thus we can
estimate tuples max at `hnsw.ef_search * m * 2`
- `layer0Selectivity` - estimates the percentage of tuples that will
actually be scanned during the index traversal, multipled by the scaling
factor

In addition to the `m` build parameter and `hsnw.ef_search`, costs
estimates can be influenced by standard PostgreSQL costing parameters,
though adjusting those (e.g. `random_page_cost`) should be done with
care.

Co-authored-by: @ankane
---
 src/hnsw.c                   | 45 +++++++++++++++++++++++++++++++-----
 test/t/017_hnsw_filtering.pl |  4 ++--
 2 files changed, 41 insertions(+), 8 deletions(-)

diff --git a/src/hnsw.c b/src/hnsw.c
index a7b1e5f..765392e 100644
--- a/src/hnsw.c
+++ b/src/hnsw.c
@@ -99,7 +99,10 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 {
 	GenericCosts costs;
 	int			m;
-	int			entryLevel;
+	int		entryLevel;
+	int		layer0TuplesMax;
+	double		layer0Selectivity;
+	double		scalingFactor = 0.55;
 	Relation	index;
 
 	/* Never use index without order */
@@ -119,12 +122,42 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	HnswGetMetaPageInfo(index, &m, NULL);
 	index_close(index, NoLock);
 
-	/* Approximate entry level */
-	entryLevel = (int) -log(1.0 / path->indexinfo->tuples) * HnswGetMl(m);
+	/*
+	 * HNSW cost estimation follows a formula that accounts for the total
+	 * number of tuples indexed combined with the parameters that most influence
+	 * the duration of the index scan, namely:
+	 *   m - the number of tuples that are scanned in each step of the HNSW
+	 *       graph traversal
+	 *   ef_search - which influences the total number of steps taken at layer 0
+	 *
+	 *  The source of the vector data can impact how many steps it takes to
+	 *  converge on the set of vectors to return to the executor. Currently,
+	 *  we use a hardcoded scaling factor (HNSWScanScalingFactor) to help
+	 *  influence that, but this could later become a configurable parameter
+	 *  based on the cost estimations.
+	 *
+	 * The tuple estimator formula is below:
+	 *
+	 * numIndexTuples = (entryLevel * m) +
+	 * 					(layer0TuplesMax * layer0Selectivity)
+	 *
+	 * "entryLevel * m" represents the floor of tuples we need to scan to get
+	 * to layer 0 (L0).
+	 *
+	 * "layer0TuplesMax" is the estimated total number of tuples we'd scan at
+	 * L0 if we weren't discarding already visited tuples as part of the scan.
+	 *
+	 * "layer0Selectivity" estimates the percentage of tuples that are scanned
+	 * at L0, accounting for previously visited tuples, multiplied by the
+	 * "scalingFactor" (currently hardcoded).
+	 */
+	entryLevel = (int) floor(log(path->indexinfo->tuples + 1) * HnswGetMl(m));
+	layer0TuplesMax = HnswGetLayerM(m, 0) * hnsw_ef_search;
+	layer0Selectivity =  (scalingFactor * log(path->indexinfo->tuples + 1)) /
+		(log(m) * (1 + log(hnsw_ef_search)));
 
-	/* TODO Improve estimate of visited tuples (currently underestimates) */
-	/* Account for number of tuples (or entry level), m, and ef_search */
-	costs.numIndexTuples = (entryLevel + 2) * m;
+	costs.numIndexTuples = (entryLevel * m) +
+		(layer0TuplesMax * layer0Selectivity);
 
 	genericcostestimate(root, path, loop_count, &costs);
 
diff --git a/test/t/017_hnsw_filtering.pl b/test/t/017_hnsw_filtering.pl
index 0896d32..9dbdcf3 100644
--- a/test/t/017_hnsw_filtering.pl
+++ b/test/t/017_hnsw_filtering.pl
@@ -117,8 +117,8 @@ $node->safe_psql("postgres", "CREATE INDEX attribute_idx ON tst (c);");
 $explain = $node->safe_psql("postgres", qq(
 	EXPLAIN ANALYZE SELECT i FROM tst WHERE c = $c ORDER BY v <-> '$query' LIMIT $limit;
 ));
-# TODO Use attribute index
-like($explain, qr/Index Scan using idx/);
+# Use attribute index
+like($explain, qr/Bitmap Index Scan on attribute_idx/);
 
 # Test partial index
 $node->safe_psql("postgres", "CREATE INDEX partial_idx ON tst USING hnsw (v vector_l2_ops) WHERE (c = $c);");