Merge branch 'master' into hnsw-streaming

2026-07-02 18:50:56 +08:00 · 2024-09-25 16:09:09 -07:00
parent 4e35c6abe3 46de265a24
commit 38207f5640
9 changed files with 162 additions and 54 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,7 @@
 ## 0.8.0 (unreleased)

 - Added casts for arrays to `sparsevec`
+- Improved cost estimation
 - Reduced memory usage for HNSW index scans
 - Dropped support for Postgres 12

--- a/src/halfvec.c
+++ b/src/halfvec.c
@@ -159,24 +159,6 @@ CheckStateArray(ArrayType *statearray, const char *caller)
 	return (float8 *) ARR_DATA_PTR(statearray);
 }

-#if PG_VERSION_NUM < 120003
-static pg_noinline void
-float_overflow_error(void)
-{
-	ereport(ERROR,
-			(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-			 errmsg("value out of range: overflow")));
-}
-
-static pg_noinline void
-float_underflow_error(void)
-{
-	ereport(ERROR,
-			(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-			 errmsg("value out of range: underflow")));
-}
-#endif
-
 /*
 * Convert textual representation to internal representation
 */
--- a/src/hnsw.c
+++ b/src/hnsw.c
@@ -12,6 +12,7 @@
 #include "utils/float.h"
 #include "utils/guc.h"
 #include "utils/selfuncs.h"
+#include "utils/spccache.h"

 #if PG_VERSION_NUM < 150000
 #define MarkGUCPrefixReserved(x) EmitWarningsOnPlaceholders(x)
@@ -112,6 +113,10 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	GenericCosts costs;
 	int			m;
 	int			entryLevel;
+	int			layer0TuplesMax;
+	double		layer0Selectivity;
+	double		scalingFactor = 0.55;
+	double		spc_seq_page_cost;
 	Relation	index;

 	/* Never use index without order */
@@ -131,17 +136,57 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 	HnswGetMetaPageInfo(index, &m, NULL);
 	index_close(index, NoLock);

-	/* Approximate entry level */
-	entryLevel = (int) -log(1.0 / path->indexinfo->tuples) * HnswGetMl(m);
+	/*
+	 * HNSW cost estimation follows a formula that accounts for the total
+	 * number of tuples indexed combined with the parameters that most
+	 * influence the duration of the index scan, namely: m - the number of
+	 * tuples that are scanned in each step of the HNSW graph traversal
+	 * ef_search - which influences the total number of steps taken at layer 0
+	 *
+	 * The source of the vector data can impact how many steps it takes to
+	 * converge on the set of vectors to return to the executor. Currently, we
+	 * use a hardcoded scaling factor (HNSWScanScalingFactor) to help
+	 * influence that, but this could later become a configurable parameter
+	 * based on the cost estimations.
+	 *
+	 * The tuple estimator formula is below:
+	 *
+	 * numIndexTuples = entryLevel * m + layer0TuplesMax * layer0Selectivity
+	 *
+	 * "entryLevel * m" represents the floor of tuples we need to scan to get
+	 * to layer 0 (L0).
+	 *
+	 * "layer0TuplesMax" is the estimated total number of tuples we'd scan at
+	 * L0 if we weren't discarding already visited tuples as part of the scan.
+	 *
+	 * "layer0Selectivity" estimates the percentage of tuples that are scanned
+	 * at L0, accounting for previously visited tuples, multiplied by the
+	 * "scalingFactor" (currently hardcoded).
+	 */
+	entryLevel = (int) (log(path->indexinfo->tuples + 1) * HnswGetMl(m));
+	layer0TuplesMax = HnswGetLayerM(m, 0) * hnsw_ef_search;
+	layer0Selectivity = (scalingFactor * log(path->indexinfo->tuples + 1)) /
+		(log(m) * (1 + log(hnsw_ef_search)));

-	/* TODO Improve estimate of visited tuples (currently underestimates) */
-	/* Account for number of tuples (or entry level), m, and ef_search */
-	costs.numIndexTuples = (entryLevel + 2) * m;
+	costs.numIndexTuples = (entryLevel * m) +
+		(layer0TuplesMax * layer0Selectivity);

 	/* TODO Adjust for selectivity for iterative scans */

 	genericcostestimate(root, path, loop_count, &costs);

+	get_tablespace_page_costs(path->indexinfo->reltablespace, NULL, &spc_seq_page_cost);
+
+	/* Adjust cost if needed since TOAST not included in seq scan cost */
+	if (costs.numIndexPages > path->indexinfo->rel->pages)
+	{
+		/* Change all page cost from random to sequential */
+		costs.indexTotalCost -= costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost);
+
+		/* Remove cost of extra pages */
+		costs.indexTotalCost -= (costs.numIndexPages - path->indexinfo->rel->pages) * spc_seq_page_cost;
+	}
+
 	/* Use total cost since most work happens before first tuple is returned */
 	*indexStartupCost = costs.indexTotalCost;
 	*indexTotalCost = costs.indexTotalCost;
--- a/src/ivfflat.c
+++ b/src/ivfflat.c
@@ -120,13 +120,6 @@ ivfflatcostestimate(PlannerInfo *root, IndexPath *path, double loop_count,
 		costs.indexTotalCost -= 0.5 * costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost);
 	}

-	/*
-	 * If the list selectivity is lower than what is returned from the generic
-	 * cost estimator, use that.
-	 */
-	if (ratio < costs.indexSelectivity)
-		costs.indexSelectivity = ratio;
-
 	/* Use total cost since most work happens before first tuple is returned */
 	*indexStartupCost = costs.indexTotalCost;
 	*indexTotalCost = costs.indexTotalCost;
--- a/src/vector.c
+++ b/src/vector.c
@@ -155,24 +155,6 @@ CheckStateArray(ArrayType *statearray, const char *caller)
 	return (float8 *) ARR_DATA_PTR(statearray);
 }

-#if PG_VERSION_NUM < 120003
-static pg_noinline void
-float_overflow_error(void)
-{
-	ereport(ERROR,
-			(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-			 errmsg("value out of range: overflow")));
-}
-
-static pg_noinline void
-float_underflow_error(void)
-{
-	ereport(ERROR,
-			(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
-			 errmsg("value out of range: underflow")));
-}
-#endif
-
 /*
 * Convert textual representation to internal representation
 */
--- a/test/t/009_ivfflat_filtering.pl
+++ b/test/t/009_ivfflat_filtering.pl
@@ -94,8 +94,7 @@ like($explain, qr/Seq Scan/);
 $explain = $node->safe_psql("postgres", qq(
 	EXPLAIN ANALYZE SELECT i FROM tst WHERE v <-> '$query' < 1 ORDER BY v <-> '$query';
 ));
-# TODO Do not use index
-like($explain, qr/Index Scan using idx/);
+like($explain, qr/Seq Scan/);

 # Test attribute index
 $node->safe_psql("postgres", "CREATE INDEX attribute_idx ON tst (c);");
@@ -110,7 +109,6 @@ $node->safe_psql("postgres", "CREATE INDEX partial_idx ON tst USING ivfflat (v v
 $explain = $node->safe_psql("postgres", qq(
 	EXPLAIN ANALYZE SELECT i FROM tst WHERE c = $c ORDER BY v <-> '$query' LIMIT $limit;
 ));
-# TODO Use partial index
-like($explain, qr/Index Scan using idx/);
+like($explain, qr/Index Scan using partial_idx/);

 done_testing();
--- a/test/t/017_hnsw_filtering.pl
+++ b/test/t/017_hnsw_filtering.pl
@@ -18,9 +18,13 @@ $node->start;
 # Create table and index
 $node->safe_psql("postgres", "CREATE EXTENSION vector;");
 $node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim), c int4, t text);");
+$node->safe_psql("postgres", "CREATE TABLE cat (i int4 PRIMARY KEY, t text, b boolean);");
 $node->safe_psql("postgres",
 	"INSERT INTO tst SELECT i, ARRAY[$array_sql], i % $nc, 'test ' || i FROM generate_series(1, 10000) i;"
 );
+$node->safe_psql("postgres",
+	"INSERT INTO cat SELECT i, 'cat ' || i, i % 5 = 0 FROM generate_series(1, $nc) i;"
+);
 $node->safe_psql("postgres", "CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);");
 $node->safe_psql("postgres", "ANALYZE tst;");

@@ -96,13 +100,25 @@ $explain = $node->safe_psql("postgres", qq(
 ));
 like($explain, qr/Seq Scan/);

+# Test join
+$explain = $node->safe_psql("postgres", qq(
+	EXPLAIN ANALYZE SELECT cat.t FROM cat INNER JOIN tst ON cat.i = tst.c ORDER BY v <-> '$query' LIMIT $limit;
+));
+like($explain, qr/Index Scan using idx/);
+
+# Test join with attribute filtering
+$explain = $node->safe_psql("postgres", qq(
+	EXPLAIN ANALYZE SELECT cat.t FROM cat INNER JOIN tst ON cat.i = tst.c WHERE cat.b = 't' ORDER BY v <-> '$query' LIMIT $limit;
+));
+like($explain, qr/Index Scan using idx/);
+
 # Test attribute index
 $node->safe_psql("postgres", "CREATE INDEX attribute_idx ON tst (c);");
 $explain = $node->safe_psql("postgres", qq(
 	EXPLAIN ANALYZE SELECT i FROM tst WHERE c = $c ORDER BY v <-> '$query' LIMIT $limit;
 ));
-# TODO Use attribute index
-like($explain, qr/Index Scan using idx/);
+# Use attribute index
+like($explain, qr/Bitmap Index Scan on attribute_idx/);

 # Test partial index
 $node->safe_psql("postgres", "CREATE INDEX partial_idx ON tst USING hnsw (v vector_l2_ops) WHERE (c = $c);");
--- a/test/t/039_hnsw_cost.pl
+++ b/test/t/039_hnsw_cost.pl
@@ -0,0 +1,46 @@
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my @dims = (384, 1536);
+my $limit = 10;
+
+# Initialize node
+my $node = PostgreSQL::Test::Cluster->new('node');
+$node->init;
+$node->start;
+
+$node->safe_psql("postgres", "CREATE EXTENSION vector;");
+
+for my $dim (@dims)
+{
+	my $array_sql = join(",", ('random()') x $dim);
+	my $n = $dim == 384 ? 2000 : 1000;
+
+	# Create table and index
+	$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));");
+	$node->safe_psql("postgres",
+		"INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, $n) i;"
+	);
+	$node->safe_psql("postgres", "CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);");
+	$node->safe_psql("postgres", "ANALYZE tst;");
+
+	# Generate query
+	my @r = ();
+	for (1 .. $dim)
+	{
+		push(@r, rand());
+	}
+	my $query = "[" . join(",", @r) . "]";
+
+	my $explain = $node->safe_psql("postgres", qq(
+		EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v <-> '$query' LIMIT $limit;
+	));
+	like($explain, qr/Index Scan using idx/);
+
+	$node->safe_psql("postgres", "DROP TABLE tst;");
+}
+
+done_testing();
--- a/test/t/040_ivfflat_cost.pl
+++ b/test/t/040_ivfflat_cost.pl
@@ -0,0 +1,45 @@
+use strict;
+use warnings FATAL => 'all';
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+my @dims = (384, 1536);
+my $limit = 10;
+
+# Initialize node
+my $node = PostgreSQL::Test::Cluster->new('node');
+$node->init;
+$node->start;
+
+$node->safe_psql("postgres", "CREATE EXTENSION vector;");
+
+for my $dim (@dims)
+{
+	my $array_sql = join(",", ('random()') x $dim);
+
+	# Create table and index
+	$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));");
+	$node->safe_psql("postgres",
+		"INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, 5000) i;"
+	);
+	$node->safe_psql("postgres", "CREATE INDEX idx ON tst USING ivfflat (v vector_l2_ops) WITH (lists = 5);");
+	$node->safe_psql("postgres", "ANALYZE tst;");
+
+	# Generate query
+	my @r = ();
+	for (1 .. $dim)
+	{
+		push(@r, rand());
+	}
+	my $query = "[" . join(",", @r) . "]";
+
+	my $explain = $node->safe_psql("postgres", qq(
+		EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v <-> '$query' LIMIT $limit;
+	));
+	like($explain, qr/Index Scan using idx/);
+
+	$node->safe_psql("postgres", "DROP TABLE tst;");
+}
+
+done_testing();