From 77688b43093c3461ed97f7232e268c8c3b638600 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Oct 2024 12:42:03 -0700 Subject: [PATCH] Improve total cost for cost estimation (#686) --- src/hnsw.c | 40 +++++++++++++++++++++--------------- src/ivfflat.c | 36 +++++++++++++++----------------- test/t/017_hnsw_filtering.pl | 6 ++---- test/t/039_hnsw_cost.pl | 13 ++++++++++-- 4 files changed, 53 insertions(+), 42 deletions(-) diff --git a/src/hnsw.c b/src/hnsw.c index 0b6640a..c2579c1 100644 --- a/src/hnsw.c +++ b/src/hnsw.c @@ -100,10 +100,8 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, { GenericCosts costs; int m; - int entryLevel; - int layer0TuplesMax; - double layer0Selectivity; - double scalingFactor = 0.55; + double ratio; + double startupPages; double spc_seq_page_cost; Relation index; @@ -120,6 +118,8 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, MemSet(&costs, 0, sizeof(costs)); + genericcostestimate(root, path, loop_count, &costs); + index = index_open(path->indexinfo->indexoid, NoLock); HnswGetMetaPageInfo(index, &m, NULL); index_close(index, NoLock); @@ -151,30 +151,38 @@ hnswcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, * at L0, accounting for previously visited tuples, multiplied by the * "scalingFactor" (currently hardcoded). */ - entryLevel = (int) (log(path->indexinfo->tuples + 1) * HnswGetMl(m)); - layer0TuplesMax = HnswGetLayerM(m, 0) * hnsw_ef_search; - layer0Selectivity = (scalingFactor * log(path->indexinfo->tuples + 1)) / - (log(m) * (1 + log(hnsw_ef_search))); + if (path->indexinfo->tuples > 0) + { + double scalingFactor = 0.55; + int entryLevel = (int) (log(path->indexinfo->tuples) * HnswGetMl(m)); + int layer0TuplesMax = HnswGetLayerM(m, 0) * hnsw_ef_search; + double layer0Selectivity = scalingFactor * log(path->indexinfo->tuples) / (log(m) * (1 + log(hnsw_ef_search))); - costs.numIndexTuples = (entryLevel * m) + - (layer0TuplesMax * layer0Selectivity); + ratio = (entryLevel * m + layer0TuplesMax * layer0Selectivity) / path->indexinfo->tuples; - genericcostestimate(root, path, loop_count, &costs); + if (ratio > 1) + ratio = 1; + } + else + ratio = 1; get_tablespace_page_costs(path->indexinfo->reltablespace, NULL, &spc_seq_page_cost); + /* Startup cost is cost before returning the first row */ + costs.indexStartupCost = costs.indexTotalCost * ratio; + /* Adjust cost if needed since TOAST not included in seq scan cost */ - if (costs.numIndexPages > path->indexinfo->rel->pages) + startupPages = costs.numIndexPages * ratio; + if (startupPages > path->indexinfo->rel->pages && ratio < 0.5) { /* Change all page cost from random to sequential */ - costs.indexTotalCost -= costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost); + costs.indexStartupCost -= startupPages * (costs.spc_random_page_cost - spc_seq_page_cost); /* Remove cost of extra pages */ - costs.indexTotalCost -= (costs.numIndexPages - path->indexinfo->rel->pages) * spc_seq_page_cost; + costs.indexStartupCost -= (startupPages - path->indexinfo->rel->pages) * spc_seq_page_cost; } - /* Use total cost since most work happens before first tuple is returned */ - *indexStartupCost = costs.indexTotalCost; + *indexStartupCost = costs.indexStartupCost; *indexTotalCost = costs.indexTotalCost; *indexSelectivity = costs.indexSelectivity; *indexCorrelation = costs.indexCorrelation; diff --git a/src/ivfflat.c b/src/ivfflat.c index 986e19d..395040d 100644 --- a/src/ivfflat.c +++ b/src/ivfflat.c @@ -69,6 +69,8 @@ ivfflatcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, GenericCosts costs; int lists; double ratio; + double sequentialRatio = 0.5; + double startupPages; double spc_seq_page_cost; Relation index; @@ -85,6 +87,8 @@ ivfflatcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, MemSet(&costs, 0, sizeof(costs)); + genericcostestimate(root, path, loop_count, &costs); + index = index_open(path->indexinfo->indexoid, NoLock); IvfflatGetMetaPageInfo(index, &lists, NULL); index_close(index, NoLock); @@ -94,34 +98,26 @@ ivfflatcostestimate(PlannerInfo *root, IndexPath *path, double loop_count, if (ratio > 1.0) ratio = 1.0; - /* - * This gives us the subset of tuples to visit. This value is passed into - * the generic cost estimator to determine the number of pages to visit - * during the index scan. - */ - costs.numIndexTuples = path->indexinfo->tuples * ratio; - - genericcostestimate(root, path, loop_count, &costs); - get_tablespace_page_costs(path->indexinfo->reltablespace, NULL, &spc_seq_page_cost); + /* Change some page cost from random to sequential */ + costs.indexTotalCost -= sequentialRatio * costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost); + + /* Startup cost is cost before returning the first row */ + costs.indexStartupCost = costs.indexTotalCost * ratio; + /* Adjust cost if needed since TOAST not included in seq scan cost */ - if (costs.numIndexPages > path->indexinfo->rel->pages && ratio < 0.5) + startupPages = costs.numIndexPages * ratio; + if (startupPages > path->indexinfo->rel->pages && ratio < 0.5) { - /* Change all page cost from random to sequential */ - costs.indexTotalCost -= costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost); + /* Change rest of page cost from random to sequential */ + costs.indexStartupCost -= (1 - sequentialRatio) * startupPages * (costs.spc_random_page_cost - spc_seq_page_cost); /* Remove cost of extra pages */ - costs.indexTotalCost -= (costs.numIndexPages - path->indexinfo->rel->pages) * spc_seq_page_cost; - } - else - { - /* Change some page cost from random to sequential */ - costs.indexTotalCost -= 0.5 * costs.numIndexPages * (costs.spc_random_page_cost - spc_seq_page_cost); + costs.indexStartupCost -= (startupPages - path->indexinfo->rel->pages) * spc_seq_page_cost; } - /* Use total cost since most work happens before first tuple is returned */ - *indexStartupCost = costs.indexTotalCost; + *indexStartupCost = costs.indexStartupCost; *indexTotalCost = costs.indexTotalCost; *indexSelectivity = costs.indexSelectivity; *indexCorrelation = costs.indexCorrelation; diff --git a/test/t/017_hnsw_filtering.pl b/test/t/017_hnsw_filtering.pl index 9dbdcf3..afa2a1c 100644 --- a/test/t/017_hnsw_filtering.pl +++ b/test/t/017_hnsw_filtering.pl @@ -41,8 +41,7 @@ my $c = int(rand() * $nc); my $explain = $node->safe_psql("postgres", qq( EXPLAIN ANALYZE SELECT i FROM tst WHERE c = $c ORDER BY v <-> '$query' LIMIT $limit; )); -# TODO Do not use index -like($explain, qr/Index Scan using idx/); +like($explain, qr/Seq Scan/); # Test attribute filtering with few rows removed $explain = $node->safe_psql("postgres", qq( @@ -60,8 +59,7 @@ like($explain, qr/Index Scan using idx/); $explain = $node->safe_psql("postgres", qq( EXPLAIN ANALYZE SELECT i FROM tst WHERE c < 1 ORDER BY v <-> '$query' LIMIT $limit; )); -# TODO Do not use index -like($explain, qr/Index Scan using idx/); +like($explain, qr/Seq Scan/); # Test attribute filtering with few rows removed like $explain = $node->safe_psql("postgres", qq( diff --git a/test/t/039_hnsw_cost.pl b/test/t/039_hnsw_cost.pl index a26c09a..97ea5e7 100644 --- a/test/t/039_hnsw_cost.pl +++ b/test/t/039_hnsw_cost.pl @@ -17,12 +17,11 @@ $node->safe_psql("postgres", "CREATE EXTENSION vector;"); for my $dim (@dims) { my $array_sql = join(",", ('random()') x $dim); - my $n = $dim == 384 ? 3000 : 1000; # Create table and index $node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));"); $node->safe_psql("postgres", - "INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, $n) i;" + "INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, 2000) i;" ); $node->safe_psql("postgres", "CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);"); $node->safe_psql("postgres", "ANALYZE tst;"); @@ -40,6 +39,16 @@ for my $dim (@dims) )); like($explain, qr/Index Scan using idx/); + # 3x the rows are needed for distance filters + # since the planner uses DEFAULT_INEQ_SEL for the selectivity (should be 1) + # Recreate index for performance + $node->safe_psql("postgres", "DROP INDEX idx;"); + $node->safe_psql("postgres", + "INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(2001, 6000) i;" + ); + $node->safe_psql("postgres", "CREATE INDEX idx ON tst USING hnsw (v vector_l2_ops);"); + $node->safe_psql("postgres", "ANALYZE tst;"); + $explain = $node->safe_psql("postgres", qq( EXPLAIN ANALYZE SELECT i FROM tst WHERE v <-> '$query' < 1 ORDER BY v <-> '$query' LIMIT $limit; ));