From ff400ce5f1edba94869f2181d1759abdfe24aa4a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 12 Feb 2022 23:56:28 -0800 Subject: [PATCH 01/23] Use macro for UpdateProgress --- src/ivfbuild.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/ivfbuild.c b/src/ivfbuild.c index 65e4f38..a106246 100644 --- a/src/ivfbuild.c +++ b/src/ivfbuild.c @@ -36,16 +36,11 @@ #define CALLBACK_ITEM_POINTER HeapTuple hup #endif -/* - * Update build phase progress - */ -static inline void -UpdateProgress(int index, int64 val) -{ #if PG_VERSION_NUM >= 120000 - pgstat_progress_update_param(index, val); +#define UpdateProgress(index, val) pgstat_progress_update_param(index, val) +#else +#define UpdateProgress(index, val) ((void)val) #endif -} /* * Callback for sampling From 4f2c937a1fb43ea6b93c1b58013fe6fbbc41859e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Feb 2022 02:35:28 -0800 Subject: [PATCH 02/23] Improved recall test --- test/t/003_recall.pl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/test/t/003_recall.pl b/test/t/003_recall.pl index 4992b0a..c7ce251 100644 --- a/test/t/003_recall.pl +++ b/test/t/003_recall.pl @@ -2,7 +2,7 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 2; +use Test::More tests => 3; my $node; my @queries = (); @@ -67,6 +67,5 @@ $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); # Test approximate results test_recall(1, 0.8); - -# Test probes +test_recall(10, 0.95); test_recall(100, 1.0); From ce72ca8620656cee103ce9a4daa31ab02489b652 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Feb 2022 02:59:27 -0800 Subject: [PATCH 03/23] Switched to heap for nearest lists for performance --- src/ivfflat.h | 3 +++ src/ivfscan.c | 55 ++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/src/ivfflat.h b/src/ivfflat.h index 1337941..c29d6ad 100644 --- a/src/ivfflat.h +++ b/src/ivfflat.h @@ -165,6 +165,7 @@ typedef IvfflatListData * IvfflatList; typedef struct IvfflatScanList { + pairingheap_node ph_node; /* must come first */ BlockNumber startPage; double distance; } IvfflatScanList; @@ -186,6 +187,8 @@ typedef struct IvfflatScanOpaqueData FmgrInfo *normprocinfo; Oid collation; + /* Lists */ + pairingheap *listQueue; IvfflatScanList lists[FLEXIBLE_ARRAY_MEMBER]; /* must come last */ } IvfflatScanOpaqueData; diff --git a/src/ivfscan.c b/src/ivfscan.c index dfd52af..09b2baa 100644 --- a/src/ivfscan.c +++ b/src/ivfscan.c @@ -17,7 +17,7 @@ * Compare list distances */ static int -CompareLists(const void *a, const void *b) +CompareLists(const pairingheap_node *a, const pairingheap_node *b, void *arg) { double diff = (((IvfflatScanList *) a)->distance - ((IvfflatScanList *) b)->distance); @@ -45,6 +45,8 @@ GetScanLists(IndexScanDesc scan, Datum value) int listCount = 0; IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; double distance; + IvfflatScanList *scanlist; + double maxDistance; /* Search all list pages */ while (BlockNumberIsValid(nextblkno)) @@ -62,22 +64,39 @@ GetScanLists(IndexScanDesc scan, Datum value) /* Use procinfo from the index instead of scan key for performance */ distance = DatumGetFloat8(FunctionCall2Coll(so->procinfo, so->collation, PointerGetDatum(&list->center), value)); - so->lists[listCount].startPage = list->startPage; - so->lists[listCount].distance = distance; - listCount++; + if (listCount < so->probes) + { + scanlist = &so->lists[listCount]; + scanlist->startPage = list->startPage; + scanlist->distance = distance; + listCount++; + + /* Add to heap */ + pairingheap_add(so->listQueue, &scanlist->ph_node); + + /* Calculate max distance */ + if (listCount == so->probes) + maxDistance = ((IvfflatScanList *) pairingheap_first(so->listQueue))->distance; + } + else if (distance < maxDistance) + { + /* Remove */ + scanlist = (IvfflatScanList *) pairingheap_remove_first(so->listQueue); + + /* Reuse */ + scanlist->startPage = list->startPage; + scanlist->distance = distance; + pairingheap_add(so->listQueue, &scanlist->ph_node); + + /* Update max distance */ + maxDistance = ((IvfflatScanList *) pairingheap_first(so->listQueue))->distance; + } } nextblkno = IvfflatPageGetOpaque(cpage)->nextblkno; UnlockReleaseBuffer(cbuf); } - - /* Sort by distance */ - /* TODO Use heap for performance */ - qsort(so->lists, listCount, sizeof(IvfflatScanList), CompareLists); - - if (so->probes > listCount) - so->probes = listCount; } /* @@ -95,7 +114,6 @@ GetScanItems(IndexScanDesc scan, Datum value) OffsetNumber maxoffno; Datum datum; bool isnull; - int i; TupleDesc tupdesc = RelationGetDescr(scan->indexRelation); #if PG_VERSION_NUM >= 120000 @@ -112,9 +130,9 @@ GetScanItems(IndexScanDesc scan, Datum value) BufferAccessStrategy bas = GetAccessStrategy(BAS_BULKREAD); /* Search closest probes lists */ - for (i = 0; i < so->probes; i++) + while (!pairingheap_is_empty(so->listQueue)) { - searchPage = so->lists[i].startPage; + searchPage = ((IvfflatScanList *) pairingheap_remove_first(so->listQueue))->startPage; /* Search all entry pages for list */ while (BlockNumberIsValid(searchPage)) @@ -171,13 +189,15 @@ ivfflatbeginscan(Relation index, int nkeys, int norderbys) Oid sortOperators[] = {Float8LessOperator}; Oid sortCollations[] = {InvalidOid}; bool nullsFirstFlags[] = {false}; + int probes = ivfflat_probes; scan = RelationGetIndexScan(index, nkeys, norderbys); lists = IvfflatGetLists(scan->indexRelation); - so = (IvfflatScanOpaque) palloc(offsetof(IvfflatScanOpaqueData, lists) + lists * sizeof(IvfflatScanList)); + so = (IvfflatScanOpaque) palloc(offsetof(IvfflatScanOpaqueData, lists) + probes * sizeof(IvfflatScanList)); so->buf = InvalidBuffer; so->first = true; + so->probes = probes; /* Set support functions */ so->procinfo = index_getprocinfo(index, 1, IVFFLAT_DISTANCE_PROC); @@ -208,6 +228,8 @@ ivfflatbeginscan(Relation index, int nkeys, int norderbys) so->slot = MakeSingleTupleTableSlot(so->tupdesc); #endif + so->listQueue = pairingheap_allocate(CompareLists, scan); + scan->opaque = so; return scan; @@ -227,7 +249,7 @@ ivfflatrescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, int #endif so->first = true; - so->probes = ivfflat_probes; + pairingheap_reset(so->listQueue); if (keys && scan->numberOfKeys > 0) memmove(scan->keyData, keys, scan->numberOfKeys * sizeof(ScanKeyData)); @@ -326,6 +348,7 @@ ivfflatendscan(IndexScanDesc scan) if (BufferIsValid(so->buf)) ReleaseBuffer(so->buf); + pairingheap_free(so->listQueue); tuplesort_end(so->sortstate); pfree(so); From 310809d0e576fa6af283cacbf951631c000d57a4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Feb 2022 03:03:17 -0800 Subject: [PATCH 04/23] Fixed warnings --- src/ivfscan.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/ivfscan.c b/src/ivfscan.c index 09b2baa..b557fea 100644 --- a/src/ivfscan.c +++ b/src/ivfscan.c @@ -1,5 +1,7 @@ #include "postgres.h" +#include + #include "access/relscan.h" #include "ivfflat.h" #include "miscadmin.h" @@ -46,7 +48,7 @@ GetScanLists(IndexScanDesc scan, Datum value) IvfflatScanOpaque so = (IvfflatScanOpaque) scan->opaque; double distance; IvfflatScanList *scanlist; - double maxDistance; + double maxDistance = DBL_MAX; /* Search all list pages */ while (BlockNumberIsValid(nextblkno)) @@ -194,6 +196,9 @@ ivfflatbeginscan(Relation index, int nkeys, int norderbys) scan = RelationGetIndexScan(index, nkeys, norderbys); lists = IvfflatGetLists(scan->indexRelation); + if (probes > lists) + probes = lists; + so = (IvfflatScanOpaque) palloc(offsetof(IvfflatScanOpaqueData, lists) + probes * sizeof(IvfflatScanList)); so->buf = InvalidBuffer; so->first = true; From 9549d93260ec25df50b5e91bfa6045b451397357 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Feb 2022 03:08:39 -0800 Subject: [PATCH 05/23] Improved recall test --- test/t/003_recall.pl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/t/003_recall.pl b/test/t/003_recall.pl index c7ce251..dc8d6e7 100644 --- a/test/t/003_recall.pl +++ b/test/t/003_recall.pl @@ -7,6 +7,7 @@ use Test::More tests => 3; my $node; my @queries = (); my @expected = (); +my $limit = 30; sub test_recall { @@ -18,7 +19,7 @@ sub test_recall my $actual = $node->safe_psql("postgres", qq( SET enable_seqscan = off; SET ivfflat.probes = $probes; - SELECT i FROM tst ORDER BY v <-> '$queries[$i]' LIMIT 10; + SELECT i FROM tst ORDER BY v <-> '$queries[$i]' LIMIT $limit; )); my @actual_ids = split("\n", $actual); my %actual_set = map { $_ => 1 } @actual_ids; @@ -58,7 +59,7 @@ for (1..20) { # Get exact results foreach (@queries) { - my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v <-> '$_' LIMIT 10;"); + my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v <-> '$_' LIMIT $limit;"); push(@expected, $res); } From c35e9f3b847ff3ba100d8a6e8153639a57d9c75e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Feb 2022 03:53:30 -0800 Subject: [PATCH 06/23] Removed comment [skip ci] --- src/ivfflat.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ivfflat.h b/src/ivfflat.h index c29d6ad..b55a75d 100644 --- a/src/ivfflat.h +++ b/src/ivfflat.h @@ -165,7 +165,7 @@ typedef IvfflatListData * IvfflatList; typedef struct IvfflatScanList { - pairingheap_node ph_node; /* must come first */ + pairingheap_node ph_node; BlockNumber startPage; double distance; } IvfflatScanList; From 7bba0e2a0199f57167fc2cf2ce86970747d7efd0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 15:22:53 -0800 Subject: [PATCH 07/23] Test recall for all operators --- test/t/003_recall.pl | 48 +++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 16 deletions(-) diff --git a/test/t/003_recall.pl b/test/t/003_recall.pl index dc8d6e7..39b19dd 100644 --- a/test/t/003_recall.pl +++ b/test/t/003_recall.pl @@ -2,16 +2,16 @@ use strict; use warnings; use PostgresNode; use TestLib; -use Test::More tests => 3; +use Test::More tests => 9; my $node; my @queries = (); -my @expected = (); +my @expected; my $limit = 30; sub test_recall { - my ($probes, $min) = @_; + my ($probes, $min, $operator) = @_; my $correct = 0; my $total = 0; @@ -19,7 +19,7 @@ sub test_recall my $actual = $node->safe_psql("postgres", qq( SET enable_seqscan = off; SET ivfflat.probes = $probes; - SELECT i FROM tst ORDER BY v <-> '$queries[$i]' LIMIT $limit; + SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit; )); my @actual_ids = split("\n", $actual); my %actual_set = map { $_ => 1 } @actual_ids; @@ -57,16 +57,32 @@ for (1..20) { push(@queries, "[$r1,$r2,$r3]"); } -# Get exact results -foreach (@queries) { - my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v <-> '$_' LIMIT $limit;"); - push(@expected, $res); +# Check each index type +my @operators = ("<->", "<#>", "<=>"); + +foreach (@operators) { + my $operator = $_; + + # Get exact results + @expected = (); + foreach (@queries) { + my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v $operator '$_' LIMIT $limit;"); + push(@expected, $res); + } + + # Add index + my $opclass; + if ($operator == "<->") { + $opclass = "vector_l2_ops"; + } elsif ($operator == "<#>") { + $opclass = "vector_ip_ops"; + } else { + $opclass = "vector_cosine_ops"; + } + $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v $opclass);"); + + # Test approximate results + test_recall(1, 0.8, $operator); + test_recall(10, 0.95, $operator); + test_recall(100, 1.0, $operator); } - -# Add index -$node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); - -# Test approximate results -test_recall(1, 0.8); -test_recall(10, 0.95); -test_recall(100, 1.0); From ac65ec2856ec32973b7798d22f68367b84ef7cb8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 19:38:59 -0800 Subject: [PATCH 08/23] Improved code [skip ci] --- src/ivfscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ivfscan.c b/src/ivfscan.c index b557fea..cc3bdfd 100644 --- a/src/ivfscan.c +++ b/src/ivfscan.c @@ -21,7 +21,7 @@ static int CompareLists(const pairingheap_node *a, const pairingheap_node *b, void *arg) { - double diff = (((IvfflatScanList *) a)->distance - ((IvfflatScanList *) b)->distance); + double diff = ((const IvfflatScanList *) a)->distance - ((const IvfflatScanList *) b)->distance; if (diff > 0) return 1; From bf5b2c8d7eae8a68f2659c8337fbc4956bac4408 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 21:31:08 -0800 Subject: [PATCH 09/23] Use tuple id directly --- src/ivfbuild.c | 25 +++++++++---------------- src/ivfscan.c | 23 +++++++++-------------- 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/src/ivfbuild.c b/src/ivfbuild.c index a106246..904f759 100644 --- a/src/ivfbuild.c +++ b/src/ivfbuild.c @@ -173,12 +173,10 @@ BuildCallback(Relation index, CALLBACK_ITEM_POINTER, Datum *values, ExecClearTuple(slot); slot->tts_values[0] = Int32GetDatum(closestCenter); slot->tts_isnull[0] = false; - slot->tts_values[1] = Int32GetDatum(ItemPointerGetBlockNumberNoCheck(tid)); + slot->tts_values[1] = PointerGetDatum(tid); slot->tts_isnull[1] = false; - slot->tts_values[2] = Int32GetDatum(ItemPointerGetOffsetNumberNoCheck(tid)); + slot->tts_values[2] = value; slot->tts_isnull[2] = false; - slot->tts_values[3] = value; - slot->tts_isnull[3] = false; ExecStoreVirtualTuple(slot); /* @@ -200,8 +198,6 @@ GetNextTuple(Tuplesortstate *sortstate, TupleDesc tupdesc, TupleTableSlot *slot, { Datum value; bool isnull; - int tupblk; - int tupoff; #if PG_VERSION_NUM >= 100000 if (tuplesort_gettupleslot(sortstate, true, false, slot, NULL)) @@ -210,13 +206,11 @@ GetNextTuple(Tuplesortstate *sortstate, TupleDesc tupdesc, TupleTableSlot *slot, #endif { *list = DatumGetInt32(slot_getattr(slot, 1, &isnull)); - tupblk = DatumGetInt32(slot_getattr(slot, 2, &isnull)); - tupoff = DatumGetInt32(slot_getattr(slot, 3, &isnull)); - value = slot_getattr(slot, 4, &isnull); + value = slot_getattr(slot, 3, &isnull); /* Form the index tuple */ *itup = index_form_tuple(tupdesc, &value, &isnull); - ItemPointerSet(&(*itup)->t_tid, tupblk, tupoff); + (*itup)->t_tid = *((ItemPointer) DatumGetPointer(slot_getattr(slot, 2, &isnull))); } else *list = -1; @@ -325,17 +319,16 @@ InitBuildState(IvfflatBuildState * buildstate, Relation heap, Relation index, In /* Create tuple description for sorting */ #if PG_VERSION_NUM >= 120000 - buildstate->tupdesc = CreateTemplateTupleDesc(4); + buildstate->tupdesc = CreateTemplateTupleDesc(3); #else - buildstate->tupdesc = CreateTemplateTupleDesc(4, false); + buildstate->tupdesc = CreateTemplateTupleDesc(3, false); #endif TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 1, "list", INT4OID, -1, 0); - TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 2, "blkno", INT4OID, -1, 0); - TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 3, "offset", INT4OID, -1, 0); + TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 2, "tid", TIDOID, -1, 0); #if PG_VERSION_NUM >= 110000 - TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 4, "vector", RelationGetDescr(index)->attrs[0].atttypid, -1, 0); + TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 3, "vector", RelationGetDescr(index)->attrs[0].atttypid, -1, 0); #else - TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 4, "vector", RelationGetDescr(index)->attrs[0]->atttypid, -1, 0); + TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 3, "vector", RelationGetDescr(index)->attrs[0]->atttypid, -1, 0); #endif #if PG_VERSION_NUM >= 120000 diff --git a/src/ivfscan.c b/src/ivfscan.c index cc3bdfd..958faae 100644 --- a/src/ivfscan.c +++ b/src/ivfscan.c @@ -158,12 +158,10 @@ GetScanItems(IndexScanDesc scan, Datum value) ExecClearTuple(slot); slot->tts_values[0] = FunctionCall2Coll(so->procinfo, so->collation, datum, value); slot->tts_isnull[0] = false; - slot->tts_values[1] = Int32GetDatum((int) ItemPointerGetBlockNumberNoCheck(&itup->t_tid)); + slot->tts_values[1] = PointerGetDatum(&itup->t_tid); slot->tts_isnull[1] = false; - slot->tts_values[2] = Int32GetDatum((int) ItemPointerGetOffsetNumberNoCheck(&itup->t_tid)); + slot->tts_values[2] = Int32GetDatum((int) searchPage); slot->tts_isnull[2] = false; - slot->tts_values[3] = Int32GetDatum((int) searchPage); - slot->tts_isnull[3] = false; ExecStoreVirtualTuple(slot); tuplesort_puttupleslot(so->sortstate, slot); @@ -211,14 +209,13 @@ ivfflatbeginscan(Relation index, int nkeys, int norderbys) /* Create tuple description for sorting */ #if PG_VERSION_NUM >= 120000 - so->tupdesc = CreateTemplateTupleDesc(4); + so->tupdesc = CreateTemplateTupleDesc(3); #else - so->tupdesc = CreateTemplateTupleDesc(4, false); + so->tupdesc = CreateTemplateTupleDesc(3, false); #endif TupleDescInitEntry(so->tupdesc, (AttrNumber) 1, "distance", FLOAT8OID, -1, 0); - TupleDescInitEntry(so->tupdesc, (AttrNumber) 2, "blkno", INT4OID, -1, 0); - TupleDescInitEntry(so->tupdesc, (AttrNumber) 3, "offset", INT4OID, -1, 0); - TupleDescInitEntry(so->tupdesc, (AttrNumber) 4, "indexblkno", INT4OID, -1, 0); + TupleDescInitEntry(so->tupdesc, (AttrNumber) 2, "tid", TIDOID, -1, 0); + TupleDescInitEntry(so->tupdesc, (AttrNumber) 3, "indexblkno", INT4OID, -1, 0); /* Prep sort */ #if PG_VERSION_NUM >= 110000 @@ -313,14 +310,12 @@ ivfflatgettuple(IndexScanDesc scan, ScanDirection dir) if (tuplesort_gettupleslot(so->sortstate, true, so->slot, NULL)) #endif { - BlockNumber blkno = DatumGetInt32(slot_getattr(so->slot, 2, &so->isnull)); - OffsetNumber offset = DatumGetInt32(slot_getattr(so->slot, 3, &so->isnull)); - BlockNumber indexblkno = DatumGetInt32(slot_getattr(so->slot, 4, &so->isnull)); + BlockNumber indexblkno = DatumGetInt32(slot_getattr(so->slot, 3, &so->isnull)); #if PG_VERSION_NUM >= 120000 - ItemPointerSet(&scan->xs_heaptid, blkno, offset); + scan->xs_heaptid = *((ItemPointer) DatumGetPointer(slot_getattr(so->slot, 2, &so->isnull))); #else - ItemPointerSet(&scan->xs_ctup.t_self, blkno, offset); + scan->xs_ctup.t_self = *((ItemPointer) DatumGetPointer(slot_getattr(so->slot, 2, &so->isnull))); #endif if (BufferIsValid(so->buf)) From 88be03a3fa8268e11dfd2d203c7518b47a93fb7f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 21:37:53 -0800 Subject: [PATCH 10/23] Removed unused code --- src/ivfflat.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/ivfflat.h b/src/ivfflat.h index b55a75d..e5a8a08 100644 --- a/src/ivfflat.h +++ b/src/ivfflat.h @@ -62,11 +62,6 @@ #define IvfflatBench(name, code) (code) #endif -#if PG_VERSION_NUM < 100000 -#define ItemPointerGetBlockNumberNoCheck ItemPointerGetBlockNumber -#define ItemPointerGetOffsetNumberNoCheck ItemPointerGetOffsetNumber -#endif - /* Variables */ extern int ivfflat_probes; From 6b9c6516f469f312c91a79ed5e93344dd3300983 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 21:41:27 -0800 Subject: [PATCH 11/23] Improved code --- src/ivfscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ivfscan.c b/src/ivfscan.c index 958faae..aa41275 100644 --- a/src/ivfscan.c +++ b/src/ivfscan.c @@ -310,12 +310,13 @@ ivfflatgettuple(IndexScanDesc scan, ScanDirection dir) if (tuplesort_gettupleslot(so->sortstate, true, so->slot, NULL)) #endif { + ItemPointer tid = (ItemPointer) DatumGetPointer(slot_getattr(so->slot, 2, &so->isnull)); BlockNumber indexblkno = DatumGetInt32(slot_getattr(so->slot, 3, &so->isnull)); #if PG_VERSION_NUM >= 120000 - scan->xs_heaptid = *((ItemPointer) DatumGetPointer(slot_getattr(so->slot, 2, &so->isnull))); + scan->xs_heaptid = *tid; #else - scan->xs_ctup.t_self = *((ItemPointer) DatumGetPointer(slot_getattr(so->slot, 2, &so->isnull))); + scan->xs_ctup.t_self = *tid; #endif if (BufferIsValid(so->buf)) From 9658d3c1adb8019959043d1274fdcbd7fc37b335 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 21:49:08 -0800 Subject: [PATCH 12/23] Made recall test less flaky --- test/t/003_recall.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/t/003_recall.pl b/test/t/003_recall.pl index 39b19dd..953e199 100644 --- a/test/t/003_recall.pl +++ b/test/t/003_recall.pl @@ -7,7 +7,7 @@ use Test::More tests => 9; my $node; my @queries = (); my @expected; -my $limit = 30; +my $limit = 20; sub test_recall { From 01926a418e72bbef863166a31d06f57b8e633d46 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 21:52:31 -0800 Subject: [PATCH 13/23] Print operator name for failed recall test [skip ci] --- test/t/003_recall.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/t/003_recall.pl b/test/t/003_recall.pl index 953e199..b5e5395 100644 --- a/test/t/003_recall.pl +++ b/test/t/003_recall.pl @@ -34,7 +34,7 @@ sub test_recall } } - cmp_ok($correct / $total, ">=", $min); + cmp_ok($correct / $total, ">=", $min, $operator); } # Initialize node From ad8acc00d4954cb87aee756238b8df4018541e26 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 21:58:20 -0800 Subject: [PATCH 14/23] Fixed spacing [skip ci] --- src/ivfbuild.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ivfbuild.c b/src/ivfbuild.c index 904f759..11283d0 100644 --- a/src/ivfbuild.c +++ b/src/ivfbuild.c @@ -324,7 +324,7 @@ InitBuildState(IvfflatBuildState * buildstate, Relation heap, Relation index, In buildstate->tupdesc = CreateTemplateTupleDesc(3, false); #endif TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 1, "list", INT4OID, -1, 0); - TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 2, "tid", TIDOID, -1, 0); + TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 2, "tid", TIDOID, -1, 0); #if PG_VERSION_NUM >= 110000 TupleDescInitEntry(buildstate->tupdesc, (AttrNumber) 3, "vector", RelationGetDescr(index)->attrs[0].atttypid, -1, 0); #else From 38f869e0bd8546b813d4942147192de8e031905b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 23:13:36 -0800 Subject: [PATCH 15/23] Added test for 100% recall --- test/t/005_query_recall.pl | 45 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 test/t/005_query_recall.pl diff --git a/test/t/005_query_recall.pl b/test/t/005_query_recall.pl new file mode 100644 index 0000000..0d93e7a --- /dev/null +++ b/test/t/005_query_recall.pl @@ -0,0 +1,45 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 60; + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (i int4 primary key, v vector(3));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" +); + +# Check each index type +my @operators = ("<->", "<#>", "<=>"); +foreach (@operators) { + my $operator = $_; + + # Add index + my $opclass; + if ($operator == "<->") { + $opclass = "vector_l2_ops"; + } elsif ($operator == "<#>") { + $opclass = "vector_ip_ops"; + } else { + $opclass = "vector_cosine_ops"; + } + $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v $opclass);"); + + # Test 100% recall + for (1..20) { + my $i = int(rand() * 100000); + my $query = $node->safe_psql("postgres", "SELECT v FROM tst WHERE i = $i"); + my $res = $node->safe_psql("postgres", qq( + SET enable_seqscan = off; + SELECT v FROM tst ORDER BY v <-> '$query' LIMIT 1; + )); + is($res, $query); + } +} From 4bdb27e85a39b8b2b0675839e417632ab6a1c162 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Feb 2022 23:15:27 -0800 Subject: [PATCH 16/23] Fixed style [skip ci] --- test/t/005_query_recall.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/t/005_query_recall.pl b/test/t/005_query_recall.pl index 0d93e7a..0e58135 100644 --- a/test/t/005_query_recall.pl +++ b/test/t/005_query_recall.pl @@ -35,7 +35,7 @@ foreach (@operators) { # Test 100% recall for (1..20) { my $i = int(rand() * 100000); - my $query = $node->safe_psql("postgres", "SELECT v FROM tst WHERE i = $i"); + my $query = $node->safe_psql("postgres", "SELECT v FROM tst WHERE i = $i;"); my $res = $node->safe_psql("postgres", qq( SET enable_seqscan = off; SELECT v FROM tst ORDER BY v <-> '$query' LIMIT 1; From a37f5eea4a72d04733ad73b88b382dd4a734fb51 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 11:50:35 -0800 Subject: [PATCH 17/23] Improved compare method [skip ci] --- src/ivfscan.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/ivfscan.c b/src/ivfscan.c index aa41275..e2171b0 100644 --- a/src/ivfscan.c +++ b/src/ivfscan.c @@ -21,12 +21,10 @@ static int CompareLists(const pairingheap_node *a, const pairingheap_node *b, void *arg) { - double diff = ((const IvfflatScanList *) a)->distance - ((const IvfflatScanList *) b)->distance; - - if (diff > 0) + if (((const IvfflatScanList *) a)->distance > ((const IvfflatScanList *) b)->distance) return 1; - if (diff < 0) + if (((const IvfflatScanList *) a)->distance < ((const IvfflatScanList *) b)->distance) return -1; return 0; From fed60dce78fa6a34ef595465a3b49f85cb8eb143 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 17:04:20 -0800 Subject: [PATCH 18/23] Added tests for lists --- test/t/006_lists.pl | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 test/t/006_lists.pl diff --git a/test/t/006_lists.pl b/test/t/006_lists.pl new file mode 100644 index 0000000..eeb11aa --- /dev/null +++ b/test/t/006_lists.pl @@ -0,0 +1,31 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More tests => 3; + +# Initialize node +my $node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (v vector(3));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" +); + +$node->safe_psql("postgres", "CREATE INDEX lists50 ON tst USING ivfflat (v) WITH (lists = 50);"); +$node->safe_psql("postgres", "CREATE INDEX lists100 ON tst USING ivfflat (v) WITH (lists = 100);"); + +# Test prefers more lists +my $res = $node->safe_psql("postgres", "EXPLAIN SELECT v FROM tst ORDER BY v <-> '[0.5,0.5,0.5]' LIMIT 10;"); +like($res, qr/lists100/); +unlike($res, qr/lists50/); + +# Test errors with too much memory +my ($ret, $stdout, $stderr) = $node->psql("postgres", + "CREATE INDEX lists10000 ON tst USING ivfflat (v) WITH (lists = 10000);" +); +like($stderr, qr/memory required is/); From 0d025be9d3e1cad590b9c4042deb32fa347626a4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 18:05:33 -0800 Subject: [PATCH 19/23] Improved performance of index creation for Postgres < 12 --- CHANGELOG.md | 4 ++++ src/ivfbuild.c | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 240efd8..d9646d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.6 (unreleased) + +- Improved performance of index creation for Postgres < 12 + ## 0.2.5 (2022-02-11) - Reduced memory usage during index creation diff --git a/src/ivfbuild.c b/src/ivfbuild.c index 11283d0..645031c 100644 --- a/src/ivfbuild.c +++ b/src/ivfbuild.c @@ -115,10 +115,10 @@ SampleRows(IvfflatBuildState * buildstate) false, true, true, targblock, 1, SampleCallback, (void *) buildstate, NULL); #elif PG_VERSION_NUM >= 110000 IndexBuildHeapRangeScan(buildstate->heap, buildstate->index, buildstate->indexInfo, - true, true, targblock, 1, SampleCallback, (void *) buildstate, NULL); + false, true, targblock, 1, SampleCallback, (void *) buildstate, NULL); #else IndexBuildHeapRangeScan(buildstate->heap, buildstate->index, buildstate->indexInfo, - true, true, targblock, 1, SampleCallback, (void *) buildstate); + false, true, targblock, 1, SampleCallback, (void *) buildstate); #endif } } From e64ed39acb10fe4597a0473c5f7c6a0da53e97f2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 18:14:59 -0800 Subject: [PATCH 20/23] Disabled scan progress for sampling --- src/ivfbuild.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ivfbuild.c b/src/ivfbuild.c index 645031c..2cd1f3c 100644 --- a/src/ivfbuild.c +++ b/src/ivfbuild.c @@ -112,7 +112,7 @@ SampleRows(IvfflatBuildState * buildstate) #if PG_VERSION_NUM >= 120000 table_index_build_range_scan(buildstate->heap, buildstate->index, buildstate->indexInfo, - false, true, true, targblock, 1, SampleCallback, (void *) buildstate, NULL); + false, true, false, targblock, 1, SampleCallback, (void *) buildstate, NULL); #elif PG_VERSION_NUM >= 110000 IndexBuildHeapRangeScan(buildstate->heap, buildstate->index, buildstate->indexInfo, false, true, targblock, 1, SampleCallback, (void *) buildstate, NULL); From f5458414b89514751bb5b7ac4a71a78c357dff70 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 18:17:05 -0800 Subject: [PATCH 21/23] Made recall test less flaky --- test/t/003_recall.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/t/003_recall.pl b/test/t/003_recall.pl index b5e5395..dddc4d5 100644 --- a/test/t/003_recall.pl +++ b/test/t/003_recall.pl @@ -82,7 +82,7 @@ foreach (@operators) { $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v $opclass);"); # Test approximate results - test_recall(1, 0.8, $operator); + test_recall(1, 0.75, $operator); test_recall(10, 0.95, $operator); test_recall(100, 1.0, $operator); } From 4ca264ba02af09e74e21be0d3bb1c90d5d3b4a07 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 19:08:47 -0800 Subject: [PATCH 22/23] Added Davies-Bouldin index [skip ci] --- src/ivfbuild.c | 43 +++++++++++++++++++++++++++++++++++++++++++ src/ivfflat.h | 2 ++ 2 files changed, 45 insertions(+) diff --git a/src/ivfbuild.c b/src/ivfbuild.c index 2cd1f3c..666e751 100644 --- a/src/ivfbuild.c +++ b/src/ivfbuild.c @@ -167,6 +167,8 @@ BuildCallback(Relation index, CALLBACK_ITEM_POINTER, Datum *values, #ifdef IVFFLAT_KMEANS_DEBUG buildstate->inertia += minDistance; + buildstate->listSums[closestCenter] += minDistance; + buildstate->listCounts[closestCenter]++; #endif /* Create a virtual tuple */ @@ -345,6 +347,8 @@ InitBuildState(IvfflatBuildState * buildstate, Relation heap, Relation index, In #ifdef IVFFLAT_KMEANS_DEBUG buildstate->inertia = 0; + buildstate->listSums = palloc0(sizeof(double) * buildstate->lists); + buildstate->listCounts = palloc0(sizeof(int) * buildstate->lists); #endif } @@ -357,6 +361,11 @@ FreeBuildState(IvfflatBuildState * buildstate) pfree(buildstate->centers); pfree(buildstate->listInfo); pfree(buildstate->normvec); + +#ifdef IVFFLAT_KMEANS_DEBUG + pfree(buildstate->listSums); + pfree(buildstate->listCounts); +#endif } /* @@ -503,6 +512,40 @@ CreateEntryPages(IvfflatBuildState * buildstate, ForkNumber forkNum) #ifdef IVFFLAT_KMEANS_DEBUG elog(INFO, "inertia: %.3e", buildstate->inertia); + + /* Calculate Davies-Bouldin index */ + if (buildstate->lists > 1) + { + double db = 0.0; + + /* Calculate average distance */ + for (int i = 0; i < buildstate->lists; i++) + { + if (buildstate->listCounts[i] > 0) + buildstate->listSums[i] /= buildstate->listCounts[i]; + } + + for (int i = 0; i < buildstate->lists; i++) + { + double max = 0.0; + double distance; + + for (int j = 0; j < buildstate->lists; j++) + { + if (j == i) + continue; + + distance = DatumGetFloat8(FunctionCall2Coll(buildstate->procinfo, buildstate->collation, PointerGetDatum(VectorArrayGet(buildstate->centers, i)), PointerGetDatum(VectorArrayGet(buildstate->centers, j)))); + distance = (buildstate->listSums[i] + buildstate->listSums[j]) / distance; + + if (distance > max) + max = distance; + } + db += max; + } + db /= buildstate->lists; + elog(INFO, "davies-bouldin: %.3f", db); + } #endif /* Insert */ diff --git a/src/ivfflat.h b/src/ivfflat.h index e5a8a08..bce7d42 100644 --- a/src/ivfflat.h +++ b/src/ivfflat.h @@ -117,6 +117,8 @@ typedef struct IvfflatBuildState #ifdef IVFFLAT_KMEANS_DEBUG double inertia; + double *listSums; + int *listCounts; #endif /* Sampling */ From ecbf46938f92db24322b78e90830dd44772e6d5b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 19:13:23 -0800 Subject: [PATCH 23/23] Moved metrics to separate function [skip ci] --- src/ivfbuild.c | 81 ++++++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/src/ivfbuild.c b/src/ivfbuild.c index 666e751..6b7d6e9 100644 --- a/src/ivfbuild.c +++ b/src/ivfbuild.c @@ -473,6 +473,51 @@ CreateListPages(Relation index, VectorArray centers, int dimensions, pfree(list); } +/* + * Print k-means metrics + */ +#ifdef IVFFLAT_KMEANS_DEBUG +static void +PrintKmeansMetrics(IvfflatBuildState * buildstate) +{ + elog(INFO, "inertia: %.3e", buildstate->inertia); + + /* Calculate Davies-Bouldin index */ + if (buildstate->lists > 1) + { + double db = 0.0; + + /* Calculate average distance */ + for (int i = 0; i < buildstate->lists; i++) + { + if (buildstate->listCounts[i] > 0) + buildstate->listSums[i] /= buildstate->listCounts[i]; + } + + for (int i = 0; i < buildstate->lists; i++) + { + double max = 0.0; + double distance; + + for (int j = 0; j < buildstate->lists; j++) + { + if (j == i) + continue; + + distance = DatumGetFloat8(FunctionCall2Coll(buildstate->procinfo, buildstate->collation, PointerGetDatum(VectorArrayGet(buildstate->centers, i)), PointerGetDatum(VectorArrayGet(buildstate->centers, j)))); + distance = (buildstate->listSums[i] + buildstate->listSums[j]) / distance; + + if (distance > max) + max = distance; + } + db += max; + } + db /= buildstate->lists; + elog(INFO, "davies-bouldin: %.3f", db); + } +} +#endif + /* * Create entry pages */ @@ -511,41 +556,7 @@ CreateEntryPages(IvfflatBuildState * buildstate, ForkNumber forkNum) tuplesort_performsort(buildstate->sortstate); #ifdef IVFFLAT_KMEANS_DEBUG - elog(INFO, "inertia: %.3e", buildstate->inertia); - - /* Calculate Davies-Bouldin index */ - if (buildstate->lists > 1) - { - double db = 0.0; - - /* Calculate average distance */ - for (int i = 0; i < buildstate->lists; i++) - { - if (buildstate->listCounts[i] > 0) - buildstate->listSums[i] /= buildstate->listCounts[i]; - } - - for (int i = 0; i < buildstate->lists; i++) - { - double max = 0.0; - double distance; - - for (int j = 0; j < buildstate->lists; j++) - { - if (j == i) - continue; - - distance = DatumGetFloat8(FunctionCall2Coll(buildstate->procinfo, buildstate->collation, PointerGetDatum(VectorArrayGet(buildstate->centers, i)), PointerGetDatum(VectorArrayGet(buildstate->centers, j)))); - distance = (buildstate->listSums[i] + buildstate->listSums[j]) / distance; - - if (distance > max) - max = distance; - } - db += max; - } - db /= buildstate->lists; - elog(INFO, "davies-bouldin: %.3f", db); - } + PrintKmeansMetrics(buildstate); #endif /* Insert */