From cbf3eb4fa5b08a42aa4497ab2df7567ab6554e98 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 13 Jan 2024 10:07:42 -0800 Subject: [PATCH] Improved HNSW build and insert code --- src/hnsw.h | 1 - src/hnswbuild.c | 87 ++++++++++++++++++++++++++++++------------------ src/hnswinsert.c | 17 +++++----- src/hnswutils.c | 24 ------------- 4 files changed, 62 insertions(+), 67 deletions(-) diff --git a/src/hnsw.h b/src/hnsw.h index 6fe0ca8..b538284 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -339,7 +339,6 @@ void HnswGetMetaPageInfo(Relation index, int *m, HnswElement * entryPoint); HnswElement HnswInitElement(ItemPointer tid, int m, double ml, int maxLevel); HnswElement HnswInitElementFromBlock(BlockNumber blkno, OffsetNumber offno); void HnswInsertElement(HnswElement element, HnswElement entryPoint, Relation index, FmgrInfo *procinfo, Oid collation, int m, int efConstruction, bool existing); -HnswElement HnswFindDuplicate(HnswElement e); HnswCandidate *HnswEntryCandidate(HnswElement em, Datum q, Relation rel, FmgrInfo *procinfo, Oid collation, bool loadVec); void HnswUpdateMetaPage(Relation index, int updateEntry, HnswElement entryPoint, BlockNumber insertPage, ForkNumber forkNum, bool building); void HnswSetNeighborTuple(HnswNeighborTuple ntup, HnswElement e, int m); diff --git a/src/hnswbuild.c b/src/hnswbuild.c index 4c89ee6..1aedfe4 100644 --- a/src/hnswbuild.c +++ b/src/hnswbuild.c @@ -316,6 +316,33 @@ HnswElementMemory(HnswElement e, int m) } #endif +/* + * Find duplicate element + */ +static bool +HnswFindDuplicateInMemory(HnswElement element) +{ + HnswNeighborArray *neighbors = &element->neighbors[0]; + + for (int i = 0; i < neighbors->length; i++) + { + HnswCandidate *neighbor = &neighbors->items[i]; + + /* Exit early since ordered by distance */ + if (!datumIsEqual(element->value, neighbor->element->value, false, -1)) + return false; + + /* Check for space */ + if (neighbor->element->heaptidsLength < HNSW_HEAPTIDS) + { + HnswAddHeapTid(neighbor->element, &element->heaptids[0]); + return true; + } + } + + return false; +} + /* * Insert tuple into in-memory graph */ @@ -330,7 +357,6 @@ InsertTupleInMemory(Relation index, Datum *values, ItemPointer heaptid, HnswBuil int m = buildstate->m; MemoryContext oldCtx; HnswElement element; - HnswElement dup; /* Detoast once for all calls */ Datum value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); @@ -348,38 +374,6 @@ InsertTupleInMemory(Relation index, Datum *values, ItemPointer heaptid, HnswBuil element->value = datumCopy(value, false, -1); MemoryContextSwitchTo(oldCtx); - /* Insert element in graph */ - HnswInsertElement(element, entryPoint, NULL, procinfo, collation, m, efConstruction, false); - - /* Look for duplicate */ - dup = HnswFindDuplicate(element); - - if (dup == NULL) - { - /* Add element */ - slist_push_head(&graph->elements, &element->next); - - /* Update neighbors */ - for (int lc = element->level; lc >= 0; lc--) - { - int lm = HnswGetLayerM(m, lc); - HnswNeighborArray *neighbors = &element->neighbors[lc]; - - for (int i = 0; i < neighbors->length; i++) - HnswUpdateConnection(element, &neighbors->items[i], lm, lc, NULL, NULL, procinfo, collation); - } - - /* Update entry point if needed */ - if (entryPoint == NULL || element->level > entryPoint->level) - graph->entryPoint = element; - } - else - { - /* No need to free element since memory unlikely to be reallocated */ - /* Element is also used to estimate memory usage below */ - HnswAddHeapTid(dup, heaptid); - } - /* Update memory usage */ #if PG_VERSION_NUM >= 130000 graph->memoryUsed = MemoryContextMemAllocated(buildstate->graphCtx, false); @@ -387,6 +381,33 @@ InsertTupleInMemory(Relation index, Datum *values, ItemPointer heaptid, HnswBuil graph->memoryUsed += HnswElementMemory(element, buildstate->m); #endif + /* Insert element in graph */ + HnswInsertElement(element, entryPoint, NULL, procinfo, collation, m, efConstruction, false); + + /* Look for duplicate */ + if (HnswFindDuplicateInMemory(element)) + { + /* No need to free element since memory unlikely to be reallocated */ + return true; + } + + /* Add element */ + slist_push_head(&graph->elements, &element->next); + + /* Update neighbors */ + for (int lc = element->level; lc >= 0; lc--) + { + int lm = HnswGetLayerM(m, lc); + HnswNeighborArray *neighbors = &element->neighbors[lc]; + + for (int i = 0; i < neighbors->length; i++) + HnswUpdateConnection(element, &neighbors->items[i], lm, lc, NULL, NULL, procinfo, collation); + } + + /* Update entry point if needed */ + if (entryPoint == NULL || element->level > entryPoint->level) + graph->entryPoint = element; + return true; } diff --git a/src/hnswinsert.c b/src/hnswinsert.c index 085516b..d411dd2 100644 --- a/src/hnswinsert.c +++ b/src/hnswinsert.c @@ -447,7 +447,7 @@ HnswUpdateNeighborPages(Relation index, FmgrInfo *procinfo, Oid collation, HnswE * Add a heap TID to an existing element */ static bool -HnswAddDuplicateToPage(Relation index, HnswElement element, HnswElement dup, bool building) +HnswAddDuplicate(Relation index, HnswElement element, HnswElement dup, bool building) { Buffer buf; Page page; @@ -508,10 +508,10 @@ HnswAddDuplicateToPage(Relation index, HnswElement element, HnswElement dup, boo } /* - * Add duplicate if found + * Find duplicate element */ static bool -HnswAddDuplicateIfFound(Relation index, HnswElement element, bool building) +HnswFindDuplicate(Relation index, HnswElement element, bool building) { HnswNeighborArray *neighbors = &element->neighbors[0]; @@ -519,12 +519,11 @@ HnswAddDuplicateIfFound(Relation index, HnswElement element, bool building) { HnswCandidate *neighbor = &neighbors->items[i]; - /* Exit early if not duplicate since ordered by distance */ + /* Exit early since ordered by distance */ if (!datumIsEqual(element->value, neighbor->element->value, false, -1)) return false; - /* If adding fails, continue to next duplicate element */ - if (HnswAddDuplicateToPage(index, element, neighbor->element, building)) + if (HnswAddDuplicate(index, element, neighbor->element, building)) return true; } @@ -539,8 +538,8 @@ WriteElement(Relation index, FmgrInfo *procinfo, Oid collation, HnswElement elem { BlockNumber newInsertPage = InvalidBlockNumber; - /* Try to add to existing page */ - if (HnswAddDuplicateIfFound(index, element, building)) + /* Look for duplicate */ + if (HnswFindDuplicate(index, element, building)) return; /* Write element and neighbor tuples */ @@ -553,7 +552,7 @@ WriteElement(Relation index, FmgrInfo *procinfo, Oid collation, HnswElement elem /* Update neighbors */ HnswUpdateNeighborPages(index, procinfo, collation, element, m, false, building); - /* Update metapage if needed */ + /* Update entry point if needed */ if (entryPoint == NULL || element->level > entryPoint->level) HnswUpdateMetaPage(index, HNSW_UPDATE_ENTRY_GREATER, element, InvalidBlockNumber, MAIN_FORKNUM, building); } diff --git a/src/hnswutils.c b/src/hnswutils.c index d8170df..3888765 100644 --- a/src/hnswutils.c +++ b/src/hnswutils.c @@ -908,30 +908,6 @@ SelectNeighbors(List *c, int m, int lc, FmgrInfo *procinfo, Oid collation, HnswE return r; } -/* - * Find duplicate element - */ -HnswElement -HnswFindDuplicate(HnswElement e) -{ - HnswNeighborArray *neighbors = &e->neighbors[0]; - - for (int i = 0; i < neighbors->length; i++) - { - HnswCandidate *neighbor = &neighbors->items[i]; - - /* Exit early since ordered by distance */ - if (!datumIsEqual(e->value, neighbor->element->value, false, -1)) - break; - - /* Check for space */ - if (neighbor->element->heaptidsLength < HNSW_HEAPTIDS) - return neighbor->element; - } - - return NULL; -} - /* * Add connections */