From 49c1f130950de2a82691b93b31e8b96828b67aa3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 29 May 2024 12:03:58 -0700 Subject: [PATCH] Improved performance of on-disk HNSW index builds - #570 --- CHANGELOG.md | 1 + src/hnsw.h | 2 +- src/hnswutils.c | 32 +++++++++++++++++--------------- src/hnswvacuum.c | 4 ++-- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3eae03e..d4fba85 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.7.1 (unreleased) +- Improved performance of on-disk HNSW index builds - Fixed `undefined symbol` error with GCC 8 - Fixed compilation error with universal binaries on Mac - Fixed compilation warning with Clang < 14 diff --git a/src/hnsw.h b/src/hnsw.h index 2f11137..480ad9f 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -393,7 +393,7 @@ void HnswInitNeighbors(char *base, HnswElement element, int m, HnswAllocator * bool HnswInsertTupleOnDisk(Relation index, Datum value, Datum *values, bool *isnull, ItemPointer heap_tid, bool building); void HnswUpdateNeighborsOnDisk(Relation index, FmgrInfo *procinfo, Oid collation, HnswElement e, int m, bool checkExisting, bool building); void HnswLoadElementFromTuple(HnswElement element, HnswElementTuple etup, bool loadHeaptids, bool loadVec); -void HnswLoadElement(HnswElement element, float *distance, Datum *q, Relation index, FmgrInfo *procinfo, Oid collation, bool loadVec); +void HnswLoadElement(HnswElement element, float *distance, Datum *q, Relation index, FmgrInfo *procinfo, Oid collation, bool loadVec, float *maxDistance); void HnswSetElementTuple(char *base, HnswElementTuple etup, HnswElement element); void HnswUpdateConnection(char *base, HnswElement element, HnswCandidate * hc, int lm, int lc, int *updateIdx, Relation index, FmgrInfo *procinfo, Oid collation); void HnswLoadNeighbors(HnswElement element, Relation index, int m); diff --git a/src/hnswutils.c b/src/hnswutils.c index 48acef0..d3ba911 100644 --- a/src/hnswutils.c +++ b/src/hnswutils.c @@ -545,7 +545,7 @@ HnswLoadElementFromTuple(HnswElement element, HnswElementTuple etup, bool loadHe * Load an element and optionally get its distance from q */ void -HnswLoadElement(HnswElement element, float *distance, Datum *q, Relation index, FmgrInfo *procinfo, Oid collation, bool loadVec) +HnswLoadElement(HnswElement element, float *distance, Datum *q, Relation index, FmgrInfo *procinfo, Oid collation, bool loadVec, float *maxDistance) { Buffer buf; Page page; @@ -560,9 +560,6 @@ HnswLoadElement(HnswElement element, float *distance, Datum *q, Relation index, Assert(HnswIsElementTuple(etup)); - /* Load element */ - HnswLoadElementFromTuple(element, etup, true, loadVec); - /* Calculate distance */ if (distance != NULL) { @@ -572,6 +569,10 @@ HnswLoadElement(HnswElement element, float *distance, Datum *q, Relation index, *distance = (float) DatumGetFloat8(FunctionCall2Coll(procinfo, collation, *q, PointerGetDatum(&etup->data))); } + /* Load element */ + if (distance == NULL || maxDistance == NULL || *distance < *maxDistance) + HnswLoadElementFromTuple(element, etup, true, loadVec); + UnlockReleaseBuffer(buf); } @@ -599,7 +600,7 @@ HnswEntryCandidate(char *base, HnswElement entryPoint, Datum q, Relation index, if (index == NULL) hc->distance = GetCandidateDistance(base, hc, q, procinfo, collation); else - HnswLoadElement(entryPoint, &hc->distance, &q, index, procinfo, collation, loadVec); + HnswLoadElement(entryPoint, &hc->distance, &q, index, procinfo, collation, loadVec, NULL); return hc; } @@ -801,19 +802,20 @@ HnswSearchLayer(char *base, Datum q, List *ep, int ef, int lc, Relation index, F if (index == NULL) eDistance = GetCandidateDistance(base, e, q, procinfo, collation); else - HnswLoadElement(eElement, &eDistance, &q, index, procinfo, collation, inserting); - - Assert(!eElement->deleted); - - /* Make robust to issues */ - if (eElement->level < lc) - continue; + HnswLoadElement(eElement, &eDistance, &q, index, procinfo, collation, inserting, wlen >= ef ? &f->distance : NULL); if (eDistance < f->distance || wlen < ef) { - /* Copy e */ - HnswCandidate *ec = palloc(sizeof(HnswCandidate)); + HnswCandidate *ec; + Assert(!eElement->deleted); + + /* Make robust to issues */ + if (eElement->level < lc) + continue; + + /* Copy e */ + ec = palloc(sizeof(HnswCandidate)); HnswPtrStore(base, ec->element, eElement); ec->distance = eDistance; @@ -1102,7 +1104,7 @@ HnswUpdateConnection(char *base, HnswElement element, HnswCandidate * hc, int lm HnswElement hc3Element = HnswPtrAccess(base, hc3->element); if (HnswPtrIsNull(base, hc3Element->value)) - HnswLoadElement(hc3Element, &hc3->distance, &q, index, procinfo, collation, true); + HnswLoadElement(hc3Element, &hc3->distance, &q, index, procinfo, collation, true, NULL); else hc3->distance = GetCandidateDistance(base, hc3, q, procinfo, collation); diff --git a/src/hnswvacuum.c b/src/hnswvacuum.c index 7c14e54..67cc645 100644 --- a/src/hnswvacuum.c +++ b/src/hnswvacuum.c @@ -256,7 +256,7 @@ RepairGraphEntryPoint(HnswVacuumState * vacuumstate) LockPage(index, HNSW_UPDATE_LOCK, ShareLock); /* Load element */ - HnswLoadElement(highestPoint, NULL, NULL, index, vacuumstate->procinfo, vacuumstate->collation, true); + HnswLoadElement(highestPoint, NULL, NULL, index, vacuumstate->procinfo, vacuumstate->collation, true, NULL); /* Repair if needed */ if (NeedsUpdated(vacuumstate, highestPoint)) @@ -294,7 +294,7 @@ RepairGraphEntryPoint(HnswVacuumState * vacuumstate) * is outdated, this can remove connections at higher levels in * the graph until they are repaired, but this should be fine. */ - HnswLoadElement(entryPoint, NULL, NULL, index, vacuumstate->procinfo, vacuumstate->collation, true); + HnswLoadElement(entryPoint, NULL, NULL, index, vacuumstate->procinfo, vacuumstate->collation, true, NULL); if (NeedsUpdated(vacuumstate, entryPoint)) {