From 921427ee034e64053d8590b0db74a8c46fc1627e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 17 Dec 2023 11:24:13 -0500 Subject: [PATCH] Replace dynahash hash table in HNSW with simplehash for speed - #378 Co-authored-by: Heikki Linnakangas --- CHANGELOG.md | 1 + src/hnsw.h | 30 +++++++++++- src/hnswutils.c | 120 ++++++++++++++++++++++++++++++++++++++++------- src/hnswvacuum.c | 19 +++----- 4 files changed, 140 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 07918f9..5dd6456 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.5.2 (unreleased) +- Improved performance of HNSW - Added support for on-disk parallel index builds for HNSW - Fixed `invalid memory alloc request size` error with HNSW index build diff --git a/src/hnsw.h b/src/hnsw.h index 5853fc9..dfab7ab 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -104,6 +104,7 @@ typedef struct HnswElementData List *heaptids; uint8 level; uint8 deleted; + uint32 hash; HnswNeighborArray *neighbors; BlockNumber blkno; OffsetNumber offno; @@ -303,7 +304,7 @@ typedef struct HnswVacuumState Oid collation; /* Variables */ - HTAB *deleted; + struct tidhash_hash *deleted; BufferAccessStrategy bas; HnswNeighborTuple ntup; HnswElementData highestPoint; @@ -360,4 +361,31 @@ void hnswrescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys, bool hnswgettuple(IndexScanDesc scan, ScanDirection dir); void hnswendscan(IndexScanDesc scan); +/* Hash tables */ +typedef struct TidHashEntry +{ + ItemPointerData tid; + char status; +} TidHashEntry; + +#define SH_PREFIX tidhash +#define SH_ELEMENT_TYPE TidHashEntry +#define SH_KEY_TYPE ItemPointerData +#define SH_SCOPE extern +#define SH_DECLARE +#include "lib/simplehash.h" + +typedef struct PointerHashEntry +{ + uintptr_t ptr; + char status; +} PointerHashEntry; + +#define SH_PREFIX pointerhash +#define SH_ELEMENT_TYPE PointerHashEntry +#define SH_KEY_TYPE uintptr_t +#define SH_SCOPE extern +#define SH_DECLARE +#include "lib/simplehash.h" + #endif diff --git a/src/hnswutils.c b/src/hnswutils.c index 0eee24e..a79f626 100644 --- a/src/hnswutils.c +++ b/src/hnswutils.c @@ -7,6 +7,89 @@ #include "utils/datum.h" #include "vector.h" +#if PG_VERSION_NUM >= 130000 +#include "common/hashfn.h" +#else +#include "utils/hashutils.h" +#endif + +#if PG_VERSION_NUM < 170000 +static inline uint64 +murmurhash64(uint64 data) +{ + uint64 h = data; + + h ^= h >> 33; + h *= 0xff51afd7ed558ccd; + h ^= h >> 33; + h *= 0xc4ceb9fe1a85ec53; + h ^= h >> 33; + + return h; +} +#endif + +/* TID hash table */ +static uint32 +hash_tid(ItemPointerData tid) +{ + union + { + uint64 i; + ItemPointerData tid; + } x; + + /* Initialize unused bytes */ + x.i = 0; + x.tid = tid; + + return murmurhash64(x.i); +} + +#define SH_PREFIX tidhash +#define SH_ELEMENT_TYPE TidHashEntry +#define SH_KEY_TYPE ItemPointerData +#define SH_KEY tid +#define SH_HASH_KEY(tb, key) hash_tid(key) +#define SH_EQUAL(tb, a, b) ItemPointerEquals(&a, &b) +#define SH_SCOPE extern +#define SH_DEFINE +#include "lib/simplehash.h" + +/* Needed to include simplehash.h twice */ +#if PG_VERSION_NUM < 120000 +#undef SH_EQUAL +#define sh_log2 pointerhash_sh_log2 +#define sh_pow2 pointerhash_sh_pow2 +#endif + +/* Pointer hash table */ +static uint32 +hash_pointer(uintptr_t ptr) +{ +#if SIZEOF_VOID_P == 8 + return murmurhash64((uint64) ptr); +#else + return murmurhash32((uint32) ptr); +#endif +} + +#define SH_PREFIX pointerhash +#define SH_ELEMENT_TYPE PointerHashEntry +#define SH_KEY_TYPE uintptr_t +#define SH_KEY ptr +#define SH_HASH_KEY(tb, key) hash_pointer(key) +#define SH_EQUAL(tb, a, b) (a == b) +#define SH_SCOPE extern +#define SH_DEFINE +#include "lib/simplehash.h" + +typedef union +{ + pointerhash_hash *pointers; + tidhash_hash *tids; +} visited_hash; + /* * Get the max number of connections in an upper layer for each element in the index */ @@ -553,16 +636,22 @@ CreatePairingHeapNode(HnswCandidate * c) * Add to visited */ static inline void -AddToVisited(HTAB *v, HnswCandidate * hc, Relation index, bool *found) +AddToVisited(visited_hash v, HnswCandidate * hc, Relation index, bool *found) { if (index == NULL) - hash_search(v, &hc->element, HASH_ENTER, found); + { +#if PG_VERSION_NUM >= 130000 + pointerhash_insert_hash(v.pointers, (uintptr_t) hc->element, hc->element->hash, found); +#else + pointerhash_insert(v.pointers, (uintptr_t) hc->element, found); +#endif + } else { ItemPointerData indextid; ItemPointerSet(&indextid, hc->element->blkno, hc->element->offno); - hash_search(v, &indextid, HASH_ENTER, found); + tidhash_insert(v.tids, indextid, found); } } @@ -578,30 +667,21 @@ HnswSearchLayer(Datum q, List *ep, int ef, int lc, Relation index, FmgrInfo *pro pairingheap *C = pairingheap_allocate(CompareNearestCandidates, NULL); pairingheap *W = pairingheap_allocate(CompareFurthestCandidates, NULL); int wlen = 0; - HASHCTL hash_ctl; - HTAB *v; + visited_hash v; /* Create hash table */ if (index == NULL) - { - hash_ctl.keysize = sizeof(HnswElement *); - hash_ctl.entrysize = sizeof(HnswElement *); - } + v.pointers = pointerhash_create(CurrentMemoryContext, ef * m * 2, NULL); else - { - hash_ctl.keysize = sizeof(ItemPointerData); - hash_ctl.entrysize = sizeof(ItemPointerData); - } - - hash_ctl.hcxt = CurrentMemoryContext; - v = hash_create("hnsw visited", 256, &hash_ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + v.tids = tidhash_create(CurrentMemoryContext, ef * m * 2, NULL); /* Add entry points to v, C, and W */ foreach(lc2, ep) { HnswCandidate *hc = (HnswCandidate *) lfirst(lc2); + bool found; - AddToVisited(v, hc, index, NULL); + AddToVisited(v, hc, index, &found); pairingheap_add(C, &(CreatePairingHeapNode(hc)->ph_node)); pairingheap_add(W, &(CreatePairingHeapNode(hc)->ph_node)); @@ -1021,6 +1101,12 @@ HnswInsertElement(HnswElement element, HnswElement entryPoint, Relation index, F Datum q = element->value; HnswElement skipElement = existing ? element : NULL; +#if PG_VERSION_NUM >= 130000 + /* Precompute hash */ + if (index == NULL) + element->hash = hash_pointer((uintptr_t) element); +#endif + /* No neighbors if no entry point */ if (entryPoint == NULL) return; diff --git a/src/hnswvacuum.c b/src/hnswvacuum.c index 8fd5d4f..1d9b01e 100644 --- a/src/hnswvacuum.c +++ b/src/hnswvacuum.c @@ -12,12 +12,9 @@ * Check if deleted list contains an index TID */ static bool -DeletedContains(HTAB *deleted, ItemPointer indextid) +DeletedContains(tidhash_hash * deleted, ItemPointer indextid) { - bool found; - - hash_search(deleted, indextid, HASH_FIND, &found); - return found; + return tidhash_lookup(deleted, *indextid) != NULL; } /* @@ -110,11 +107,13 @@ RemoveHeapTids(HnswVacuumState * vacuumstate) if (!ItemPointerIsValid(&etup->heaptids[0])) { ItemPointerData ip; + bool found; /* Add to deleted list */ ItemPointerSet(&ip, blkno, offno); - (void) hash_search(vacuumstate->deleted, &ip, HASH_ENTER, NULL); + tidhash_insert(vacuumstate->deleted, ip, &found); + Assert(!found); } else if (etup->level > highestLevel && !(entryPoint != NULL && blkno == entryPoint->blkno && offno == entryPoint->offno)) { @@ -575,7 +574,6 @@ static void InitVacuumState(HnswVacuumState * vacuumstate, IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state) { Relation index = info->index; - HASHCTL hash_ctl; if (stats == NULL) stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); @@ -597,10 +595,7 @@ InitVacuumState(HnswVacuumState * vacuumstate, IndexVacuumInfo *info, IndexBulkD HnswGetMetaPageInfo(index, &vacuumstate->m, NULL); /* Create hash table */ - hash_ctl.keysize = sizeof(ItemPointerData); - hash_ctl.entrysize = sizeof(ItemPointerData); - hash_ctl.hcxt = CurrentMemoryContext; - vacuumstate->deleted = hash_create("hnswbulkdelete indextids", 256, &hash_ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT); + vacuumstate->deleted = tidhash_create(CurrentMemoryContext, 256, NULL); } /* @@ -609,7 +604,7 @@ InitVacuumState(HnswVacuumState * vacuumstate, IndexVacuumInfo *info, IndexBulkD static void FreeVacuumState(HnswVacuumState * vacuumstate) { - hash_destroy(vacuumstate->deleted); + tidhash_destroy(vacuumstate->deleted); FreeAccessStrategy(vacuumstate->bas); pfree(vacuumstate->ntup); MemoryContextDelete(vacuumstate->tmpCtx);