mirror of
https://github.com/pgvector/pgvector.git
synced 2026-06-06 05:51:21 +08:00
Replace dynahash hash table in HNSW with simplehash for speed - #378
Co-authored-by: Heikki Linnakangas <heikki.linnakangas@iki.fi>
This commit is contained in:
@@ -1,5 +1,6 @@
|
||||
## 0.5.2 (unreleased)
|
||||
|
||||
- Improved performance of HNSW
|
||||
- Added support for on-disk parallel index builds for HNSW
|
||||
- Fixed `invalid memory alloc request size` error with HNSW index build
|
||||
|
||||
|
||||
30
src/hnsw.h
30
src/hnsw.h
@@ -104,6 +104,7 @@ typedef struct HnswElementData
|
||||
List *heaptids;
|
||||
uint8 level;
|
||||
uint8 deleted;
|
||||
uint32 hash;
|
||||
HnswNeighborArray *neighbors;
|
||||
BlockNumber blkno;
|
||||
OffsetNumber offno;
|
||||
@@ -303,7 +304,7 @@ typedef struct HnswVacuumState
|
||||
Oid collation;
|
||||
|
||||
/* Variables */
|
||||
HTAB *deleted;
|
||||
struct tidhash_hash *deleted;
|
||||
BufferAccessStrategy bas;
|
||||
HnswNeighborTuple ntup;
|
||||
HnswElementData highestPoint;
|
||||
@@ -360,4 +361,31 @@ void hnswrescan(IndexScanDesc scan, ScanKey keys, int nkeys, ScanKey orderbys,
|
||||
bool hnswgettuple(IndexScanDesc scan, ScanDirection dir);
|
||||
void hnswendscan(IndexScanDesc scan);
|
||||
|
||||
/* Hash tables */
|
||||
typedef struct TidHashEntry
|
||||
{
|
||||
ItemPointerData tid;
|
||||
char status;
|
||||
} TidHashEntry;
|
||||
|
||||
#define SH_PREFIX tidhash
|
||||
#define SH_ELEMENT_TYPE TidHashEntry
|
||||
#define SH_KEY_TYPE ItemPointerData
|
||||
#define SH_SCOPE extern
|
||||
#define SH_DECLARE
|
||||
#include "lib/simplehash.h"
|
||||
|
||||
typedef struct PointerHashEntry
|
||||
{
|
||||
uintptr_t ptr;
|
||||
char status;
|
||||
} PointerHashEntry;
|
||||
|
||||
#define SH_PREFIX pointerhash
|
||||
#define SH_ELEMENT_TYPE PointerHashEntry
|
||||
#define SH_KEY_TYPE uintptr_t
|
||||
#define SH_SCOPE extern
|
||||
#define SH_DECLARE
|
||||
#include "lib/simplehash.h"
|
||||
|
||||
#endif
|
||||
|
||||
120
src/hnswutils.c
120
src/hnswutils.c
@@ -7,6 +7,89 @@
|
||||
#include "utils/datum.h"
|
||||
#include "vector.h"
|
||||
|
||||
#if PG_VERSION_NUM >= 130000
|
||||
#include "common/hashfn.h"
|
||||
#else
|
||||
#include "utils/hashutils.h"
|
||||
#endif
|
||||
|
||||
#if PG_VERSION_NUM < 170000
|
||||
static inline uint64
|
||||
murmurhash64(uint64 data)
|
||||
{
|
||||
uint64 h = data;
|
||||
|
||||
h ^= h >> 33;
|
||||
h *= 0xff51afd7ed558ccd;
|
||||
h ^= h >> 33;
|
||||
h *= 0xc4ceb9fe1a85ec53;
|
||||
h ^= h >> 33;
|
||||
|
||||
return h;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* TID hash table */
|
||||
static uint32
|
||||
hash_tid(ItemPointerData tid)
|
||||
{
|
||||
union
|
||||
{
|
||||
uint64 i;
|
||||
ItemPointerData tid;
|
||||
} x;
|
||||
|
||||
/* Initialize unused bytes */
|
||||
x.i = 0;
|
||||
x.tid = tid;
|
||||
|
||||
return murmurhash64(x.i);
|
||||
}
|
||||
|
||||
#define SH_PREFIX tidhash
|
||||
#define SH_ELEMENT_TYPE TidHashEntry
|
||||
#define SH_KEY_TYPE ItemPointerData
|
||||
#define SH_KEY tid
|
||||
#define SH_HASH_KEY(tb, key) hash_tid(key)
|
||||
#define SH_EQUAL(tb, a, b) ItemPointerEquals(&a, &b)
|
||||
#define SH_SCOPE extern
|
||||
#define SH_DEFINE
|
||||
#include "lib/simplehash.h"
|
||||
|
||||
/* Needed to include simplehash.h twice */
|
||||
#if PG_VERSION_NUM < 120000
|
||||
#undef SH_EQUAL
|
||||
#define sh_log2 pointerhash_sh_log2
|
||||
#define sh_pow2 pointerhash_sh_pow2
|
||||
#endif
|
||||
|
||||
/* Pointer hash table */
|
||||
static uint32
|
||||
hash_pointer(uintptr_t ptr)
|
||||
{
|
||||
#if SIZEOF_VOID_P == 8
|
||||
return murmurhash64((uint64) ptr);
|
||||
#else
|
||||
return murmurhash32((uint32) ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
#define SH_PREFIX pointerhash
|
||||
#define SH_ELEMENT_TYPE PointerHashEntry
|
||||
#define SH_KEY_TYPE uintptr_t
|
||||
#define SH_KEY ptr
|
||||
#define SH_HASH_KEY(tb, key) hash_pointer(key)
|
||||
#define SH_EQUAL(tb, a, b) (a == b)
|
||||
#define SH_SCOPE extern
|
||||
#define SH_DEFINE
|
||||
#include "lib/simplehash.h"
|
||||
|
||||
typedef union
|
||||
{
|
||||
pointerhash_hash *pointers;
|
||||
tidhash_hash *tids;
|
||||
} visited_hash;
|
||||
|
||||
/*
|
||||
* Get the max number of connections in an upper layer for each element in the index
|
||||
*/
|
||||
@@ -553,16 +636,22 @@ CreatePairingHeapNode(HnswCandidate * c)
|
||||
* Add to visited
|
||||
*/
|
||||
static inline void
|
||||
AddToVisited(HTAB *v, HnswCandidate * hc, Relation index, bool *found)
|
||||
AddToVisited(visited_hash v, HnswCandidate * hc, Relation index, bool *found)
|
||||
{
|
||||
if (index == NULL)
|
||||
hash_search(v, &hc->element, HASH_ENTER, found);
|
||||
{
|
||||
#if PG_VERSION_NUM >= 130000
|
||||
pointerhash_insert_hash(v.pointers, (uintptr_t) hc->element, hc->element->hash, found);
|
||||
#else
|
||||
pointerhash_insert(v.pointers, (uintptr_t) hc->element, found);
|
||||
#endif
|
||||
}
|
||||
else
|
||||
{
|
||||
ItemPointerData indextid;
|
||||
|
||||
ItemPointerSet(&indextid, hc->element->blkno, hc->element->offno);
|
||||
hash_search(v, &indextid, HASH_ENTER, found);
|
||||
tidhash_insert(v.tids, indextid, found);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -578,30 +667,21 @@ HnswSearchLayer(Datum q, List *ep, int ef, int lc, Relation index, FmgrInfo *pro
|
||||
pairingheap *C = pairingheap_allocate(CompareNearestCandidates, NULL);
|
||||
pairingheap *W = pairingheap_allocate(CompareFurthestCandidates, NULL);
|
||||
int wlen = 0;
|
||||
HASHCTL hash_ctl;
|
||||
HTAB *v;
|
||||
visited_hash v;
|
||||
|
||||
/* Create hash table */
|
||||
if (index == NULL)
|
||||
{
|
||||
hash_ctl.keysize = sizeof(HnswElement *);
|
||||
hash_ctl.entrysize = sizeof(HnswElement *);
|
||||
}
|
||||
v.pointers = pointerhash_create(CurrentMemoryContext, ef * m * 2, NULL);
|
||||
else
|
||||
{
|
||||
hash_ctl.keysize = sizeof(ItemPointerData);
|
||||
hash_ctl.entrysize = sizeof(ItemPointerData);
|
||||
}
|
||||
|
||||
hash_ctl.hcxt = CurrentMemoryContext;
|
||||
v = hash_create("hnsw visited", 256, &hash_ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
|
||||
v.tids = tidhash_create(CurrentMemoryContext, ef * m * 2, NULL);
|
||||
|
||||
/* Add entry points to v, C, and W */
|
||||
foreach(lc2, ep)
|
||||
{
|
||||
HnswCandidate *hc = (HnswCandidate *) lfirst(lc2);
|
||||
bool found;
|
||||
|
||||
AddToVisited(v, hc, index, NULL);
|
||||
AddToVisited(v, hc, index, &found);
|
||||
|
||||
pairingheap_add(C, &(CreatePairingHeapNode(hc)->ph_node));
|
||||
pairingheap_add(W, &(CreatePairingHeapNode(hc)->ph_node));
|
||||
@@ -1021,6 +1101,12 @@ HnswInsertElement(HnswElement element, HnswElement entryPoint, Relation index, F
|
||||
Datum q = element->value;
|
||||
HnswElement skipElement = existing ? element : NULL;
|
||||
|
||||
#if PG_VERSION_NUM >= 130000
|
||||
/* Precompute hash */
|
||||
if (index == NULL)
|
||||
element->hash = hash_pointer((uintptr_t) element);
|
||||
#endif
|
||||
|
||||
/* No neighbors if no entry point */
|
||||
if (entryPoint == NULL)
|
||||
return;
|
||||
|
||||
@@ -12,12 +12,9 @@
|
||||
* Check if deleted list contains an index TID
|
||||
*/
|
||||
static bool
|
||||
DeletedContains(HTAB *deleted, ItemPointer indextid)
|
||||
DeletedContains(tidhash_hash * deleted, ItemPointer indextid)
|
||||
{
|
||||
bool found;
|
||||
|
||||
hash_search(deleted, indextid, HASH_FIND, &found);
|
||||
return found;
|
||||
return tidhash_lookup(deleted, *indextid) != NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -110,11 +107,13 @@ RemoveHeapTids(HnswVacuumState * vacuumstate)
|
||||
if (!ItemPointerIsValid(&etup->heaptids[0]))
|
||||
{
|
||||
ItemPointerData ip;
|
||||
bool found;
|
||||
|
||||
/* Add to deleted list */
|
||||
ItemPointerSet(&ip, blkno, offno);
|
||||
|
||||
(void) hash_search(vacuumstate->deleted, &ip, HASH_ENTER, NULL);
|
||||
tidhash_insert(vacuumstate->deleted, ip, &found);
|
||||
Assert(!found);
|
||||
}
|
||||
else if (etup->level > highestLevel && !(entryPoint != NULL && blkno == entryPoint->blkno && offno == entryPoint->offno))
|
||||
{
|
||||
@@ -575,7 +574,6 @@ static void
|
||||
InitVacuumState(HnswVacuumState * vacuumstate, IndexVacuumInfo *info, IndexBulkDeleteResult *stats, IndexBulkDeleteCallback callback, void *callback_state)
|
||||
{
|
||||
Relation index = info->index;
|
||||
HASHCTL hash_ctl;
|
||||
|
||||
if (stats == NULL)
|
||||
stats = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult));
|
||||
@@ -597,10 +595,7 @@ InitVacuumState(HnswVacuumState * vacuumstate, IndexVacuumInfo *info, IndexBulkD
|
||||
HnswGetMetaPageInfo(index, &vacuumstate->m, NULL);
|
||||
|
||||
/* Create hash table */
|
||||
hash_ctl.keysize = sizeof(ItemPointerData);
|
||||
hash_ctl.entrysize = sizeof(ItemPointerData);
|
||||
hash_ctl.hcxt = CurrentMemoryContext;
|
||||
vacuumstate->deleted = hash_create("hnswbulkdelete indextids", 256, &hash_ctl, HASH_ELEM | HASH_BLOBS | HASH_CONTEXT);
|
||||
vacuumstate->deleted = tidhash_create(CurrentMemoryContext, 256, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -609,7 +604,7 @@ InitVacuumState(HnswVacuumState * vacuumstate, IndexVacuumInfo *info, IndexBulkD
|
||||
static void
|
||||
FreeVacuumState(HnswVacuumState * vacuumstate)
|
||||
{
|
||||
hash_destroy(vacuumstate->deleted);
|
||||
tidhash_destroy(vacuumstate->deleted);
|
||||
FreeAccessStrategy(vacuumstate->bas);
|
||||
pfree(vacuumstate->ntup);
|
||||
MemoryContextDelete(vacuumstate->tmpCtx);
|
||||
|
||||
Reference in New Issue
Block a user