From 719b4b74368f00b0070a7bc6bb14250878813735 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 16 Jan 2024 23:25:03 +0200
Subject: [PATCH] Use LWLocks instead of SpinLocks (#410)

Spinlocks should be held only for a few instructions, for multiple
reasons:

- You have to be very careful not to elog() out while holding a
  spinlock, because there is no mechanism to release the spinlock on
  error.

- Waiters can waste a lot of cycles spinning if the lock is
  contended. I you wait on a spinlock for too long, the PostgreSQL
  implementation will actually PANIC, see s_lock_stuck().

The flushLock is particularly problematic. It is held in exclusive
mode, which means it holds a spinlock, over the call to
FlushPages(). FlushPages() performs lots of I/O so it can take a very
long time (>= minutes), and can also easily error out for various
reasons.

allocatorLock would perhaps be OK as a spinlocks, but even that feels
a bit heavy, so I converted that to an LWLock, too.

entryLock is usually held for a very short time, in shared mode, so
that would be fine as a spinlock. However, in the rare case that the
entry point is updated, it's held for a very long time. An LWLock used
in shared mode is about as fast a spinlock, that path is pretty
heavily optimized.

I think we have some problems with the per-element spinlocks too. In
HnswUpdateNeighborPagesInMemory(), it's held over a call to
HnswUpdateConnection(), but HnswUpdateConnection() can error out at
least in case of an out-of-memory error (it uses lappend(), which
calls palloc()). It also calls the distance function, and I don't
think they are guaranteed to be ereport-free either. However, I didn't
address that in this PR, it needs a bit more thinking.
---
 README.md       |  1 -
 src/hnsw.c      | 36 +++++++++++++++++++++++
 src/hnsw.h      | 29 +++++--------------
 src/hnswbuild.c | 77 ++++++++++---------------------------------------
 4 files changed, 60 insertions(+), 83 deletions(-)

diff --git a/README.md b/README.md
index e163886..d09c4fd 100644
--- a/README.md
+++ b/README.md
@@ -741,7 +741,6 @@ Thanks to:
 - [k-means++: The Advantage of Careful Seeding](https://theory.stanford.edu/~sergei/papers/kMeansPP-soda.pdf)
 - [Concept Decompositions for Large Sparse Text Data using Clustering](https://www.cs.utexas.edu/users/inderjit/public_papers/concept_mlj.pdf)
 - [Efficient and Robust Approximate Nearest Neighbor Search using Hierarchical Navigable Small World Graphs](https://arxiv.org/ftp/arxiv/papers/1603/1603.09320.pdf)
-- [Concurrent Programming: Algorithms, Principles, and Foundations](https://doi.org/10.1007/978-3-642-32027-9)
 
 ## History
 
diff --git a/src/hnsw.c b/src/hnsw.c
index 758e418..2ea5895 100644
--- a/src/hnsw.c
+++ b/src/hnsw.c
@@ -16,12 +16,48 @@
 int			hnsw_ef_search;
 static relopt_kind hnsw_relopt_kind;
 
+int			entryLockTrancheId;
+int			allocatorLockTrancheId;
+int			flushLockTrancheId;
+
 /*
  * Initialize index options and variables
  */
 void
 HnswInit(void)
 {
+	int		   *tranche_ids;
+	bool		found;
+
+	/*
+	 * Assign tranche IDs for our LWLocks. This only needs to be done by one
+	 * backend, the tranche IDs are remembered in shared memory.
+	 *
+	 * This shared memory area is very small, so we just allocate it from the
+	 * "slop" that PostgreSQL reserves for small allocations like this. If
+	 * this grows bigger, we should use a shmem_request_hook and
+	 * RequestAddinShmemSpace() to pre-reserve space for this.
+	 */
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	tranche_ids = ShmemInitStruct("pgvector LWLock ids",
+								  sizeof(int) * 3,
+								  &found);
+	if (!found)
+	{
+		tranche_ids[0] = LWLockNewTrancheId();
+		tranche_ids[1] = LWLockNewTrancheId();
+		tranche_ids[2] = LWLockNewTrancheId();
+	}
+	entryLockTrancheId = tranche_ids[0];
+	allocatorLockTrancheId = tranche_ids[1];
+	flushLockTrancheId = tranche_ids[2];
+	LWLockRelease(AddinShmemInitLock);
+
+	/* Per-backend registration of the tranche IDs */
+	LWLockRegisterTranche(entryLockTrancheId, "pgvector entryLock");
+	LWLockRegisterTranche(allocatorLockTrancheId, "pgvector allocatorLock");
+	LWLockRegisterTranche(flushLockTrancheId, "pgvector flushLock");
+
 	hnsw_relopt_kind = add_reloption_kind();
 	add_int_reloption(hnsw_relopt_kind, "m", "Max number of connections",
 					  HNSW_DEFAULT_M, HNSW_MIN_M, HNSW_MAX_M
diff --git a/src/hnsw.h b/src/hnsw.h
index 0ee3cf7..9627c71 100644
--- a/src/hnsw.h
+++ b/src/hnsw.h
@@ -116,6 +116,11 @@
 /* Variables */
 extern int	hnsw_ef_search;
 
+/* These are initialized when the module is loaded */
+extern int	entryLockTrancheId;
+extern int	allocatorLockTrancheId;
+extern int	flushLockTrancheId;
+
 typedef struct HnswElementData HnswElementData;
 typedef struct HnswNeighborArray HnswNeighborArray;
 
@@ -177,24 +182,6 @@ typedef struct HnswOptions
 	int			efConstruction; /* size of dynamic candidate list */
 }			HnswOptions;
 
-typedef enum HnswLWLockMode
-{
-	RW_EXCLUSIVE,
-	RW_SHARED
-}			HnswLWLockMode;
-
-/*
- * Readers-writers with weak priority to the readers
- *
- * https://doi.org/10.1007/978-3-642-32027-9
- */
-typedef struct HnswRWLock
-{
-	volatile int readers;
-	slock_t		readersMutex;
-	slock_t		globalMutex;
-}			HnswRWLock;
-
 typedef struct HnswGraph
 {
 	/* Graph state */
@@ -203,16 +190,16 @@ typedef struct HnswGraph
 	double		indtuples;
 
 	/* Entry state */
-	slock_t		entryLock;
+	LWLock		entryLock;
 	HnswElementPtr entryPoint;
 
 	/* Allocations state */
-	slock_t		allocatorLock;
+	LWLock		allocatorLock;
 	long		memoryUsed;
 	long		memoryTotal;
 
 	/* Flushed state */
-	HnswRWLock	flushLock;
+	LWLock		flushLock;
 	bool		flushed;
 }			HnswGraph;
 
diff --git a/src/hnswbuild.c b/src/hnswbuild.c
index ddb31b3..fcc22b3 100644
--- a/src/hnswbuild.c
+++ b/src/hnswbuild.c
@@ -291,51 +291,6 @@ FlushPages(HnswBuildState * buildstate)
 	MemoryContextReset(buildstate->graphCtx);
 }
 
-/*
- * Initialize a readers-writer lock
- */
-static void
-HnswRWLockInitialize(HnswRWLock * lock)
-{
-	lock->readers = 0;
-	SpinLockInit(&lock->readersMutex);
-	SpinLockInit(&lock->globalMutex);
-}
-
-/*
- * Acquire a readers-writer lock
- */
-static void
-HnswRWLockAcquire(HnswRWLock * lock, HnswLWLockMode lockmode)
-{
-	if (lockmode == RW_EXCLUSIVE)
-		SpinLockAcquire(&lock->globalMutex);
-	else
-	{
-		SpinLockAcquire(&lock->readersMutex);
-		if (++lock->readers == 1)
-			SpinLockAcquire(&lock->globalMutex);
-		SpinLockRelease(&lock->readersMutex);
-	}
-}
-
-/*
- * Release a readers-writer lock
- */
-static void
-HnswRWLockRelease(HnswRWLock * lock, HnswLWLockMode lockmode)
-{
-	if (lockmode == RW_EXCLUSIVE)
-		SpinLockRelease(&lock->globalMutex);
-	else
-	{
-		SpinLockAcquire(&lock->readersMutex);
-		if (--lock->readers == 0)
-			SpinLockRelease(&lock->globalMutex);
-		SpinLockRelease(&lock->readersMutex);
-	}
-}
-
 /*
  * Add a heap TID to an existing element
  */
@@ -463,7 +418,7 @@ InsertTuple(Relation index, Datum *values, bool *isnull, ItemPointer heaptid, Hn
 	Size		valueSize;
 	Pointer		valuePtr;
 	bool		updateEntryPoint;
-	HnswRWLock *flushLock = &graph->flushLock;
+	LWLock	   *flushLock = &graph->flushLock;
 	char	   *base = buildstate->hnswarea;
 
 	/* Detoast once for all calls */
@@ -480,25 +435,25 @@ InsertTuple(Relation index, Datum *values, bool *isnull, ItemPointer heaptid, Hn
 	valueSize = VARSIZE_ANY(DatumGetPointer(value));
 
 	/* Ensure graph not flushed when inserting */
-	HnswRWLockAcquire(flushLock, RW_SHARED);
+	LWLockAcquire(flushLock, LW_SHARED);
 
 	if (graph->flushed)
 	{
-		HnswRWLockRelease(flushLock, RW_SHARED);
+		LWLockRelease(flushLock);
 
 		return HnswInsertTupleOnDisk(index, value, values, isnull, heaptid, buildstate->heap, true);
 	}
 
 	/* Get lock for allocator */
-	SpinLockAcquire(&graph->allocatorLock);
+	LWLockAcquire(&graph->allocatorLock, LW_EXCLUSIVE);
 
 	/* Flush pages if needed */
 	if (graph->memoryUsed >= graph->memoryTotal)
 	{
-		SpinLockRelease(&graph->allocatorLock);
+		LWLockRelease(&graph->allocatorLock);
 
-		HnswRWLockRelease(flushLock, RW_SHARED);
-		HnswRWLockAcquire(flushLock, RW_EXCLUSIVE);
+		LWLockRelease(flushLock);
+		LWLockAcquire(flushLock, LW_EXCLUSIVE);
 
 		if (!graph->flushed)
 		{
@@ -510,7 +465,7 @@ InsertTuple(Relation index, Datum *values, bool *isnull, ItemPointer heaptid, Hn
 			FlushPages(buildstate);
 		}
 
-		HnswRWLockRelease(flushLock, RW_EXCLUSIVE);
+		LWLockRelease(flushLock);
 
 		return HnswInsertTupleOnDisk(index, value, values, isnull, heaptid, buildstate->heap, true);
 	}
@@ -520,7 +475,7 @@ InsertTuple(Relation index, Datum *values, bool *isnull, ItemPointer heaptid, Hn
 	valuePtr = HnswAlloc(allocator, valueSize);
 
 	/* Release allocator lock */
-	SpinLockRelease(&graph->allocatorLock);
+	LWLockRelease(&graph->allocatorLock);
 
 	/* Copy datum */
 	memcpy(valuePtr, DatumGetPointer(value), valueSize);
@@ -530,13 +485,13 @@ InsertTuple(Relation index, Datum *values, bool *isnull, ItemPointer heaptid, Hn
 	SpinLockInit(&element->lock);
 
 	/* Get entry point */
-	SpinLockAcquire(&graph->entryLock);
+	LWLockAcquire(&graph->entryLock, LW_EXCLUSIVE);
 	entryPoint = HnswPtrAccess(base, graph->entryPoint);
 	updateEntryPoint = entryPoint == NULL || element->level > entryPoint->level;
 
 	/* Release lock if not updating entry point */
 	if (!updateEntryPoint)
-		SpinLockRelease(&graph->entryLock);
+		LWLockRelease(&graph->entryLock);
 
 	/* Insert element in graph */
 	HnswInsertElement(base, element, entryPoint, NULL, procinfo, collation, m, efConstruction, false);
@@ -546,10 +501,10 @@ InsertTuple(Relation index, Datum *values, bool *isnull, ItemPointer heaptid, Hn
 
 	/* Release lock if needed */
 	if (updateEntryPoint)
-		SpinLockRelease(&graph->entryLock);
+		LWLockRelease(&graph->entryLock);
 
 	/* Release flush lock */
-	HnswRWLockRelease(flushLock, RW_SHARED);
+	LWLockRelease(flushLock);
 
 	return true;
 }
@@ -603,9 +558,9 @@ InitGraph(HnswGraph * graph, char *base, long memoryTotal)
 	graph->flushed = false;
 	graph->indtuples = 0;
 	SpinLockInit(&graph->lock);
-	SpinLockInit(&graph->entryLock);
-	SpinLockInit(&graph->allocatorLock);
-	HnswRWLockInitialize(&graph->flushLock);
+	LWLockInitialize(&graph->entryLock, entryLockTrancheId);
+	LWLockInitialize(&graph->allocatorLock, allocatorLockTrancheId);
+	LWLockInitialize(&graph->flushLock, flushLockTrancheId);
 }
 
 /*