From abac7a3f776d4edbb423a000ba5234d3e8eab465 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 2 Apr 2024 14:25:09 -0700 Subject: [PATCH] Added sparsevec type --- CHANGELOG.md | 1 + Makefile | 4 +- Makefile.win | 6 +- README.md | 22 + sql/vector--0.6.2--0.7.0.sql | 93 +++ sql/vector.sql | 107 ++++ src/hnsw.h | 5 +- src/hnswbuild.c | 5 + src/hnswinsert.c | 6 +- src/hnswutils.c | 33 + src/sparsevec.c | 778 ++++++++++++++++++++++++ src/sparsevec.h | 24 + src/vector.c | 24 + test/expected/hnsw_sparsevec_cosine.out | 26 + test/expected/hnsw_sparsevec_ip.out | 21 + test/expected/hnsw_sparsevec_l2.out | 43 ++ test/expected/sparsevec_functions.out | 62 ++ test/expected/sparsevec_input.out | 62 ++ test/sql/hnsw_sparsevec_cosine.sql | 13 + test/sql/hnsw_sparsevec_ip.sql | 12 + test/sql/hnsw_sparsevec_l2.sql | 25 + test/sql/sparsevec_functions.sql | 13 + test/sql/sparsevec_input.sql | 19 + 23 files changed, 1397 insertions(+), 7 deletions(-) create mode 100644 src/sparsevec.c create mode 100644 src/sparsevec.h create mode 100644 test/expected/hnsw_sparsevec_cosine.out create mode 100644 test/expected/hnsw_sparsevec_ip.out create mode 100644 test/expected/hnsw_sparsevec_l2.out create mode 100644 test/expected/sparsevec_functions.out create mode 100644 test/expected/sparsevec_input.out create mode 100644 test/sql/hnsw_sparsevec_cosine.sql create mode 100644 test/sql/hnsw_sparsevec_ip.sql create mode 100644 test/sql/hnsw_sparsevec_l2.sql create mode 100644 test/sql/sparsevec_functions.sql create mode 100644 test/sql/sparsevec_input.sql diff --git a/CHANGELOG.md b/CHANGELOG.md index 5bf1395..961b6df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.7.0 (unreleased) - Added `halfvec` type +- Added `sparsevec` type - Added support for bit vectors to HNSW - Added `hamming_distance` function - Added `jaccard_distance` function diff --git a/Makefile b/Makefile index cab9397..a7be0ef 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ EXTVERSION = 0.6.2 MODULE_big = vector DATA = $(wildcard sql/*--*.sql) -OBJS = src/bitvector.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/vector.o -HEADERS = src/halfvec.h src/vector.h +OBJS = src/bitvector.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o +HEADERS = src/halfvec.h src/sparsevec.h src/vector.h TESTS = $(wildcard test/sql/*.sql) REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) diff --git a/Makefile.win b/Makefile.win index 04ece60..48fd71b 100644 --- a/Makefile.win +++ b/Makefile.win @@ -1,10 +1,10 @@ EXTENSION = vector EXTVERSION = 0.6.2 -OBJS = src\bitvector.obj src\halfvec.obj src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\vector.obj -HEADERS = src\halfvec.h src\vector.h +OBJS = src\bitvector.obj src\halfvec.obj src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\sparsevec.obj src\vector.obj +HEADERS = src\halfvec.h src\sparsevec.h src\vector.h -REGRESS = bit_functions btree cast copy halfvec_functions halfvec_input hnsw_bit_hamming hnsw_bit_jaccard hnsw_halfvec_cosine hnsw_halfvec_ip hnsw_halfvec_l2 hnsw_options hnsw_unlogged hnsw_vector_cosine hnsw_vector_ip hnsw_vector_l2 ivfflat_options ivfflat_unlogged ivfflat_vector_cosine ivfflat_vector_ip ivfflat_vector_l2 vector_functions vector_input +REGRESS = bit_functions btree cast copy halfvec_functions halfvec_input hnsw_bit_hamming hnsw_bit_jaccard hnsw_halfvec_cosine hnsw_halfvec_ip hnsw_halfvec_l2 hnsw_options hnsw_sparsevec_cosine hnsw_sparsevec_ip hnsw_sparsevec_l2 hnsw_unlogged hnsw_vector_cosine hnsw_vector_ip hnsw_vector_l2 ivfflat_options ivfflat_unlogged ivfflat_vector_cosine ivfflat_vector_ip ivfflat_vector_l2 sparsevec_functions sparsevec_input vector_functions vector_input REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION) # For /arch flags diff --git a/README.md b/README.md index cafe953..4a78093 100644 --- a/README.md +++ b/README.md @@ -714,6 +714,7 @@ Also, note that `NULL` vectors are not indexed (as well as zero vectors for cosi - [Vector](#vector-type) - [Halfvec](#halfvec-type) - [Bit](#bit-type) +- [Sparsevec](#sparsevec-type) ### Vector Type @@ -789,6 +790,27 @@ Function | Description | Added hamming_distance(bit, bit) → double precision | Hamming distance | unreleased jaccard_distance(bit, bit) → double precision | Jaccard distance | unreleased +### Sparsevec Type + +Each sparse vector takes `8 * nnz + 16` bytes of storage. Each element is a single-precision floating-point number, and all elements must be finite (no `NaN`, `Infinity` or `-Infinity`). + +### Sparsevec Operators + +Operator | Description | Added +--- | --- | --- +<-> | Euclidean distance | unreleased +<#> | negative inner product | unreleased +<=> | cosine distance | unreleased + +### Sparsevec Functions + +Function | Description | Added +--- | --- | --- +cosine_distance(sparsevec, sparsevec) → double precision | cosine distance | unreleased +inner_product(sparsevec, sparsevec) → double precision | inner product | unreleased +l2_distance(sparsevec, sparsevec) → double precision | Euclidean distance | unreleased +l1_distance(sparsevec, sparsevec) → double precision | taxicab distance | unreleased + ## Installation Notes - Linux and Mac ### Postgres Location diff --git a/sql/vector--0.6.2--0.7.0.sql b/sql/vector--0.6.2--0.7.0.sql index dffd83c..f767d6a 100644 --- a/sql/vector--0.6.2--0.7.0.sql +++ b/sql/vector--0.6.2--0.7.0.sql @@ -158,3 +158,96 @@ CREATE CAST (halfvec AS vector) CREATE CAST (vector AS halfvec) WITH FUNCTION vector_to_halfvec(vector, integer, boolean) AS IMPLICIT; + +CREATE TYPE sparsevec; + +CREATE FUNCTION sparsevec_in(cstring, oid, integer) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_out(sparsevec) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_recv(internal, oid, integer) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_send(sparsevec) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE sparsevec ( + INPUT = sparsevec_in, + OUTPUT = sparsevec_out, + TYPMOD_IN = sparsevec_typmod_in, + RECEIVE = sparsevec_recv, + SEND = sparsevec_send, + STORAGE = external +); + +CREATE FUNCTION l2_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_norm(sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_l2_squared_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_negative_inner_product(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec(sparsevec, integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_to_sparsevec(vector, integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_to_vector(sparsevec, integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE CAST (sparsevec AS sparsevec) + WITH FUNCTION sparsevec(sparsevec, integer, boolean) AS IMPLICIT; + +CREATE CAST (sparsevec AS vector) + WITH FUNCTION sparsevec_to_vector(sparsevec, integer, boolean) AS IMPLICIT; + +CREATE CAST (vector AS sparsevec) + WITH FUNCTION vector_to_sparsevec(vector, integer, boolean) AS IMPLICIT; + +CREATE OPERATOR <-> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +CREATE OPERATOR CLASS sparsevec_l2_ops + FOR TYPE sparsevec USING hnsw AS + OPERATOR 1 <-> (sparsevec, sparsevec) FOR ORDER BY float_ops, + FUNCTION 1 sparsevec_l2_squared_distance(sparsevec, sparsevec); + +CREATE OPERATOR CLASS sparsevec_ip_ops + FOR TYPE sparsevec USING hnsw AS + OPERATOR 1 <#> (sparsevec, sparsevec) FOR ORDER BY float_ops, + FUNCTION 1 sparsevec_negative_inner_product(sparsevec, sparsevec); + +CREATE OPERATOR CLASS sparsevec_cosine_ops + FOR TYPE sparsevec USING hnsw AS + OPERATOR 1 <=> (sparsevec, sparsevec) FOR ORDER BY float_ops, + FUNCTION 1 sparsevec_negative_inner_product(sparsevec, sparsevec), + FUNCTION 2 sparsevec_norm(sparsevec); diff --git a/sql/vector.sql b/sql/vector.sql index 3fc5081..f21b100 100644 --- a/sql/vector.sql +++ b/sql/vector.sql @@ -463,3 +463,110 @@ CREATE CAST (halfvec AS vector) CREATE CAST (vector AS halfvec) WITH FUNCTION vector_to_halfvec(vector, integer, boolean) AS IMPLICIT; + +--- sparsevec type + +CREATE TYPE sparsevec; + +CREATE FUNCTION sparsevec_in(cstring, oid, integer) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_out(sparsevec) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_recv(internal, oid, integer) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_send(sparsevec) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE sparsevec ( + INPUT = sparsevec_in, + OUTPUT = sparsevec_out, + TYPMOD_IN = sparsevec_typmod_in, + RECEIVE = sparsevec_recv, + SEND = sparsevec_send, + STORAGE = external +); + +-- sparsevec functions + +CREATE FUNCTION l2_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME', 'sparsevec_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_norm(sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- sparsevec private functions + +CREATE FUNCTION sparsevec_l2_squared_distance(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_negative_inner_product(sparsevec, sparsevec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- sparsevec cast functions + +CREATE FUNCTION sparsevec(sparsevec, integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_to_sparsevec(vector, integer, boolean) RETURNS sparsevec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION sparsevec_to_vector(sparsevec, integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- sparsevec casts + +CREATE CAST (sparsevec AS sparsevec) + WITH FUNCTION sparsevec(sparsevec, integer, boolean) AS IMPLICIT; + +CREATE CAST (sparsevec AS vector) + WITH FUNCTION sparsevec_to_vector(sparsevec, integer, boolean) AS IMPLICIT; + +CREATE CAST (vector AS sparsevec) + WITH FUNCTION vector_to_sparsevec(vector, integer, boolean) AS IMPLICIT; + +-- sparsevec operators + +CREATE OPERATOR <-> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = sparsevec_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = sparsevec, RIGHTARG = sparsevec, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +-- sparsevec opclasses + +CREATE OPERATOR CLASS sparsevec_l2_ops + FOR TYPE sparsevec USING hnsw AS + OPERATOR 1 <-> (sparsevec, sparsevec) FOR ORDER BY float_ops, + FUNCTION 1 sparsevec_l2_squared_distance(sparsevec, sparsevec); + +CREATE OPERATOR CLASS sparsevec_ip_ops + FOR TYPE sparsevec USING hnsw AS + OPERATOR 1 <#> (sparsevec, sparsevec) FOR ORDER BY float_ops, + FUNCTION 1 sparsevec_negative_inner_product(sparsevec, sparsevec); + +CREATE OPERATOR CLASS sparsevec_cosine_ops + FOR TYPE sparsevec USING hnsw AS + OPERATOR 1 <=> (sparsevec, sparsevec) FOR ORDER BY float_ops, + FUNCTION 1 sparsevec_negative_inner_product(sparsevec, sparsevec), + FUNCTION 2 sparsevec_norm(sparsevec); diff --git a/src/hnsw.h b/src/hnsw.h index 3012f5f..772b228 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -17,6 +17,7 @@ #endif #define HNSW_MAX_DIM 2000 +#define HNSW_MAX_NNZ 1000 /* Support functions */ #define HNSW_DISTANCE_PROC 1 @@ -59,7 +60,8 @@ typedef enum HnswType { HNSW_TYPE_VECTOR, HNSW_TYPE_HALFVEC, - HNSW_TYPE_BIT + HNSW_TYPE_BIT, + HNSW_TYPE_SPARSEVEC } HnswType; /* Build phases */ @@ -376,6 +378,7 @@ int HnswGetEfConstruction(Relation index); FmgrInfo *HnswOptionalProcInfo(Relation index, uint16 procnum); HnswType HnswGetType(Relation index); bool HnswNormValue(FmgrInfo *procinfo, Oid collation, Datum *value, HnswType type); +void HnswCheckValue(Datum value, HnswType type); Buffer HnswNewBuffer(Relation index, ForkNumber forkNum); void HnswInitPage(Buffer buf, Page page); void HnswInit(void); diff --git a/src/hnswbuild.c b/src/hnswbuild.c index 5e586f6..2300127 100644 --- a/src/hnswbuild.c +++ b/src/hnswbuild.c @@ -487,6 +487,9 @@ InsertTuple(Relation index, Datum *values, bool *isnull, ItemPointer heaptid, Hn /* Detoast once for all calls */ Datum value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); + /* Check value */ + HnswCheckValue(value, buildstate->type); + /* Normalize if needed */ if (buildstate->normprocinfo != NULL) { @@ -678,6 +681,8 @@ GetMaxDimensions(HnswType type) maxDimensions *= 2; else if (type == HNSW_TYPE_BIT) maxDimensions *= 32; + else if (type == HNSW_TYPE_SPARSEVEC) + maxDimensions = INT_MAX; return maxDimensions; } diff --git a/src/hnswinsert.c b/src/hnswinsert.c index 0e09cfa..c5ea1fd 100644 --- a/src/hnswinsert.c +++ b/src/hnswinsert.c @@ -614,15 +614,19 @@ HnswInsertTuple(Relation index, Datum *values, bool *isnull, ItemPointer heap_ti Datum value; FmgrInfo *normprocinfo; Oid collation = index->rd_indcollation[0]; + HnswType type = HnswGetType(index); /* Detoast once for all calls */ value = PointerGetDatum(PG_DETOAST_DATUM(values[0])); + /* Check value */ + HnswCheckValue(value, type); + /* Normalize if needed */ normprocinfo = HnswOptionalProcInfo(index, HNSW_NORM_PROC); if (normprocinfo != NULL) { - if (!HnswNormValue(normprocinfo, collation, &value, HnswGetType(index))) + if (!HnswNormValue(normprocinfo, collation, &value, type)) return; } diff --git a/src/hnswutils.c b/src/hnswutils.c index 272934c..e082808 100644 --- a/src/hnswutils.c +++ b/src/hnswutils.c @@ -8,6 +8,7 @@ #include "halfvec.h" #include "hnsw.h" #include "lib/pairingheap.h" +#include "sparsevec.h" #include "storage/bufmgr.h" #include "utils/datum.h" #include "utils/memdebug.h" @@ -176,6 +177,8 @@ HnswGetType(Relation index) result = HNSW_TYPE_VECTOR; else if (strcmp(NameStr(type->typname), "halfvec") == 0) result = HNSW_TYPE_HALFVEC; + else if (strcmp(NameStr(type->typname), "sparsevec") == 0) + result = HNSW_TYPE_SPARSEVEC; else elog(ERROR, "Unsupported type"); @@ -223,6 +226,21 @@ HnswNormValue(FmgrInfo *procinfo, Oid collation, Datum *value, HnswType type) *value = PointerGetDatum(result); } + else if (type == HNSW_TYPE_SPARSEVEC) + { + SparseVector *v = DatumGetSparseVector(*value); + SparseVector *result = InitSparseVector(v->dim, v->nnz); + float *vx = SPARSEVEC_VALUES(v); + float *rx = SPARSEVEC_VALUES(result); + + for (int i = 0; i < v->nnz; i++) + { + result->indices[i] = v->indices[i]; + rx[i] = vx[i] / norm; + } + + *value = PointerGetDatum(result); + } else elog(ERROR, "Unsupported type"); @@ -232,6 +250,21 @@ HnswNormValue(FmgrInfo *procinfo, Oid collation, Datum *value, HnswType type) return false; } +/* + * Check if a value can be indexed + */ +void +HnswCheckValue(Datum value, HnswType type) +{ + if (type == HNSW_TYPE_SPARSEVEC) + { + SparseVector *vec = DatumGetSparseVector(value); + + if (vec->nnz > HNSW_MAX_NNZ) + elog(ERROR, "sparsevec cannot have more than %d non-zero elements for hnsw index", HNSW_MAX_NNZ); + } +} + /* * New buffer */ diff --git a/src/sparsevec.c b/src/sparsevec.c new file mode 100644 index 0000000..22649cd --- /dev/null +++ b/src/sparsevec.c @@ -0,0 +1,778 @@ +#include "postgres.h" + +#include +#include + +#include "fmgr.h" +#include "libpq/pqformat.h" +#include "sparsevec.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "vector.h" + +#if PG_VERSION_NUM >= 120000 +#include "common/shortest_dec.h" +#include "utils/float.h" +#else +#include +#include "utils/builtins.h" +#endif + +/* + * Ensure same dimensions + */ +static inline void +CheckDims(SparseVector * a, SparseVector * b) +{ + if (a->dim != b->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different sparsevec dimensions %d and %d", a->dim, b->dim))); +} + +/* + * Ensure expected dimensions + */ +static inline void +CheckExpectedDim(int32 typmod, int dim) +{ + if (typmod != -1 && typmod != dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected %d dimensions, not %d", typmod, dim))); +} + +/* + * Ensure valid dimensions + */ +static inline void +CheckDim(int dim) +{ + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("sparsevec must have at least 1 dimension"))); + + if (dim > SPARSEVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("sparsevec cannot have more than %d dimensions", SPARSEVEC_MAX_DIM))); +} + +/* + * Ensure valid nnz + */ +static inline void +CheckNnz(int nnz, int dim) +{ + if (nnz < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("sparsevec must have at least one element"))); + + if (nnz > dim) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("sparsevec cannot have more elements than dimensions"))); +} + +/* + * Ensure valid index + */ +static inline void +CheckIndex(int32 *indices, int i, int dim) +{ + int32 index = indices[i]; + + if (index < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("index must not be negative"))); + + if (index >= dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("index must be less than dimensions"))); + + if (i > 0) + { + if (index < indices[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("indexes must be in ascending order"))); + + if (index == indices[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("indexes must not contain duplicates"))); + } +} + +/* + * Ensure finite element + */ +static inline void +CheckElement(float value) +{ + if (isnan(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("NaN not allowed in sparsevec"))); + + if (isinf(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("infinite value not allowed in sparsevec"))); +} + +/* + * Allocate and initialize a new sparse vector + */ +SparseVector * +InitSparseVector(int dim, int nnz) +{ + SparseVector *result; + int size; + + size = SPARSEVEC_SIZE(nnz); + result = (SparseVector *) palloc0(size); + SET_VARSIZE(result, size); + result->dim = dim; + result->nnz = nnz; + + return result; +} + +/* + * Check for whitespace, since array_isspace() is static + */ +static inline bool +sparsevec_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\v' || + ch == '\f') + return true; + return false; +} + +/* + * Convert textual representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_in); +Datum +sparsevec_in(PG_FUNCTION_ARGS) +{ + char *lit = PG_GETARG_CSTRING(0); + int32 typmod = PG_GETARG_INT32(2); + int dim; + char *pt; + char *stringEnd; + SparseVector *result; + float *rvalues; + char *litcopy = pstrdup(lit); + char *str = litcopy; + int32 *indices; + float *values; + int maxNnz; + int nnz = 0; + + maxNnz = 1; + pt = str; + while (*pt != '\0') + { + if (*pt == ',') + maxNnz++; + + pt++; + } + + indices = palloc(maxNnz * sizeof(int32)); + values = palloc(maxNnz * sizeof(float)); + + while (sparsevec_isspace(*str)) + str++; + + if (*str != '{') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed sparsevec literal: \"%s\"", lit), + errdetail("Vector contents must start with \"{\"."))); + + str++; + pt = strtok(str, ","); + stringEnd = pt; + + while (pt != NULL && *stringEnd != '}') + { + long index; + float value; + + /* TODO Better error */ + if (nnz == maxNnz) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("ran out of buffer: \"%s\"", lit))); + + while (sparsevec_isspace(*pt)) + pt++; + + /* Check for empty string like float4in */ + if (*pt == '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + /* Use similar logic as int2vectorin */ + errno = 0; + index = strtol(pt, &stringEnd, 10); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + if (errno == ERANGE || index < 0 || index > INT_MAX) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("index \"%ld\" is out of range for type sparsevec", index))); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + while (sparsevec_isspace(*stringEnd)) + stringEnd++; + + if (*stringEnd != ':') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + stringEnd++; + + while (sparsevec_isspace(*stringEnd)) + stringEnd++; + + errno = 0; + pt = stringEnd; + value = strtof(pt, &stringEnd); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + /* Check for range error like float4in */ + if (errno == ERANGE && (value == 0 || isinf(value))) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"%s\" is out of range for type sparsevec", pt))); + + /* TODO Decide whether to store zero values */ + if (value != 0) + { + indices[nnz] = index; + values[nnz] = value; + nnz++; + } + + if (*stringEnd != '\0' && *stringEnd != '}') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + pt = strtok(NULL, ","); + } + + if (stringEnd == NULL || *stringEnd != '}') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed sparsevec literal: \"%s\"", lit), + errdetail("Unexpected end of input."))); + + stringEnd++; + + if (*stringEnd != '/') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed sparsevec literal: \"%s\"", lit), + errdetail("Unexpected end of input."))); + + stringEnd++; + + /* Use similar logic as int2vectorin */ + errno = 0; + pt = stringEnd; + dim = strtol(pt, &stringEnd, 10); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type sparsevec: \"%s\"", lit))); + + /* Only whitespace is allowed after the closing brace */ + while (sparsevec_isspace(*stringEnd)) + stringEnd++; + + if (*stringEnd != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed sparsevec literal: \"%s\"", lit), + errdetail("Junk after closing."))); + + pfree(litcopy); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + result = InitSparseVector(dim, nnz); + rvalues = SPARSEVEC_VALUES(result); + for (int i = 0; i < nnz; i++) + { + result->indices[i] = indices[i]; + rvalues[i] = values[i]; + + CheckIndex(result->indices, i, dim); + CheckElement(rvalues[i]); + } + + PG_RETURN_POINTER(result); +} + +#define AppendChar(ptr, c) (*(ptr)++ = (c)) +#define AppendFloat(ptr, f) ((ptr) += float_to_shortest_decimal_bufn((f), (ptr))) + +#if PG_VERSION_NUM >= 140000 +#define AppendInt(ptr, i) ((ptr) += pg_ltoa((i), (ptr))) +#else +#define AppendInt(ptr, i) \ + do { \ + pg_ltoa(i, ptr); \ + while (*ptr != '\0') \ + ptr++; \ + } while (0) +#endif + +/* + * Convert internal representation to textual representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_out); +Datum +sparsevec_out(PG_FUNCTION_ARGS) +{ + SparseVector *sparsevec = PG_GETARG_SPARSEVEC_P(0); + float *values = SPARSEVEC_VALUES(sparsevec); + char *buf; + char *ptr; + + /* + * Need: + * + * nnz * 10 bytes for index (positive integer) + * + * nnz bytes for : + * + * nnz * (FLOAT_SHORTEST_DECIMAL_LEN - 1) bytes for + * float_to_shortest_decimal_bufn + * + * nnz - 1 bytes for , + * + * 10 bytes for dimensions + * + * 4 bytes for {, }, /, and \0 + */ + buf = (char *) palloc((11 + FLOAT_SHORTEST_DECIMAL_LEN) * sparsevec->nnz + 13); + ptr = buf; + + AppendChar(ptr, '{'); + + for (int i = 0; i < sparsevec->nnz; i++) + { + if (i > 0) + AppendChar(ptr, ','); + + AppendInt(ptr, sparsevec->indices[i]); + AppendChar(ptr, ':'); + AppendFloat(ptr, values[i]); + } + + AppendChar(ptr, '}'); + AppendChar(ptr, '/'); + AppendInt(ptr, sparsevec->dim); + *ptr = '\0'; + + PG_FREE_IF_COPY(sparsevec, 0); + PG_RETURN_CSTRING(buf); +} + +/* + * Convert type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_typmod_in); +Datum +sparsevec_typmod_in(PG_FUNCTION_ARGS) +{ + ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0); + int32 *tl; + int n; + + tl = ArrayGetIntegerTypmods(ta, &n); + + if (n != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid type modifier"))); + + if (*tl < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type sparsevec must be at least 1"))); + + if (*tl > SPARSEVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type sparsevec cannot exceed %d", SPARSEVEC_MAX_DIM))); + + PG_RETURN_INT32(*tl); +} + +/* + * Convert external binary representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_recv); +Datum +sparsevec_recv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + int32 typmod = PG_GETARG_INT32(2); + SparseVector *result; + int32 dim; + int32 nnz; + int32 unused; + float *values; + + dim = pq_getmsgint(buf, sizeof(int32)); + nnz = pq_getmsgint(buf, sizeof(int32)); + unused = pq_getmsgint(buf, sizeof(int32)); + + CheckDim(dim); + CheckNnz(nnz, dim); + CheckExpectedDim(typmod, dim); + + if (unused != 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected unused to be 0, not %d", unused))); + + result = InitSparseVector(dim, nnz); + values = SPARSEVEC_VALUES(result); + + for (int i = 0; i < nnz; i++) + { + result->indices[i] = pq_getmsgint(buf, sizeof(int32)); + CheckIndex(result->indices, i, dim); + } + + for (int i = 0; i < nnz; i++) + { + values[i] = pq_getmsgfloat4(buf); + CheckElement(values[i]); + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to the external binary representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_send); +Datum +sparsevec_send(PG_FUNCTION_ARGS) +{ + SparseVector *svec = PG_GETARG_SPARSEVEC_P(0); + float *values = SPARSEVEC_VALUES(svec); + StringInfoData buf; + + pq_begintypsend(&buf); + pq_sendint(&buf, svec->dim, sizeof(int32)); + pq_sendint(&buf, svec->nnz, sizeof(int32)); + pq_sendint(&buf, svec->unused, sizeof(int32)); + for (int i = 0; i < svec->nnz; i++) + pq_sendint(&buf, svec->indices[i], sizeof(int32)); + for (int i = 0; i < svec->nnz; i++) + pq_sendfloat4(&buf, values[i]); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +/* + * Convert sparse vector to sparse vector + * This is needed to check the type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec); +Datum +sparsevec(PG_FUNCTION_ARGS) +{ + SparseVector *svec = PG_GETARG_SPARSEVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + + CheckExpectedDim(typmod, svec->dim); + + PG_RETURN_POINTER(svec); +} + +/* + * Convert dense vector to sparse vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_to_sparsevec); +Datum +vector_to_sparsevec(PG_FUNCTION_ARGS) +{ + Vector *vec = PG_GETARG_VECTOR_P(0); + int32 typmod = PG_GETARG_INT32(1); + SparseVector *result; + int dim = vec->dim; + int nnz = 0; + float *values; + int j = 0; + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + for (int i = 0; i < dim; i++) + { + if (vec->x[i] != 0) + nnz++; + } + + result = InitSparseVector(dim, nnz); + values = SPARSEVEC_VALUES(result); + for (int i = 0; i < dim; i++) + { + if (vec->x[i] != 0) + { + /* Safety check */ + if (j == nnz) + elog(ERROR, "safety check failed"); + + result->indices[j] = i; + values[j] = vec->x[i]; + j++; + } + } + + PG_RETURN_POINTER(result); +} + +/* + * Get the L2 squared distance between sparse vectors + */ +static double +l2_distance_squared_internal(SparseVector * a, SparseVector * b) +{ + float *ax = SPARSEVEC_VALUES(a); + float *bx = SPARSEVEC_VALUES(b); + double distance = 0.0; + int bpos = 0; + + for (int i = 0; i < a->nnz; i++) + { + int ai = a->indices[i]; + int bi = -1; + + for (int j = bpos; j < b->nnz; j++) + { + bi = b->indices[j]; + + if (ai == bi) + { + double diff = ax[i] - bx[j]; + + distance += diff * diff; + } + else if (ai > bi) + distance += bx[j] * bx[j]; + + /* Update start for next iteration */ + if (ai >= bi) + bpos = j + 1; + + /* Found or passed it */ + if (bi >= ai) + break; + } + + if (ai != bi) + distance += ax[i] * ax[i]; + } + + for (int j = bpos; j < b->nnz; j++) + distance += bx[j] * bx[j]; + + return distance; +} + +/* + * Get the L2 distance between sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_l2_distance); +Datum +sparsevec_l2_distance(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(sqrt(l2_distance_squared_internal(a, b))); +} + +/* + * Get the L2 squared distance between sparse vectors + * This saves a sqrt calculation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_l2_squared_distance); +Datum +sparsevec_l2_squared_distance(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(l2_distance_squared_internal(a, b)); +} + +/* + * Get the inner product of two sparse vectors + */ +static double +inner_product_internal(SparseVector * a, SparseVector * b) +{ + float *ax = SPARSEVEC_VALUES(a); + float *bx = SPARSEVEC_VALUES(b); + double distance = 0.0; + int bpos = 0; + + for (int i = 0; i < a->nnz; i++) + { + int ai = a->indices[i]; + + for (int j = bpos; j < b->nnz; j++) + { + int bi = b->indices[j]; + + /* Only update when the same index */ + if (ai == bi) + distance += ax[i] * bx[j]; + + /* Update start for next iteration */ + if (ai >= bi) + bpos = j + 1; + + /* Found or passed it */ + if (bi >= ai) + break; + } + } + + return distance; +} + +/* + * Get the inner product of two sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_inner_product); +Datum +sparsevec_inner_product(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(inner_product_internal(a, b)); +} + +/* + * Get the negative inner product of two sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_negative_inner_product); +Datum +sparsevec_negative_inner_product(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(-inner_product_internal(a, b)); +} + +/* + * Get the cosine distance between two sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_cosine_distance); +Datum +sparsevec_cosine_distance(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + SparseVector *b = PG_GETARG_SPARSEVEC_P(1); + float *ax = SPARSEVEC_VALUES(a); + float *bx = SPARSEVEC_VALUES(b); + float norma = 0.0; + float normb = 0.0; + double similarity; + + CheckDims(a, b); + + similarity = inner_product_internal(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->nnz; i++) + norma += ax[i] * ax[i]; + + /* Auto-vectorized */ + for (int i = 0; i < b->nnz; i++) + normb += bx[i] * bx[i]; + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + similarity /= sqrt((double) norma * (double) normb); + +#ifdef _MSC_VER + /* /fp:fast may not propagate NaN */ + if (isnan(similarity)) + PG_RETURN_FLOAT8(NAN); +#endif + + /* Keep in range */ + if (similarity > 1) + similarity = 1.0; + else if (similarity < -1) + similarity = -1.0; + + PG_RETURN_FLOAT8(1.0 - similarity); +} + +/* + * Get the L2 norm of a sparse vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_norm); +Datum +sparsevec_norm(PG_FUNCTION_ARGS) +{ + SparseVector *a = PG_GETARG_SPARSEVEC_P(0); + float *ax = SPARSEVEC_VALUES(a); + double norm = 0.0; + + /* Auto-vectorized */ + for (int i = 0; i < a->nnz; i++) + norm += (double) ax[i] * (double) ax[i]; + + PG_RETURN_FLOAT8(sqrt(norm)); +} diff --git a/src/sparsevec.h b/src/sparsevec.h new file mode 100644 index 0000000..673c5b0 --- /dev/null +++ b/src/sparsevec.h @@ -0,0 +1,24 @@ +#ifndef SPARSEVEC_H +#define SPARSEVEC_H + +#define SPARSEVEC_MAX_DIM 100000 + +/* Ensure values are aligned */ +#define SPARSEVEC_SIZE(_nnz) (offsetof(SparseVector, indices) + MAXALIGN((_nnz) * sizeof(int32)) + (_nnz * sizeof(float))) +#define SPARSEVEC_VALUES(x) ((float *) (((char *) (x)) + offsetof(SparseVector, indices) + MAXALIGN((x)->nnz * sizeof(int32)))) +#define DatumGetSparseVector(x) ((SparseVector *) PG_DETOAST_DATUM(x)) +#define PG_GETARG_SPARSEVEC_P(x) DatumGetSparseVector(PG_GETARG_DATUM(x)) +#define PG_RETURN_SPARSEVEC_P(x) PG_RETURN_POINTER(x) + +typedef struct SparseVector +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int32 dim; /* number of dimensions */ + int32 nnz; + int32 unused; + int32 indices[FLEXIBLE_ARRAY_MEMBER]; +} SparseVector; + +SparseVector *InitSparseVector(int dim, int nnz); + +#endif diff --git a/src/vector.c b/src/vector.c index 97d922f..c3871ea 100644 --- a/src/vector.c +++ b/src/vector.c @@ -12,6 +12,7 @@ #include "lib/stringinfo.h" #include "libpq/pqformat.h" #include "port.h" /* for strtof() */ +#include "sparsevec.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/float.h" @@ -1214,3 +1215,26 @@ vector_avg(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } + +/* + * Convert sparse vector to dense vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(sparsevec_to_vector); +Datum +sparsevec_to_vector(PG_FUNCTION_ARGS) +{ + SparseVector *svec = PG_GETARG_SPARSEVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + Vector *result; + int dim = svec->dim; + float *values = SPARSEVEC_VALUES(svec); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + result = InitVector(dim); + for (int i = 0; i < svec->nnz; i++) + result->x[svec->indices[i]] = values[i]; + + PG_RETURN_POINTER(result); +} diff --git a/test/expected/hnsw_sparsevec_cosine.out b/test/expected/hnsw_sparsevec_cosine.out new file mode 100644 index 0000000..778415e --- /dev/null +++ b/test/expected/hnsw_sparsevec_cosine.out @@ -0,0 +1,26 @@ +SET enable_seqscan = off; +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{0:1,1:2,2:3}/3'), ('{0:1,1:1,2:1}/3'), (NULL); +CREATE INDEX ON t USING hnsw (val sparsevec_cosine_ops); +INSERT INTO t (val) VALUES ('{0:1,1:2,2:4}/3'); +SELECT * FROM t ORDER BY val <=> '{0:3,1:3,2:3}/3'; + val +----------------- + {0:1,1:1,2:1}/3 + {0:1,1:2,2:3}/3 + {0:1,1:2,2:4}/3 +(3 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '{}/3') t2; + count +------- + 3 +(1 row) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::sparsevec)) t2; + count +------- + 3 +(1 row) + +DROP TABLE t; diff --git a/test/expected/hnsw_sparsevec_ip.out b/test/expected/hnsw_sparsevec_ip.out new file mode 100644 index 0000000..1c303f0 --- /dev/null +++ b/test/expected/hnsw_sparsevec_ip.out @@ -0,0 +1,21 @@ +SET enable_seqscan = off; +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{0:1,1:2,2:3}/3'), ('{0:1,1:1,2:1}/3'), (NULL); +CREATE INDEX ON t USING hnsw (val sparsevec_ip_ops); +INSERT INTO t (val) VALUES ('{0:1,1:2,2:4}/3'); +SELECT * FROM t ORDER BY val <#> '{0:3,1:3,2:3}/3'; + val +----------------- + {0:1,1:2,2:4}/3 + {0:1,1:2,2:3}/3 + {0:1,1:1,2:1}/3 + {}/3 +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::sparsevec)) t2; + count +------- + 4 +(1 row) + +DROP TABLE t; diff --git a/test/expected/hnsw_sparsevec_l2.out b/test/expected/hnsw_sparsevec_l2.out new file mode 100644 index 0000000..adc5cfd --- /dev/null +++ b/test/expected/hnsw_sparsevec_l2.out @@ -0,0 +1,43 @@ +SET enable_seqscan = off; +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{0:1,1:2,2:3}/3'), ('{0:1,1:1,2:1}/3'), (NULL); +CREATE INDEX ON t USING hnsw (val sparsevec_l2_ops); +INSERT INTO t (val) VALUES ('{0:1,1:2,2:4}/3'); +SELECT * FROM t ORDER BY val <-> '{0:3,1:3,2:3}/3'; + val +----------------- + {0:1,1:2,2:3}/3 + {0:1,1:2,2:4}/3 + {0:1,1:1,2:1}/3 + {}/3 +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <-> (SELECT NULL::sparsevec)) t2; + count +------- + 4 +(1 row) + +SELECT COUNT(*) FROM t; + count +------- + 5 +(1 row) + +TRUNCATE t; +SELECT * FROM t ORDER BY val <-> '{0:3,1:3,2:3}/3'; + val +----- +(0 rows) + +DROP TABLE t; +-- TODO move +CREATE TABLE t (val sparsevec(1001)); +INSERT INTO t (val) VALUES (array_fill(1, ARRAY[1001])::vector::sparsevec); +CREATE INDEX ON t USING hnsw (val sparsevec_l2_ops); +ERROR: sparsevec cannot have more than 1000 non-zero elements for hnsw index +TRUNCATE t; +CREATE INDEX ON t USING hnsw (val sparsevec_l2_ops); +INSERT INTO t (val) VALUES (array_fill(1, ARRAY[1001])::vector::sparsevec); +ERROR: sparsevec cannot have more than 1000 non-zero elements for hnsw index +DROP TABLE t; diff --git a/test/expected/sparsevec_functions.out b/test/expected/sparsevec_functions.out new file mode 100644 index 0000000..07117d8 --- /dev/null +++ b/test/expected/sparsevec_functions.out @@ -0,0 +1,62 @@ +SELECT l2_distance('{}/2'::sparsevec, '{0:3,1:4}/2'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('{}/2'::sparsevec, '{1:1}/2'); + l2_distance +------------- + 1 +(1 row) + +SELECT '{}/2'::sparsevec <-> '{0:3,1:4}/2'; + ?column? +---------- + 5 +(1 row) + +SELECT inner_product('{0:1,1:2}/2'::sparsevec, '{0:2,1:4}/2'); + inner_product +--------------- + 10 +(1 row) + +SELECT sparsevec_negative_inner_product('{0:1,1:2}/2', '{0:2,1:4}/2'); + sparsevec_negative_inner_product +---------------------------------- + -10 +(1 row) + +SELECT cosine_distance('{0:1,1:2}/2'::sparsevec, '{0:2,1:4}/2'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('{0:1,1:2}/2'::sparsevec, '{}/2'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('{0:1,1:1}/2'::sparsevec, '{0:-1,1:-1}/2'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('{0:1}/2'::sparsevec, '{1:2}/2'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('{}/1'::sparsevec, '{}/1'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('{0:1}/2'::sparsevec, '{0:1}/3'); +ERROR: different sparsevec dimensions 2 and 3 diff --git a/test/expected/sparsevec_input.out b/test/expected/sparsevec_input.out new file mode 100644 index 0000000..bd2faf5 --- /dev/null +++ b/test/expected/sparsevec_input.out @@ -0,0 +1,62 @@ +SELECT '{0:1.5,2:3.5}/5'::sparsevec; + sparsevec +----------------- + {0:1.5,2:3.5}/5 +(1 row) + +SELECT '{0:1.5,2:3.5}/5'::sparsevec::vector; + vector +----------------- + [1.5,0,3.5,0,0] +(1 row) + +SELECT '{0:1.5,2:3.5}/5'::sparsevec::vector(5); + vector +----------------- + [1.5,0,3.5,0,0] +(1 row) + +SELECT '{0:1.5,2:3.5}/5'::sparsevec::vector(4); +ERROR: expected 4 dimensions, not 5 +SELECT '[0,1.5,0,3.5,0]'::vector::sparsevec; + sparsevec +----------------- + {1:1.5,3:3.5}/5 +(1 row) + +SELECT '{0:0,1:1,2:0}/3'::sparsevec; + sparsevec +----------- + {1:1}/3 +(1 row) + +SELECT '{1:1,0:1}/2'::sparsevec; +ERROR: indexes must be in ascending order +LINE 1: SELECT '{1:1,0:1}/2'::sparsevec; + ^ +SELECT '{}/5'::sparsevec; + sparsevec +----------- + {}/5 +(1 row) + +SELECT '{}/-1'::sparsevec; +ERROR: sparsevec must have at least 1 dimension +LINE 1: SELECT '{}/-1'::sparsevec; + ^ +SELECT '{}/100001'::sparsevec; +ERROR: sparsevec cannot have more than 100000 dimensions +LINE 1: SELECT '{}/100001'::sparsevec; + ^ +SELECT '{}/16001'::sparsevec::vector; +ERROR: vector cannot have more than 16000 dimensions +SELECT '{-1:1}/1'::sparsevec; +ERROR: index "-1" is out of range for type sparsevec +LINE 1: SELECT '{-1:1}/1'::sparsevec; + ^ +SELECT '{1:1}/1'::sparsevec; +ERROR: index must be less than dimensions +LINE 1: SELECT '{1:1}/1'::sparsevec; + ^ +SELECT '{}/1'::sparsevec(2); +ERROR: expected 2 dimensions, not 1 diff --git a/test/sql/hnsw_sparsevec_cosine.sql b/test/sql/hnsw_sparsevec_cosine.sql new file mode 100644 index 0000000..685423c --- /dev/null +++ b/test/sql/hnsw_sparsevec_cosine.sql @@ -0,0 +1,13 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{0:1,1:2,2:3}/3'), ('{0:1,1:1,2:1}/3'), (NULL); +CREATE INDEX ON t USING hnsw (val sparsevec_cosine_ops); + +INSERT INTO t (val) VALUES ('{0:1,1:2,2:4}/3'); + +SELECT * FROM t ORDER BY val <=> '{0:3,1:3,2:3}/3'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '{}/3') t2; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::sparsevec)) t2; + +DROP TABLE t; diff --git a/test/sql/hnsw_sparsevec_ip.sql b/test/sql/hnsw_sparsevec_ip.sql new file mode 100644 index 0000000..1888d9c --- /dev/null +++ b/test/sql/hnsw_sparsevec_ip.sql @@ -0,0 +1,12 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{0:1,1:2,2:3}/3'), ('{0:1,1:1,2:1}/3'), (NULL); +CREATE INDEX ON t USING hnsw (val sparsevec_ip_ops); + +INSERT INTO t (val) VALUES ('{0:1,1:2,2:4}/3'); + +SELECT * FROM t ORDER BY val <#> '{0:3,1:3,2:3}/3'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::sparsevec)) t2; + +DROP TABLE t; diff --git a/test/sql/hnsw_sparsevec_l2.sql b/test/sql/hnsw_sparsevec_l2.sql new file mode 100644 index 0000000..b472607 --- /dev/null +++ b/test/sql/hnsw_sparsevec_l2.sql @@ -0,0 +1,25 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val sparsevec(3)); +INSERT INTO t (val) VALUES ('{}/3'), ('{0:1,1:2,2:3}/3'), ('{0:1,1:1,2:1}/3'), (NULL); +CREATE INDEX ON t USING hnsw (val sparsevec_l2_ops); + +INSERT INTO t (val) VALUES ('{0:1,1:2,2:4}/3'); + +SELECT * FROM t ORDER BY val <-> '{0:3,1:3,2:3}/3'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <-> (SELECT NULL::sparsevec)) t2; +SELECT COUNT(*) FROM t; + +TRUNCATE t; +SELECT * FROM t ORDER BY val <-> '{0:3,1:3,2:3}/3'; + +DROP TABLE t; + +-- TODO move +CREATE TABLE t (val sparsevec(1001)); +INSERT INTO t (val) VALUES (array_fill(1, ARRAY[1001])::vector::sparsevec); +CREATE INDEX ON t USING hnsw (val sparsevec_l2_ops); +TRUNCATE t; +CREATE INDEX ON t USING hnsw (val sparsevec_l2_ops); +INSERT INTO t (val) VALUES (array_fill(1, ARRAY[1001])::vector::sparsevec); +DROP TABLE t; diff --git a/test/sql/sparsevec_functions.sql b/test/sql/sparsevec_functions.sql new file mode 100644 index 0000000..86f7990 --- /dev/null +++ b/test/sql/sparsevec_functions.sql @@ -0,0 +1,13 @@ +SELECT l2_distance('{}/2'::sparsevec, '{0:3,1:4}/2'); +SELECT l2_distance('{}/2'::sparsevec, '{1:1}/2'); +SELECT '{}/2'::sparsevec <-> '{0:3,1:4}/2'; + +SELECT inner_product('{0:1,1:2}/2'::sparsevec, '{0:2,1:4}/2'); +SELECT sparsevec_negative_inner_product('{0:1,1:2}/2', '{0:2,1:4}/2'); + +SELECT cosine_distance('{0:1,1:2}/2'::sparsevec, '{0:2,1:4}/2'); +SELECT cosine_distance('{0:1,1:2}/2'::sparsevec, '{}/2'); +SELECT cosine_distance('{0:1,1:1}/2'::sparsevec, '{0:-1,1:-1}/2'); +SELECT cosine_distance('{0:1}/2'::sparsevec, '{1:2}/2'); +SELECT cosine_distance('{}/1'::sparsevec, '{}/1'); +SELECT cosine_distance('{0:1}/2'::sparsevec, '{0:1}/3'); diff --git a/test/sql/sparsevec_input.sql b/test/sql/sparsevec_input.sql new file mode 100644 index 0000000..1fdfd88 --- /dev/null +++ b/test/sql/sparsevec_input.sql @@ -0,0 +1,19 @@ +SELECT '{0:1.5,2:3.5}/5'::sparsevec; +SELECT '{0:1.5,2:3.5}/5'::sparsevec::vector; +SELECT '{0:1.5,2:3.5}/5'::sparsevec::vector(5); +SELECT '{0:1.5,2:3.5}/5'::sparsevec::vector(4); +SELECT '[0,1.5,0,3.5,0]'::vector::sparsevec; + +SELECT '{0:0,1:1,2:0}/3'::sparsevec; + +SELECT '{1:1,0:1}/2'::sparsevec; + +SELECT '{}/5'::sparsevec; +SELECT '{}/-1'::sparsevec; +SELECT '{}/100001'::sparsevec; +SELECT '{}/16001'::sparsevec::vector; + +SELECT '{-1:1}/1'::sparsevec; +SELECT '{1:1}/1'::sparsevec; + +SELECT '{}/1'::sparsevec(2);