diff --git a/CHANGELOG.md b/CHANGELOG.md index 07040d0..87242a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.6.0 (unreleased) + +- Added support for sparse vectors + ## 0.5.1 (2023-10-10) - Improved performance of HNSW index builds diff --git a/Makefile b/Makefile index f6c1f20..0cf1c7b 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ EXTVERSION = 0.5.1 MODULE_big = vector DATA = $(wildcard sql/*--*.sql) -OBJS = src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/vector.o -HEADERS = src/vector.h +OBJS = src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/svector.o src/vector.o +HEADERS = src/svector.h src/vector.h TESTS = $(wildcard test/sql/*.sql) REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) diff --git a/Makefile.win b/Makefile.win index f6d955a..45db7be 100644 --- a/Makefile.win +++ b/Makefile.win @@ -1,8 +1,8 @@ EXTENSION = vector EXTVERSION = 0.5.1 -OBJS = src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\vector.obj -HEADERS = src\vector.h +OBJS = src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\svector.obj src\vector.obj +HEADERS = src\svector.h src\vector.h REGRESS = btree cast copy functions input ivfflat_cosine ivfflat_ip ivfflat_l2 ivfflat_options ivfflat_unlogged REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION) diff --git a/README.md b/README.md index bca3de8..eda757d 100644 --- a/README.md +++ b/README.md @@ -369,6 +369,26 @@ To speed up queries with an IVFFlat index, increase the number of inverted lists CREATE INDEX ON items USING ivfflat (embedding vector_l2_ops) WITH (lists = 1000); ``` +## Sparse Vectors + +Create a sparse vector column with 10 dimensions + +```sql +CREATE TABLE items (id bigserial PRIMARY KEY, embedding svector(10)); +``` + +Insert vectors + +```sql +INSERT INTO items (embedding) VALUES ('(0,1),(1,2),(2,3)|10|'), ('(0,4),(1,5),(4,6)|10|'); +``` + +Get the nearest neighbors by L2 distance + +```sql +SELECT * FROM items ORDER BY embedding <-> '(0,3),(1,1),(2,2)|10|' LIMIT 5; +``` + ## Languages Use pgvector from any language with a Postgres client. You can even generate and store vectors in one language and query them in another. diff --git a/sql/vector--0.5.1--0.6.0.sql b/sql/vector--0.5.1--0.6.0.sql new file mode 100644 index 0000000..0b1a51e --- /dev/null +++ b/sql/vector--0.5.1--0.6.0.sql @@ -0,0 +1,79 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.5.2'" to load this file. \quit + +CREATE TYPE svector; + +CREATE FUNCTION svector_in(cstring, oid, integer) RETURNS svector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_out(svector) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_recv(internal, oid, integer) RETURNS svector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_send(svector) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE svector ( + INPUT = svector_in, + OUTPUT = svector_out, + TYPMOD_IN = svector_typmod_in, + RECEIVE = svector_recv, + SEND = svector_send, + STORAGE = external +); + +CREATE FUNCTION l2_distance(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME', 'svector_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME', 'svector_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME', 'svector_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION jaccard_distance(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME', 'svector_jaccard_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_l2_squared_distance(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_negative_inner_product(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector(svector, integer, boolean) RETURNS svector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_to_svector(vector, integer, boolean) RETURNS svector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_to_vector(svector, integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE CAST (svector AS svector) + WITH FUNCTION svector(svector, integer, boolean) AS IMPLICIT; + +CREATE CAST (svector AS vector) + WITH FUNCTION svector_to_vector(svector, integer, boolean) AS IMPLICIT; + +CREATE CAST (vector AS svector) + WITH FUNCTION vector_to_svector(vector, integer, boolean) AS IMPLICIT; + +CREATE OPERATOR <-> ( + LEFTARG = svector, RIGHTARG = svector, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = svector, RIGHTARG = svector, PROCEDURE = svector_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = svector, RIGHTARG = svector, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); diff --git a/sql/vector.sql b/sql/vector.sql index 137931f..4ca8908 100644 --- a/sql/vector.sql +++ b/sql/vector.sql @@ -290,3 +290,92 @@ CREATE OPERATOR CLASS vector_cosine_ops OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops, FUNCTION 1 vector_negative_inner_product(vector, vector), FUNCTION 2 vector_norm(vector); + +--- svector type + +CREATE TYPE svector; + +CREATE FUNCTION svector_in(cstring, oid, integer) RETURNS svector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_out(svector) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_recv(internal, oid, integer) RETURNS svector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_send(svector) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE svector ( + INPUT = svector_in, + OUTPUT = svector_out, + TYPMOD_IN = svector_typmod_in, + RECEIVE = svector_recv, + SEND = svector_send, + STORAGE = external +); + +-- svector functions + +CREATE FUNCTION l2_distance(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME', 'svector_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME', 'svector_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME', 'svector_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION jaccard_distance(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME', 'svector_jaccard_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- svector private functions + +CREATE FUNCTION svector_l2_squared_distance(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_negative_inner_product(svector, svector) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- svector cast functions + +CREATE FUNCTION svector(svector, integer, boolean) RETURNS svector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION vector_to_svector(vector, integer, boolean) RETURNS svector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION svector_to_vector(svector, integer, boolean) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- svector casts + +CREATE CAST (svector AS svector) + WITH FUNCTION svector(svector, integer, boolean) AS IMPLICIT; + +CREATE CAST (svector AS vector) + WITH FUNCTION svector_to_vector(svector, integer, boolean) AS IMPLICIT; + +CREATE CAST (vector AS svector) + WITH FUNCTION vector_to_svector(vector, integer, boolean) AS IMPLICIT; + +-- svector operators + +CREATE OPERATOR <-> ( + LEFTARG = svector, RIGHTARG = svector, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = svector, RIGHTARG = svector, PROCEDURE = svector_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = svector, RIGHTARG = svector, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); diff --git a/src/svector.c b/src/svector.c new file mode 100644 index 0000000..7e3d192 --- /dev/null +++ b/src/svector.c @@ -0,0 +1,704 @@ +#include "postgres.h" + +#include + +#include "fmgr.h" +#include "libpq/pqformat.h" +#include "svector.h" +#include "utils/array.h" +#include "vector.h" + +#if PG_VERSION_NUM >= 120000 +#include "common/shortest_dec.h" +#include "utils/float.h" +#else +#include +#include "utils/builtins.h" +#endif + +/* + * Ensure same dimensions + */ +static inline void +CheckDims(SVector * a, SVector * b) +{ + if (a->dim != b->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different svector dimensions %d and %d", a->dim, b->dim))); +} + +/* + * Ensure expected dimensions + */ +static inline void +CheckExpectedDim(int32 typmod, int dim) +{ + if (typmod != -1 && typmod != dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected %d dimensions, not %d", typmod, dim))); +} + +/* + * Ensure valid dimensions + */ +static inline void +CheckDim(int dim) +{ + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("svector must have at least 1 dimension"))); + + if (dim > SVECTOR_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("svector cannot have more than %d dimensions", SVECTOR_MAX_DIM))); +} + +/* + * Ensure valid nnz + */ +static inline void +CheckNnz(int nnz, int dim) +{ + if (nnz < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("svector must have at least one element"))); + + if (nnz > dim) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("svector cannot have more elements than dimensions"))); +} + +/* + * Ensure valid index + */ +static inline void +CheckIndex(int32 *indices, int i, int dim) +{ + int32 index = indices[i]; + + if (index < 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("index must not be negative"))); + + if (index >= dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("index must be less than dimensions"))); + + if (i > 0) + { + if (index < indices[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("indexes must be in ascending order"))); + + if (index == indices[i - 1]) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("indexes must not contain duplicates"))); + } +} + +/* + * Ensure finite element + */ +static inline void +CheckElement(float value) +{ + if (isnan(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("NaN not allowed in svector"))); + + if (isinf(value)) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("infinite value not allowed in svector"))); +} + +/* + * Allocate and initialize a new sparse vector + */ +SVector * +InitSVector(int dim, int nnz) +{ + SVector *result; + int size; + + size = SVECTOR_SIZE(nnz); + result = (SVector *) palloc0(size); + SET_VARSIZE(result, size); + result->dim = dim; + result->nnz = nnz; + + return result; +} + +/* + * Convert textual representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_in); +Datum +svector_in(PG_FUNCTION_ARGS) +{ + char *str = PG_GETARG_CSTRING(0); + int32 typmod = PG_GETARG_INT32(2); + int dim; + char *pt; + SVector *result; + float *rvalues; + char *lit = pstrdup(str); + int n; + int32 *indices; + float *values; + int index; + float value; + int maxNnz; + int nnz = 0; + + /* TODO Improve code and checks after deciding on format */ + + maxNnz = 1; + pt = str; + while (*pt != '\0') + { + if (*pt == ',') + maxNnz++; + + pt++; + } + maxNnz /= 2; + + indices = palloc(maxNnz * sizeof(int32)); + values = palloc(maxNnz * sizeof(float)); + + while (sscanf(str, "(%d,%f)%n", &index, &value, &n) == 2) + { + /* TODO Better error */ + if (nnz == maxNnz) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("ran out of buffer: \"%s\"", lit))); + + indices[nnz] = index; + values[nnz] = value; + nnz++; + + str += n; + + if (*str == ',') + str++; + else if (*str == '|') + break; + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed svector literal: \"%s\"", lit))); + } + + if (sscanf(str, "|%d|%n", &dim, &n) != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed svector literal: \"%s\"", lit))); + + str += n; + + if (*str != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed svector literal: \"%s\"", lit), + errdetail("Junk after closing pipe."))); + + pfree(lit); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + result = InitSVector(dim, nnz); + rvalues = SVECTOR_VALUES(result); + for (int i = 0; i < nnz; i++) + { + result->indices[i] = indices[i]; + rvalues[i] = values[i]; + + CheckIndex(result->indices, i, dim); + CheckElement(rvalues[i]); + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to textual representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_out); +Datum +svector_out(PG_FUNCTION_ARGS) +{ + SVector *svector = PG_GETARG_SVECTOR_P(0); + float *values = SVECTOR_VALUES(svector); + char *buf; + char *ptr; + int n; + + /* TODO Improve code after deciding on format */ + +#if PG_VERSION_NUM < 120000 + int ndig = FLT_DIG + extra_float_digits; + + if (ndig < 1) + ndig = 1; + +#define FLOAT_SHORTEST_DECIMAL_LEN (ndig + 10) +#endif + + /* TODO Move */ +#define APPEND_CHAR(ptr, ch) (*(ptr)++ = (ch)) + + /* TODO Improve */ + buf = (char *) palloc((FLOAT_SHORTEST_DECIMAL_LEN + 20) * svector->nnz + 20); + ptr = buf; + + for (int i = 0; i < svector->nnz; i++) + { + if (i > 0) + APPEND_CHAR(ptr, ','); + + n = sprintf(ptr, "(%d,", svector->indices[i]); + ptr += n; + +#if PG_VERSION_NUM >= 120000 + n = float_to_shortest_decimal_bufn(values[i], ptr); +#else + n = sprintf(ptr, "%.*g", ndig, values[i]); +#endif + ptr += n; + + APPEND_CHAR(ptr, ')'); + } + + n = sprintf(ptr, "|%d|", svector->dim); + ptr += n; + + APPEND_CHAR(ptr, '\0'); + + PG_FREE_IF_COPY(svector, 0); + PG_RETURN_CSTRING(buf); +} + +/* + * Convert type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_typmod_in); +Datum +svector_typmod_in(PG_FUNCTION_ARGS) +{ + ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0); + int32 *tl; + int n; + + tl = ArrayGetIntegerTypmods(ta, &n); + + if (n != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid type modifier"))); + + if (*tl < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type svector must be at least 1"))); + + if (*tl > SVECTOR_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type svector cannot exceed %d", SVECTOR_MAX_DIM))); + + PG_RETURN_INT32(*tl); +} + +/* + * Convert external binary representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_recv); +Datum +svector_recv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + int32 typmod = PG_GETARG_INT32(2); + SVector *result; + int32 dim; + int32 nnz; + int32 unused; + float *values; + + dim = pq_getmsgint(buf, sizeof(int32)); + nnz = pq_getmsgint(buf, sizeof(int32)); + unused = pq_getmsgint(buf, sizeof(int32)); + + CheckDim(dim); + CheckNnz(nnz, dim); + CheckExpectedDim(typmod, dim); + + if (unused != 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected unused to be 0, not %d", unused))); + + result = InitSVector(dim, nnz); + values = SVECTOR_VALUES(result); + + for (int i = 0; i < nnz; i++) + { + result->indices[i] = pq_getmsgint(buf, sizeof(int32)); + CheckIndex(result->indices, i, dim); + } + + for (int i = 0; i < nnz; i++) + { + values[i] = pq_getmsgfloat4(buf); + CheckElement(values[i]); + } + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to the external binary representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_send); +Datum +svector_send(PG_FUNCTION_ARGS) +{ + SVector *svec = PG_GETARG_SVECTOR_P(0); + float *values = SVECTOR_VALUES(svec); + StringInfoData buf; + + pq_begintypsend(&buf); + pq_sendint(&buf, svec->dim, sizeof(int32)); + pq_sendint(&buf, svec->nnz, sizeof(int32)); + pq_sendint(&buf, svec->unused, sizeof(int32)); + for (int i = 0; i < svec->nnz; i++) + pq_sendint(&buf, svec->indices[i], sizeof(int32)); + for (int i = 0; i < svec->nnz; i++) + pq_sendfloat4(&buf, values[i]); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +/* + * Convert sparse vector to sparse vector + * This is needed to check the type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector); +Datum +svector(PG_FUNCTION_ARGS) +{ + SVector *svec = PG_GETARG_SVECTOR_P(0); + int32 typmod = PG_GETARG_INT32(1); + + CheckExpectedDim(typmod, svec->dim); + + PG_RETURN_POINTER(svec); +} + +/* + * Convert dense vector to sparse vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(vector_to_svector); +Datum +vector_to_svector(PG_FUNCTION_ARGS) +{ + Vector *vec = PG_GETARG_VECTOR_P(0); + int32 typmod = PG_GETARG_INT32(1); + SVector *result; + int dim = vec->dim; + int nnz = 0; + float *values; + int j = 0; + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + for (int i = 0; i < dim; i++) + { + if (vec->x[i] != 0) + nnz++; + } + + result = InitSVector(dim, nnz); + values = SVECTOR_VALUES(result); + for (int i = 0; i < dim; i++) + { + if (vec->x[i] != 0) + { + /* Safety check */ + if (j == nnz) + elog(ERROR, "safety check failed"); + + result->indices[j] = i; + values[j] = vec->x[i]; + j++; + } + } + + PG_RETURN_POINTER(result); +} + +/* + * Get the L2 squared distance between sparse vectors + */ +static double +l2_distance_squared_internal(SVector * a, SVector * b) +{ + float *ax = SVECTOR_VALUES(a); + float *bx = SVECTOR_VALUES(b); + double distance = 0.0; + int bpos = 0; + + for (int i = 0; i < a->nnz; i++) + { + int ai = a->indices[i]; + int bi = -1; + + for (int j = bpos; j < b->nnz; j++) + { + bi = b->indices[j]; + + if (ai == bi) + { + double diff = ax[i] - bx[j]; + + distance += diff * diff; + } + else if (ai > bi) + distance += bx[j] * bx[j]; + + /* Update start for next iteration */ + if (ai >= bi) + bpos = j + 1; + + /* Found or passed it */ + if (bi >= ai) + break; + } + + if (ai != bi) + distance += ax[i] * ax[i]; + } + + for (int j = bpos; j < b->nnz; j++) + distance += bx[j] * bx[j]; + + return distance; +} + +/* + * Get the L2 distance between sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_l2_distance); +Datum +svector_l2_distance(PG_FUNCTION_ARGS) +{ + SVector *a = PG_GETARG_SVECTOR_P(0); + SVector *b = PG_GETARG_SVECTOR_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(sqrt(l2_distance_squared_internal(a, b))); +} + +/* + * Get the L2 squared distance between sparse vectors + * This saves a sqrt calculation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_l2_squared_distance); +Datum +svector_l2_squared_distance(PG_FUNCTION_ARGS) +{ + SVector *a = PG_GETARG_SVECTOR_P(0); + SVector *b = PG_GETARG_SVECTOR_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(l2_distance_squared_internal(a, b)); +} + +/* + * Get the inner product of two sparse vectors + */ +static double +inner_product_internal(SVector * a, SVector * b) +{ + float *ax = SVECTOR_VALUES(a); + float *bx = SVECTOR_VALUES(b); + double distance = 0.0; + int bpos = 0; + + for (int i = 0; i < a->nnz; i++) + { + int ai = a->indices[i]; + + for (int j = bpos; j < b->nnz; j++) + { + int bi = b->indices[j]; + + /* Only update when the same index */ + if (ai == bi) + distance += ax[i] * bx[j]; + + /* Update start for next iteration */ + if (ai >= bi) + bpos = j + 1; + + /* Found or passed it */ + if (bi >= ai) + break; + } + } + + return distance; +} + +/* + * Get the inner product of two sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_inner_product); +Datum +svector_inner_product(PG_FUNCTION_ARGS) +{ + SVector *a = PG_GETARG_SVECTOR_P(0); + SVector *b = PG_GETARG_SVECTOR_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(inner_product_internal(a, b)); +} + +/* + * Get the negative inner product of two sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_negative_inner_product); +Datum +svector_negative_inner_product(PG_FUNCTION_ARGS) +{ + SVector *a = PG_GETARG_SVECTOR_P(0); + SVector *b = PG_GETARG_SVECTOR_P(1); + + CheckDims(a, b); + + PG_RETURN_FLOAT8(-inner_product_internal(a, b)); +} + +/* + * Get the cosine distance between two sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_cosine_distance); +Datum +svector_cosine_distance(PG_FUNCTION_ARGS) +{ + SVector *a = PG_GETARG_SVECTOR_P(0); + SVector *b = PG_GETARG_SVECTOR_P(1); + float *ax = SVECTOR_VALUES(a); + float *bx = SVECTOR_VALUES(b); + float norma = 0.0; + float normb = 0.0; + double similarity; + + CheckDims(a, b); + + similarity = inner_product_internal(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->nnz; i++) + norma += ax[i] * ax[i]; + + /* Auto-vectorized */ + for (int i = 0; i < b->nnz; i++) + normb += bx[i] * bx[i]; + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + similarity /= sqrt((double) norma * (double) normb); + +#ifdef _MSC_VER + /* /fp:fast may not propagate NaN */ + if (isnan(similarity)) + PG_RETURN_FLOAT8(NAN); +#endif + + /* Keep in range */ + if (similarity > 1) + similarity = 1.0; + else if (similarity < -1) + similarity = -1.0; + + PG_RETURN_FLOAT8(1.0 - similarity); +} + +/* + * Get the weighted Jaccard distance between two sparse vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_jaccard_distance); +Datum +svector_jaccard_distance(PG_FUNCTION_ARGS) +{ + SVector *a = PG_GETARG_SVECTOR_P(0); + SVector *b = PG_GETARG_SVECTOR_P(1); + float *ax = SVECTOR_VALUES(a); + float *bx = SVECTOR_VALUES(b); + double num = 0.0; + double denom = 0.0; + int bpos = 0; + + CheckDims(a, b); + + /* + * Weighted Jaccard distance is not defined for vectors with negative + * values. Could check and return NaN if minimal impact on performance. + */ + + for (int i = 0; i < a->nnz; i++) + { + int ai = a->indices[i]; + int bi = -1; + + for (int j = bpos; j < b->nnz; j++) + { + bi = b->indices[j]; + + if (ai == bi) + { + num += ax[i] < bx[j] ? ax[i] : bx[j]; + denom += ax[i] > bx[j] ? ax[i] : bx[j]; + } + else if (ai > bi) + denom += bx[j]; + + /* Update start for next iteration */ + if (ai >= bi) + bpos = j + 1; + + /* Found or passed it */ + if (bi >= ai) + break; + } + + if (ai != bi) + denom += ax[i]; + } + + for (int j = bpos; j < b->nnz; j++) + denom += bx[j]; + + if (denom > 0) + PG_RETURN_FLOAT8(1.0 - (num / denom)); + else + PG_RETURN_FLOAT8(NAN); +} diff --git a/src/svector.h b/src/svector.h new file mode 100644 index 0000000..29fbc29 --- /dev/null +++ b/src/svector.h @@ -0,0 +1,23 @@ +#ifndef SVECTOR_H +#define SVECTOR_H + +#define SVECTOR_MAX_DIM 100000 + +#define SVECTOR_SIZE(_nnz) (offsetof(SVector, indices) + (_nnz) * sizeof(int32) + (_nnz * sizeof(float))) +#define SVECTOR_VALUES(x) ((float *) (((char *) (x)) + offsetof(SVector, indices) + (x)->nnz * sizeof(int32))) +#define DatumGetSVector(x) ((SVector *) PG_DETOAST_DATUM(x)) +#define PG_GETARG_SVECTOR_P(x) DatumGetSVector(PG_GETARG_DATUM(x)) +#define PG_RETURN_SVECTOR_P(x) PG_RETURN_POINTER(x) + +typedef struct SVector +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int32 dim; /* number of dimensions */ + int32 nnz; + int32 unused; + int32 indices[FLEXIBLE_ARRAY_MEMBER]; +} SVector; + +SVector *InitSVector(int dim, int nnz); + +#endif diff --git a/src/vector.c b/src/vector.c index 2f1e886..0db856d 100644 --- a/src/vector.c +++ b/src/vector.c @@ -9,6 +9,7 @@ #include "lib/stringinfo.h" #include "libpq/pqformat.h" #include "port.h" /* for strtof() */ +#include "svector.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/lsyscache.h" @@ -1151,3 +1152,26 @@ vector_avg(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } + +/* + * Convert sparse vector to dense vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(svector_to_vector); +Datum +svector_to_vector(PG_FUNCTION_ARGS) +{ + SVector *svec = PG_GETARG_SVECTOR_P(0); + int32 typmod = PG_GETARG_INT32(1); + Vector *result; + int dim = svec->dim; + float *values = SVECTOR_VALUES(svec); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + result = InitVector(dim); + for (int i = 0; i < svec->nnz; i++) + result->x[svec->indices[i]] = values[i]; + + PG_RETURN_POINTER(result); +} diff --git a/test/expected/functions.out b/test/expected/functions.out index 2840688..0973b94 100644 --- a/test/expected/functions.out +++ b/test/expected/functions.out @@ -54,85 +54,85 @@ SELECT vector_norm('[3e37,4e37]')::real; 5e+37 (1 row) -SELECT l2_distance('[0,0]', '[3,4]'); +SELECT l2_distance('[0,0]'::vector, '[3,4]'); l2_distance ------------- 5 (1 row) -SELECT l2_distance('[0,0]', '[0,1]'); +SELECT l2_distance('[0,0]'::vector, '[0,1]'); l2_distance ------------- 1 (1 row) -SELECT l2_distance('[1,2]', '[3]'); +SELECT l2_distance('[1,2]'::vector, '[3]'); ERROR: different vector dimensions 2 and 1 -SELECT l2_distance('[3e38]', '[-3e38]'); +SELECT l2_distance('[3e38]'::vector, '[-3e38]'); l2_distance ------------- Infinity (1 row) -SELECT inner_product('[1,2]', '[3,4]'); +SELECT inner_product('[1,2]'::vector, '[3,4]'); inner_product --------------- 11 (1 row) -SELECT inner_product('[1,2]', '[3]'); +SELECT inner_product('[1,2]'::vector, '[3]'); ERROR: different vector dimensions 2 and 1 -SELECT inner_product('[3e38]', '[3e38]'); +SELECT inner_product('[3e38]'::vector, '[3e38]'); inner_product --------------- Infinity (1 row) -SELECT cosine_distance('[1,2]', '[2,4]'); +SELECT cosine_distance('[1,2]'::vector, '[2,4]'); cosine_distance ----------------- 0 (1 row) -SELECT cosine_distance('[1,2]', '[0,0]'); +SELECT cosine_distance('[1,2]'::vector, '[0,0]'); cosine_distance ----------------- NaN (1 row) -SELECT cosine_distance('[1,1]', '[1,1]'); +SELECT cosine_distance('[1,1]'::vector, '[1,1]'); cosine_distance ----------------- 0 (1 row) -SELECT cosine_distance('[1,0]', '[0,2]'); +SELECT cosine_distance('[1,0]'::vector, '[0,2]'); cosine_distance ----------------- 1 (1 row) -SELECT cosine_distance('[1,1]', '[-1,-1]'); +SELECT cosine_distance('[1,1]'::vector, '[-1,-1]'); cosine_distance ----------------- 2 (1 row) -SELECT cosine_distance('[1,2]', '[3]'); +SELECT cosine_distance('[1,2]'::vector, '[3]'); ERROR: different vector dimensions 2 and 1 -SELECT cosine_distance('[1,1]', '[1.1,1.1]'); +SELECT cosine_distance('[1,1]'::vector, '[1.1,1.1]'); cosine_distance ----------------- 0 (1 row) -SELECT cosine_distance('[1,1]', '[-1.1,-1.1]'); +SELECT cosine_distance('[1,1]'::vector, '[-1.1,-1.1]'); cosine_distance ----------------- 2 (1 row) -SELECT cosine_distance('[3e38]', '[3e38]'); +SELECT cosine_distance('[3e38]'::vector, '[3e38]'); cosine_distance ----------------- NaN diff --git a/test/expected/svector.out b/test/expected/svector.out new file mode 100644 index 0000000..6b0d6a5 --- /dev/null +++ b/test/expected/svector.out @@ -0,0 +1,134 @@ +SELECT '(0,1.5),(2,3.5)|5|'::svector; + svector +-------------------- + (0,1.5),(2,3.5)|5| +(1 row) + +SELECT '(0,1.5),(2,3.5)|5|'::svector::vector; + vector +----------------- + [1.5,0,3.5,0,0] +(1 row) + +SELECT '(0,1.5),(2,3.5)|5|'::svector::vector(5); + vector +----------------- + [1.5,0,3.5,0,0] +(1 row) + +SELECT '(0,1.5),(2,3.5)|5|'::svector::vector(4); +ERROR: expected 4 dimensions, not 5 +SELECT '[0,1.5,0,3.5,0]'::vector::svector; + svector +-------------------- + (1,1.5),(3,3.5)|5| +(1 row) + +SELECT '|5|'::svector; + svector +--------- + |5| +(1 row) + +SELECT '|-1|'::svector; +ERROR: svector must have at least 1 dimension +LINE 1: SELECT '|-1|'::svector; + ^ +SELECT '|100001|'::svector; +ERROR: svector cannot have more than 100000 dimensions +LINE 1: SELECT '|100001|'::svector; + ^ +SELECT '|16001|'::svector::vector; +ERROR: vector cannot have more than 16000 dimensions +SELECT '(-1,1)|1|'::svector; +ERROR: index must not be negative +LINE 1: SELECT '(-1,1)|1|'::svector; + ^ +SELECT '(1,1)|1|'::svector; +ERROR: index must be less than dimensions +LINE 1: SELECT '(1,1)|1|'::svector; + ^ +SELECT '|1|'::svector(2); +ERROR: expected 2 dimensions, not 1 +SELECT l2_distance('|2|'::svector, '(0,3),(1,4)|2|'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('|2|'::svector, '(1,1)|2|'); + l2_distance +------------- + 1 +(1 row) + +SELECT '|2|'::svector <-> '(0,3),(1,4)|2|'; + ?column? +---------- + 5 +(1 row) + +SELECT inner_product('(0,1),(1,2)|2|'::svector, '(0,2),(1,4)|2|'); + inner_product +--------------- + 10 +(1 row) + +SELECT svector_negative_inner_product('(0,1),(1,2)|2|', '(0,2),(1,4)|2|'); + svector_negative_inner_product +-------------------------------- + -10 +(1 row) + +SELECT cosine_distance('(0,1),(1,2)|2|'::svector, '(0,2),(1,4)|2|'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('(0,1),(1,2)|2|'::svector, '|2|'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('(0,1),(1,1)|2|'::svector, '(0,-1),(1,-1)|2|'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('(0,1)|2|'::svector, '(1,2)|2|'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('|1|'::svector, '|1|'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('(0,1)|2|'::svector, '(0,1)|3|'); +ERROR: different svector dimensions 2 and 3 +SELECT jaccard_distance('(0,1)|2|', '(0,1)|2|'); + jaccard_distance +------------------ + 0 +(1 row) + +SELECT jaccard_distance('(0,1)|2|', '(1,1)|2|'); + jaccard_distance +------------------ + 1 +(1 row) + +SELECT jaccard_distance('|1|', '|1|'); + jaccard_distance +------------------ + NaN +(1 row) + +SELECT jaccard_distance('(0,1)|2|', '(0,1)|3|'); +ERROR: different svector dimensions 2 and 3 diff --git a/test/sql/functions.sql b/test/sql/functions.sql index 914df36..1c2fe71 100644 --- a/test/sql/functions.sql +++ b/test/sql/functions.sql @@ -13,24 +13,24 @@ SELECT vector_norm('[3,4]'); SELECT vector_norm('[0,1]'); SELECT vector_norm('[3e37,4e37]')::real; -SELECT l2_distance('[0,0]', '[3,4]'); -SELECT l2_distance('[0,0]', '[0,1]'); -SELECT l2_distance('[1,2]', '[3]'); -SELECT l2_distance('[3e38]', '[-3e38]'); +SELECT l2_distance('[0,0]'::vector, '[3,4]'); +SELECT l2_distance('[0,0]'::vector, '[0,1]'); +SELECT l2_distance('[1,2]'::vector, '[3]'); +SELECT l2_distance('[3e38]'::vector, '[-3e38]'); -SELECT inner_product('[1,2]', '[3,4]'); -SELECT inner_product('[1,2]', '[3]'); -SELECT inner_product('[3e38]', '[3e38]'); +SELECT inner_product('[1,2]'::vector, '[3,4]'); +SELECT inner_product('[1,2]'::vector, '[3]'); +SELECT inner_product('[3e38]'::vector, '[3e38]'); -SELECT cosine_distance('[1,2]', '[2,4]'); -SELECT cosine_distance('[1,2]', '[0,0]'); -SELECT cosine_distance('[1,1]', '[1,1]'); -SELECT cosine_distance('[1,0]', '[0,2]'); -SELECT cosine_distance('[1,1]', '[-1,-1]'); -SELECT cosine_distance('[1,2]', '[3]'); -SELECT cosine_distance('[1,1]', '[1.1,1.1]'); -SELECT cosine_distance('[1,1]', '[-1.1,-1.1]'); -SELECT cosine_distance('[3e38]', '[3e38]'); +SELECT cosine_distance('[1,2]'::vector, '[2,4]'); +SELECT cosine_distance('[1,2]'::vector, '[0,0]'); +SELECT cosine_distance('[1,1]'::vector, '[1,1]'); +SELECT cosine_distance('[1,0]'::vector, '[0,2]'); +SELECT cosine_distance('[1,1]'::vector, '[-1,-1]'); +SELECT cosine_distance('[1,2]'::vector, '[3]'); +SELECT cosine_distance('[1,1]'::vector, '[1.1,1.1]'); +SELECT cosine_distance('[1,1]'::vector, '[-1.1,-1.1]'); +SELECT cosine_distance('[3e38]'::vector, '[3e38]'); SELECT l1_distance('[0,0]', '[3,4]'); SELECT l1_distance('[0,0]', '[0,1]'); diff --git a/test/sql/svector.sql b/test/sql/svector.sql new file mode 100644 index 0000000..3b5b0bd --- /dev/null +++ b/test/sql/svector.sql @@ -0,0 +1,34 @@ +SELECT '(0,1.5),(2,3.5)|5|'::svector; +SELECT '(0,1.5),(2,3.5)|5|'::svector::vector; +SELECT '(0,1.5),(2,3.5)|5|'::svector::vector(5); +SELECT '(0,1.5),(2,3.5)|5|'::svector::vector(4); +SELECT '[0,1.5,0,3.5,0]'::vector::svector; + +SELECT '|5|'::svector; +SELECT '|-1|'::svector; +SELECT '|100001|'::svector; +SELECT '|16001|'::svector::vector; + +SELECT '(-1,1)|1|'::svector; +SELECT '(1,1)|1|'::svector; + +SELECT '|1|'::svector(2); + +SELECT l2_distance('|2|'::svector, '(0,3),(1,4)|2|'); +SELECT l2_distance('|2|'::svector, '(1,1)|2|'); +SELECT '|2|'::svector <-> '(0,3),(1,4)|2|'; + +SELECT inner_product('(0,1),(1,2)|2|'::svector, '(0,2),(1,4)|2|'); +SELECT svector_negative_inner_product('(0,1),(1,2)|2|', '(0,2),(1,4)|2|'); + +SELECT cosine_distance('(0,1),(1,2)|2|'::svector, '(0,2),(1,4)|2|'); +SELECT cosine_distance('(0,1),(1,2)|2|'::svector, '|2|'); +SELECT cosine_distance('(0,1),(1,1)|2|'::svector, '(0,-1),(1,-1)|2|'); +SELECT cosine_distance('(0,1)|2|'::svector, '(1,2)|2|'); +SELECT cosine_distance('|1|'::svector, '|1|'); +SELECT cosine_distance('(0,1)|2|'::svector, '(0,1)|3|'); + +SELECT jaccard_distance('(0,1)|2|', '(0,1)|2|'); +SELECT jaccard_distance('(0,1)|2|', '(1,1)|2|'); +SELECT jaccard_distance('|1|', '|1|'); +SELECT jaccard_distance('(0,1)|2|', '(0,1)|3|');