diff --git a/CHANGELOG.md b/CHANGELOG.md index db6798c..4c2e827 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.8.0 (unreleased) +- Added `intvec` type - Added casts for arrays to `sparsevec` - Reduced memory usage for HNSW index scans - Dropped support for Postgres 12 diff --git a/Makefile b/Makefile index e7ae85e..b68263d 100644 --- a/Makefile +++ b/Makefile @@ -4,8 +4,8 @@ EXTVERSION = 0.7.4 MODULE_big = vector DATA = $(wildcard sql/*--*--*.sql) DATA_built = sql/$(EXTENSION)--$(EXTVERSION).sql -OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o -HEADERS = src/halfvec.h src/sparsevec.h src/vector.h +OBJS = src/bitutils.o src/bitvec.o src/halfutils.o src/halfvec.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/intvec.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/sparsevec.o src/vector.o +HEADERS = src/halfvec.h src/intvec.h src/sparsevec.h src/vector.h TESTS = $(wildcard test/sql/*.sql) REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) diff --git a/Makefile.win b/Makefile.win index c44cb1f..86cd9b5 100644 --- a/Makefile.win +++ b/Makefile.win @@ -2,8 +2,8 @@ EXTENSION = vector EXTVERSION = 0.7.4 DATA_built = sql\$(EXTENSION)--$(EXTVERSION).sql -OBJS = src\bitutils.obj src\bitvec.obj src\halfutils.obj src\halfvec.obj src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\sparsevec.obj src\vector.obj -HEADERS = src\halfvec.h src\sparsevec.h src\vector.h +OBJS = src\bitutils.obj src\bitvec.obj src\halfutils.obj src\halfvec.obj src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\intvec.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\sparsevec.obj src\vector.obj +HEADERS = src\halfvec.h src\intvec.h src\sparsevec.h src\vector.h REGRESS = bit btree cast copy halfvec hnsw_bit hnsw_halfvec hnsw_sparsevec hnsw_vector ivfflat_bit ivfflat_halfvec ivfflat_vector sparsevec vector_type REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION) diff --git a/README.md b/README.md index 7439964..7a234ff 100644 --- a/README.md +++ b/README.md @@ -934,6 +934,28 @@ Function | Description | Added avg(halfvec) → halfvec | average | 0.7.0 sum(halfvec) → halfvec | sum | 0.7.0 +### Intvec Type + +Each int vector takes `dimensions + 8` bytes of storage. Each element is a single byte signed integer. Int vectors can have up to 16,000 dimensions. + +### Intvec Operators + +Operator | Description | Added +--- | --- | --- +<-> | Euclidean distance | 0.8.0 +<#> | negative inner product | 0.8.0 +<=> | cosine distance | 0.8.0 +<+> | taxicab distance | 0.7.0 + +### Intvec Functions + +Function | Description | Added +--- | --- | --- +cosine_distance(intvec, intvec) → double precision | cosine distance | 0.8.0 +inner_product(intvec, intvec) → double precision | inner product | 0.8.0 +l2_distance(intvec, intvec) → double precision | Euclidean distance | 0.8.0 +l1_distance(intvec, intvec) → double precision | taxicab distance | 0.8.0 + ### Bit Type Each bit vector takes `dimensions / 8 + 8` bytes of storage. See the [Postgres docs](https://www.postgresql.org/docs/current/datatype-bit.html) for more info. diff --git a/sql/vector--0.7.4--0.8.0.sql b/sql/vector--0.7.4--0.8.0.sql index e00348d..b490a94 100644 --- a/sql/vector--0.7.4--0.8.0.sql +++ b/sql/vector--0.7.4--0.8.0.sql @@ -1,6 +1,109 @@ -- complain if script is sourced in psql, rather than via CREATE EXTENSION \echo Use "ALTER EXTENSION vector UPDATE TO '0.8.0'" to load this file. \quit +CREATE FUNCTION hnsw_intvec_support(internal) RETURNS internal + AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE TYPE intvec; + +CREATE FUNCTION intvec_in(cstring, oid, integer) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_out(intvec) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_recv(internal, oid, integer) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_send(intvec) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE intvec ( + INPUT = intvec_in, + OUTPUT = intvec_out, + TYPMOD_IN = intvec_typmod_in, + RECEIVE = intvec_recv, + SEND = intvec_send, + STORAGE = external +); + +CREATE FUNCTION l2_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l1_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_l1_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_l2_squared_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_negative_inner_product(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec(intvec, integer, boolean) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_intvec(integer[], integer, boolean) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE CAST (intvec AS intvec) + WITH FUNCTION intvec(intvec, integer, boolean) AS IMPLICIT; + +CREATE CAST (integer[] AS intvec) + WITH FUNCTION array_to_intvec(integer[], integer, boolean) AS ASSIGNMENT; + +CREATE OPERATOR <-> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = intvec_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +CREATE OPERATOR <+> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = l1_distance, + COMMUTATOR = '<+>' +); + +CREATE OPERATOR CLASS intvec_l2_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <-> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 intvec_l2_squared_distance(intvec, intvec), + FUNCTION 3 hnsw_intvec_support(internal); + +CREATE OPERATOR CLASS intvec_ip_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <#> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 intvec_negative_inner_product(intvec, intvec), + FUNCTION 3 hnsw_intvec_support(internal); + +CREATE OPERATOR CLASS intvec_cosine_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <=> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 cosine_distance(intvec, intvec), + FUNCTION 3 hnsw_intvec_support(internal); + +CREATE OPERATOR CLASS intvec_l1_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <+> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 l1_distance(intvec, intvec), + FUNCTION 3 hnsw_intvec_support(internal); + CREATE FUNCTION array_to_sparsevec(integer[], integer, boolean) RETURNS sparsevec AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; diff --git a/sql/vector.sql b/sql/vector.sql index 7fc3671..395418c 100644 --- a/sql/vector.sql +++ b/sql/vector.sql @@ -272,6 +272,9 @@ CREATE FUNCTION ivfflat_bit_support(internal) RETURNS internal CREATE FUNCTION hnsw_halfvec_support(internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE C; +CREATE FUNCTION hnsw_intvec_support(internal) RETURNS internal + AS 'MODULE_PATHNAME' LANGUAGE C; + CREATE FUNCTION hnsw_bit_support(internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE C; @@ -647,6 +650,120 @@ CREATE OPERATOR CLASS halfvec_l1_ops FUNCTION 1 l1_distance(halfvec, halfvec), FUNCTION 3 hnsw_halfvec_support(internal); +-- intvec type + +CREATE TYPE intvec; + +CREATE FUNCTION intvec_in(cstring, oid, integer) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_out(intvec) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_recv(internal, oid, integer) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_send(intvec) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE intvec ( + INPUT = intvec_in, + OUTPUT = intvec_out, + TYPMOD_IN = intvec_typmod_in, + RECEIVE = intvec_recv, + SEND = intvec_send, + STORAGE = external +); + +-- intvec functions + +CREATE FUNCTION l2_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l1_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_l1_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- intvec private functions + +CREATE FUNCTION intvec_l2_squared_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_negative_inner_product(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- intvec cast functions + +CREATE FUNCTION intvec(intvec, integer, boolean) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_intvec(integer[], integer, boolean) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- intvec casts + +CREATE CAST (intvec AS intvec) + WITH FUNCTION intvec(intvec, integer, boolean) AS IMPLICIT; + +CREATE CAST (integer[] AS intvec) + WITH FUNCTION array_to_intvec(integer[], integer, boolean) AS ASSIGNMENT; + +-- intvec operators + +CREATE OPERATOR <-> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = intvec_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +CREATE OPERATOR <+> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = l1_distance, + COMMUTATOR = '<+>' +); + +-- intvec opclasses + +CREATE OPERATOR CLASS intvec_l2_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <-> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 intvec_l2_squared_distance(intvec, intvec), + FUNCTION 3 hnsw_intvec_support(internal); + +CREATE OPERATOR CLASS intvec_ip_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <#> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 intvec_negative_inner_product(intvec, intvec), + FUNCTION 3 hnsw_intvec_support(internal); + +CREATE OPERATOR CLASS intvec_cosine_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <=> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 cosine_distance(intvec, intvec), + FUNCTION 3 hnsw_intvec_support(internal); + +CREATE OPERATOR CLASS intvec_l1_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <+> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 l1_distance(intvec, intvec), + FUNCTION 3 hnsw_intvec_support(internal); + -- bit functions CREATE FUNCTION hamming_distance(bit, bit) RETURNS float8 diff --git a/src/hnswutils.c b/src/hnswutils.c index ac1e7de..2b92a36 100644 --- a/src/hnswutils.c +++ b/src/hnswutils.c @@ -1375,6 +1375,19 @@ hnsw_halfvec_support(PG_FUNCTION_ARGS) PG_RETURN_POINTER(&typeInfo); }; +FUNCTION_PREFIX PG_FUNCTION_INFO_V1(hnsw_intvec_support); +Datum +hnsw_intvec_support(PG_FUNCTION_ARGS) +{ + static const HnswTypeInfo typeInfo = { + .maxDimensions = HNSW_MAX_DIM * 4, + .normalize = NULL, + .checkValue = NULL + }; + + PG_RETURN_POINTER(&typeInfo); +}; + FUNCTION_PREFIX PG_FUNCTION_INFO_V1(hnsw_bit_support); Datum hnsw_bit_support(PG_FUNCTION_ARGS) diff --git a/src/intvec.c b/src/intvec.c new file mode 100644 index 0000000..540782b --- /dev/null +++ b/src/intvec.c @@ -0,0 +1,599 @@ +#include "postgres.h" + +#include +#include + +#include "catalog/pg_type.h" +#include "fmgr.h" +#include "intvec.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +/* + * Ensure same dimensions + */ +static inline void +CheckDims(IntVector * a, IntVector * b) +{ + if (a->dim != b->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different intvec dimensions %d and %d", a->dim, b->dim))); +} + +/* + * Ensure expected dimensions + */ +static inline void +CheckExpectedDim(int32 typmod, int dim) +{ + if (typmod != -1 && typmod != dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected %d dimensions, not %d", typmod, dim))); +} + +/* + * Ensure valid dimensions + */ +static inline void +CheckDim(int dim) +{ + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("intvec must have at least 1 dimension"))); + + if (dim > INTVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("intvec cannot have more than %d dimensions", INTVEC_MAX_DIM))); +} + +/* + * Ensure element in range + */ +static inline void +CheckElement(long value) +{ + if (value < SCHAR_MIN || value > SCHAR_MAX) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value \"%ld\" is out of range for type intvec", value))); +} + +/* + * Allocate and initialize a new int vector + */ +IntVector * +InitIntVector(int dim) +{ + IntVector *result; + int size; + + size = INTVEC_SIZE(dim); + result = (IntVector *) palloc0(size); + SET_VARSIZE(result, size); + result->dim = dim; + + return result; +} + +/* + * Check for whitespace, since array_isspace() is static + */ +static inline bool +intvec_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\v' || + ch == '\f') + return true; + return false; +} + +/* + * Convert textual representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_in); +Datum +intvec_in(PG_FUNCTION_ARGS) +{ + char *lit = PG_GETARG_CSTRING(0); + int32 typmod = PG_GETARG_INT32(2); + int8 x[INTVEC_MAX_DIM]; + int dim = 0; + char *pt = lit; + IntVector *result; + + while (intvec_isspace(*pt)) + pt++; + + if (*pt != '[') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type intvec: \"%s\"", lit), + errdetail("Vector contents must start with \"[\"."))); + + pt++; + + while (intvec_isspace(*pt)) + pt++; + + if (*pt == ']') + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("intvec must have at least 1 dimension"))); + + for (;;) + { + long val; + char *stringEnd; + + if (dim == INTVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("intvec cannot have more than %d dimensions", VECTOR_MAX_DIM))); + + while (intvec_isspace(*pt)) + pt++; + + /* Check for empty string like float4in */ + if (*pt == '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type intvec: \"%s\"", lit))); + + errno = 0; + + /* Use similar logic as int2vectorin */ + val = strtol(pt, &stringEnd, 10); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type intvec: \"%s\"", lit))); + + /* Check for range error like float4in */ + if (errno == ERANGE || val < SCHAR_MIN || val > SCHAR_MAX) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("\"%s\" is out of range for type intvec", pnstrdup(pt, stringEnd - pt)))); + + CheckElement(val); + x[dim++] = val; + + pt = stringEnd; + + while (intvec_isspace(*pt)) + pt++; + + if (*pt == ',') + pt++; + else if (*pt == ']') + { + pt++; + break; + } + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type intvec: \"%s\"", lit))); + } + + /* Only whitespace is allowed after the closing brace */ + while (intvec_isspace(*pt)) + pt++; + + if (*pt != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type intvec: \"%s\"", lit), + errdetail("Junk after closing right brace."))); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + result = InitIntVector(dim); + for (int i = 0; i < dim; i++) + result->x[i] = x[i]; + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to textual representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_out); +Datum +intvec_out(PG_FUNCTION_ARGS) +{ + IntVector *vector = PG_GETARG_INTVEC_P(0); + int dim = vector->dim; + char *buf; + char *ptr; + + /* + * Need: + * + * dim * 4 bytes for elements (-128 to 127) + * + * dim - 1 bytes for separator + * + * 3 bytes for [, ], and \0 + */ + buf = (char *) palloc(5 * dim + 2); + ptr = buf; + + *ptr = '['; + ptr++; + for (int i = 0; i < dim; i++) + { + if (i > 0) + { + *ptr = ','; + ptr++; + } + +#if PG_VERSION_NUM >= 140000 + ptr += pg_ltoa(vector->x[i], ptr); +#else + pg_ltoa(vector->x[i], ptr); + while (*ptr != '\0') + ptr++; +#endif + } + *ptr = ']'; + ptr++; + *ptr = '\0'; + + PG_FREE_IF_COPY(vector, 0); + PG_RETURN_CSTRING(buf); +} + +/* + * Convert type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_typmod_in); +Datum +intvec_typmod_in(PG_FUNCTION_ARGS) +{ + ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0); + int32 *tl; + int n; + + tl = ArrayGetIntegerTypmods(ta, &n); + + if (n != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid type modifier"))); + + if (*tl < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type intvec must be at least 1"))); + + if (*tl > INTVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type intvec cannot exceed %d", INTVEC_MAX_DIM))); + + PG_RETURN_INT32(*tl); +} + +/* + * Convert external binary representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_recv); +Datum +intvec_recv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + int32 typmod = PG_GETARG_INT32(2); + IntVector *result; + int16 dim; + int16 unused; + + dim = pq_getmsgint(buf, sizeof(int16)); + unused = pq_getmsgint(buf, sizeof(int16)); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + if (unused != 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected unused to be 0, not %d", unused))); + + result = InitIntVector(dim); + for (int i = 0; i < dim; i++) + result->x[i] = pq_getmsgint(buf, sizeof(int8)); + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to the external binary representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_send); +Datum +intvec_send(PG_FUNCTION_ARGS) +{ + IntVector *vec = PG_GETARG_INTVEC_P(0); + StringInfoData buf; + + pq_begintypsend(&buf); + pq_sendint(&buf, vec->dim, sizeof(int16)); + pq_sendint(&buf, vec->unused, sizeof(int16)); + for (int i = 0; i < vec->dim; i++) + pq_sendint8(&buf, vec->x[i]); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +/* + * Convert int vector to int vector + * This is needed to check the type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec); +Datum +intvec(PG_FUNCTION_ARGS) +{ + IntVector *vec = PG_GETARG_INTVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + + CheckExpectedDim(typmod, vec->dim); + + PG_RETURN_POINTER(vec); +} + +/* + * Convert array to intvec vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(array_to_intvec); +Datum +array_to_intvec(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + int32 typmod = PG_GETARG_INT32(1); + Vector *result; + int16 typlen; + bool typbyval; + char typalign; + Datum *elemsp; + int nelemsp; + + if (ARR_NDIM(array) > 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("array must be 1-D"))); + + if (ARR_HASNULL(array) && array_contains_nulls(array)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array must not contain nulls"))); + + get_typlenbyvalalign(ARR_ELEMTYPE(array), &typlen, &typbyval, &typalign); + deconstruct_array(array, ARR_ELEMTYPE(array), typlen, typbyval, typalign, &elemsp, NULL, &nelemsp); + + CheckDim(nelemsp); + CheckExpectedDim(typmod, nelemsp); + + result = InitVector(nelemsp); + + if (ARR_ELEMTYPE(array) == INT4OID) + { + for (int i = 0; i < nelemsp; i++) + { + long l = DatumGetInt32(elemsp[i]); + + CheckElement(l); + + result->x[i] = l; + } + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("unsupported array type"))); + } + + /* + * Free allocation from deconstruct_array. Do not free individual elements + * when pass-by-reference since they point to original array. + */ + pfree(elemsp); + + PG_RETURN_POINTER(result); +} + +/* + * Get the L2 distance between int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_l2_distance); +Datum +intvec_l2_distance(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + int diff = ax[i] - bx[i]; + + distance += diff * diff; + } + + PG_RETURN_FLOAT8(sqrt((double) distance)); +} + +/* + * Get the L2 squared distance between int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_l2_squared_distance); +Datum +intvec_l2_squared_distance(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + int diff = ax[i] - bx[i]; + + distance += diff * diff; + } + + PG_RETURN_FLOAT8((double) distance); +} + +/* + * Get the inner product of two int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_inner_product); +Datum +intvec_inner_product(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += ax[i] * bx[i]; + + PG_RETURN_FLOAT8((double) distance); +} + +/* + * Get the negative inner product of two int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_negative_inner_product); +Datum +intvec_negative_inner_product(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += ax[i] * bx[i]; + + PG_RETURN_FLOAT8((double) -distance); +} + +/* + * Get the cosine distance between two int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_cosine_distance); +Datum +intvec_cosine_distance(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + int norma = 0; + int normb = 0; + double similarity; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + int8 axi = ax[i]; + int8 bxi = bx[i]; + + distance += axi * bxi; + norma += axi * axi; + normb += bxi * bxi; + } + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + similarity = (double) distance / sqrt((double) norma * (double) normb); + +#ifdef _MSC_VER + /* /fp:fast may not propagate NaN */ + if (isnan(similarity)) + PG_RETURN_FLOAT8(NAN); +#endif + + /* Keep in range */ + if (similarity > 1) + similarity = 1; + else if (similarity < -1) + similarity = -1; + + PG_RETURN_FLOAT8(1 - similarity); +} + +/* + * Get the L1 distance between two int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_l1_distance); +Datum +intvec_l1_distance(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += abs(ax[i] - bx[i]); + + PG_RETURN_FLOAT8((double) distance); +} + +/* + * Get the L2 norm of an int vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_l2_norm); +Datum +intvec_l2_norm(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + int8 *ax = a->x; + int norm = 0; + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + norm += ax[i] * ax[i]; + + PG_RETURN_FLOAT8(sqrt((double) norm)); +} diff --git a/src/intvec.h b/src/intvec.h new file mode 100644 index 0000000..cf5f367 --- /dev/null +++ b/src/intvec.h @@ -0,0 +1,23 @@ +#ifndef INTVEC_H +#define INTVEC_H + +#include "vector.h" + +#define INTVEC_MAX_DIM VECTOR_MAX_DIM + +#define INTVEC_SIZE(_dim) (offsetof(IntVector, x) + sizeof(int8)*(_dim)) +#define DatumGetIntVector(x) ((IntVector *) PG_DETOAST_DATUM(x)) +#define PG_GETARG_INTVEC_P(x) DatumGetIntVector(PG_GETARG_DATUM(x)) +#define PG_RETURN_INTVEC_P(x) PG_RETURN_POINTER(x) + +typedef struct IntVector +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int16 dim; /* number of dimensions */ + int16 unused; + int8 x[FLEXIBLE_ARRAY_MEMBER]; +} IntVector; + +IntVector *InitIntVector(int dim); + +#endif diff --git a/test/expected/copy.out b/test/expected/copy.out index 9b4ebc0..b8ee75c 100644 --- a/test/expected/copy.out +++ b/test/expected/copy.out @@ -30,6 +30,19 @@ SELECT * FROM t2 ORDER BY val; (4 rows) +DROP TABLE t; +DROP TABLE t2; +-- intvec +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE TABLE t2 (val intvec(3)); +\copy t TO 'results/intvec.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/intvec.bin' WITH (FORMAT binary) +SELECT * FROM t2 ORDER BY val; +ERROR: could not identify an ordering operator for type intvec +LINE 1: SELECT * FROM t2 ORDER BY val; + ^ +HINT: Use an explicit ordering operator or modify the query. DROP TABLE t; DROP TABLE t2; -- sparsevec diff --git a/test/expected/hnsw_intvec.out b/test/expected/hnsw_intvec.out new file mode 100644 index 0000000..53d6136 --- /dev/null +++ b/test/expected/hnsw_intvec.out @@ -0,0 +1,103 @@ +SET enable_seqscan = off; +-- L2 +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_l2_ops); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + val +--------- + [1,2,3] + [1,2,4] + [1,1,1] + [0,0,0] +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <-> (SELECT NULL::intvec)) t2; + count +------- + 4 +(1 row) + +SELECT COUNT(*) FROM t; + count +------- + 5 +(1 row) + +TRUNCATE t; +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + val +----- +(0 rows) + +DROP TABLE t; +-- inner product +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_ip_ops); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <#> '[3,3,3]'; + val +--------- + [1,2,4] + [1,2,3] + [1,1,1] + [0,0,0] +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::intvec)) t2; + count +------- + 4 +(1 row) + +DROP TABLE t; +-- cosine +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_cosine_ops); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <=> '[3,3,3]'; + val +--------- + [1,1,1] + [1,2,3] + [1,2,4] + [0,0,0] +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '[0,0,0]') t2; + count +------- + 4 +(1 row) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::intvec)) t2; + count +------- + 4 +(1 row) + +DROP TABLE t; +-- L1 +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_l1_ops); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <+> '[3,3,3]'; + val +--------- + [1,2,3] + [1,2,4] + [1,1,1] + [0,0,0] +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <+> (SELECT NULL::intvec)) t2; + count +------- + 4 +(1 row) + +DROP TABLE t; diff --git a/test/expected/intvec.out b/test/expected/intvec.out new file mode 100644 index 0000000..32a9326 --- /dev/null +++ b/test/expected/intvec.out @@ -0,0 +1,208 @@ +SELECT '[1,2,3]'::intvec; + intvec +--------- + [1,2,3] +(1 row) + +SELECT '[-1,-2,-3]'::intvec; + intvec +------------ + [-1,-2,-3] +(1 row) + +SELECT ' [ 1, 2 , 3 ] '::intvec; + intvec +--------- + [1,2,3] +(1 row) + +SELECT '[1.23456]'::intvec; +ERROR: invalid input syntax for type intvec: "[1.23456]" +LINE 1: SELECT '[1.23456]'::intvec; + ^ +SELECT '[hello,1]'::intvec; +ERROR: invalid input syntax for type intvec: "[hello,1]" +LINE 1: SELECT '[hello,1]'::intvec; + ^ +SELECT '[127,-128]'::intvec; + intvec +------------ + [127,-128] +(1 row) + +SELECT '[128,-129]'::intvec; +ERROR: "128" is out of range for type intvec +LINE 1: SELECT '[128,-129]'::intvec; + ^ +SELECT '[1,2,3'::intvec; +ERROR: invalid input syntax for type intvec: "[1,2,3" +LINE 1: SELECT '[1,2,3'::intvec; + ^ +SELECT '[1,2,3]9'::intvec; +ERROR: invalid input syntax for type intvec: "[1,2,3]9" +LINE 1: SELECT '[1,2,3]9'::intvec; + ^ +DETAIL: Junk after closing right brace. +SELECT '1,2,3'::intvec; +ERROR: invalid input syntax for type intvec: "1,2,3" +LINE 1: SELECT '1,2,3'::intvec; + ^ +DETAIL: Vector contents must start with "[". +SELECT ''::intvec; +ERROR: invalid input syntax for type intvec: "" +LINE 1: SELECT ''::intvec; + ^ +DETAIL: Vector contents must start with "[". +SELECT '['::intvec; +ERROR: invalid input syntax for type intvec: "[" +LINE 1: SELECT '['::intvec; + ^ +SELECT '[,'::intvec; +ERROR: invalid input syntax for type intvec: "[," +LINE 1: SELECT '[,'::intvec; + ^ +SELECT '[]'::intvec; +ERROR: intvec must have at least 1 dimension +LINE 1: SELECT '[]'::intvec; + ^ +SELECT '[1,]'::intvec; +ERROR: invalid input syntax for type intvec: "[1,]" +LINE 1: SELECT '[1,]'::intvec; + ^ +SELECT '[1a]'::intvec; +ERROR: invalid input syntax for type intvec: "[1a]" +LINE 1: SELECT '[1a]'::intvec; + ^ +SELECT '[1,,3]'::intvec; +ERROR: invalid input syntax for type intvec: "[1,,3]" +LINE 1: SELECT '[1,,3]'::intvec; + ^ +SELECT '[1, ,3]'::intvec; +ERROR: invalid input syntax for type intvec: "[1, ,3]" +LINE 1: SELECT '[1, ,3]'::intvec; + ^ +SELECT '[1,2,3]'::intvec(3); + intvec +--------- + [1,2,3] +(1 row) + +SELECT '[1,2,3]'::intvec(2); +ERROR: expected 2 dimensions, not 3 +SELECT '[1,2,3]'::intvec(3, 2); +ERROR: invalid type modifier +LINE 1: SELECT '[1,2,3]'::intvec(3, 2); + ^ +SELECT '[1,2,3]'::intvec('a'); +ERROR: invalid input syntax for type integer: "a" +LINE 1: SELECT '[1,2,3]'::intvec('a'); + ^ +SELECT '[1,2,3]'::intvec(0); +ERROR: dimensions for type intvec must be at least 1 +LINE 1: SELECT '[1,2,3]'::intvec(0); + ^ +SELECT '[1,2,3]'::intvec(16001); +ERROR: dimensions for type intvec cannot exceed 16000 +LINE 1: SELECT '[1,2,3]'::intvec(16001); + ^ +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::intvec[]); + unnest +--------- + [1,2,3] + [4,5,6] +(2 rows) + +SELECT '{"[1,2,3]"}'::intvec(2)[]; +ERROR: expected 2 dimensions, not 3 +SELECT l2_distance('[0,0]'::intvec, '[3,4]'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('[0,0]'::intvec, '[0,1]'); + l2_distance +------------- + 1 +(1 row) + +SELECT l2_distance('[1,2]'::intvec, '[3]'); +ERROR: different intvec dimensions 2 and 1 +SELECT '[0,0]'::intvec <-> '[3,4]'; + ?column? +---------- + 5 +(1 row) + +SELECT inner_product('[1,2]'::intvec, '[3,4]'); + inner_product +--------------- + 11 +(1 row) + +SELECT inner_product('[1,2]'::intvec, '[3]'); +ERROR: different intvec dimensions 2 and 1 +SELECT inner_product('[127]'::intvec, '[127]'); + inner_product +--------------- + 16129 +(1 row) + +SELECT '[1,2]'::intvec <#> '[3,4]'; + ?column? +---------- + -11 +(1 row) + +SELECT cosine_distance('[1,2]'::intvec, '[2,4]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2]'::intvec, '[0,0]'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('[1,1]'::intvec, '[1,1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,0]'::intvec, '[0,2]'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('[1,1]'::intvec, '[-1,-1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[1,2]'::intvec, '[3]'); +ERROR: different intvec dimensions 2 and 1 +SELECT '[1,2]'::intvec <=> '[2,4]'; + ?column? +---------- + 0 +(1 row) + +SELECT l1_distance('[0,0]'::intvec, '[3,4]'); + l1_distance +------------- + 7 +(1 row) + +SELECT l1_distance('[0,0]'::intvec, '[0,1]'); + l1_distance +------------- + 1 +(1 row) + +SELECT l1_distance('[1,2]'::intvec, '[3]'); +ERROR: different intvec dimensions 2 and 1 diff --git a/test/sql/copy.sql b/test/sql/copy.sql index 2dff3ff..8a57f60 100644 --- a/test/sql/copy.sql +++ b/test/sql/copy.sql @@ -28,6 +28,21 @@ SELECT * FROM t2 ORDER BY val; DROP TABLE t; DROP TABLE t2; +-- intvec + +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); + +CREATE TABLE t2 (val intvec(3)); + +\copy t TO 'results/intvec.bin' WITH (FORMAT binary) +\copy t2 FROM 'results/intvec.bin' WITH (FORMAT binary) + +SELECT * FROM t2 ORDER BY val; + +DROP TABLE t; +DROP TABLE t2; + -- sparsevec CREATE TABLE t (val sparsevec(3)); diff --git a/test/sql/hnsw_intvec.sql b/test/sql/hnsw_intvec.sql new file mode 100644 index 0000000..8903184 --- /dev/null +++ b/test/sql/hnsw_intvec.sql @@ -0,0 +1,58 @@ +SET enable_seqscan = off; + +-- L2 + +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_l2_ops); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <-> (SELECT NULL::intvec)) t2; +SELECT COUNT(*) FROM t; + +TRUNCATE t; +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + +DROP TABLE t; + +-- inner product + +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_ip_ops); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <#> '[3,3,3]'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::intvec)) t2; + +DROP TABLE t; + +-- cosine + +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_cosine_ops); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <=> '[3,3,3]'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '[0,0,0]') t2; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::intvec)) t2; + +DROP TABLE t; + +-- L1 + +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_l1_ops); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <+> '[3,3,3]'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <+> (SELECT NULL::intvec)) t2; + +DROP TABLE t; diff --git a/test/sql/intvec.sql b/test/sql/intvec.sql new file mode 100644 index 0000000..0a76d0b --- /dev/null +++ b/test/sql/intvec.sql @@ -0,0 +1,50 @@ +SELECT '[1,2,3]'::intvec; +SELECT '[-1,-2,-3]'::intvec; +SELECT ' [ 1, 2 , 3 ] '::intvec; +SELECT '[1.23456]'::intvec; +SELECT '[hello,1]'::intvec; +SELECT '[127,-128]'::intvec; +SELECT '[128,-129]'::intvec; +SELECT '[1,2,3'::intvec; +SELECT '[1,2,3]9'::intvec; +SELECT '1,2,3'::intvec; +SELECT ''::intvec; +SELECT '['::intvec; +SELECT '[,'::intvec; +SELECT '[]'::intvec; +SELECT '[1,]'::intvec; +SELECT '[1a]'::intvec; +SELECT '[1,,3]'::intvec; +SELECT '[1, ,3]'::intvec; + +SELECT '[1,2,3]'::intvec(3); +SELECT '[1,2,3]'::intvec(2); +SELECT '[1,2,3]'::intvec(3, 2); +SELECT '[1,2,3]'::intvec('a'); +SELECT '[1,2,3]'::intvec(0); +SELECT '[1,2,3]'::intvec(16001); + +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::intvec[]); +SELECT '{"[1,2,3]"}'::intvec(2)[]; + +SELECT l2_distance('[0,0]'::intvec, '[3,4]'); +SELECT l2_distance('[0,0]'::intvec, '[0,1]'); +SELECT l2_distance('[1,2]'::intvec, '[3]'); +SELECT '[0,0]'::intvec <-> '[3,4]'; + +SELECT inner_product('[1,2]'::intvec, '[3,4]'); +SELECT inner_product('[1,2]'::intvec, '[3]'); +SELECT inner_product('[127]'::intvec, '[127]'); +SELECT '[1,2]'::intvec <#> '[3,4]'; + +SELECT cosine_distance('[1,2]'::intvec, '[2,4]'); +SELECT cosine_distance('[1,2]'::intvec, '[0,0]'); +SELECT cosine_distance('[1,1]'::intvec, '[1,1]'); +SELECT cosine_distance('[1,0]'::intvec, '[0,2]'); +SELECT cosine_distance('[1,1]'::intvec, '[-1,-1]'); +SELECT cosine_distance('[1,2]'::intvec, '[3]'); +SELECT '[1,2]'::intvec <=> '[2,4]'; + +SELECT l1_distance('[0,0]'::intvec, '[3,4]'); +SELECT l1_distance('[0,0]'::intvec, '[0,1]'); +SELECT l1_distance('[1,2]'::intvec, '[3]'); diff --git a/test/t/039_hnsw_intvec_build_recall.pl b/test/t/039_hnsw_intvec_build_recall.pl new file mode 100644 index 0000000..6aaff95 --- /dev/null +++ b/test/t/039_hnsw_intvec_build_recall.pl @@ -0,0 +1,132 @@ +use strict; +use warnings FATAL => 'all'; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +my $node; +my @queries = (); +my @expected; +my $limit = 20; +my $dim = 20; +my $array_sql = join(",", ('(random() * 255)::int - 128') x $dim); + +sub test_recall +{ + my ($min, $operator) = @_; + my $correct = 0; + my $total = 0; + + my $explain = $node->safe_psql("postgres", qq( + SET enable_seqscan = off; + EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v $operator '$queries[0]' LIMIT $limit; + )); + like($explain, qr/Index Scan/); + + for my $i (0 .. $#queries) + { + my $actual = $node->safe_psql("postgres", qq( + SET enable_seqscan = off; + SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit; + )); + my @actual_ids = split("\n", $actual); + my %actual_set = map { $_ => 1 } @actual_ids; + + my @expected_ids = split("\n", $expected[$i]); + + foreach (@expected_ids) + { + if (exists($actual_set{$_})) + { + $correct++; + } + $total++; + } + } + + cmp_ok($correct / $total, ">=", $min, $operator); +} + +# Initialize node +$node = PostgreSQL::Test::Cluster->new('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v intvec($dim));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, 10000) i;" +); + +# Generate queries +for (1 .. 20) +{ + my @r = (); + for (1 .. $dim) + { + push(@r, int(rand(256)) - 128); + } + push(@queries, "[" . join(",", @r) . "]"); +} + +# Check each index type +my @operators = ("<->", "<#>", "<=>"); +my @opclasses = ("intvec_l2_ops", "intvec_ip_ops", "intvec_cosine_ops"); + +for my $i (0 .. $#operators) +{ + my $operator = $operators[$i]; + my $opclass = $opclasses[$i]; + + # Get exact results + @expected = (); + foreach (@queries) + { + my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v $operator '$_' LIMIT $limit;"); + push(@expected, $res); + } + + # Build index serially + $node->safe_psql("postgres", qq( + SET max_parallel_maintenance_workers = 0; + CREATE INDEX idx ON tst USING hnsw (v $opclass); + )); + + # Test approximate results + my $min = 0.99; + test_recall($min, $operator); + + $node->safe_psql("postgres", "DROP INDEX idx;"); + + # Build index in parallel in memory + my ($ret, $stdout, $stderr) = $node->psql("postgres", qq( + SET client_min_messages = DEBUG; + SET min_parallel_table_scan_size = 1; + CREATE INDEX idx ON tst USING hnsw (v $opclass); + )); + is($ret, 0, $stderr); + like($stderr, qr/using \d+ parallel workers/); + + # Test approximate results + test_recall($min, $operator); + + $node->safe_psql("postgres", "DROP INDEX idx;"); + + # Build index in parallel on disk + # Set parallel_workers on table to use workers with low maintenance_work_mem + ($ret, $stdout, $stderr) = $node->psql("postgres", qq( + ALTER TABLE tst SET (parallel_workers = 2); + SET client_min_messages = DEBUG; + SET maintenance_work_mem = '4MB'; + CREATE INDEX idx ON tst USING hnsw (v $opclass); + ALTER TABLE tst RESET (parallel_workers); + )); + is($ret, 0, $stderr); + like($stderr, qr/using \d+ parallel workers/); + like($stderr, qr/hnsw graph no longer fits into maintenance_work_mem/); + + $node->safe_psql("postgres", "DROP INDEX idx;"); +} + +done_testing();