diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bcea19..dceb5ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.7.0 (unreleased) + +- Added `intvec` type + ## 0.6.2 (2024-03-18) - Reduced lock contention with parallel HNSW index builds diff --git a/Makefile b/Makefile index d5f61ff..15645d5 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,8 @@ EXTVERSION = 0.6.2 MODULE_big = vector DATA = $(wildcard sql/*--*.sql) -OBJS = src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/vector.o -HEADERS = src/vector.h +OBJS = src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/intvec.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/vector.o +HEADERS = src/intvec.h src/vector.h TESTS = $(wildcard test/sql/*.sql) REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) diff --git a/Makefile.win b/Makefile.win index 1bb193e..d8e7fc1 100644 --- a/Makefile.win +++ b/Makefile.win @@ -1,8 +1,8 @@ EXTENSION = vector EXTVERSION = 0.6.2 -OBJS = src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\vector.obj -HEADERS = src\vector.h +OBJS = src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\intvec.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\vector.obj +HEADERS = src\intvec.h src\vector.h REGRESS = btree cast copy functions input ivfflat_cosine ivfflat_ip ivfflat_l2 ivfflat_options ivfflat_unlogged REGRESS_OPTS = --inputdir=test --load-extension=$(EXTENSION) diff --git a/README.md b/README.md index f9849c8..4a5906f 100644 --- a/README.md +++ b/README.md @@ -732,6 +732,27 @@ Function | Description | Added avg(vector) → vector | average | sum(vector) → vector | sum | 0.5.0 +### Intvec Type + +Each int vector takes `dimensions + 8` bytes of storage. Each element is a single byte signed integer. Int vectors can have up to 16,000 dimensions. + +### Intvec Operators + +Operator | Description | Added +--- | --- | --- +<-> | Euclidean distance | 0.7.0 +<#> | negative inner product | 0.7.0 +<=> | cosine distance | 0.7.0 + +### Intvec Functions + +Function | Description | Added +--- | --- | --- +cosine_distance(intvec, intvec) → double precision | cosine distance | 0.7.0 +inner_product(intvec, intvec) → double precision | inner product | 0.7.0 +l2_distance(intvec, intvec) → double precision | Euclidean distance | 0.7.0 +l1_distance(intvec, intvec) → double precision | taxicab distance | 0.7.0 + ## Installation Notes - Linux and Mac ### Postgres Location diff --git a/sql/vector--0.6.2--0.7.0.sql b/sql/vector--0.6.2--0.7.0.sql new file mode 100644 index 0000000..3b8e346 --- /dev/null +++ b/sql/vector--0.6.2--0.7.0.sql @@ -0,0 +1,88 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.7.0'" to load this file. \quit + +CREATE TYPE intvec; + +CREATE FUNCTION intvec_in(cstring, oid, integer) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_out(intvec) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_recv(internal, oid, integer) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_send(intvec) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE intvec ( + INPUT = intvec_in, + OUTPUT = intvec_out, + TYPMOD_IN = intvec_typmod_in, + RECEIVE = intvec_recv, + SEND = intvec_send, + STORAGE = external +); + +CREATE FUNCTION l2_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l1_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_l1_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_l2_squared_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_negative_inner_product(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec(intvec, integer, boolean) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_intvec(integer[], integer, boolean) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE CAST (intvec AS intvec) + WITH FUNCTION intvec(intvec, integer, boolean) AS IMPLICIT; + +CREATE CAST (integer[] AS intvec) + WITH FUNCTION array_to_intvec(integer[], integer, boolean) AS ASSIGNMENT; + +CREATE OPERATOR <-> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = intvec_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +CREATE OPERATOR CLASS intvec_l2_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <-> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 intvec_l2_squared_distance(intvec, intvec); + +CREATE OPERATOR CLASS intvec_ip_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <#> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 intvec_negative_inner_product(intvec, intvec); + +CREATE OPERATOR CLASS intvec_cosine_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <=> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 cosine_distance(intvec, intvec); diff --git a/sql/vector.sql b/sql/vector.sql index 141e83c..05d97a2 100644 --- a/sql/vector.sql +++ b/sql/vector.sql @@ -287,3 +287,103 @@ CREATE OPERATOR CLASS vector_cosine_ops OPERATOR 1 <=> (vector, vector) FOR ORDER BY float_ops, FUNCTION 1 vector_negative_inner_product(vector, vector), FUNCTION 2 vector_norm(vector); + +-- intvec type + +CREATE TYPE intvec; + +CREATE FUNCTION intvec_in(cstring, oid, integer) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_out(intvec) RETURNS cstring + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_typmod_in(cstring[]) RETURNS integer + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_recv(internal, oid, integer) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_send(intvec) RETURNS bytea + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE TYPE intvec ( + INPUT = intvec_in, + OUTPUT = intvec_out, + TYPMOD_IN = intvec_typmod_in, + RECEIVE = intvec_recv, + SEND = intvec_send, + STORAGE = external +); + +-- intvec functions + +CREATE FUNCTION l2_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_l2_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION inner_product(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_inner_product' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION cosine_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_cosine_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION l1_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME', 'intvec_l1_distance' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- intvec private functions + +CREATE FUNCTION intvec_l2_squared_distance(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION intvec_negative_inner_product(intvec, intvec) RETURNS float8 + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- intvec cast functions + +CREATE FUNCTION intvec(intvec, integer, boolean) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +CREATE FUNCTION array_to_intvec(integer[], integer, boolean) RETURNS intvec + AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; + +-- intvec casts + +CREATE CAST (intvec AS intvec) + WITH FUNCTION intvec(intvec, integer, boolean) AS IMPLICIT; + +CREATE CAST (integer[] AS intvec) + WITH FUNCTION array_to_intvec(integer[], integer, boolean) AS ASSIGNMENT; + +-- intvec operators + +CREATE OPERATOR <-> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = l2_distance, + COMMUTATOR = '<->' +); + +CREATE OPERATOR <#> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = intvec_negative_inner_product, + COMMUTATOR = '<#>' +); + +CREATE OPERATOR <=> ( + LEFTARG = intvec, RIGHTARG = intvec, PROCEDURE = cosine_distance, + COMMUTATOR = '<=>' +); + +-- intvec opclasses + +CREATE OPERATOR CLASS intvec_l2_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <-> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 intvec_l2_squared_distance(intvec, intvec); + +CREATE OPERATOR CLASS intvec_ip_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <#> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 intvec_negative_inner_product(intvec, intvec); + +CREATE OPERATOR CLASS intvec_cosine_ops + FOR TYPE intvec USING hnsw AS + OPERATOR 1 <=> (intvec, intvec) FOR ORDER BY float_ops, + FUNCTION 1 cosine_distance(intvec, intvec); diff --git a/src/hnsw.h b/src/hnsw.h index 901f22b..207ded6 100644 --- a/src/hnsw.h +++ b/src/hnsw.h @@ -57,7 +57,8 @@ typedef enum HnswType { - HNSW_TYPE_VECTOR + HNSW_TYPE_VECTOR, + HNSW_TYPE_INTVEC } HnswType; /* Build phases */ diff --git a/src/hnswbuild.c b/src/hnswbuild.c index cd7150d..1806d94 100644 --- a/src/hnswbuild.c +++ b/src/hnswbuild.c @@ -683,6 +683,9 @@ InitBuildState(HnswBuildState * buildstate, Relation heap, Relation index, Index buildstate->efConstruction = HnswGetEfConstruction(index); buildstate->dimensions = TupleDescAttr(index->rd_att, 0)->atttypmod; + if (buildstate->type == HNSW_TYPE_INTVEC) + maxDimensions *= 4; + /* Require column to have dimensions to be indexed */ if (buildstate->dimensions < 0) elog(ERROR, "column does not have dimensions"); diff --git a/src/hnswutils.c b/src/hnswutils.c index 0c828ae..0104c81 100644 --- a/src/hnswutils.c +++ b/src/hnswutils.c @@ -1,14 +1,17 @@ #include "postgres.h" +#include #include #include "access/generic_xlog.h" +#include "catalog/pg_type.h" #include "hnsw.h" #include "lib/pairingheap.h" #include "storage/bufmgr.h" #include "utils/datum.h" #include "utils/memdebug.h" #include "utils/rel.h" +#include "utils/syscache.h" #include "vector.h" #if PG_VERSION_NUM >= 130000 @@ -155,7 +158,24 @@ HnswOptionalProcInfo(Relation index, uint16 procnum) HnswType HnswGetType(Relation index) { - return HNSW_TYPE_VECTOR; + Oid typeOid = TupleDescAttr(index->rd_att, 0)->atttypid; + HeapTuple tuple; + Form_pg_type type; + int result; + + tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typeOid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for type %u", typeOid); + + type = (Form_pg_type) GETSTRUCT(tuple); + if (strcmp(NameStr(type->typname), "intvec") == 0) + result = HNSW_TYPE_INTVEC; + else + result = HNSW_TYPE_VECTOR; + + ReleaseSysCache(tuple); + + return result; } /* @@ -592,7 +612,14 @@ HnswLoadElement(HnswElement element, float *distance, Datum *q, Relation index, if (DatumGetPointer(*q) == NULL) *distance = 0; else + { *distance = (float) DatumGetFloat8(FunctionCall2Coll(procinfo, collation, *q, PointerGetDatum(&etup->data))); + + /* Needed for intvec cosine distance */ + /* TODO Improve */ + if (isnan(*distance)) + *distance = FLT_MAX; + } } UnlockReleaseBuffer(buf); diff --git a/src/intvec.c b/src/intvec.c new file mode 100644 index 0000000..164b931 --- /dev/null +++ b/src/intvec.c @@ -0,0 +1,586 @@ +#include "postgres.h" + +#include +#include + +#include "catalog/pg_type.h" +#include "fmgr.h" +#include "intvec.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" + +/* + * Ensure same dimensions + */ +static inline void +CheckDims(IntVector * a, IntVector * b) +{ + if (a->dim != b->dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different intvec dimensions %d and %d", a->dim, b->dim))); +} + +/* + * Ensure expected dimensions + */ +static inline void +CheckExpectedDim(int32 typmod, int dim) +{ + if (typmod != -1 && typmod != dim) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected %d dimensions, not %d", typmod, dim))); +} + +/* + * Ensure valid dimensions + */ +static inline void +CheckDim(int dim) +{ + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("intvec must have at least 1 dimension"))); + + if (dim > INTVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("intvec cannot have more than %d dimensions", INTVEC_MAX_DIM))); +} + +/* + * Ensure element in range + */ +static inline void +CheckElement(long value) +{ + if (value < SCHAR_MIN || value > SCHAR_MAX) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value \"%ld\" is out of range for type intvec", value))); +} + +/* + * Allocate and initialize a new int vector + */ +IntVector * +InitIntVector(int dim) +{ + IntVector *result; + int size; + + size = INTVEC_SIZE(dim); + result = (IntVector *) palloc0(size); + SET_VARSIZE(result, size); + result->dim = dim; + + return result; +} + +/* + * Check for whitespace, since array_isspace() is static + */ +static inline bool +intvec_isspace(char ch) +{ + if (ch == ' ' || + ch == '\t' || + ch == '\n' || + ch == '\r' || + ch == '\v' || + ch == '\f') + return true; + return false; +} + +/* + * Convert textual representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_in); +Datum +intvec_in(PG_FUNCTION_ARGS) +{ + char *lit = PG_GETARG_CSTRING(0); + int32 typmod = PG_GETARG_INT32(2); + int8 x[INTVEC_MAX_DIM]; + int dim = 0; + char *pt; + char *stringEnd; + IntVector *result; + char *litcopy = pstrdup(lit); + char *str = litcopy; + + while (intvec_isspace(*str)) + str++; + + if (*str != '[') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed intvec literal: \"%s\"", lit), + errdetail("Vector contents must start with \"[\"."))); + + str++; + pt = strtok(str, ","); + stringEnd = pt; + + while (pt != NULL && *stringEnd != ']') + { + long l; + + if (dim == INTVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("intvec cannot have more than %d dimensions", INTVEC_MAX_DIM))); + + while (intvec_isspace(*pt)) + pt++; + + /* Check for empty string like float4in */ + if (*pt == '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type intvec: \"%s\"", lit))); + + /* Use similar logic as int2vectorin */ + errno = 0; + l = strtol(pt, &stringEnd, 10); + + if (stringEnd == pt) + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type intvec: \"%s\"", lit))); + + if (errno == ERANGE || l < SCHAR_MIN || l > SCHAR_MAX) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value \"%s\" is out of range for type intvec", pt))); + + x[dim++] = l; + + while (intvec_isspace(*stringEnd)) + stringEnd++; + + if (*stringEnd != '\0' && *stringEnd != ']') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("invalid input syntax for type intvec: \"%s\"", lit))); + + pt = strtok(NULL, ","); + } + + if (stringEnd == NULL || *stringEnd != ']') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed intvec literal: \"%s\"", lit), + errdetail("Unexpected end of input."))); + + stringEnd++; + + /* Only whitespace is allowed after the closing brace */ + while (intvec_isspace(*stringEnd)) + stringEnd++; + + if (*stringEnd != '\0') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed intvec literal: \"%s\"", lit), + errdetail("Junk after closing right brace."))); + + /* Ensure no consecutive delimiters since strtok skips */ + for (pt = lit + 1; *pt != '\0'; pt++) + { + if (pt[-1] == ',' && *pt == ',') + ereport(ERROR, + (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION), + errmsg("malformed intvec literal: \"%s\"", lit))); + } + + if (dim < 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("intvec must have at least 1 dimension"))); + + pfree(litcopy); + + CheckExpectedDim(typmod, dim); + + result = InitIntVector(dim); + for (int i = 0; i < dim; i++) + result->x[i] = x[i]; + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to textual representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_out); +Datum +intvec_out(PG_FUNCTION_ARGS) +{ + IntVector *vector = PG_GETARG_INTVEC_P(0); + int dim = vector->dim; + char *buf; + char *ptr; + int n; + + /* + * Need: + * + * dim * 4 bytes for elements (-128 to 127) + * + * dim - 1 bytes for separator + * + * 3 bytes for [, ], and \0 + */ + buf = (char *) palloc(5 * dim + 2); + ptr = buf; + + *ptr = '['; + ptr++; + for (int i = 0; i < dim; i++) + { + if (i > 0) + { + *ptr = ','; + ptr++; + } + + n = pg_ltoa(vector->x[i], ptr); + ptr += n; + } + *ptr = ']'; + ptr++; + *ptr = '\0'; + + PG_FREE_IF_COPY(vector, 0); + PG_RETURN_CSTRING(buf); +} + +/* + * Convert type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_typmod_in); +Datum +intvec_typmod_in(PG_FUNCTION_ARGS) +{ + ArrayType *ta = PG_GETARG_ARRAYTYPE_P(0); + int32 *tl; + int n; + + tl = ArrayGetIntegerTypmods(ta, &n); + + if (n != 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid type modifier"))); + + if (*tl < 1) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type intvec must be at least 1"))); + + if (*tl > INTVEC_MAX_DIM) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("dimensions for type intvec cannot exceed %d", INTVEC_MAX_DIM))); + + PG_RETURN_INT32(*tl); +} + +/* + * Convert external binary representation to internal representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_recv); +Datum +intvec_recv(PG_FUNCTION_ARGS) +{ + StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); + int32 typmod = PG_GETARG_INT32(2); + IntVector *result; + int16 dim; + int16 unused; + + dim = pq_getmsgint(buf, sizeof(int16)); + unused = pq_getmsgint(buf, sizeof(int16)); + + CheckDim(dim); + CheckExpectedDim(typmod, dim); + + if (unused != 0) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("expected unused to be 0, not %d", unused))); + + result = InitIntVector(dim); + for (int i = 0; i < dim; i++) + result->x[i] = pq_getmsgint(buf, sizeof(int8)); + + PG_RETURN_POINTER(result); +} + +/* + * Convert internal representation to the external binary representation + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_send); +Datum +intvec_send(PG_FUNCTION_ARGS) +{ + IntVector *vec = PG_GETARG_INTVEC_P(0); + StringInfoData buf; + + pq_begintypsend(&buf); + pq_sendint(&buf, vec->dim, sizeof(int16)); + pq_sendint(&buf, vec->unused, sizeof(int16)); + for (int i = 0; i < vec->dim; i++) + pq_sendint8(&buf, vec->x[i]); + + PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); +} + +/* + * Convert int vector to int vector + * This is needed to check the type modifier + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec); +Datum +intvec(PG_FUNCTION_ARGS) +{ + IntVector *vec = PG_GETARG_INTVEC_P(0); + int32 typmod = PG_GETARG_INT32(1); + + CheckExpectedDim(typmod, vec->dim); + + PG_RETURN_POINTER(vec); +} + +/* + * Convert array to intvec vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(array_to_intvec); +Datum +array_to_intvec(PG_FUNCTION_ARGS) +{ + ArrayType *array = PG_GETARG_ARRAYTYPE_P(0); + int32 typmod = PG_GETARG_INT32(1); + Vector *result; + int16 typlen; + bool typbyval; + char typalign; + Datum *elemsp; + int nelemsp; + + if (ARR_NDIM(array) > 1) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("array must be 1-D"))); + + if (ARR_HASNULL(array) && array_contains_nulls(array)) + ereport(ERROR, + (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), + errmsg("array must not contain nulls"))); + + get_typlenbyvalalign(ARR_ELEMTYPE(array), &typlen, &typbyval, &typalign); + deconstruct_array(array, ARR_ELEMTYPE(array), typlen, typbyval, typalign, &elemsp, NULL, &nelemsp); + + CheckDim(nelemsp); + CheckExpectedDim(typmod, nelemsp); + + result = InitVector(nelemsp); + + if (ARR_ELEMTYPE(array) == INT4OID) + { + for (int i = 0; i < nelemsp; i++) + { + long l = DatumGetInt32(elemsp[i]); + + CheckElement(l); + + result->x[i] = l; + } + } + else + { + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("unsupported array type"))); + } + + /* + * Free allocation from deconstruct_array. Do not free individual elements + * when pass-by-reference since they point to original array. + */ + pfree(elemsp); + + PG_RETURN_POINTER(result); +} + +/* + * Get the L2 distance between int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_l2_distance); +Datum +intvec_l2_distance(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + int diff = ax[i] - bx[i]; + + distance += diff * diff; + } + + PG_RETURN_FLOAT8(sqrt((double) distance)); +} + +/* + * Get the L2 squared distance between int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_l2_squared_distance); +Datum +intvec_l2_squared_distance(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + int diff = ax[i] - bx[i]; + + distance += diff * diff; + } + + PG_RETURN_FLOAT8((double) distance); +} + +/* + * Get the inner product of two int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_inner_product); +Datum +intvec_inner_product(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += ax[i] * bx[i]; + + PG_RETURN_FLOAT8((double) distance); +} + +/* + * Get the negative inner product of two int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_negative_inner_product); +Datum +intvec_negative_inner_product(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += ax[i] * bx[i]; + + PG_RETURN_FLOAT8((double) -distance); +} + +/* + * Get the cosine distance between two int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_cosine_distance); +Datum +intvec_cosine_distance(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + int norma = 0; + int normb = 0; + double similarity; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + { + int8 axi = ax[i]; + int8 bxi = bx[i]; + + distance += axi * bxi; + norma += axi * axi; + normb += bxi * bxi; + } + + /* Use sqrt(a * b) over sqrt(a) * sqrt(b) */ + similarity = (double) distance / sqrt((double) norma * (double) normb); + +#ifdef _MSC_VER + /* /fp:fast may not propagate NaN */ + if (isnan(similarity)) + PG_RETURN_FLOAT8(NAN); +#endif + + /* Keep in range */ + if (similarity > 1) + similarity = 1; + else if (similarity < -1) + similarity = -1; + + PG_RETURN_FLOAT8(1 - similarity); +} + +/* + * Get the L1 distance between two int vectors + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(intvec_l1_distance); +Datum +intvec_l1_distance(PG_FUNCTION_ARGS) +{ + IntVector *a = PG_GETARG_INTVEC_P(0); + IntVector *b = PG_GETARG_INTVEC_P(1); + int8 *ax = a->x; + int8 *bx = b->x; + int distance = 0; + + CheckDims(a, b); + + /* Auto-vectorized */ + for (int i = 0; i < a->dim; i++) + distance += abs(ax[i] - bx[i]); + + PG_RETURN_FLOAT8((double) distance); +} diff --git a/src/intvec.h b/src/intvec.h new file mode 100644 index 0000000..cf5f367 --- /dev/null +++ b/src/intvec.h @@ -0,0 +1,23 @@ +#ifndef INTVEC_H +#define INTVEC_H + +#include "vector.h" + +#define INTVEC_MAX_DIM VECTOR_MAX_DIM + +#define INTVEC_SIZE(_dim) (offsetof(IntVector, x) + sizeof(int8)*(_dim)) +#define DatumGetIntVector(x) ((IntVector *) PG_DETOAST_DATUM(x)) +#define PG_GETARG_INTVEC_P(x) DatumGetIntVector(PG_GETARG_DATUM(x)) +#define PG_RETURN_INTVEC_P(x) PG_RETURN_POINTER(x) + +typedef struct IntVector +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + int16 dim; /* number of dimensions */ + int16 unused; + int8 x[FLEXIBLE_ARRAY_MEMBER]; +} IntVector; + +IntVector *InitIntVector(int dim); + +#endif diff --git a/test/expected/copy.out b/test/expected/copy.out index 36d4620..ce85454 100644 --- a/test/expected/copy.out +++ b/test/expected/copy.out @@ -1,15 +1,15 @@ -CREATE TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); -CREATE TABLE t2 (val vector(3)); +CREATE TABLE t (val vector(3), val2 intvec(3)); +INSERT INTO t (val, val2) VALUES ('[0,0,0]', '[0,0,0]'), ('[1,2,3]', '[1,2,3]'), ('[1,1,1]', '[1,1,1]'), (NULL, NULL); +CREATE TABLE t2 (val vector(3), val2 intvec(3)); \copy t TO 'results/data.bin' WITH (FORMAT binary) \copy t2 FROM 'results/data.bin' WITH (FORMAT binary) SELECT * FROM t2 ORDER BY val; - val ---------- - [0,0,0] - [1,1,1] - [1,2,3] - + val | val2 +---------+--------- + [0,0,0] | [0,0,0] + [1,1,1] | [1,1,1] + [1,2,3] | [1,2,3] + | (4 rows) DROP TABLE t; diff --git a/test/expected/functions.out b/test/expected/functions.out index 85d1a2f..12f8f6d 100644 --- a/test/expected/functions.out +++ b/test/expected/functions.out @@ -104,105 +104,105 @@ SELECT vector_norm('[3e37,4e37]')::real; 5e+37 (1 row) -SELECT l2_distance('[0,0]', '[3,4]'); +SELECT l2_distance('[0,0]'::vector, '[3,4]'); l2_distance ------------- 5 (1 row) -SELECT l2_distance('[0,0]', '[0,1]'); +SELECT l2_distance('[0,0]'::vector, '[0,1]'); l2_distance ------------- 1 (1 row) -SELECT l2_distance('[1,2]', '[3]'); +SELECT l2_distance('[1,2]'::vector, '[3]'); ERROR: different vector dimensions 2 and 1 -SELECT l2_distance('[3e38]', '[-3e38]'); +SELECT l2_distance('[3e38]'::vector, '[-3e38]'); l2_distance ------------- Infinity (1 row) -SELECT inner_product('[1,2]', '[3,4]'); +SELECT inner_product('[1,2]'::vector, '[3,4]'); inner_product --------------- 11 (1 row) -SELECT inner_product('[1,2]', '[3]'); +SELECT inner_product('[1,2]'::vector, '[3]'); ERROR: different vector dimensions 2 and 1 -SELECT inner_product('[3e38]', '[3e38]'); +SELECT inner_product('[3e38]'::vector, '[3e38]'); inner_product --------------- Infinity (1 row) -SELECT cosine_distance('[1,2]', '[2,4]'); +SELECT cosine_distance('[1,2]'::vector, '[2,4]'); cosine_distance ----------------- 0 (1 row) -SELECT cosine_distance('[1,2]', '[0,0]'); +SELECT cosine_distance('[1,2]'::vector, '[0,0]'); cosine_distance ----------------- NaN (1 row) -SELECT cosine_distance('[1,1]', '[1,1]'); +SELECT cosine_distance('[1,1]'::vector, '[1,1]'); cosine_distance ----------------- 0 (1 row) -SELECT cosine_distance('[1,0]', '[0,2]'); +SELECT cosine_distance('[1,0]'::vector, '[0,2]'); cosine_distance ----------------- 1 (1 row) -SELECT cosine_distance('[1,1]', '[-1,-1]'); +SELECT cosine_distance('[1,1]'::vector, '[-1,-1]'); cosine_distance ----------------- 2 (1 row) -SELECT cosine_distance('[1,2]', '[3]'); +SELECT cosine_distance('[1,2]'::vector, '[3]'); ERROR: different vector dimensions 2 and 1 -SELECT cosine_distance('[1,1]', '[1.1,1.1]'); +SELECT cosine_distance('[1,1]'::vector, '[1.1,1.1]'); cosine_distance ----------------- 0 (1 row) -SELECT cosine_distance('[1,1]', '[-1.1,-1.1]'); +SELECT cosine_distance('[1,1]'::vector, '[-1.1,-1.1]'); cosine_distance ----------------- 2 (1 row) -SELECT cosine_distance('[3e38]', '[3e38]'); +SELECT cosine_distance('[3e38]'::vector, '[3e38]'); cosine_distance ----------------- NaN (1 row) -SELECT l1_distance('[0,0]', '[3,4]'); +SELECT l1_distance('[0,0]'::vector, '[3,4]'); l1_distance ------------- 7 (1 row) -SELECT l1_distance('[0,0]', '[0,1]'); +SELECT l1_distance('[0,0]'::vector, '[0,1]'); l1_distance ------------- 1 (1 row) -SELECT l1_distance('[1,2]', '[3]'); +SELECT l1_distance('[1,2]'::vector, '[3]'); ERROR: different vector dimensions 2 and 1 -SELECT l1_distance('[3e38]', '[-3e38]'); +SELECT l1_distance('[3e38]'::vector, '[-3e38]'); l1_distance ------------- Infinity diff --git a/test/expected/hnsw_intvec_cosine.out b/test/expected/hnsw_intvec_cosine.out new file mode 100644 index 0000000..306e685 --- /dev/null +++ b/test/expected/hnsw_intvec_cosine.out @@ -0,0 +1,27 @@ +SET enable_seqscan = off; +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_cosine_ops); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <=> '[3,3,3]'; + val +--------- + [1,1,1] + [1,2,3] + [1,2,4] + [0,0,0] +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '[0,0,0]') t2; + count +------- + 4 +(1 row) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::intvec)) t2; + count +------- + 4 +(1 row) + +DROP TABLE t; diff --git a/test/expected/hnsw_intvec_cosine.out.diff b/test/expected/hnsw_intvec_cosine.out.diff new file mode 100644 index 0000000..e69de29 diff --git a/test/expected/hnsw_intvec_ip.out b/test/expected/hnsw_intvec_ip.out new file mode 100644 index 0000000..8f8a879 --- /dev/null +++ b/test/expected/hnsw_intvec_ip.out @@ -0,0 +1,21 @@ +SET enable_seqscan = off; +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_ip_ops); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <#> '[3,3,3]'; + val +--------- + [1,2,4] + [1,2,3] + [1,1,1] + [0,0,0] +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::intvec)) t2; + count +------- + 4 +(1 row) + +DROP TABLE t; diff --git a/test/expected/hnsw_intvec_l2.out b/test/expected/hnsw_intvec_l2.out new file mode 100644 index 0000000..33165b7 --- /dev/null +++ b/test/expected/hnsw_intvec_l2.out @@ -0,0 +1,33 @@ +SET enable_seqscan = off; +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_l2_ops); +INSERT INTO t (val) VALUES ('[1,2,4]'); +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + val +--------- + [1,2,3] + [1,2,4] + [1,1,1] + [0,0,0] +(4 rows) + +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <-> (SELECT NULL::intvec)) t2; + count +------- + 4 +(1 row) + +SELECT COUNT(*) FROM t; + count +------- + 5 +(1 row) + +TRUNCATE t; +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + val +----- +(0 rows) + +DROP TABLE t; diff --git a/test/expected/intvec_functions.out b/test/expected/intvec_functions.out new file mode 100644 index 0000000..983cc90 --- /dev/null +++ b/test/expected/intvec_functions.out @@ -0,0 +1,92 @@ +SELECT l2_distance('[0,0]'::intvec, '[3,4]'); + l2_distance +------------- + 5 +(1 row) + +SELECT l2_distance('[0,0]'::intvec, '[0,1]'); + l2_distance +------------- + 1 +(1 row) + +SELECT l2_distance('[1,2]'::intvec, '[3]'); +ERROR: different intvec dimensions 2 and 1 +SELECT '[0,0]'::intvec <-> '[3,4]'; + ?column? +---------- + 5 +(1 row) + +SELECT inner_product('[1,2]'::intvec, '[3,4]'); + inner_product +--------------- + 11 +(1 row) + +SELECT inner_product('[1,2]'::intvec, '[3]'); +ERROR: different intvec dimensions 2 and 1 +SELECT inner_product('[127]'::intvec, '[127]'); + inner_product +--------------- + 16129 +(1 row) + +SELECT '[1,2]'::intvec <#> '[3,4]'; + ?column? +---------- + -11 +(1 row) + +SELECT cosine_distance('[1,2]'::intvec, '[2,4]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,2]'::intvec, '[0,0]'); + cosine_distance +----------------- + NaN +(1 row) + +SELECT cosine_distance('[1,1]'::intvec, '[1,1]'); + cosine_distance +----------------- + 0 +(1 row) + +SELECT cosine_distance('[1,0]'::intvec, '[0,2]'); + cosine_distance +----------------- + 1 +(1 row) + +SELECT cosine_distance('[1,1]'::intvec, '[-1,-1]'); + cosine_distance +----------------- + 2 +(1 row) + +SELECT cosine_distance('[1,2]'::intvec, '[3]'); +ERROR: different intvec dimensions 2 and 1 +SELECT '[1,2]'::intvec <=> '[2,4]'; + ?column? +---------- + 0 +(1 row) + +SELECT l1_distance('[0,0]'::intvec, '[3,4]'); + l1_distance +------------- + 7 +(1 row) + +SELECT l1_distance('[0,0]'::intvec, '[0,1]'); + l1_distance +------------- + 1 +(1 row) + +SELECT l1_distance('[1,2]'::intvec, '[3]'); +ERROR: different intvec dimensions 2 and 1 diff --git a/test/expected/intvec_input.out b/test/expected/intvec_input.out new file mode 100644 index 0000000..41e53f1 --- /dev/null +++ b/test/expected/intvec_input.out @@ -0,0 +1,119 @@ +SELECT '[1,2,3]'::intvec; + intvec +--------- + [1,2,3] +(1 row) + +SELECT '[-1,-2,-3]'::intvec; + intvec +------------ + [-1,-2,-3] +(1 row) + +SELECT ' [ 1, 2 , 3 ] '::intvec; + intvec +--------- + [1,2,3] +(1 row) + +SELECT '[1.23456]'::intvec; +ERROR: invalid input syntax for type intvec: "[1.23456]" +LINE 1: SELECT '[1.23456]'::intvec; + ^ +SELECT '[hello,1]'::intvec; +ERROR: invalid input syntax for type intvec: "[hello,1]" +LINE 1: SELECT '[hello,1]'::intvec; + ^ +SELECT '[127,-128]'::intvec; + intvec +------------ + [127,-128] +(1 row) + +SELECT '[128,-129]'::intvec; +ERROR: value "128" is out of range for type intvec +LINE 1: SELECT '[128,-129]'::intvec; + ^ +SELECT '[1,2,3'::intvec; +ERROR: malformed intvec literal: "[1,2,3" +LINE 1: SELECT '[1,2,3'::intvec; + ^ +DETAIL: Unexpected end of input. +SELECT '[1,2,3]9'::intvec; +ERROR: malformed intvec literal: "[1,2,3]9" +LINE 1: SELECT '[1,2,3]9'::intvec; + ^ +DETAIL: Junk after closing right brace. +SELECT '1,2,3'::intvec; +ERROR: malformed intvec literal: "1,2,3" +LINE 1: SELECT '1,2,3'::intvec; + ^ +DETAIL: Vector contents must start with "[". +SELECT ''::intvec; +ERROR: malformed intvec literal: "" +LINE 1: SELECT ''::intvec; + ^ +DETAIL: Vector contents must start with "[". +SELECT '['::intvec; +ERROR: malformed intvec literal: "[" +LINE 1: SELECT '['::intvec; + ^ +DETAIL: Unexpected end of input. +SELECT '[,'::intvec; +ERROR: malformed intvec literal: "[," +LINE 1: SELECT '[,'::intvec; + ^ +DETAIL: Unexpected end of input. +SELECT '[]'::intvec; +ERROR: intvec must have at least 1 dimension +LINE 1: SELECT '[]'::intvec; + ^ +SELECT '[1,]'::intvec; +ERROR: invalid input syntax for type intvec: "[1,]" +LINE 1: SELECT '[1,]'::intvec; + ^ +SELECT '[1a]'::intvec; +ERROR: invalid input syntax for type intvec: "[1a]" +LINE 1: SELECT '[1a]'::intvec; + ^ +SELECT '[1,,3]'::intvec; +ERROR: malformed intvec literal: "[1,,3]" +LINE 1: SELECT '[1,,3]'::intvec; + ^ +SELECT '[1, ,3]'::intvec; +ERROR: invalid input syntax for type intvec: "[1, ,3]" +LINE 1: SELECT '[1, ,3]'::intvec; + ^ +SELECT '[1,2,3]'::intvec(3); + intvec +--------- + [1,2,3] +(1 row) + +SELECT '[1,2,3]'::intvec(2); +ERROR: expected 2 dimensions, not 3 +SELECT '[1,2,3]'::intvec(3, 2); +ERROR: invalid type modifier +LINE 1: SELECT '[1,2,3]'::intvec(3, 2); + ^ +SELECT '[1,2,3]'::intvec('a'); +ERROR: invalid input syntax for type integer: "a" +LINE 1: SELECT '[1,2,3]'::intvec('a'); + ^ +SELECT '[1,2,3]'::intvec(0); +ERROR: dimensions for type intvec must be at least 1 +LINE 1: SELECT '[1,2,3]'::intvec(0); + ^ +SELECT '[1,2,3]'::intvec(16001); +ERROR: dimensions for type intvec cannot exceed 16000 +LINE 1: SELECT '[1,2,3]'::intvec(16001); + ^ +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::intvec[]); + unnest +--------- + [1,2,3] + [4,5,6] +(2 rows) + +SELECT '{"[1,2,3]"}'::intvec(2)[]; +ERROR: expected 2 dimensions, not 3 diff --git a/test/sql/copy.sql b/test/sql/copy.sql index 2820090..7d2dc37 100644 --- a/test/sql/copy.sql +++ b/test/sql/copy.sql @@ -1,7 +1,7 @@ -CREATE TABLE t (val vector(3)); -INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE TABLE t (val vector(3), val2 intvec(3)); +INSERT INTO t (val, val2) VALUES ('[0,0,0]', '[0,0,0]'), ('[1,2,3]', '[1,2,3]'), ('[1,1,1]', '[1,1,1]'), (NULL, NULL); -CREATE TABLE t2 (val vector(3)); +CREATE TABLE t2 (val vector(3), val2 intvec(3)); \copy t TO 'results/data.bin' WITH (FORMAT binary) \copy t2 FROM 'results/data.bin' WITH (FORMAT binary) diff --git a/test/sql/functions.sql b/test/sql/functions.sql index 6235684..7e820d7 100644 --- a/test/sql/functions.sql +++ b/test/sql/functions.sql @@ -24,29 +24,29 @@ SELECT vector_norm('[3,4]'); SELECT vector_norm('[0,1]'); SELECT vector_norm('[3e37,4e37]')::real; -SELECT l2_distance('[0,0]', '[3,4]'); -SELECT l2_distance('[0,0]', '[0,1]'); -SELECT l2_distance('[1,2]', '[3]'); -SELECT l2_distance('[3e38]', '[-3e38]'); +SELECT l2_distance('[0,0]'::vector, '[3,4]'); +SELECT l2_distance('[0,0]'::vector, '[0,1]'); +SELECT l2_distance('[1,2]'::vector, '[3]'); +SELECT l2_distance('[3e38]'::vector, '[-3e38]'); -SELECT inner_product('[1,2]', '[3,4]'); -SELECT inner_product('[1,2]', '[3]'); -SELECT inner_product('[3e38]', '[3e38]'); +SELECT inner_product('[1,2]'::vector, '[3,4]'); +SELECT inner_product('[1,2]'::vector, '[3]'); +SELECT inner_product('[3e38]'::vector, '[3e38]'); -SELECT cosine_distance('[1,2]', '[2,4]'); -SELECT cosine_distance('[1,2]', '[0,0]'); -SELECT cosine_distance('[1,1]', '[1,1]'); -SELECT cosine_distance('[1,0]', '[0,2]'); -SELECT cosine_distance('[1,1]', '[-1,-1]'); -SELECT cosine_distance('[1,2]', '[3]'); -SELECT cosine_distance('[1,1]', '[1.1,1.1]'); -SELECT cosine_distance('[1,1]', '[-1.1,-1.1]'); -SELECT cosine_distance('[3e38]', '[3e38]'); +SELECT cosine_distance('[1,2]'::vector, '[2,4]'); +SELECT cosine_distance('[1,2]'::vector, '[0,0]'); +SELECT cosine_distance('[1,1]'::vector, '[1,1]'); +SELECT cosine_distance('[1,0]'::vector, '[0,2]'); +SELECT cosine_distance('[1,1]'::vector, '[-1,-1]'); +SELECT cosine_distance('[1,2]'::vector, '[3]'); +SELECT cosine_distance('[1,1]'::vector, '[1.1,1.1]'); +SELECT cosine_distance('[1,1]'::vector, '[-1.1,-1.1]'); +SELECT cosine_distance('[3e38]'::vector, '[3e38]'); -SELECT l1_distance('[0,0]', '[3,4]'); -SELECT l1_distance('[0,0]', '[0,1]'); -SELECT l1_distance('[1,2]', '[3]'); -SELECT l1_distance('[3e38]', '[-3e38]'); +SELECT l1_distance('[0,0]'::vector, '[3,4]'); +SELECT l1_distance('[0,0]'::vector, '[0,1]'); +SELECT l1_distance('[1,2]'::vector, '[3]'); +SELECT l1_distance('[3e38]'::vector, '[-3e38]'); SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]']) v; SELECT avg(v) FROM unnest(ARRAY['[1,2,3]'::vector, '[3,5,7]', NULL]) v; diff --git a/test/sql/hnsw_intvec_cosine.sql b/test/sql/hnsw_intvec_cosine.sql new file mode 100644 index 0000000..c8a8ddc --- /dev/null +++ b/test/sql/hnsw_intvec_cosine.sql @@ -0,0 +1,13 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_cosine_ops); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <=> '[3,3,3]'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> '[0,0,0]') t2; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <=> (SELECT NULL::intvec)) t2; + +DROP TABLE t; diff --git a/test/sql/hnsw_intvec_ip.sql b/test/sql/hnsw_intvec_ip.sql new file mode 100644 index 0000000..7045951 --- /dev/null +++ b/test/sql/hnsw_intvec_ip.sql @@ -0,0 +1,12 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_ip_ops); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <#> '[3,3,3]'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <#> (SELECT NULL::intvec)) t2; + +DROP TABLE t; diff --git a/test/sql/hnsw_intvec_l2.sql b/test/sql/hnsw_intvec_l2.sql new file mode 100644 index 0000000..6c08b3d --- /dev/null +++ b/test/sql/hnsw_intvec_l2.sql @@ -0,0 +1,16 @@ +SET enable_seqscan = off; + +CREATE TABLE t (val intvec(3)); +INSERT INTO t (val) VALUES ('[0,0,0]'), ('[1,2,3]'), ('[1,1,1]'), (NULL); +CREATE INDEX ON t USING hnsw (val intvec_l2_ops); + +INSERT INTO t (val) VALUES ('[1,2,4]'); + +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; +SELECT COUNT(*) FROM (SELECT * FROM t ORDER BY val <-> (SELECT NULL::intvec)) t2; +SELECT COUNT(*) FROM t; + +TRUNCATE t; +SELECT * FROM t ORDER BY val <-> '[3,3,3]'; + +DROP TABLE t; diff --git a/test/sql/intvec_functions.sql b/test/sql/intvec_functions.sql new file mode 100644 index 0000000..8570d3d --- /dev/null +++ b/test/sql/intvec_functions.sql @@ -0,0 +1,21 @@ +SELECT l2_distance('[0,0]'::intvec, '[3,4]'); +SELECT l2_distance('[0,0]'::intvec, '[0,1]'); +SELECT l2_distance('[1,2]'::intvec, '[3]'); +SELECT '[0,0]'::intvec <-> '[3,4]'; + +SELECT inner_product('[1,2]'::intvec, '[3,4]'); +SELECT inner_product('[1,2]'::intvec, '[3]'); +SELECT inner_product('[127]'::intvec, '[127]'); +SELECT '[1,2]'::intvec <#> '[3,4]'; + +SELECT cosine_distance('[1,2]'::intvec, '[2,4]'); +SELECT cosine_distance('[1,2]'::intvec, '[0,0]'); +SELECT cosine_distance('[1,1]'::intvec, '[1,1]'); +SELECT cosine_distance('[1,0]'::intvec, '[0,2]'); +SELECT cosine_distance('[1,1]'::intvec, '[-1,-1]'); +SELECT cosine_distance('[1,2]'::intvec, '[3]'); +SELECT '[1,2]'::intvec <=> '[2,4]'; + +SELECT l1_distance('[0,0]'::intvec, '[3,4]'); +SELECT l1_distance('[0,0]'::intvec, '[0,1]'); +SELECT l1_distance('[1,2]'::intvec, '[3]'); diff --git a/test/sql/intvec_input.sql b/test/sql/intvec_input.sql new file mode 100644 index 0000000..fefa5bf --- /dev/null +++ b/test/sql/intvec_input.sql @@ -0,0 +1,28 @@ +SELECT '[1,2,3]'::intvec; +SELECT '[-1,-2,-3]'::intvec; +SELECT ' [ 1, 2 , 3 ] '::intvec; +SELECT '[1.23456]'::intvec; +SELECT '[hello,1]'::intvec; +SELECT '[127,-128]'::intvec; +SELECT '[128,-129]'::intvec; +SELECT '[1,2,3'::intvec; +SELECT '[1,2,3]9'::intvec; +SELECT '1,2,3'::intvec; +SELECT ''::intvec; +SELECT '['::intvec; +SELECT '[,'::intvec; +SELECT '[]'::intvec; +SELECT '[1,]'::intvec; +SELECT '[1a]'::intvec; +SELECT '[1,,3]'::intvec; +SELECT '[1, ,3]'::intvec; + +SELECT '[1,2,3]'::intvec(3); +SELECT '[1,2,3]'::intvec(2); +SELECT '[1,2,3]'::intvec(3, 2); +SELECT '[1,2,3]'::intvec('a'); +SELECT '[1,2,3]'::intvec(0); +SELECT '[1,2,3]'::intvec(16001); + +SELECT unnest('{"[1,2,3]", "[4,5,6]"}'::intvec[]); +SELECT '{"[1,2,3]"}'::intvec(2)[]; diff --git a/test/t/020_hnsw_intvec_build_recall.pl b/test/t/020_hnsw_intvec_build_recall.pl new file mode 100644 index 0000000..dcc6886 --- /dev/null +++ b/test/t/020_hnsw_intvec_build_recall.pl @@ -0,0 +1,132 @@ +use strict; +use warnings; +use PostgresNode; +use TestLib; +use Test::More; + +my $node; +my @queries = (); +my @expected; +my $limit = 20; +my $dim = 20; +my $array_sql = join(",", ('(random() * 255)::int - 128') x $dim); + +sub test_recall +{ + my ($min, $operator) = @_; + my $correct = 0; + my $total = 0; + + my $explain = $node->safe_psql("postgres", qq( + SET enable_seqscan = off; + EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v $operator '$queries[0]' LIMIT $limit; + )); + like($explain, qr/Index Scan/); + + for my $i (0 .. $#queries) + { + my $actual = $node->safe_psql("postgres", qq( + SET enable_seqscan = off; + SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit; + )); + my @actual_ids = split("\n", $actual); + my %actual_set = map { $_ => 1 } @actual_ids; + + my @expected_ids = split("\n", $expected[$i]); + + foreach (@expected_ids) + { + if (exists($actual_set{$_})) + { + $correct++; + } + $total++; + } + } + + cmp_ok($correct / $total, ">=", $min, $operator); +} + +# Initialize node +$node = get_new_node('node'); +$node->init; +$node->start; + +# Create table +$node->safe_psql("postgres", "CREATE EXTENSION vector;"); +$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v intvec($dim));"); +$node->safe_psql("postgres", + "INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, 10000) i;" +); + +# Generate queries +for (1 .. 20) +{ + my @r = (); + for (1 .. $dim) + { + push(@r, int(rand(256)) - 128); + } + push(@queries, "[" . join(",", @r) . "]"); +} + +# Check each index type +my @operators = ("<->", "<#>", "<=>"); +my @opclasses = ("intvec_l2_ops", "intvec_ip_ops", "intvec_cosine_ops"); + +for my $i (0 .. $#operators) +{ + my $operator = $operators[$i]; + my $opclass = $opclasses[$i]; + + # Get exact results + @expected = (); + foreach (@queries) + { + my $res = $node->safe_psql("postgres", "SELECT i FROM tst ORDER BY v $operator '$_' LIMIT $limit;"); + push(@expected, $res); + } + + # Build index serially + $node->safe_psql("postgres", qq( + SET max_parallel_maintenance_workers = 0; + CREATE INDEX idx ON tst USING hnsw (v $opclass); + )); + + # Test approximate results + my $min = 0.99; + test_recall($min, $operator); + + $node->safe_psql("postgres", "DROP INDEX idx;"); + + # Build index in parallel in memory + my ($ret, $stdout, $stderr) = $node->psql("postgres", qq( + SET client_min_messages = DEBUG; + SET min_parallel_table_scan_size = 1; + CREATE INDEX idx ON tst USING hnsw (v $opclass); + )); + is($ret, 0, $stderr); + like($stderr, qr/using \d+ parallel workers/); + + # Test approximate results + test_recall($min, $operator); + + $node->safe_psql("postgres", "DROP INDEX idx;"); + + # Build index in parallel on disk + # Set parallel_workers on table to use workers with low maintenance_work_mem + ($ret, $stdout, $stderr) = $node->psql("postgres", qq( + ALTER TABLE tst SET (parallel_workers = 2); + SET client_min_messages = DEBUG; + SET maintenance_work_mem = '4MB'; + CREATE INDEX idx ON tst USING hnsw (v $opclass); + ALTER TABLE tst RESET (parallel_workers); + )); + is($ret, 0, $stderr); + like($stderr, qr/using \d+ parallel workers/); + like($stderr, qr/hnsw graph no longer fits into maintenance_work_mem/); + + $node->safe_psql("postgres", "DROP INDEX idx;"); +} + +done_testing();