diff --git a/Makefile b/Makefile index d5f61ff..9ac5ac2 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ EXTVERSION = 0.6.2 MODULE_big = vector DATA = $(wildcard sql/*--*.sql) -OBJS = src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/vector.o +OBJS = src/bitvector.o src/hnsw.o src/hnswbuild.o src/hnswinsert.o src/hnswscan.o src/hnswutils.o src/hnswvacuum.o src/ivfbuild.o src/ivfflat.o src/ivfinsert.o src/ivfkmeans.o src/ivfscan.o src/ivfutils.o src/ivfvacuum.o src/vector.o HEADERS = src/vector.h TESTS = $(wildcard test/sql/*.sql) diff --git a/Makefile.win b/Makefile.win index 1bb193e..b3928bc 100644 --- a/Makefile.win +++ b/Makefile.win @@ -1,7 +1,7 @@ EXTENSION = vector EXTVERSION = 0.6.2 -OBJS = src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\vector.obj +OBJS = src\bitvector.obj src\hnsw.obj src\hnswbuild.obj src\hnswinsert.obj src\hnswscan.obj src\hnswutils.obj src\hnswvacuum.obj src\ivfbuild.obj src\ivfflat.obj src\ivfinsert.obj src\ivfkmeans.obj src\ivfscan.obj src\ivfutils.obj src\ivfvacuum.obj src\vector.obj HEADERS = src\vector.h REGRESS = btree cast copy functions input ivfflat_cosine ivfflat_ip ivfflat_l2 ivfflat_options ivfflat_unlogged diff --git a/src/bitvector.c b/src/bitvector.c new file mode 100644 index 0000000..113d830 --- /dev/null +++ b/src/bitvector.c @@ -0,0 +1,60 @@ +#include "postgres.h" + +#include "bitvector.h" +#include "port/pg_bitutils.h" +#include "utils/varbit.h" + +#if PG_VERSION_NUM >= 160000 +#include "varatt.h" +#endif + +/* + * Allocate and initialize a new bit vector + */ +VarBit * +InitBitVector(int dim) +{ + VarBit *result; + int size; + + size = VARBITTOTALLEN(dim); + result = (VarBit *) palloc0(size); + SET_VARSIZE(result, size); + VARBITLEN(result) = dim; + + return result; +} + +/* + * Ensure same number of bits + */ +static inline void +CheckBitLengths(uint32 aLen, uint32 bLen) +{ + if (aLen != bLen) + ereport(ERROR, + (errcode(ERRCODE_DATA_EXCEPTION), + errmsg("different bit lengths %u and %u", aLen, bLen))); +} + +/* + * Get the Hamming distance between two bit strings + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(hamming_distance); +Datum +hamming_distance(PG_FUNCTION_ARGS) +{ + VarBit *a = PG_GETARG_VARBIT_P(0); + VarBit *b = PG_GETARG_VARBIT_P(1); + unsigned char *ax = VARBITS(a); + unsigned char *bx = VARBITS(b); + uint64 distance = 0; + + CheckBitLengths(VARBITLEN(a), VARBITLEN(b)); + + /* TODO Improve performance */ + for (uint32 i = 0; i < VARBITBYTES(a); i++) + distance += pg_number_of_ones[ax[i] ^ bx[i]]; + + PG_RETURN_FLOAT8((double) distance); +} diff --git a/src/bitvector.h b/src/bitvector.h new file mode 100644 index 0000000..b7dec9f --- /dev/null +++ b/src/bitvector.h @@ -0,0 +1,8 @@ +#ifndef BITVECTOR_H +#define BITVECTOR_H + +#include "utils/varbit.h" + +VarBit *InitBitVector(int dim); + +#endif diff --git a/src/hnswscan.c b/src/hnswscan.c index e659c14..cb015a1 100644 --- a/src/hnswscan.c +++ b/src/hnswscan.c @@ -1,13 +1,13 @@ #include "postgres.h" #include "access/relscan.h" +#include "bitvector.h" #include "catalog/pg_type_d.h" #include "hnsw.h" #include "pgstat.h" #include "storage/bufmgr.h" #include "storage/lmgr.h" #include "utils/memutils.h" -#include "utils/varbit.h" /* * Algorithm 5 from paper diff --git a/src/vector.c b/src/vector.c index 7be4c8f..0fee33e 100644 --- a/src/vector.c +++ b/src/vector.c @@ -2,6 +2,7 @@ #include +#include "bitvector.h" #include "catalog/pg_type.h" #include "common/shortest_dec.h" #include "fmgr.h" @@ -10,13 +11,11 @@ #include "lib/stringinfo.h" #include "libpq/pqformat.h" #include "port.h" /* for strtof() */ -#include "port/pg_bitutils.h" #include "utils/array.h" #include "utils/builtins.h" #include "utils/float.h" #include "utils/lsyscache.h" #include "utils/numeric.h" -#include "utils/varbit.h" #include "vector.h" #if PG_VERSION_NUM >= 160000 @@ -862,6 +861,26 @@ vector_mul(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } +/* + * Quantize a vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(quantize_binary); +Datum +quantize_binary(PG_FUNCTION_ARGS) +{ + Vector *a = PG_GETARG_VECTOR_P(0); + float *ax = a->x; + VarBit *result = InitBitVector(a->dim); + unsigned char *rx = VARBITS(result); + + /* TODO Improve */ + for (int i = 0; i < a->dim; i++) + rx[i / 8] |= (ax[i] > 0) << (7 - (i % 8)); + + PG_RETURN_VARBIT_P(result); +} + + /* * Internal helper to compare vectors */ @@ -1162,73 +1181,3 @@ vector_avg(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } - -/* - * Allocate and initialize a new bit vector - */ -VarBit * -InitBitVector(int dim) -{ - VarBit *result; - int size; - - size = VARBITTOTALLEN(dim); - result = (VarBit *) palloc0(size); - SET_VARSIZE(result, size); - VARBITLEN(result) = dim; - - return result; -} - -/* - * Quantize a vector - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(quantize_binary); -Datum -quantize_binary(PG_FUNCTION_ARGS) -{ - Vector *a = PG_GETARG_VECTOR_P(0); - float *ax = a->x; - VarBit *result = InitBitVector(a->dim); - unsigned char *rx = VARBITS(result); - - /* TODO Improve */ - for (int i = 0; i < a->dim; i++) - rx[i / 8] |= (ax[i] > 0) << (7 - (i % 8)); - - PG_RETURN_VARBIT_P(result); -} - -/* - * Ensure same number of bits - */ -static inline void -CheckBitLengths(uint32 aLen, uint32 bLen) -{ - if (aLen != bLen) - ereport(ERROR, - (errcode(ERRCODE_DATA_EXCEPTION), - errmsg("different bit lengths %u and %u", aLen, bLen))); -} - -/* - * Get the Hamming distance between two bit strings - */ -PGDLLEXPORT PG_FUNCTION_INFO_V1(hamming_distance); -Datum -hamming_distance(PG_FUNCTION_ARGS) -{ - VarBit *a = PG_GETARG_VARBIT_P(0); - VarBit *b = PG_GETARG_VARBIT_P(1); - unsigned char *ax = VARBITS(a); - unsigned char *bx = VARBITS(b); - uint64 distance = 0; - - CheckBitLengths(VARBITLEN(a), VARBITLEN(b)); - - /* TODO Improve performance */ - for (uint32 i = 0; i < VARBITBYTES(a); i++) - distance += pg_number_of_ones[ax[i] ^ bx[i]]; - - PG_RETURN_FLOAT8((double) distance); -} diff --git a/src/vector.h b/src/vector.h index d50c00f..e649471 100644 --- a/src/vector.h +++ b/src/vector.h @@ -1,8 +1,6 @@ #ifndef VECTOR_H #define VECTOR_H -#include "utils/varbit.h" - #define VECTOR_MAX_DIM 16000 #define VECTOR_SIZE(_dim) (offsetof(Vector, x) + sizeof(float)*(_dim)) @@ -19,7 +17,6 @@ typedef struct Vector } Vector; Vector *InitVector(int dim); -VarBit *InitBitVector(int dim); void PrintVector(char *msg, Vector * vector); int vector_cmp_internal(Vector * a, Vector * b);