diff --git a/CHANGELOG.md b/CHANGELOG.md index 5338387..92db86f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.1 (unreleased) + +- Added `random_vector` function + ## 0.4.0 (2023-01-11) If upgrading with Postgres < 13, see [this note](https://github.com/pgvector/pgvector#040). diff --git a/README.md b/README.md index 5a7babf..7991949 100644 --- a/README.md +++ b/README.md @@ -184,6 +184,7 @@ inner_product(vector, vector) → double precision | inner product l2_distance(vector, vector) → double precision | Euclidean distance vector_dims(vector) → integer | number of dimensions vector_norm(vector) → double precision | Euclidean norm +random_vector(integer) → vector | random vector [unreleased] ### Aggregate Functions diff --git a/sql/vector--0.4.0--0.4.1.sql b/sql/vector--0.4.0--0.4.1.sql new file mode 100644 index 0000000..8661aff --- /dev/null +++ b/sql/vector--0.4.0--0.4.1.sql @@ -0,0 +1,5 @@ +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.1'" to load this file. \quit + +CREATE FUNCTION random_vector(integer) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C VOLATILE STRICT PARALLEL SAFE; diff --git a/sql/vector.sql b/sql/vector.sql index 6188e2e..dc7fefb 100644 --- a/sql/vector.sql +++ b/sql/vector.sql @@ -52,6 +52,9 @@ CREATE FUNCTION vector_add(vector, vector) RETURNS vector CREATE FUNCTION vector_sub(vector, vector) RETURNS vector AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE; +CREATE FUNCTION random_vector(integer) RETURNS vector + AS 'MODULE_PATHNAME' LANGUAGE C VOLATILE STRICT PARALLEL SAFE; + -- private functions CREATE FUNCTION vector_lt(vector, vector) RETURNS bool diff --git a/src/ivfflat.h b/src/ivfflat.h index 5bd7622..3918b94 100644 --- a/src/ivfflat.h +++ b/src/ivfflat.h @@ -10,15 +10,10 @@ #include "access/generic_xlog.h" #include "access/reloptions.h" #include "nodes/execnodes.h" -#include "port.h" /* for strtof() and random() */ #include "utils/sampling.h" #include "utils/tuplesort.h" #include "vector.h" -#if PG_VERSION_NUM >= 150000 -#include "common/pg_prng.h" -#endif - #ifdef IVFFLAT_BENCH #include "portability/instr_time.h" #endif @@ -68,14 +63,6 @@ #define IvfflatBench(name, code) (code) #endif -#if PG_VERSION_NUM >= 150000 -#define RandomDouble() pg_prng_double(&pg_global_prng_state) -#define RandomInt() pg_prng_uint32(&pg_global_prng_state) -#else -#define RandomDouble() (((double) random()) / MAX_RANDOM_VALUE) -#define RandomInt() random() -#endif - /* Variables */ extern int ivfflat_probes; diff --git a/src/vector.c b/src/vector.c index d232a23..0ea2574 100644 --- a/src/vector.c +++ b/src/vector.c @@ -950,3 +950,22 @@ vector_avg(PG_FUNCTION_ARGS) PG_RETURN_POINTER(result); } + +/* + * Generate a random vector + */ +PGDLLEXPORT PG_FUNCTION_INFO_V1(random_vector); +Datum +random_vector(PG_FUNCTION_ARGS) +{ + int32 dim = PG_GETARG_INT32(0); + Vector *result; + + CheckDim(dim); + + result = InitVector(dim); + for (int i = 0; i < dim; i++) + result->x[i] = RandomDouble(); + + PG_RETURN_POINTER(result); +} diff --git a/src/vector.h b/src/vector.h index 3e41130..ca04971 100644 --- a/src/vector.h +++ b/src/vector.h @@ -3,6 +3,12 @@ #include "postgres.h" +#include "port.h" /* for strtof() and random() */ + +#if PG_VERSION_NUM >= 150000 +#include "common/pg_prng.h" +#endif + #define VECTOR_MAX_DIM 16000 #define VECTOR_SIZE(_dim) (offsetof(Vector, x) + sizeof(float)*(_dim)) @@ -10,6 +16,14 @@ #define PG_GETARG_VECTOR_P(x) DatumGetVector(PG_GETARG_DATUM(x)) #define PG_RETURN_VECTOR_P(x) PG_RETURN_POINTER(x) +#if PG_VERSION_NUM >= 150000 +#define RandomDouble() pg_prng_double(&pg_global_prng_state) +#define RandomInt() pg_prng_uint32(&pg_global_prng_state) +#else +#define RandomDouble() (((double) random()) / MAX_RANDOM_VALUE) +#define RandomInt() random() +#endif + typedef struct Vector { int32 vl_len_; /* varlena header (do not touch directly!) */ diff --git a/test/t/001_wal.pl b/test/t/001_wal.pl index 46060ed..f9fc4df 100644 --- a/test/t/001_wal.pl +++ b/test/t/001_wal.pl @@ -44,11 +44,6 @@ sub test_index_replay return; } -# Use ARRAY[random(), random(), random(), ...] over -# SELECT array_agg(random()) FROM generate_series(1, $dim) -# to generate different values for each row -my $array_sql = join(",", ('random()') x $dim); - # Initialize primary node $node_primary = get_new_node('primary'); $node_primary->init(allows_streaming => 1); @@ -75,7 +70,7 @@ $node_replica->start; $node_primary->safe_psql("postgres", "CREATE EXTENSION vector;"); $node_primary->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));"); $node_primary->safe_psql("postgres", - "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 100000) i;" + "INSERT INTO tst SELECT i % 10, random_vector($dim) FROM generate_series(1, 100000) i;" ); $node_primary->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); @@ -91,7 +86,7 @@ for my $i (1 .. 10) test_index_replay("vacuum $i"); my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000); $node_primary->safe_psql("postgres", - "INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series($start, $end) i;" + "INSERT INTO tst SELECT i % 10, random_vector($dim) FROM generate_series($start, $end) i;" ); test_index_replay("insert $i"); } diff --git a/test/t/003_recall.pl b/test/t/003_recall.pl index dddc4d5..c5012bc 100644 --- a/test/t/003_recall.pl +++ b/test/t/003_recall.pl @@ -46,7 +46,7 @@ $node->start; $node->safe_psql("postgres", "CREATE EXTENSION vector;"); $node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector(3));"); $node->safe_psql("postgres", - "INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" + "INSERT INTO tst SELECT i, random_vector(3) FROM generate_series(1, 100000) i;" ); # Generate queries diff --git a/test/t/005_query_recall.pl b/test/t/005_query_recall.pl index 0e58135..1cc1cd0 100644 --- a/test/t/005_query_recall.pl +++ b/test/t/005_query_recall.pl @@ -13,7 +13,7 @@ $node->start; $node->safe_psql("postgres", "CREATE EXTENSION vector;"); $node->safe_psql("postgres", "CREATE TABLE tst (i int4 primary key, v vector(3));"); $node->safe_psql("postgres", - "INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" + "INSERT INTO tst SELECT i, random_vector(3) FROM generate_series(1, 100000) i;" ); # Check each index type diff --git a/test/t/006_lists.pl b/test/t/006_lists.pl index eeb11aa..d0a2059 100644 --- a/test/t/006_lists.pl +++ b/test/t/006_lists.pl @@ -13,7 +13,7 @@ $node->start; $node->safe_psql("postgres", "CREATE EXTENSION vector;"); $node->safe_psql("postgres", "CREATE TABLE tst (v vector(3));"); $node->safe_psql("postgres", - "INSERT INTO tst SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;" + "INSERT INTO tst SELECT random_vector(3) FROM generate_series(1, 100000) i;" ); $node->safe_psql("postgres", "CREATE INDEX lists50 ON tst USING ivfflat (v) WITH (lists = 50);"); diff --git a/test/t/007_inserts.pl b/test/t/007_inserts.pl index 85f354c..b9b93c9 100644 --- a/test/t/007_inserts.pl +++ b/test/t/007_inserts.pl @@ -6,8 +6,6 @@ use Test::More tests => 5; my $dim = 768; -my $array_sql = join(",", ('random()') x $dim); - # Initialize node my $node = get_new_node('node'); $node->init; @@ -17,7 +15,7 @@ $node->start; $node->safe_psql("postgres", "CREATE EXTENSION vector;"); $node->safe_psql("postgres", "CREATE TABLE tst (v vector($dim));"); $node->safe_psql("postgres", - "INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10000) i;" + "INSERT INTO tst SELECT random_vector($dim) FROM generate_series(1, 10000) i;" ); $node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);"); @@ -28,7 +26,7 @@ $node->pgbench( [qr{^$}], "concurrent INSERTs", { - "007_inserts" => "INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10) i;" + "007_inserts" => "INSERT INTO tst SELECT random_vector($dim) FROM generate_series(1, 10) i;" } ); diff --git a/test/t/009_storage.pl b/test/t/009_storage.pl index de818c7..b63e3a2 100644 --- a/test/t/009_storage.pl +++ b/test/t/009_storage.pl @@ -17,7 +17,7 @@ $node->safe_psql("postgres", "CREATE TABLE tst (v1 vector(1024), v2 vector(1024) # Test insert succeeds $node->safe_psql("postgres", - "INSERT INTO tst SELECT array_agg(n), array_agg(n), array_agg(n) FROM generate_series(1, $dim) n" + "INSERT INTO tst SELECT random_vector($dim), random_vector($dim), random_vector($dim)" ); # Change storage to PLAIN @@ -27,6 +27,6 @@ $node->safe_psql("postgres", "ALTER TABLE tst ALTER COLUMN v3 SET STORAGE PLAIN" # Test insert fails my ($ret, $stdout, $stderr) = $node->psql("postgres", - "INSERT INTO tst SELECT array_agg(n), array_agg(n), array_agg(n) FROM generate_series(1, $dim) n" + "INSERT INTO tst SELECT random_vector($dim), random_vector($dim), random_vector($dim)" ); like($stderr, qr/row is too big/);