Added random_vector function

This commit is contained in:
Andrew Kane
2023-01-26 18:41:46 -08:00
parent 9b13db5c5c
commit f7a0abe6ad
13 changed files with 55 additions and 29 deletions

View File

@@ -1,3 +1,7 @@
## 0.4.1 (unreleased)
- Added `random_vector` function
## 0.4.0 (2023-01-11)
If upgrading with Postgres < 13, see [this note](https://github.com/pgvector/pgvector#040).

View File

@@ -184,6 +184,7 @@ inner_product(vector, vector) → double precision | inner product
l2_distance(vector, vector) → double precision | Euclidean distance
vector_dims(vector) → integer | number of dimensions
vector_norm(vector) → double precision | Euclidean norm
random_vector(integer) → vector | random vector [unreleased]
### Aggregate Functions

View File

@@ -0,0 +1,5 @@
-- complain if script is sourced in psql, rather than via CREATE EXTENSION
\echo Use "ALTER EXTENSION vector UPDATE TO '0.4.1'" to load this file. \quit
CREATE FUNCTION random_vector(integer) RETURNS vector
AS 'MODULE_PATHNAME' LANGUAGE C VOLATILE STRICT PARALLEL SAFE;

View File

@@ -52,6 +52,9 @@ CREATE FUNCTION vector_add(vector, vector) RETURNS vector
CREATE FUNCTION vector_sub(vector, vector) RETURNS vector
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
CREATE FUNCTION random_vector(integer) RETURNS vector
AS 'MODULE_PATHNAME' LANGUAGE C VOLATILE STRICT PARALLEL SAFE;
-- private functions
CREATE FUNCTION vector_lt(vector, vector) RETURNS bool

View File

@@ -10,15 +10,10 @@
#include "access/generic_xlog.h"
#include "access/reloptions.h"
#include "nodes/execnodes.h"
#include "port.h" /* for strtof() and random() */
#include "utils/sampling.h"
#include "utils/tuplesort.h"
#include "vector.h"
#if PG_VERSION_NUM >= 150000
#include "common/pg_prng.h"
#endif
#ifdef IVFFLAT_BENCH
#include "portability/instr_time.h"
#endif
@@ -68,14 +63,6 @@
#define IvfflatBench(name, code) (code)
#endif
#if PG_VERSION_NUM >= 150000
#define RandomDouble() pg_prng_double(&pg_global_prng_state)
#define RandomInt() pg_prng_uint32(&pg_global_prng_state)
#else
#define RandomDouble() (((double) random()) / MAX_RANDOM_VALUE)
#define RandomInt() random()
#endif
/* Variables */
extern int ivfflat_probes;

View File

@@ -950,3 +950,22 @@ vector_avg(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(result);
}
/*
* Generate a random vector
*/
PGDLLEXPORT PG_FUNCTION_INFO_V1(random_vector);
Datum
random_vector(PG_FUNCTION_ARGS)
{
int32 dim = PG_GETARG_INT32(0);
Vector *result;
CheckDim(dim);
result = InitVector(dim);
for (int i = 0; i < dim; i++)
result->x[i] = RandomDouble();
PG_RETURN_POINTER(result);
}

View File

@@ -3,6 +3,12 @@
#include "postgres.h"
#include "port.h" /* for strtof() and random() */
#if PG_VERSION_NUM >= 150000
#include "common/pg_prng.h"
#endif
#define VECTOR_MAX_DIM 16000
#define VECTOR_SIZE(_dim) (offsetof(Vector, x) + sizeof(float)*(_dim))
@@ -10,6 +16,14 @@
#define PG_GETARG_VECTOR_P(x) DatumGetVector(PG_GETARG_DATUM(x))
#define PG_RETURN_VECTOR_P(x) PG_RETURN_POINTER(x)
#if PG_VERSION_NUM >= 150000
#define RandomDouble() pg_prng_double(&pg_global_prng_state)
#define RandomInt() pg_prng_uint32(&pg_global_prng_state)
#else
#define RandomDouble() (((double) random()) / MAX_RANDOM_VALUE)
#define RandomInt() random()
#endif
typedef struct Vector
{
int32 vl_len_; /* varlena header (do not touch directly!) */

View File

@@ -44,11 +44,6 @@ sub test_index_replay
return;
}
# Use ARRAY[random(), random(), random(), ...] over
# SELECT array_agg(random()) FROM generate_series(1, $dim)
# to generate different values for each row
my $array_sql = join(",", ('random()') x $dim);
# Initialize primary node
$node_primary = get_new_node('primary');
$node_primary->init(allows_streaming => 1);
@@ -75,7 +70,7 @@ $node_replica->start;
$node_primary->safe_psql("postgres", "CREATE EXTENSION vector;");
$node_primary->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector($dim));");
$node_primary->safe_psql("postgres",
"INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series(1, 100000) i;"
"INSERT INTO tst SELECT i % 10, random_vector($dim) FROM generate_series(1, 100000) i;"
);
$node_primary->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);");
@@ -91,7 +86,7 @@ for my $i (1 .. 10)
test_index_replay("vacuum $i");
my ($start, $end) = (100001 + ($i - 1) * 10000, 100000 + $i * 10000);
$node_primary->safe_psql("postgres",
"INSERT INTO tst SELECT i % 10, ARRAY[$array_sql] FROM generate_series($start, $end) i;"
"INSERT INTO tst SELECT i % 10, random_vector($dim) FROM generate_series($start, $end) i;"
);
test_index_replay("insert $i");
}

View File

@@ -46,7 +46,7 @@ $node->start;
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v vector(3));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;"
"INSERT INTO tst SELECT i, random_vector(3) FROM generate_series(1, 100000) i;"
);
# Generate queries

View File

@@ -13,7 +13,7 @@ $node->start;
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (i int4 primary key, v vector(3));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT i, ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;"
"INSERT INTO tst SELECT i, random_vector(3) FROM generate_series(1, 100000) i;"
);
# Check each index type

View File

@@ -13,7 +13,7 @@ $node->start;
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (v vector(3));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT ARRAY[random(), random(), random()] FROM generate_series(1, 100000) i;"
"INSERT INTO tst SELECT random_vector(3) FROM generate_series(1, 100000) i;"
);
$node->safe_psql("postgres", "CREATE INDEX lists50 ON tst USING ivfflat (v) WITH (lists = 50);");

View File

@@ -6,8 +6,6 @@ use Test::More tests => 5;
my $dim = 768;
my $array_sql = join(",", ('random()') x $dim);
# Initialize node
my $node = get_new_node('node');
$node->init;
@@ -17,7 +15,7 @@ $node->start;
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
$node->safe_psql("postgres", "CREATE TABLE tst (v vector($dim));");
$node->safe_psql("postgres",
"INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10000) i;"
"INSERT INTO tst SELECT random_vector($dim) FROM generate_series(1, 10000) i;"
);
$node->safe_psql("postgres", "CREATE INDEX ON tst USING ivfflat (v);");
@@ -28,7 +26,7 @@ $node->pgbench(
[qr{^$}],
"concurrent INSERTs",
{
"007_inserts" => "INSERT INTO tst SELECT ARRAY[$array_sql] FROM generate_series(1, 10) i;"
"007_inserts" => "INSERT INTO tst SELECT random_vector($dim) FROM generate_series(1, 10) i;"
}
);

View File

@@ -17,7 +17,7 @@ $node->safe_psql("postgres", "CREATE TABLE tst (v1 vector(1024), v2 vector(1024)
# Test insert succeeds
$node->safe_psql("postgres",
"INSERT INTO tst SELECT array_agg(n), array_agg(n), array_agg(n) FROM generate_series(1, $dim) n"
"INSERT INTO tst SELECT random_vector($dim), random_vector($dim), random_vector($dim)"
);
# Change storage to PLAIN
@@ -27,6 +27,6 @@ $node->safe_psql("postgres", "ALTER TABLE tst ALTER COLUMN v3 SET STORAGE PLAIN"
# Test insert fails
my ($ret, $stdout, $stderr) = $node->psql("postgres",
"INSERT INTO tst SELECT array_agg(n), array_agg(n), array_agg(n) FROM generate_series(1, $dim) n"
"INSERT INTO tst SELECT random_vector($dim), random_vector($dim), random_vector($dim)"
);
like($stderr, qr/row is too big/);