mirror of
https://github.com/pgvector/pgvector.git
synced 2026-07-03 11:10:56 +08:00
Added support for IVFFlat [skip ci]
This commit is contained in:
@@ -266,6 +266,9 @@ COMMENT ON ACCESS METHOD hnsw IS 'hnsw index access method';
|
||||
CREATE FUNCTION ivfflat_halfvec_support(internal) RETURNS internal
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C;
|
||||
|
||||
CREATE FUNCTION ivfflat_minivec_support(internal) RETURNS internal
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C;
|
||||
|
||||
CREATE FUNCTION ivfflat_bit_support(internal) RETURNS internal
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C;
|
||||
|
||||
@@ -748,6 +751,9 @@ CREATE FUNCTION minivec_l2_squared_distance(minivec, minivec) RETURNS float8
|
||||
CREATE FUNCTION minivec_negative_inner_product(minivec, minivec) RETURNS float8
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
||||
|
||||
CREATE FUNCTION minivec_spherical_distance(minivec, minivec) RETURNS float8
|
||||
AS 'MODULE_PATHNAME' LANGUAGE C IMMUTABLE STRICT PARALLEL SAFE;
|
||||
|
||||
-- minivec cast functions
|
||||
|
||||
CREATE FUNCTION minivec(minivec, integer, boolean) RETURNS minivec
|
||||
@@ -887,6 +893,30 @@ CREATE OPERATOR CLASS minivec_ops
|
||||
OPERATOR 5 > ,
|
||||
FUNCTION 1 minivec_cmp(minivec, minivec);
|
||||
|
||||
CREATE OPERATOR CLASS minivec_l2_ops
|
||||
FOR TYPE minivec USING ivfflat AS
|
||||
OPERATOR 1 <-> (minivec, minivec) FOR ORDER BY float_ops,
|
||||
FUNCTION 1 minivec_l2_squared_distance(minivec, minivec),
|
||||
FUNCTION 3 l2_distance(minivec, minivec),
|
||||
FUNCTION 5 ivfflat_minivec_support(internal);
|
||||
|
||||
CREATE OPERATOR CLASS minivec_ip_ops
|
||||
FOR TYPE minivec USING ivfflat AS
|
||||
OPERATOR 1 <#> (minivec, minivec) FOR ORDER BY float_ops,
|
||||
FUNCTION 1 minivec_negative_inner_product(minivec, minivec),
|
||||
FUNCTION 3 minivec_spherical_distance(minivec, minivec),
|
||||
FUNCTION 4 l2_norm(minivec),
|
||||
FUNCTION 5 ivfflat_minivec_support(internal);
|
||||
|
||||
CREATE OPERATOR CLASS minivec_cosine_ops
|
||||
FOR TYPE minivec USING ivfflat AS
|
||||
OPERATOR 1 <=> (minivec, minivec) FOR ORDER BY float_ops,
|
||||
FUNCTION 1 minivec_negative_inner_product(minivec, minivec),
|
||||
FUNCTION 2 l2_norm(minivec),
|
||||
FUNCTION 3 minivec_spherical_distance(minivec, minivec),
|
||||
FUNCTION 4 l2_norm(minivec),
|
||||
FUNCTION 5 ivfflat_minivec_support(internal);
|
||||
|
||||
CREATE OPERATOR CLASS minivec_l2_ops
|
||||
FOR TYPE minivec USING hnsw AS
|
||||
OPERATOR 1 <-> (minivec, minivec) FOR ORDER BY float_ops,
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include "halfutils.h"
|
||||
#include "halfvec.h"
|
||||
#include "ivfflat.h"
|
||||
#include "minivec.h"
|
||||
#include "storage/bufmgr.h"
|
||||
|
||||
/*
|
||||
@@ -231,6 +232,7 @@ IvfflatUpdateList(Relation index, ListInfo listInfo,
|
||||
|
||||
PGDLLEXPORT Datum l2_normalize(PG_FUNCTION_ARGS);
|
||||
PGDLLEXPORT Datum halfvec_l2_normalize(PG_FUNCTION_ARGS);
|
||||
PGDLLEXPORT Datum minivec_l2_normalize(PG_FUNCTION_ARGS);
|
||||
PGDLLEXPORT Datum sparsevec_l2_normalize(PG_FUNCTION_ARGS);
|
||||
|
||||
static Size
|
||||
@@ -245,6 +247,12 @@ HalfvecItemSize(int dimensions)
|
||||
return HALFVEC_SIZE(dimensions);
|
||||
}
|
||||
|
||||
static Size
|
||||
MinivecItemSize(int dimensions)
|
||||
{
|
||||
return MINIVEC_SIZE(dimensions);
|
||||
}
|
||||
|
||||
static Size
|
||||
BitItemSize(int dimensions)
|
||||
{
|
||||
@@ -275,6 +283,18 @@ HalfvecUpdateCenter(Pointer v, int dimensions, float *x)
|
||||
vec->x[k] = Float4ToHalfUnchecked(x[k]);
|
||||
}
|
||||
|
||||
static void
|
||||
MinivecUpdateCenter(Pointer v, int dimensions, float *x)
|
||||
{
|
||||
MiniVector *vec = (MiniVector *) v;
|
||||
|
||||
SET_VARSIZE(vec, MINIVEC_SIZE(dimensions));
|
||||
vec->dim = dimensions;
|
||||
|
||||
for (int k = 0; k < dimensions; k++)
|
||||
vec->x[k] = Float4ToFp8Unchecked(x[k]);
|
||||
}
|
||||
|
||||
static void
|
||||
BitUpdateCenter(Pointer v, int dimensions, float *x)
|
||||
{
|
||||
@@ -309,6 +329,15 @@ HalfvecSumCenter(Pointer v, float *x)
|
||||
x[k] += HalfToFloat4(vec->x[k]);
|
||||
}
|
||||
|
||||
static void
|
||||
MinivecSumCenter(Pointer v, float *x)
|
||||
{
|
||||
MiniVector *vec = (MiniVector *) v;
|
||||
|
||||
for (int k = 0; k < vec->dim; k++)
|
||||
x[k] += Fp8ToFloat4(vec->x[k]);
|
||||
}
|
||||
|
||||
static void
|
||||
BitSumCenter(Pointer v, float *x)
|
||||
{
|
||||
@@ -357,6 +386,21 @@ ivfflat_halfvec_support(PG_FUNCTION_ARGS)
|
||||
PG_RETURN_POINTER(&typeInfo);
|
||||
};
|
||||
|
||||
FUNCTION_PREFIX PG_FUNCTION_INFO_V1(ivfflat_minivec_support);
|
||||
Datum
|
||||
ivfflat_minivec_support(PG_FUNCTION_ARGS)
|
||||
{
|
||||
static const IvfflatTypeInfo typeInfo = {
|
||||
.maxDimensions = IVFFLAT_MAX_DIM * 4,
|
||||
.normalize = minivec_l2_normalize,
|
||||
.itemSize = MinivecItemSize,
|
||||
.updateCenter = MinivecUpdateCenter,
|
||||
.sumCenter = MinivecSumCenter
|
||||
};
|
||||
|
||||
PG_RETURN_POINTER(&typeInfo);
|
||||
};
|
||||
|
||||
FUNCTION_PREFIX PG_FUNCTION_INFO_V1(ivfflat_bit_support);
|
||||
Datum
|
||||
ivfflat_bit_support(PG_FUNCTION_ARGS)
|
||||
|
||||
154
test/t/043_ivfflat_minivec_build_recall.pl
Normal file
154
test/t/043_ivfflat_minivec_build_recall.pl
Normal file
@@ -0,0 +1,154 @@
|
||||
use strict;
|
||||
use warnings FATAL => 'all';
|
||||
use PostgreSQL::Test::Cluster;
|
||||
use PostgreSQL::Test::Utils;
|
||||
use Test::More;
|
||||
|
||||
my $node;
|
||||
my @queries = ();
|
||||
my @expected;
|
||||
my $limit = 20;
|
||||
my $dim = 10;
|
||||
my $array_sql = join(",", ('random()') x $dim);
|
||||
|
||||
sub test_recall
|
||||
{
|
||||
my ($probes, $min, $operator) = @_;
|
||||
my $correct = 0;
|
||||
my $total = 0;
|
||||
|
||||
my $explain = $node->safe_psql("postgres", qq(
|
||||
SET enable_seqscan = off;
|
||||
SET ivfflat.probes = $probes;
|
||||
EXPLAIN ANALYZE SELECT i FROM tst ORDER BY v $operator '$queries[0]' LIMIT $limit;
|
||||
));
|
||||
like($explain, qr/Index Scan using idx on tst/);
|
||||
|
||||
for my $i (0 .. $#queries)
|
||||
{
|
||||
my $actual = $node->safe_psql("postgres", qq(
|
||||
SET enable_seqscan = off;
|
||||
SET ivfflat.probes = $probes;
|
||||
SELECT i FROM tst ORDER BY v $operator '$queries[$i]' LIMIT $limit;
|
||||
));
|
||||
my @actual_ids = split("\n", $actual);
|
||||
|
||||
my @expected_ids = split("\n", $expected[$i]);
|
||||
my %expected_set = map { $_ => 1 } @expected_ids;
|
||||
|
||||
foreach (@actual_ids)
|
||||
{
|
||||
if (exists($expected_set{$_}))
|
||||
{
|
||||
$correct++;
|
||||
}
|
||||
}
|
||||
|
||||
$total += $limit;
|
||||
}
|
||||
|
||||
cmp_ok($correct / $total, ">=", $min, $operator);
|
||||
}
|
||||
|
||||
# Initialize node
|
||||
$node = PostgreSQL::Test::Cluster->new('node');
|
||||
$node->init;
|
||||
$node->start;
|
||||
|
||||
# Create table
|
||||
$node->safe_psql("postgres", "CREATE EXTENSION vector;");
|
||||
$node->safe_psql("postgres", "CREATE TABLE tst (i int4, v minivec($dim));");
|
||||
$node->safe_psql("postgres",
|
||||
"INSERT INTO tst SELECT i, ARRAY[$array_sql] FROM generate_series(1, 100000) i;"
|
||||
);
|
||||
|
||||
# Generate queries
|
||||
for (1 .. 20)
|
||||
{
|
||||
my @r = ();
|
||||
for (1 .. $dim)
|
||||
{
|
||||
push(@r, rand());
|
||||
}
|
||||
push(@queries, "[" . join(",", @r) . "]");
|
||||
}
|
||||
|
||||
# Check each index type
|
||||
my @operators = ("<->", "<#>", "<=>");
|
||||
my @opclasses = ("minivec_l2_ops", "minivec_ip_ops", "minivec_cosine_ops");
|
||||
|
||||
for my $i (0 .. $#operators)
|
||||
{
|
||||
my $operator = $operators[$i];
|
||||
my $opclass = $opclasses[$i];
|
||||
|
||||
# Get exact results
|
||||
@expected = ();
|
||||
foreach (@queries)
|
||||
{
|
||||
my $res = $node->safe_psql("postgres", qq(
|
||||
WITH top AS (
|
||||
SELECT v $operator '$_' AS distance FROM tst ORDER BY distance LIMIT $limit
|
||||
)
|
||||
SELECT i FROM tst WHERE (v $operator '$_') <= (SELECT MAX(distance) FROM top)
|
||||
));
|
||||
push(@expected, $res);
|
||||
}
|
||||
|
||||
# Build index serially
|
||||
$node->safe_psql("postgres", qq(
|
||||
SET max_parallel_maintenance_workers = 0;
|
||||
CREATE INDEX idx ON tst USING ivfflat (v $opclass);
|
||||
));
|
||||
|
||||
# Test approximate results
|
||||
if ($operator eq "<->")
|
||||
{
|
||||
# TODO Fix test (uniform random vectors all have similar inner product)
|
||||
test_recall(1, 0.33, $operator);
|
||||
test_recall(10, 0.93, $operator);
|
||||
}
|
||||
|
||||
# Test probes equals lists
|
||||
if ($operator eq "<=>")
|
||||
{
|
||||
test_recall(100, 0.30, $operator);
|
||||
}
|
||||
else
|
||||
{
|
||||
test_recall(100, 1.00, $operator);
|
||||
}
|
||||
|
||||
$node->safe_psql("postgres", "DROP INDEX idx;");
|
||||
|
||||
# Build index in parallel
|
||||
my ($ret, $stdout, $stderr) = $node->psql("postgres", qq(
|
||||
SET client_min_messages = DEBUG;
|
||||
SET min_parallel_table_scan_size = 1;
|
||||
CREATE INDEX idx ON tst USING ivfflat (v $opclass);
|
||||
));
|
||||
is($ret, 0, $stderr);
|
||||
like($stderr, qr/using \d+ parallel workers/);
|
||||
|
||||
# Test approximate results
|
||||
if ($operator eq "<->")
|
||||
{
|
||||
# TODO Fix test (uniform random vectors all have similar inner product)
|
||||
test_recall(1, 0.33, $operator);
|
||||
test_recall(10, 0.93, $operator);
|
||||
}
|
||||
|
||||
# Test probes equals lists
|
||||
if ($operator eq "<=>")
|
||||
{
|
||||
test_recall(100, 0.30, $operator);
|
||||
}
|
||||
else
|
||||
{
|
||||
test_recall(100, 1.00, $operator);
|
||||
}
|
||||
|
||||
$node->safe_psql("postgres", "DROP INDEX idx;");
|
||||
}
|
||||
|
||||
done_testing();
|
||||
Reference in New Issue
Block a user