From 925aa4e048f029491851bf25375890b3ace4a75b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 7 Apr 2024 20:22:19 -0700 Subject: [PATCH] Added SIMD version of L2 distance --- .github/workflows/build.yml | 2 +- src/halfvec.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index efa0b8e..478cc0a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -123,6 +123,6 @@ jobs: - uses: ankane/setup-postgres-valgrind@v1 with: postgres-version: 16 - - run: make + - run: make OPTFLAGS="" - run: sudo --preserve-env=PG_CONFIG make install - run: make installcheck diff --git a/src/halfvec.c b/src/halfvec.c index 3772dbb..df51710 100644 --- a/src/halfvec.c +++ b/src/halfvec.c @@ -810,6 +810,34 @@ l2_distance_squared_internal(HalfVector * a, HalfVector * b) half *bx = b->x; float distance = 0.0; +#if defined(F16C_SUPPORT) && defined(__FMA__) + int i; + float s[8]; + int count = (a->dim / 8) * 8; + __m256 dist = _mm256_setzero_ps(); + + for (i = 0; i < count; i += 8) + { + __m128i axi = _mm_loadu_si128((__m128i *) (ax + i)); + __m128i bxi = _mm_loadu_si128((__m128i *) (bx + i)); + __m256 axs = _mm256_cvtph_ps(axi); + __m256 bxs = _mm256_cvtph_ps(bxi); + __m256 diff = _mm256_sub_ps(axs, bxs); + + dist = _mm256_fmadd_ps(diff, diff, dist); + } + + _mm256_store_ps(s, dist); + + distance = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7]; + + for (; i < a->dim; i++) + { + float diff = HalfToFloat4(ax[i]) - HalfToFloat4(bx[i]); + + distance += diff * diff; + } +#else /* Auto-vectorized */ for (int i = 0; i < a->dim; i++) { @@ -817,6 +845,7 @@ l2_distance_squared_internal(HalfVector * a, HalfVector * b) distance += diff * diff; } +#endif return (double) distance; }