From 0c9ae4b18754cd09fb9e2d81dc8bcb55adf6d156 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 22 Apr 2024 15:02:17 -0700 Subject: [PATCH] Added CPU dispatching for L1 distance for halfvec --- src/halfutils.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/src/halfutils.c b/src/halfutils.c index 1d8266a..6f499dd 100644 --- a/src/halfutils.c +++ b/src/halfutils.c @@ -204,6 +204,39 @@ HalfvecL1DistanceDefault(int dim, half * ax, half * bx) return distance; } +#ifdef HALFVEC_DISPATCH +/* Does not require FMA, but keep logic simple */ +TARGET_F16C static float +HalfvecL1DistanceF16c(int dim, half * ax, half * bx) +{ + float distance; + int i; + float s[8]; + int count = (dim / 8) * 8; + __m256 dist = _mm256_setzero_ps(); + __m256 sign = _mm256_set1_ps(-0.0); + + for (i = 0; i < count; i += 8) + { + __m128i axi = _mm_loadu_si128((__m128i *) (ax + i)); + __m128i bxi = _mm_loadu_si128((__m128i *) (bx + i)); + __m256 axs = _mm256_cvtph_ps(axi); + __m256 bxs = _mm256_cvtph_ps(bxi); + + dist = _mm256_add_ps(dist, _mm256_andnot_ps(sign, _mm256_sub_ps(axs, bxs))); + } + + _mm256_storeu_ps(s, dist); + + distance = s[0] + s[1] + s[2] + s[3] + s[4] + s[5] + s[6] + s[7]; + + for (; i < dim; i++) + distance += fabsf(HalfToFloat4(ax[i]) - HalfToFloat4(bx[i])); + + return distance; +} +#endif + #ifdef HALFVEC_DISPATCH #define CPU_FEATURE_FMA (1 << 12) #define CPU_FEATURE_OSXSAVE (1 << 27) @@ -258,6 +291,8 @@ HalfvecInit(void) HalfvecL2SquaredDistance = HalfvecL2SquaredDistanceF16c; HalfvecInnerProduct = HalfvecInnerProductF16c; HalfvecCosineSimilarity = HalfvecCosineSimilarityF16c; + /* Does not require FMA, but keep logic simple */ + HalfvecL1Distance = HalfvecL1DistanceF16c; } #endif }