From d02bdcf2bd23742db65ef4ccca30af045ec12e93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Thu, 8 Jun 2023 16:46:15 +0300 Subject: [PATCH] =?UTF-8?q?mdbx:=20=D0=BA=D0=BE=D1=81=D1=82=D1=8B=D0=BB?= =?UTF-8?q?=D1=8C=20=D0=B4=D0=BB=D1=8F=20GCC=20=D0=BF=D1=80=D0=B8=20=D1=81?= =?UTF-8?q?=D0=B1=D0=BE=D1=80=D0=BA=D0=B5=20=D1=81=20`-m32=20-arch=3Di686?= =?UTF-8?q?=20-Ofast`.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Обходное решение проблем сборки посредством GCC с использование опций `-m32 -arch=i686 -Ofast`. Проблема обусловлена ошибкой GCC, из-за которой конструкция `__attribute__((__target__("sse2")))` не включает полноценное использование инструкций SSE и SSE2, если это не было сделано посредством опций командной строки, но была использована опция `-Ofast`. В результате сборка заканчивалась сообщением об ошибке: gcc/i686-buildroot-linux-gnu/12.2.0/include/xmmintrin.h: In function 'diffcmp2mask_sse2': gcc/i686-buildroot-linux-gnu/12.2.0/include/xmmintrin.h:814:1: error: inlining failed in call to 'always_inline' '_mm_movemask_ps': target specific option mismatch 814 | _mm_movemask_ps (__m128 __A) --- src/core.c | 45 +++++++++++++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/src/core.c b/src/core.c index d47e24ca..0b0997a1 100644 --- a/src/core.c +++ b/src/core.c @@ -6472,27 +6472,47 @@ MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clzl(size_t value) { #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) #endif /* MDBX_ATTRIBUTE_TARGET */ -#if defined(__SSE2__) +#ifndef MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND +/* Workaround for GCC's bug with `-m32 -march=i686 -Ofast` + * gcc/i686-buildroot-linux-gnu/12.2.0/include/xmmintrin.h:814:1: + * error: inlining failed in call to 'always_inline' '_mm_movemask_ps': + * target specific option mismatch */ +#if !defined(__FAST_MATH__) || !__FAST_MATH__ || !defined(__GNUC__) || \ + defined(__e2k__) || defined(__clang__) || defined(__amd64__) || \ + defined(__SSE2__) +#define MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND 0 +#else +#define MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND 1 +#endif +#endif /* MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND */ + +#if defined(__SSE2__) && defined(__SSE__) #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ #elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) #define __SSE2__ #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ -#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) -#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2") +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND +#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse,sse2") #endif /* __SSE2__ */ #if defined(__AVX2__) #define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */ -#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) -#define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("avx2") +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND +#define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("sse,sse2,avx,avx2") #endif /* __AVX2__ */ +#if defined(MDBX_ATTRIBUTE_TARGET_AVX2) #if defined(__AVX512BW__) #define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */ #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND && \ (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0)) -#define MDBX_ATTRIBUTE_TARGET_AVX512BW MDBX_ATTRIBUTE_TARGET("avx512bw") +#define MDBX_ATTRIBUTE_TARGET_AVX512BW \ + MDBX_ATTRIBUTE_TARGET("sse,sse2,avx,avx2,avx512bw") #endif /* __AVX512BW__ */ +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 for MDBX_ATTRIBUTE_TARGET_AVX512BW */ #ifdef MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned @@ -6566,6 +6586,15 @@ diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, return _mm256_movemask_ps(*(const __m256 *)&cmp); } +MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned +diffcmp2mask_sse2avx(const pgno_t *const ptr, const ptrdiff_t offset, + const __m128i pattern) { + const __m128i f = _mm_loadu_si128((const __m128i *)ptr); + const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); + const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); + return _mm_movemask_ps(*(const __m128 *)&cmp); +} + MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t * scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { assert(seq > 0 && len > seq); @@ -6611,7 +6640,7 @@ scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { } #endif /* __SANITIZE_ADDRESS__ */ if (range - 3 > detent) { - mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); + mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern); if (mask) return range + 28 - __builtin_clz(mask); range -= 4; @@ -6685,7 +6714,7 @@ scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { range -= 8; } if (range - 3 > detent) { - mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); + mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern); if (mask) return range + 28 - __builtin_clz(mask); range -= 4;