mirror of
				https://github.com/isar/libmdbx.git
				synced 2025-10-31 15:38:57 +08:00 
			
		
		
		
	
		
			
				
	
	
		
			1461 lines
		
	
	
		
			59 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			1461 lines
		
	
	
		
			59 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
| /// \copyright SPDX-License-Identifier: Apache-2.0
 | ||
| /// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
 | ||
| 
 | ||
| #include "internals.h"
 | ||
| 
 | ||
| #if MDBX_ENABLE_MINCORE
 | ||
| /*------------------------------------------------------------------------------
 | ||
|  * Проверка размещения/расположения отображенных страниц БД в ОЗУ (mem-in-core),
 | ||
|  * с кешированием этой информации. */
 | ||
| 
 | ||
| static inline bool bit_tas(uint64_t *field, char bit) {
 | ||
|   const uint64_t m = UINT64_C(1) << bit;
 | ||
|   const bool r = (*field & m) != 0;
 | ||
|   *field |= m;
 | ||
|   return r;
 | ||
| }
 | ||
| 
 | ||
| static bool mincore_fetch(MDBX_env *const env, const size_t unit_begin) {
 | ||
|   lck_t *const lck = env->lck;
 | ||
|   for (size_t i = 1; i < ARRAY_LENGTH(lck->mincore_cache.begin); ++i) {
 | ||
|     const ptrdiff_t dist = unit_begin - lck->mincore_cache.begin[i];
 | ||
|     if (likely(dist >= 0 && dist < 64)) {
 | ||
|       const pgno_t tmp_begin = lck->mincore_cache.begin[i];
 | ||
|       const uint64_t tmp_mask = lck->mincore_cache.mask[i];
 | ||
|       do {
 | ||
|         lck->mincore_cache.begin[i] = lck->mincore_cache.begin[i - 1];
 | ||
|         lck->mincore_cache.mask[i] = lck->mincore_cache.mask[i - 1];
 | ||
|       } while (--i);
 | ||
|       lck->mincore_cache.begin[0] = tmp_begin;
 | ||
|       lck->mincore_cache.mask[0] = tmp_mask;
 | ||
|       return bit_tas(lck->mincore_cache.mask, (char)dist);
 | ||
|     }
 | ||
|   }
 | ||
| 
 | ||
|   size_t pages = 64;
 | ||
|   unsigned unit_log = globals.sys_pagesize_ln2;
 | ||
|   unsigned shift = 0;
 | ||
|   if (env->ps > globals.sys_pagesize) {
 | ||
|     unit_log = env->ps2ln;
 | ||
|     shift = env->ps2ln - globals.sys_pagesize_ln2;
 | ||
|     pages <<= shift;
 | ||
|   }
 | ||
| 
 | ||
|   const size_t offset = unit_begin << unit_log;
 | ||
|   size_t length = pages << globals.sys_pagesize_ln2;
 | ||
|   if (offset + length > env->dxb_mmap.current) {
 | ||
|     length = env->dxb_mmap.current - offset;
 | ||
|     pages = length >> globals.sys_pagesize_ln2;
 | ||
|   }
 | ||
| 
 | ||
| #if MDBX_ENABLE_PGOP_STAT
 | ||
|   env->lck->pgops.mincore.weak += 1;
 | ||
| #endif /* MDBX_ENABLE_PGOP_STAT */
 | ||
|   uint8_t *const vector = alloca(pages);
 | ||
|   if (unlikely(mincore(ptr_disp(env->dxb_mmap.base, offset), length,
 | ||
|                        (void *)vector))) {
 | ||
|     NOTICE("mincore(+%zu, %zu), err %d", offset, length, errno);
 | ||
|     return false;
 | ||
|   }
 | ||
| 
 | ||
|   for (size_t i = 1; i < ARRAY_LENGTH(lck->mincore_cache.begin); ++i) {
 | ||
|     lck->mincore_cache.begin[i] = lck->mincore_cache.begin[i - 1];
 | ||
|     lck->mincore_cache.mask[i] = lck->mincore_cache.mask[i - 1];
 | ||
|   }
 | ||
|   lck->mincore_cache.begin[0] = unit_begin;
 | ||
| 
 | ||
|   uint64_t mask = 0;
 | ||
| #ifdef MINCORE_INCORE
 | ||
|   STATIC_ASSERT(MINCORE_INCORE == 1);
 | ||
| #endif
 | ||
|   for (size_t i = 0; i < pages; ++i) {
 | ||
|     uint64_t bit = (vector[i] & 1) == 0;
 | ||
|     bit <<= i >> shift;
 | ||
|     mask |= bit;
 | ||
|   }
 | ||
| 
 | ||
|   lck->mincore_cache.mask[0] = ~mask;
 | ||
|   return bit_tas(lck->mincore_cache.mask, 0);
 | ||
| }
 | ||
| #endif /* MDBX_ENABLE_MINCORE */
 | ||
| 
 | ||
| MDBX_MAYBE_UNUSED static inline bool mincore_probe(MDBX_env *const env,
 | ||
|                                                    const pgno_t pgno) {
 | ||
| #if MDBX_ENABLE_MINCORE
 | ||
|   const size_t offset_aligned =
 | ||
|       floor_powerof2(pgno2bytes(env, pgno), globals.sys_pagesize);
 | ||
|   const unsigned unit_log2 = (env->ps2ln > globals.sys_pagesize_ln2)
 | ||
|                                  ? env->ps2ln
 | ||
|                                  : globals.sys_pagesize_ln2;
 | ||
|   const size_t unit_begin = offset_aligned >> unit_log2;
 | ||
|   eASSERT(env, (unit_begin << unit_log2) == offset_aligned);
 | ||
|   const ptrdiff_t dist = unit_begin - env->lck->mincore_cache.begin[0];
 | ||
|   if (likely(dist >= 0 && dist < 64))
 | ||
|     return bit_tas(env->lck->mincore_cache.mask, (char)dist);
 | ||
|   return mincore_fetch(env, unit_begin);
 | ||
| #else
 | ||
|   (void)env;
 | ||
|   (void)pgno;
 | ||
|   return false;
 | ||
| #endif /* MDBX_ENABLE_MINCORE */
 | ||
| }
 | ||
| 
 | ||
| /*----------------------------------------------------------------------------*/
 | ||
| 
 | ||
| MDBX_MAYBE_UNUSED __hot static pgno_t *
 | ||
| scan4seq_fallback(pgno_t *range, const size_t len, const size_t seq) {
 | ||
|   assert(seq > 0 && len > seq);
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
|   assert(range[-1] == len);
 | ||
|   const pgno_t *const detent = range + len - seq;
 | ||
|   const ptrdiff_t offset = (ptrdiff_t)seq;
 | ||
|   const pgno_t target = (pgno_t)offset;
 | ||
|   if (likely(len > seq + 3)) {
 | ||
|     do {
 | ||
|       const pgno_t diff0 = range[offset + 0] - range[0];
 | ||
|       const pgno_t diff1 = range[offset + 1] - range[1];
 | ||
|       const pgno_t diff2 = range[offset + 2] - range[2];
 | ||
|       const pgno_t diff3 = range[offset + 3] - range[3];
 | ||
|       if (diff0 == target)
 | ||
|         return range + 0;
 | ||
|       if (diff1 == target)
 | ||
|         return range + 1;
 | ||
|       if (diff2 == target)
 | ||
|         return range + 2;
 | ||
|       if (diff3 == target)
 | ||
|         return range + 3;
 | ||
|       range += 4;
 | ||
|     } while (range + 3 < detent);
 | ||
|     if (range == detent)
 | ||
|       return nullptr;
 | ||
|   }
 | ||
|   do
 | ||
|     if (range[offset] - *range == target)
 | ||
|       return range;
 | ||
|   while (++range < detent);
 | ||
| #else
 | ||
|   assert(range[-(ptrdiff_t)len] == len);
 | ||
|   const pgno_t *const detent = range - len + seq;
 | ||
|   const ptrdiff_t offset = -(ptrdiff_t)seq;
 | ||
|   const pgno_t target = (pgno_t)offset;
 | ||
|   if (likely(len > seq + 3)) {
 | ||
|     do {
 | ||
|       const pgno_t diff0 = range[-0] - range[offset - 0];
 | ||
|       const pgno_t diff1 = range[-1] - range[offset - 1];
 | ||
|       const pgno_t diff2 = range[-2] - range[offset - 2];
 | ||
|       const pgno_t diff3 = range[-3] - range[offset - 3];
 | ||
|       /* Смысл вычислений до ветвлений в том, чтобы позволить компилятору
 | ||
|        * загружать и вычислять все значения параллельно. */
 | ||
|       if (diff0 == target)
 | ||
|         return range - 0;
 | ||
|       if (diff1 == target)
 | ||
|         return range - 1;
 | ||
|       if (diff2 == target)
 | ||
|         return range - 2;
 | ||
|       if (diff3 == target)
 | ||
|         return range - 3;
 | ||
|       range -= 4;
 | ||
|     } while (range > detent + 3);
 | ||
|     if (range == detent)
 | ||
|       return nullptr;
 | ||
|   }
 | ||
|   do
 | ||
|     if (*range - range[offset] == target)
 | ||
|       return range;
 | ||
|   while (--range > detent);
 | ||
| #endif /* pnl_t sort-order */
 | ||
|   return nullptr;
 | ||
| }
 | ||
| 
 | ||
| MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const pnl_t pnl,
 | ||
|                                                           const size_t seq) {
 | ||
|   size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pnl);
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
|   while (seq <= MDBX_PNL_GETSIZE(pnl) - begin) {
 | ||
|     if (pnl[begin + seq] - pnl[begin] == seq)
 | ||
|       return pnl + begin;
 | ||
|     ++begin;
 | ||
|   }
 | ||
| #else
 | ||
|   while (begin > seq) {
 | ||
|     if (pnl[begin - seq] - pnl[begin] == seq)
 | ||
|       return pnl + begin;
 | ||
|     --begin;
 | ||
|   }
 | ||
| #endif /* pnl_t sort-order */
 | ||
|   return nullptr;
 | ||
| }
 | ||
| 
 | ||
| #if defined(_MSC_VER) && !defined(__builtin_clz) &&                            \
 | ||
|     !__has_builtin(__builtin_clz)
 | ||
| MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(uint32_t value) {
 | ||
|   unsigned long index;
 | ||
|   _BitScanReverse(&index, value);
 | ||
|   return 31 - index;
 | ||
| }
 | ||
| #endif /* _MSC_VER */
 | ||
| 
 | ||
| #if defined(_MSC_VER) && !defined(__builtin_clzl) &&                           \
 | ||
|     !__has_builtin(__builtin_clzl)
 | ||
| MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clzl(size_t value) {
 | ||
|   unsigned long index;
 | ||
| #ifdef _WIN64
 | ||
|   assert(sizeof(value) == 8);
 | ||
|   _BitScanReverse64(&index, value);
 | ||
|   return 63 - index;
 | ||
| #else
 | ||
|   assert(sizeof(value) == 4);
 | ||
|   _BitScanReverse(&index, value);
 | ||
|   return 31 - index;
 | ||
| #endif
 | ||
| }
 | ||
| #endif /* _MSC_VER */
 | ||
| 
 | ||
| #if !MDBX_PNL_ASCENDING
 | ||
| 
 | ||
| #if !defined(MDBX_ATTRIBUTE_TARGET) &&                                         \
 | ||
|     (__has_attribute(__target__) || __GNUC_PREREQ(5, 0))
 | ||
| #define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target)))
 | ||
| #endif /* MDBX_ATTRIBUTE_TARGET */
 | ||
| 
 | ||
| #ifndef MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND
 | ||
| /* Workaround for GCC's bug with `-m32 -march=i686 -Ofast`
 | ||
|  * gcc/i686-buildroot-linux-gnu/12.2.0/include/xmmintrin.h:814:1:
 | ||
|  *     error: inlining failed in call to 'always_inline' '_mm_movemask_ps':
 | ||
|  *            target specific option mismatch */
 | ||
| #if !defined(__FAST_MATH__) || !__FAST_MATH__ || !defined(__GNUC__) ||         \
 | ||
|     defined(__e2k__) || defined(__clang__) || defined(__amd64__) ||            \
 | ||
|     defined(__SSE2__)
 | ||
| #define MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND 0
 | ||
| #else
 | ||
| #define MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND 1
 | ||
| #endif
 | ||
| #endif /* MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND */
 | ||
| 
 | ||
| #if defined(__SSE2__) && defined(__SSE__)
 | ||
| #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */
 | ||
| #elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__)
 | ||
| #define __SSE2__
 | ||
| #define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */
 | ||
| #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) &&                   \
 | ||
|     !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND
 | ||
| #define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse,sse2")
 | ||
| #endif /* __SSE2__ */
 | ||
| 
 | ||
| #if defined(__AVX2__)
 | ||
| #define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */
 | ||
| #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) &&                   \
 | ||
|     !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND
 | ||
| #define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("sse,sse2,avx,avx2")
 | ||
| #endif /* __AVX2__ */
 | ||
| 
 | ||
| #if defined(MDBX_ATTRIBUTE_TARGET_AVX2)
 | ||
| #if defined(__AVX512BW__)
 | ||
| #define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */
 | ||
| #elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) &&                   \
 | ||
|     !MDBX_GCC_FASTMATH_i686_SIMD_WORKAROUND &&                                 \
 | ||
|     (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0))
 | ||
| #define MDBX_ATTRIBUTE_TARGET_AVX512BW                                         \
 | ||
|   MDBX_ATTRIBUTE_TARGET("sse,sse2,avx,avx2,avx512bw")
 | ||
| #endif /* __AVX512BW__ */
 | ||
| #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 for MDBX_ATTRIBUTE_TARGET_AVX512BW */
 | ||
| 
 | ||
| #ifdef MDBX_ATTRIBUTE_TARGET_SSE2
 | ||
| MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned
 | ||
| diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset,
 | ||
|                   const __m128i pattern) {
 | ||
|   const __m128i f = _mm_loadu_si128((const __m128i *)ptr);
 | ||
|   const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset));
 | ||
|   const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern);
 | ||
|   return _mm_movemask_ps(*(const __m128 *)&cmp);
 | ||
| }
 | ||
| 
 | ||
| MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t *
 | ||
| scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) {
 | ||
|   assert(seq > 0 && len > seq);
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
| #error "FIXME: Not implemented"
 | ||
| #endif /* MDBX_PNL_ASCENDING */
 | ||
|   assert(range[-(ptrdiff_t)len] == len);
 | ||
|   pgno_t *const detent = range - len + seq;
 | ||
|   const ptrdiff_t offset = -(ptrdiff_t)seq;
 | ||
|   const pgno_t target = (pgno_t)offset;
 | ||
|   const __m128i pattern = _mm_set1_epi32(target);
 | ||
|   uint8_t mask;
 | ||
|   if (likely(len > seq + 3)) {
 | ||
|     do {
 | ||
|       mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern);
 | ||
|       if (mask) {
 | ||
| #if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
 | ||
|       found:
 | ||
| #endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */
 | ||
|         return range + 28 - __builtin_clz(mask);
 | ||
|       }
 | ||
|       range -= 4;
 | ||
|     } while (range > detent + 3);
 | ||
|     if (range == detent)
 | ||
|       return nullptr;
 | ||
|   }
 | ||
| 
 | ||
|   /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не
 | ||
|    * только за пределами региона выделенного под PNL, но и пересекать границу
 | ||
|    * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению.
 | ||
|    * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */
 | ||
| #if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
 | ||
|   const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */;
 | ||
|   if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) &&
 | ||
|       !RUNNING_ON_VALGRIND) {
 | ||
|     const unsigned extra = (unsigned)(detent + 4 - range);
 | ||
|     assert(extra > 0 && extra < 4);
 | ||
|     mask = 0xF << extra;
 | ||
|     mask &= diffcmp2mask_sse2(range - 3, offset, pattern);
 | ||
|     if (mask)
 | ||
|       goto found;
 | ||
|     return nullptr;
 | ||
|   }
 | ||
| #endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */
 | ||
|   do
 | ||
|     if (*range - range[offset] == target)
 | ||
|       return range;
 | ||
|   while (--range != detent);
 | ||
|   return nullptr;
 | ||
| }
 | ||
| #endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */
 | ||
| 
 | ||
| #ifdef MDBX_ATTRIBUTE_TARGET_AVX2
 | ||
| MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned
 | ||
| diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset,
 | ||
|                   const __m256i pattern) {
 | ||
|   const __m256i f = _mm256_loadu_si256((const __m256i *)ptr);
 | ||
|   const __m256i l = _mm256_loadu_si256((const __m256i *)(ptr + offset));
 | ||
|   const __m256i cmp = _mm256_cmpeq_epi32(_mm256_sub_epi32(f, l), pattern);
 | ||
|   return _mm256_movemask_ps(*(const __m256 *)&cmp);
 | ||
| }
 | ||
| 
 | ||
| MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned
 | ||
| diffcmp2mask_sse2avx(const pgno_t *const ptr, const ptrdiff_t offset,
 | ||
|                      const __m128i pattern) {
 | ||
|   const __m128i f = _mm_loadu_si128((const __m128i *)ptr);
 | ||
|   const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset));
 | ||
|   const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern);
 | ||
|   return _mm_movemask_ps(*(const __m128 *)&cmp);
 | ||
| }
 | ||
| 
 | ||
| MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t *
 | ||
| scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) {
 | ||
|   assert(seq > 0 && len > seq);
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
| #error "FIXME: Not implemented"
 | ||
| #endif /* MDBX_PNL_ASCENDING */
 | ||
|   assert(range[-(ptrdiff_t)len] == len);
 | ||
|   pgno_t *const detent = range - len + seq;
 | ||
|   const ptrdiff_t offset = -(ptrdiff_t)seq;
 | ||
|   const pgno_t target = (pgno_t)offset;
 | ||
|   const __m256i pattern = _mm256_set1_epi32(target);
 | ||
|   uint8_t mask;
 | ||
|   if (likely(len > seq + 7)) {
 | ||
|     do {
 | ||
|       mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern);
 | ||
|       if (mask) {
 | ||
| #if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
 | ||
|       found:
 | ||
| #endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */
 | ||
|         return range + 24 - __builtin_clz(mask);
 | ||
|       }
 | ||
|       range -= 8;
 | ||
|     } while (range > detent + 7);
 | ||
|     if (range == detent)
 | ||
|       return nullptr;
 | ||
|   }
 | ||
| 
 | ||
|   /* Далее происходит чтение от 4 до 28 лишних байт, которые могут быть не
 | ||
|    * только за пределами региона выделенного под PNL, но и пересекать границу
 | ||
|    * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению.
 | ||
|    * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */
 | ||
| #if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
 | ||
|   const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */;
 | ||
|   if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) &&
 | ||
|       !RUNNING_ON_VALGRIND) {
 | ||
|     const unsigned extra = (unsigned)(detent + 8 - range);
 | ||
|     assert(extra > 0 && extra < 8);
 | ||
|     mask = 0xFF << extra;
 | ||
|     mask &= diffcmp2mask_avx2(range - 7, offset, pattern);
 | ||
|     if (mask)
 | ||
|       goto found;
 | ||
|     return nullptr;
 | ||
|   }
 | ||
| #endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */
 | ||
|   if (range - 3 > detent) {
 | ||
|     mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern);
 | ||
|     if (mask)
 | ||
|       return range + 28 - __builtin_clz(mask);
 | ||
|     range -= 4;
 | ||
|   }
 | ||
|   while (range > detent) {
 | ||
|     if (*range - range[offset] == target)
 | ||
|       return range;
 | ||
|     --range;
 | ||
|   }
 | ||
|   return nullptr;
 | ||
| }
 | ||
| #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */
 | ||
| 
 | ||
| #ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW
 | ||
| MDBX_ATTRIBUTE_TARGET_AVX512BW static __always_inline unsigned
 | ||
| diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset,
 | ||
|                       const __m512i pattern) {
 | ||
|   const __m512i f = _mm512_loadu_si512((const __m512i *)ptr);
 | ||
|   const __m512i l = _mm512_loadu_si512((const __m512i *)(ptr + offset));
 | ||
|   return _mm512_cmpeq_epi32_mask(_mm512_sub_epi32(f, l), pattern);
 | ||
| }
 | ||
| 
 | ||
| MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t *
 | ||
| scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) {
 | ||
|   assert(seq > 0 && len > seq);
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
| #error "FIXME: Not implemented"
 | ||
| #endif /* MDBX_PNL_ASCENDING */
 | ||
|   assert(range[-(ptrdiff_t)len] == len);
 | ||
|   pgno_t *const detent = range - len + seq;
 | ||
|   const ptrdiff_t offset = -(ptrdiff_t)seq;
 | ||
|   const pgno_t target = (pgno_t)offset;
 | ||
|   const __m512i pattern = _mm512_set1_epi32(target);
 | ||
|   unsigned mask;
 | ||
|   if (likely(len > seq + 15)) {
 | ||
|     do {
 | ||
|       mask = diffcmp2mask_avx512bw(range - 15, offset, pattern);
 | ||
|       if (mask) {
 | ||
| #if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
 | ||
|       found:
 | ||
| #endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */
 | ||
|         return range + 16 - __builtin_clz(mask);
 | ||
|       }
 | ||
|       range -= 16;
 | ||
|     } while (range > detent + 15);
 | ||
|     if (range == detent)
 | ||
|       return nullptr;
 | ||
|   }
 | ||
| 
 | ||
|   /* Далее происходит чтение от 4 до 60 лишних байт, которые могут быть не
 | ||
|    * только за пределами региона выделенного под PNL, но и пересекать границу
 | ||
|    * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению.
 | ||
|    * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */
 | ||
| #if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
 | ||
|   const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */;
 | ||
|   if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) &&
 | ||
|       !RUNNING_ON_VALGRIND) {
 | ||
|     const unsigned extra = (unsigned)(detent + 16 - range);
 | ||
|     assert(extra > 0 && extra < 16);
 | ||
|     mask = 0xFFFF << extra;
 | ||
|     mask &= diffcmp2mask_avx512bw(range - 15, offset, pattern);
 | ||
|     if (mask)
 | ||
|       goto found;
 | ||
|     return nullptr;
 | ||
|   }
 | ||
| #endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */
 | ||
|   if (range - 7 > detent) {
 | ||
|     mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern);
 | ||
|     if (mask)
 | ||
|       return range + 24 - __builtin_clz(mask);
 | ||
|     range -= 8;
 | ||
|   }
 | ||
|   if (range - 3 > detent) {
 | ||
|     mask = diffcmp2mask_sse2avx(range - 3, offset, *(const __m128i *)&pattern);
 | ||
|     if (mask)
 | ||
|       return range + 28 - __builtin_clz(mask);
 | ||
|     range -= 4;
 | ||
|   }
 | ||
|   while (range > detent) {
 | ||
|     if (*range - range[offset] == target)
 | ||
|       return range;
 | ||
|     --range;
 | ||
|   }
 | ||
|   return nullptr;
 | ||
| }
 | ||
| #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */
 | ||
| 
 | ||
| #if (defined(__ARM_NEON) || defined(__ARM_NEON__)) &&                          \
 | ||
|     (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
 | ||
| static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr,
 | ||
|                                                 const ptrdiff_t offset,
 | ||
|                                                 const uint32x4_t pattern) {
 | ||
|   const uint32x4_t f = vld1q_u32(ptr);
 | ||
|   const uint32x4_t l = vld1q_u32(ptr + offset);
 | ||
|   const uint16x4_t cmp = vmovn_u32(vceqq_u32(vsubq_u32(f, l), pattern));
 | ||
|   if (sizeof(size_t) > 7)
 | ||
|     return vget_lane_u64(vreinterpret_u64_u16(cmp), 0);
 | ||
|   else
 | ||
|     return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(cmp, cmp))),
 | ||
|                          0);
 | ||
| }
 | ||
| 
 | ||
| __hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len,
 | ||
|                                    const size_t seq) {
 | ||
|   assert(seq > 0 && len > seq);
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
| #error "FIXME: Not implemented"
 | ||
| #endif /* MDBX_PNL_ASCENDING */
 | ||
|   assert(range[-(ptrdiff_t)len] == len);
 | ||
|   pgno_t *const detent = range - len + seq;
 | ||
|   const ptrdiff_t offset = -(ptrdiff_t)seq;
 | ||
|   const pgno_t target = (pgno_t)offset;
 | ||
|   const uint32x4_t pattern = vmovq_n_u32(target);
 | ||
|   size_t mask;
 | ||
|   if (likely(len > seq + 3)) {
 | ||
|     do {
 | ||
|       mask = diffcmp2mask_neon(range - 3, offset, pattern);
 | ||
|       if (mask) {
 | ||
| #if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
 | ||
|       found:
 | ||
| #endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */
 | ||
|         return ptr_disp(range, -(__builtin_clzl(mask) >> sizeof(size_t) / 4));
 | ||
|       }
 | ||
|       range -= 4;
 | ||
|     } while (range > detent + 3);
 | ||
|     if (range == detent)
 | ||
|       return nullptr;
 | ||
|   }
 | ||
| 
 | ||
|   /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не
 | ||
|    * только за пределами региона выделенного под PNL, но и пересекать границу
 | ||
|    * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению.
 | ||
|    * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */
 | ||
| #if !defined(ENABLE_MEMCHECK) && !defined(__SANITIZE_ADDRESS__)
 | ||
|   const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */;
 | ||
|   if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) &&
 | ||
|       !RUNNING_ON_VALGRIND) {
 | ||
|     const unsigned extra = (unsigned)(detent + 4 - range);
 | ||
|     assert(extra > 0 && extra < 4);
 | ||
|     mask = (~(size_t)0) << (extra * sizeof(size_t) * 2);
 | ||
|     mask &= diffcmp2mask_neon(range - 3, offset, pattern);
 | ||
|     if (mask)
 | ||
|       goto found;
 | ||
|     return nullptr;
 | ||
|   }
 | ||
| #endif /* !ENABLE_MEMCHECK && !__SANITIZE_ADDRESS__ */
 | ||
|   do
 | ||
|     if (*range - range[offset] == target)
 | ||
|       return range;
 | ||
|   while (--range != detent);
 | ||
|   return nullptr;
 | ||
| }
 | ||
| #endif /* __ARM_NEON || __ARM_NEON__ */
 | ||
| 
 | ||
| #if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW)
 | ||
| #define scan4seq_default scan4seq_avx512bw
 | ||
| #define scan4seq_impl scan4seq_default
 | ||
| #elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2)
 | ||
| #define scan4seq_default scan4seq_avx2
 | ||
| #elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2)
 | ||
| #define scan4seq_default scan4seq_sse2
 | ||
| #elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) &&                        \
 | ||
|     (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
 | ||
| #define scan4seq_default scan4seq_neon
 | ||
| /* Choosing of another variants should be added here. */
 | ||
| #endif /* scan4seq_default */
 | ||
| 
 | ||
| #endif /* MDBX_PNL_ASCENDING */
 | ||
| 
 | ||
| #ifndef scan4seq_default
 | ||
| #define scan4seq_default scan4seq_fallback
 | ||
| #endif /* scan4seq_default */
 | ||
| 
 | ||
| #ifdef scan4seq_impl
 | ||
| /* The scan4seq_impl() is the best or no alternatives */
 | ||
| #elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS
 | ||
| /* The scan4seq_default() will be used since no cpu-features detection support
 | ||
|  * from compiler. Please don't ask to implement cpuid-based detection and don't
 | ||
|  * make such PRs. */
 | ||
| #define scan4seq_impl scan4seq_default
 | ||
| #else
 | ||
| /* Selecting the most appropriate implementation at runtime,
 | ||
|  * depending on the available CPU features. */
 | ||
| static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len,
 | ||
|                                  const size_t seq);
 | ||
| static pgno_t *(*scan4seq_impl)(pgno_t *range, const size_t len,
 | ||
|                                 const size_t seq) = scan4seq_resolver;
 | ||
| 
 | ||
| static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len,
 | ||
|                                  const size_t seq) {
 | ||
|   pgno_t *(*choice)(pgno_t *range, const size_t len, const size_t seq) =
 | ||
|       nullptr;
 | ||
| #if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) ||      \
 | ||
|     __GNUC_PREREQ(4, 8)
 | ||
|   __builtin_cpu_init();
 | ||
| #endif /* __builtin_cpu_init() */
 | ||
| #ifdef MDBX_ATTRIBUTE_TARGET_SSE2
 | ||
|   if (__builtin_cpu_supports("sse2"))
 | ||
|     choice = scan4seq_sse2;
 | ||
| #endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */
 | ||
| #ifdef MDBX_ATTRIBUTE_TARGET_AVX2
 | ||
|   if (__builtin_cpu_supports("avx2"))
 | ||
|     choice = scan4seq_avx2;
 | ||
| #endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */
 | ||
| #ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW
 | ||
|   if (__builtin_cpu_supports("avx512bw"))
 | ||
|     choice = scan4seq_avx512bw;
 | ||
| #endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */
 | ||
|   /* Choosing of another variants should be added here. */
 | ||
|   scan4seq_impl = choice ? choice : scan4seq_default;
 | ||
|   return scan4seq_impl(range, len, seq);
 | ||
| }
 | ||
| #endif /* scan4seq_impl */
 | ||
| 
 | ||
| /*----------------------------------------------------------------------------*/
 | ||
| 
 | ||
| #define ALLOC_COALESCE 4    /* внутреннее состояние */
 | ||
| #define ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */
 | ||
| #define ALLOC_LIFO 16       /* внутреннее состояние */
 | ||
| 
 | ||
| static inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc,
 | ||
|                                 const uint8_t flags) {
 | ||
|   /* If txn is updating the GC, then the retired-list cannot play catch-up with
 | ||
|    * itself by growing while trying to save it. */
 | ||
|   if (mc->tree == &txn->dbs[FREE_DBI] && !(flags & ALLOC_RESERVE) &&
 | ||
|       !(mc->flags & z_gcu_preparation))
 | ||
|     return false;
 | ||
| 
 | ||
|   /* avoid search inside empty tree and while tree is updating,
 | ||
|      https://libmdbx.dqdkfa.ru/dead-github/issues/31 */
 | ||
|   if (unlikely(txn->dbs[FREE_DBI].items == 0)) {
 | ||
|     txn->flags |= txn_gc_drained;
 | ||
|     return false;
 | ||
|   }
 | ||
| 
 | ||
|   return true;
 | ||
| }
 | ||
| 
 | ||
| __hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) {
 | ||
|   const size_t len = MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed);
 | ||
|   for (size_t i = 1; i <= len; ++i)
 | ||
|     if (txn->tw.gc.reclaimed[i] == id)
 | ||
|       return true;
 | ||
|   return false;
 | ||
| }
 | ||
| 
 | ||
| __hot static pgno_t relist_get_single(MDBX_txn *txn) {
 | ||
|   const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist);
 | ||
|   assert(len > 0);
 | ||
|   pgno_t *target = MDBX_PNL_EDGE(txn->tw.relist);
 | ||
|   const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1;
 | ||
| 
 | ||
|   /* Есть ТРИ потенциально выигрышные, но противо-направленные тактики:
 | ||
|    *
 | ||
|    * 1. Стараться использовать страницы с наименьшими номерами. Так обмен с
 | ||
|    * диском будет более кучным, а у страниц ближе к концу БД будет больше шансов
 | ||
|    * попасть под авто-компактификацию. Частично эта тактика уже реализована, но
 | ||
|    * для её эффективности требуется явно приоритезировать выделение страниц:
 | ||
|    *   - поддерживать для relist, для ближних и для дальних страниц;
 | ||
|    *   - использовать страницы из дальнего списка, если первый пуст,
 | ||
|    *     а второй слишком большой, либо при пустой GC.
 | ||
|    *
 | ||
|    * 2. Стараться выделять страницы последовательно. Так записываемые на диск
 | ||
|    * регионы будут линейными, что принципиально ускоряет запись на HDD.
 | ||
|    * Одновременно, в среднем это не повлияет на чтение, точнее говоря, если
 | ||
|    * порядок чтения не совпадает с порядком изменения (иначе говоря, если
 | ||
|    * чтение не коррклирует с обновлениями и/или вставками) то не повлияет, иначе
 | ||
|    * может ускорить. Однако, последовательности в среднем достаточно редки.
 | ||
|    * Поэтому для эффективности требуется аккумулировать и поддерживать в ОЗУ
 | ||
|    * огромные списки страниц, а затем сохранять их обратно в БД. Текущий формат
 | ||
|    * БД (без битовых карт) для этого крайне не удачен. Поэтому эта тактика не
 | ||
|    * имеет шансов быть успешной без смены формата БД (Mithril).
 | ||
|    *
 | ||
|    * 3. Стараться экономить последовательности страниц. Это позволяет избегать
 | ||
|    * лишнего чтения/поиска в GC при более-менее постоянном размещении и/или
 | ||
|    * обновлении данных требующих более одной страницы. Проблема в том, что без
 | ||
|    * информации от приложения библиотека не может знать насколько
 | ||
|    * востребованными будут последовательности в ближайшей перспективе, а
 | ||
|    * экономия последовательностей "на всякий случай" не только затратна
 | ||
|    * сама-по-себе, но и работает во вред.
 | ||
|    *
 | ||
|    * Поэтому:
 | ||
|    *  - в TODO добавляется разделение relist на «ближние» и «дальние» страницы,
 | ||
|    *    с последующей реализацией первой тактики;
 | ||
|    *  - преимущественное использование последовательностей отправляется
 | ||
|    *    в MithrilDB как составляющая "HDD frendly" feature;
 | ||
|    *  - реализованная в 3757eb72f7c6b46862f8f17881ac88e8cecc1979 экономия
 | ||
|    *    последовательностей отключается через MDBX_ENABLE_SAVING_SEQUENCES=0.
 | ||
|    *
 | ||
|    * В качестве альтернативы для безусловной «экономии» последовательностей,
 | ||
|    * в следующих версиях libmdbx, вероятно, будет предложено
 | ||
|    * API для взаимодействия с GC:
 | ||
|    *  - получение размера GC, включая гистограммы размеров последовательностей
 | ||
|    *    и близости к концу БД;
 | ||
|    *  - включение формирования "линейного запаса" для последующего использования
 | ||
|    *    в рамках текущей транзакции;
 | ||
|    *  - намеренная загрузка GC в память для коагуляции и "выпрямления";
 | ||
|    *  - намеренное копирование данных из страниц в конце БД для последующего
 | ||
|    *    из освобождения, т.е. контролируемая компактификация по запросу. */
 | ||
| 
 | ||
| #ifndef MDBX_ENABLE_SAVING_SEQUENCES
 | ||
| #define MDBX_ENABLE_SAVING_SEQUENCES 0
 | ||
| #endif
 | ||
|   if (MDBX_ENABLE_SAVING_SEQUENCES && unlikely(target[dir] == *target + 1) &&
 | ||
|       len > 2) {
 | ||
|     /* Пытаемся пропускать последовательности при наличии одиночных элементов.
 | ||
|      * TODO: необходимо кэшировать пропускаемые последовательности
 | ||
|      * чтобы не сканировать список сначала при каждом выделении. */
 | ||
|     pgno_t *scan = target + dir + dir;
 | ||
|     size_t left = len;
 | ||
|     do {
 | ||
|       if (likely(scan[-dir] != *scan - 1 && *scan + 1 != scan[dir])) {
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
|         target = scan;
 | ||
|         break;
 | ||
| #else
 | ||
|         /* вырезаем элемент с перемещением хвоста */
 | ||
|         const pgno_t pgno = *scan;
 | ||
|         MDBX_PNL_SETSIZE(txn->tw.relist, len - 1);
 | ||
|         while (++scan <= target)
 | ||
|           scan[-1] = *scan;
 | ||
|         return pgno;
 | ||
| #endif
 | ||
|       }
 | ||
|       scan += dir;
 | ||
|     } while (--left > 2);
 | ||
|   }
 | ||
| 
 | ||
|   const pgno_t pgno = *target;
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
|   /* вырезаем элемент с перемещением хвоста */
 | ||
|   MDBX_PNL_SETSIZE(txn->tw.relist, len - 1);
 | ||
|   for (const pgno_t *const end = txn->tw.relist + len - 1; target <= end;
 | ||
|        ++target)
 | ||
|     *target = target[1];
 | ||
| #else
 | ||
|   /* перемещать хвост не нужно, просто усекам список */
 | ||
|   MDBX_PNL_SETSIZE(txn->tw.relist, len - 1);
 | ||
| #endif
 | ||
|   return pgno;
 | ||
| }
 | ||
| 
 | ||
| __hot static pgno_t relist_get_sequence(MDBX_txn *txn, const size_t num,
 | ||
|                                         uint8_t flags) {
 | ||
|   const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist);
 | ||
|   pgno_t *edge = MDBX_PNL_EDGE(txn->tw.relist);
 | ||
|   assert(len >= num && num > 1);
 | ||
|   const size_t seq = num - 1;
 | ||
| #if !MDBX_PNL_ASCENDING
 | ||
|   if (edge[-(ptrdiff_t)seq] - *edge == seq) {
 | ||
|     if (unlikely(flags & ALLOC_RESERVE))
 | ||
|       return P_INVALID;
 | ||
|     assert(edge == scan4range_checker(txn->tw.relist, seq));
 | ||
|     /* перемещать хвост не нужно, просто усекам список */
 | ||
|     MDBX_PNL_SETSIZE(txn->tw.relist, len - num);
 | ||
|     return *edge;
 | ||
|   }
 | ||
| #endif
 | ||
|   pgno_t *target = scan4seq_impl(edge, len, seq);
 | ||
|   assert(target == scan4range_checker(txn->tw.relist, seq));
 | ||
|   if (target) {
 | ||
|     if (unlikely(flags & ALLOC_RESERVE))
 | ||
|       return P_INVALID;
 | ||
|     const pgno_t pgno = *target;
 | ||
|     /* вырезаем найденную последовательность с перемещением хвоста */
 | ||
|     MDBX_PNL_SETSIZE(txn->tw.relist, len - num);
 | ||
| #if MDBX_PNL_ASCENDING
 | ||
|     for (const pgno_t *const end = txn->tw.relist + len - num; target <= end;
 | ||
|          ++target)
 | ||
|       *target = target[num];
 | ||
| #else
 | ||
|     for (const pgno_t *const end = txn->tw.relist + len; ++target <= end;)
 | ||
|       target[-(ptrdiff_t)num] = *target;
 | ||
| #endif
 | ||
|     return pgno;
 | ||
|   }
 | ||
|   return 0;
 | ||
| }
 | ||
| 
 | ||
| static inline pgr_t page_alloc_finalize(MDBX_env *const env,
 | ||
|                                         MDBX_txn *const txn,
 | ||
|                                         const MDBX_cursor *const mc,
 | ||
|                                         const pgno_t pgno, const size_t num) {
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|   size_t majflt_before;
 | ||
|   const uint64_t cputime_before = osal_cputime(&majflt_before);
 | ||
|   gc_prof_stat_t *const prof = (mc->mc_dbi == FREE_DBI)
 | ||
|                                    ? &env->lck->pgops.gc_prof.self
 | ||
|                                    : &env->lck->pgops.gc_prof.work;
 | ||
| #else
 | ||
|   (void)mc;
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
|   ENSURE(env, pgno >= NUM_METAS);
 | ||
| 
 | ||
|   pgr_t ret;
 | ||
|   bool need_clean = (env->flags & MDBX_PAGEPERTURB) != 0;
 | ||
|   if (env->flags & MDBX_WRITEMAP) {
 | ||
|     ret.page = pgno2page(env, pgno);
 | ||
|     MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num));
 | ||
|     VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
 | ||
| 
 | ||
|     /* Содержимое выделенной страницы не нужно, но если страница отсутствует
 | ||
|      * в ОЗУ (что весьма вероятно), то любое обращение к ней приведет
 | ||
|      * к page-fault:
 | ||
|      *  - прерыванию по отсутствию страницы;
 | ||
|      *  - переключение контекста в режим ядра с засыпанием процесса;
 | ||
|      *  - чтение страницы с диска;
 | ||
|      *  - обновление PTE и пробуждением процесса;
 | ||
|      *  - переключение контекста по доступности ЦПУ.
 | ||
|      *
 | ||
|      * Пытаемся минимизировать накладные расходы записывая страницу, что при
 | ||
|      * наличии unified page cache приведет к появлению страницы в ОЗУ без чтения
 | ||
|      * с диска. При этом запись на диск должна быть отложена адекватным ядром,
 | ||
|      * так как страница отображена в память в режиме чтения-записи и следом в
 | ||
|      * неё пишет ЦПУ. */
 | ||
| 
 | ||
|     /* В случае если страница в памяти процесса, то излишняя запись может быть
 | ||
|      * достаточно дорогой. Кроме системного вызова и копирования данных, в особо
 | ||
|      * одаренных ОС при этом могут включаться файловая система, выделяться
 | ||
|      * временная страница, пополняться очереди асинхронного выполнения,
 | ||
|      * обновляться PTE с последующей генерацией page-fault и чтением данных из
 | ||
|      * грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть
 | ||
|      * сравним с избегаемым ненужным чтением. */
 | ||
|     if (env->prefault_write_activated) {
 | ||
|       void *const pattern =
 | ||
|           ptr_disp(env->page_auxbuf, need_clean ? env->ps : env->ps * 2);
 | ||
|       size_t file_offset = pgno2bytes(env, pgno);
 | ||
|       if (likely(num == 1)) {
 | ||
|         if (!mincore_probe(env, pgno)) {
 | ||
|           osal_pwrite(env->lazy_fd, pattern, env->ps, file_offset);
 | ||
| #if MDBX_ENABLE_PGOP_STAT
 | ||
|           env->lck->pgops.prefault.weak += 1;
 | ||
| #endif /* MDBX_ENABLE_PGOP_STAT */
 | ||
|           need_clean = false;
 | ||
|         }
 | ||
|       } else {
 | ||
|         struct iovec iov[MDBX_AUXILARY_IOV_MAX];
 | ||
|         size_t n = 0, cleared = 0;
 | ||
|         for (size_t i = 0; i < num; ++i) {
 | ||
|           if (!mincore_probe(env, pgno + (pgno_t)i)) {
 | ||
|             ++cleared;
 | ||
|             iov[n].iov_len = env->ps;
 | ||
|             iov[n].iov_base = pattern;
 | ||
|             if (unlikely(++n == MDBX_AUXILARY_IOV_MAX)) {
 | ||
|               osal_pwritev(env->lazy_fd, iov, MDBX_AUXILARY_IOV_MAX,
 | ||
|                            file_offset);
 | ||
| #if MDBX_ENABLE_PGOP_STAT
 | ||
|               env->lck->pgops.prefault.weak += 1;
 | ||
| #endif /* MDBX_ENABLE_PGOP_STAT */
 | ||
|               file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
 | ||
|               n = 0;
 | ||
|             }
 | ||
|           }
 | ||
|         }
 | ||
|         if (likely(n > 0)) {
 | ||
|           osal_pwritev(env->lazy_fd, iov, n, file_offset);
 | ||
| #if MDBX_ENABLE_PGOP_STAT
 | ||
|           env->lck->pgops.prefault.weak += 1;
 | ||
| #endif /* MDBX_ENABLE_PGOP_STAT */
 | ||
|         }
 | ||
|         if (cleared == num)
 | ||
|           need_clean = false;
 | ||
|       }
 | ||
|     }
 | ||
|   } else {
 | ||
|     ret.page = page_shadow_alloc(txn, num);
 | ||
|     if (unlikely(!ret.page)) {
 | ||
|       ret.err = MDBX_ENOMEM;
 | ||
|       goto bailout;
 | ||
|     }
 | ||
|   }
 | ||
| 
 | ||
|   if (unlikely(need_clean))
 | ||
|     memset(ret.page, -1, pgno2bytes(env, num));
 | ||
| 
 | ||
|   VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
 | ||
|   ret.page->pgno = pgno;
 | ||
|   ret.page->dupfix_ksize = 0;
 | ||
|   ret.page->flags = 0;
 | ||
|   if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) {
 | ||
|     ret.page->pages = (pgno_t)num;
 | ||
|     ret.page->flags = P_LARGE;
 | ||
|   }
 | ||
| 
 | ||
|   ret.err = page_dirty(txn, ret.page, (pgno_t)num);
 | ||
| bailout:
 | ||
|   tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated -
 | ||
|                                                        MDBX_ENABLE_REFUND));
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|   size_t majflt_after;
 | ||
|   prof->xtime_cpu += osal_cputime(&majflt_after) - cputime_before;
 | ||
|   prof->majflt += (uint32_t)(majflt_after - majflt_before);
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
|   return ret;
 | ||
| }
 | ||
| 
 | ||
| pgr_t gc_alloc_ex(const MDBX_cursor *const mc, const size_t num,
 | ||
|                   uint8_t flags) {
 | ||
|   pgr_t ret;
 | ||
|   MDBX_txn *const txn = mc->txn;
 | ||
|   MDBX_env *const env = txn->env;
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|   gc_prof_stat_t *const prof = (mc->mc_dbi == FREE_DBI)
 | ||
|                                    ? &env->lck->pgops.gc_prof.self
 | ||
|                                    : &env->lck->pgops.gc_prof.work;
 | ||
|   prof->spe_counter += 1;
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
| 
 | ||
|   eASSERT(env, num > 0 || (flags & ALLOC_RESERVE));
 | ||
|   eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated -
 | ||
|                                                        MDBX_ENABLE_REFUND));
 | ||
| 
 | ||
|   size_t newnext;
 | ||
|   const uint64_t monotime_begin =
 | ||
|       (MDBX_ENABLE_PROFGC || (num > 1 && env->options.gc_time_limit))
 | ||
|           ? osal_monotime()
 | ||
|           : 0;
 | ||
|   struct monotime_cache now_cache;
 | ||
|   now_cache.expire_countdown =
 | ||
|       1 /* старт с 1 позволяет избавиться как от лишних системных вызовов когда
 | ||
|            лимит времени задан нулевой или уже исчерпан, так и от подсчета
 | ||
|            времени при не-достижении rp_augment_limit */
 | ||
|       ;
 | ||
|   now_cache.value = monotime_begin;
 | ||
|   pgno_t pgno = 0;
 | ||
|   if (num > 1) {
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|     prof->xpages += 1;
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
|     if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) {
 | ||
|       eASSERT(env,
 | ||
|               MDBX_PNL_LAST(txn->tw.relist) < txn->geo.first_unallocated &&
 | ||
|                   MDBX_PNL_FIRST(txn->tw.relist) < txn->geo.first_unallocated);
 | ||
|       pgno = relist_get_sequence(txn, num, flags);
 | ||
|       if (likely(pgno))
 | ||
|         goto done;
 | ||
|     }
 | ||
|   } else {
 | ||
|     eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.relist) == 0);
 | ||
|     eASSERT(env, !(flags & ALLOC_RESERVE) || num == 0);
 | ||
|   }
 | ||
| 
 | ||
|   //---------------------------------------------------------------------------
 | ||
| 
 | ||
|   if (unlikely(!is_gc_usable(txn, mc, flags))) {
 | ||
|     eASSERT(env, (txn->flags & txn_gc_drained) || num > 1);
 | ||
|     goto no_gc;
 | ||
|   }
 | ||
| 
 | ||
|   eASSERT(env,
 | ||
|           (flags & (ALLOC_COALESCE | ALLOC_LIFO | ALLOC_SHOULD_SCAN)) == 0);
 | ||
|   flags += (env->flags & MDBX_LIFORECLAIM) ? ALLOC_LIFO : 0;
 | ||
| 
 | ||
|   if (/* Не коагулируем записи при подготовке резерва для обновления GC.
 | ||
|        * Иначе попытка увеличить резерв может приводить к необходимости ещё
 | ||
|        * большего резерва из-за увеличения списка переработанных страниц. */
 | ||
|       (flags & ALLOC_RESERVE) == 0) {
 | ||
|     if (txn->dbs[FREE_DBI].branch_pages &&
 | ||
|         MDBX_PNL_GETSIZE(txn->tw.relist) < env->maxgc_large1page / 2)
 | ||
|       flags += ALLOC_COALESCE;
 | ||
|   }
 | ||
| 
 | ||
|   MDBX_cursor *const gc = ptr_disp(env->basal_txn, sizeof(MDBX_txn));
 | ||
|   eASSERT(env, mc != gc && gc->next == gc);
 | ||
|   gc->txn = txn;
 | ||
|   gc->dbi_state = txn->dbi_state;
 | ||
|   gc->top_and_flags = z_fresh_mark;
 | ||
| 
 | ||
|   env->prefault_write_activated = env->options.prefault_write;
 | ||
|   if (env->prefault_write_activated) {
 | ||
|     /* Проверка посредством minicore() существенно снижает затраты, но в
 | ||
|      * простейших случаях (тривиальный бенчмарк) интегральная производительность
 | ||
|      * становится вдвое меньше. А на платформах без mincore() и с проблемной
 | ||
|      * подсистемой виртуальной памяти ситуация может быть многократно хуже.
 | ||
|      * Поэтому избегаем затрат в ситуациях когда prefault-write скорее всего не
 | ||
|      * нужна. */
 | ||
|     const bool readahead_enabled = env->lck->readahead_anchor & 1;
 | ||
|     const pgno_t readahead_edge = env->lck->readahead_anchor >> 1;
 | ||
|     if (/* Не суетимся если GC почти пустая и БД маленькая */
 | ||
|         (txn->dbs[FREE_DBI].branch_pages == 0 && txn->geo.now < 1234) ||
 | ||
|         /* Не суетимся если страница в зоне включенного упреждающего чтения */
 | ||
|         (readahead_enabled && pgno + num < readahead_edge))
 | ||
|       env->prefault_write_activated = false;
 | ||
|   }
 | ||
| 
 | ||
| retry_gc_refresh_oldest:;
 | ||
|   txnid_t oldest = txn_snapshot_oldest(txn);
 | ||
| retry_gc_have_oldest:
 | ||
|   if (unlikely(oldest >= txn->txnid)) {
 | ||
|     ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN
 | ||
|           " for current-txnid %" PRIaTXN,
 | ||
|           oldest, txn->txnid);
 | ||
|     ret.err = MDBX_PROBLEM;
 | ||
|     goto fail;
 | ||
|   }
 | ||
|   const txnid_t detent = oldest + 1;
 | ||
| 
 | ||
|   txnid_t id = 0;
 | ||
|   MDBX_cursor_op op = MDBX_FIRST;
 | ||
|   if (flags & ALLOC_LIFO) {
 | ||
|     if (!txn->tw.gc.reclaimed) {
 | ||
|       txn->tw.gc.reclaimed = txl_alloc();
 | ||
|       if (unlikely(!txn->tw.gc.reclaimed)) {
 | ||
|         ret.err = MDBX_ENOMEM;
 | ||
|         goto fail;
 | ||
|       }
 | ||
|     }
 | ||
|     /* Begin lookup backward from oldest reader */
 | ||
|     id = detent - 1;
 | ||
|     op = MDBX_SET_RANGE;
 | ||
|   } else if (txn->tw.gc.last_reclaimed) {
 | ||
|     /* Continue lookup forward from last-reclaimed */
 | ||
|     id = txn->tw.gc.last_reclaimed + 1;
 | ||
|     if (id >= detent)
 | ||
|       goto depleted_gc;
 | ||
|     op = MDBX_SET_RANGE;
 | ||
|   }
 | ||
| 
 | ||
| next_gc:;
 | ||
|   MDBX_val key;
 | ||
|   key.iov_base = &id;
 | ||
|   key.iov_len = sizeof(id);
 | ||
| 
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|   prof->rsteps += 1;
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
| 
 | ||
|   /* Seek first/next GC record */
 | ||
|   ret.err = cursor_ops(gc, &key, nullptr, op);
 | ||
|   if (unlikely(ret.err != MDBX_SUCCESS)) {
 | ||
|     if (unlikely(ret.err != MDBX_NOTFOUND))
 | ||
|       goto fail;
 | ||
|     if ((flags & ALLOC_LIFO) && op == MDBX_SET_RANGE) {
 | ||
|       op = MDBX_PREV;
 | ||
|       goto next_gc;
 | ||
|     }
 | ||
|     goto depleted_gc;
 | ||
|   }
 | ||
|   if (unlikely(key.iov_len != sizeof(txnid_t))) {
 | ||
|     ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED,
 | ||
|           "invalid GC key-length");
 | ||
|     ret.err = MDBX_CORRUPTED;
 | ||
|     goto fail;
 | ||
|   }
 | ||
|   id = unaligned_peek_u64(4, key.iov_base);
 | ||
|   if (flags & ALLOC_LIFO) {
 | ||
|     op = MDBX_PREV;
 | ||
|     if (id >= detent || is_already_reclaimed(txn, id))
 | ||
|       goto next_gc;
 | ||
|   } else {
 | ||
|     op = MDBX_NEXT;
 | ||
|     if (unlikely(id >= detent))
 | ||
|       goto depleted_gc;
 | ||
|   }
 | ||
|   txn->flags &= ~txn_gc_drained;
 | ||
| 
 | ||
|   /* Reading next GC record */
 | ||
|   MDBX_val data;
 | ||
|   page_t *const mp = gc->pg[gc->top];
 | ||
|   if (unlikely((ret.err = node_read(gc, page_node(mp, gc->ki[gc->top]), &data,
 | ||
|                                     mp)) != MDBX_SUCCESS))
 | ||
|     goto fail;
 | ||
| 
 | ||
|   pgno_t *gc_pnl = (pgno_t *)data.iov_base;
 | ||
|   if (unlikely(data.iov_len % sizeof(pgno_t) ||
 | ||
|                data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) ||
 | ||
|                !pnl_check(gc_pnl, txn->geo.first_unallocated))) {
 | ||
|     ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED,
 | ||
|           "invalid GC value-length");
 | ||
|     ret.err = MDBX_CORRUPTED;
 | ||
|     goto fail;
 | ||
|   }
 | ||
| 
 | ||
|   const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl);
 | ||
|   TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len,
 | ||
|         gc_len + MDBX_PNL_GETSIZE(txn->tw.relist));
 | ||
| 
 | ||
|   if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >=
 | ||
|                env->maxgc_large1page)) {
 | ||
|     /* Don't try to coalesce too much. */
 | ||
|     if (flags & ALLOC_SHOULD_SCAN) {
 | ||
|       eASSERT(env, flags & ALLOC_COALESCE);
 | ||
|       eASSERT(env, !(flags & ALLOC_RESERVE));
 | ||
|       eASSERT(env, num > 0);
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|       env->lck->pgops.gc_prof.coalescences += 1;
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
|       TRACE("clear %s %s", "ALLOC_COALESCE", "since got threshold");
 | ||
|       if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) {
 | ||
|         eASSERT(env,
 | ||
|                 MDBX_PNL_LAST(txn->tw.relist) < txn->geo.first_unallocated &&
 | ||
|                     MDBX_PNL_FIRST(txn->tw.relist) <
 | ||
|                         txn->geo.first_unallocated);
 | ||
|         if (likely(num == 1)) {
 | ||
|           pgno = relist_get_single(txn);
 | ||
|           goto done;
 | ||
|         }
 | ||
|         pgno = relist_get_sequence(txn, num, flags);
 | ||
|         if (likely(pgno))
 | ||
|           goto done;
 | ||
|       }
 | ||
|       flags -= ALLOC_COALESCE | ALLOC_SHOULD_SCAN;
 | ||
|     }
 | ||
|     if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE(
 | ||
|                      txn->tw.relist) >= env->options.rp_augment_limit) &&
 | ||
|         ((/* not a slot-request from gc-update */ num &&
 | ||
|           /* have enough unallocated space */ txn->geo.upper >=
 | ||
|               txn->geo.first_unallocated + num &&
 | ||
|           monotime_since_cached(monotime_begin, &now_cache) +
 | ||
|                   txn->tw.gc.time_acc >=
 | ||
|               env->options.gc_time_limit) ||
 | ||
|          gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= PAGELIST_LIMIT)) {
 | ||
|       /* Stop reclaiming to avoid large/overflow the page list. This is a rare
 | ||
|        * case while search for a continuously multi-page region in a
 | ||
|        * large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */
 | ||
|       NOTICE("stop reclaiming %s: %zu (current) + %zu "
 | ||
|              "(chunk) -> %zu, rp_augment_limit %u",
 | ||
|              likely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) < PAGELIST_LIMIT)
 | ||
|                  ? "since rp_augment_limit was reached"
 | ||
|                  : "to avoid PNL overflow",
 | ||
|              MDBX_PNL_GETSIZE(txn->tw.relist), gc_len,
 | ||
|              gc_len + MDBX_PNL_GETSIZE(txn->tw.relist),
 | ||
|              env->options.rp_augment_limit);
 | ||
|       goto depleted_gc;
 | ||
|     }
 | ||
|   }
 | ||
| 
 | ||
|   /* Remember ID of readed GC record */
 | ||
|   txn->tw.gc.last_reclaimed = id;
 | ||
|   if (flags & ALLOC_LIFO) {
 | ||
|     ret.err = txl_append(&txn->tw.gc.reclaimed, id);
 | ||
|     if (unlikely(ret.err != MDBX_SUCCESS))
 | ||
|       goto fail;
 | ||
|   }
 | ||
| 
 | ||
|   /* Append PNL from GC record to tw.relist */
 | ||
|   ret.err = pnl_need(&txn->tw.relist, gc_len);
 | ||
|   if (unlikely(ret.err != MDBX_SUCCESS))
 | ||
|     goto fail;
 | ||
| 
 | ||
|   if (LOG_ENABLED(MDBX_LOG_EXTRA)) {
 | ||
|     DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO
 | ||
|                 " len %zu, PNL",
 | ||
|                 id, txn->dbs[FREE_DBI].root, gc_len);
 | ||
|     for (size_t i = gc_len; i; i--)
 | ||
|       DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]);
 | ||
|     DEBUG_EXTRA_PRINT(", first_unallocated %u\n", txn->geo.first_unallocated);
 | ||
|   }
 | ||
| 
 | ||
|   /* Merge in descending sorted order */
 | ||
|   pnl_merge(txn->tw.relist, gc_pnl);
 | ||
|   flags |= ALLOC_SHOULD_SCAN;
 | ||
|   if (AUDIT_ENABLED()) {
 | ||
|     if (unlikely(!pnl_check(txn->tw.relist, txn->geo.first_unallocated))) {
 | ||
|       ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED,
 | ||
|             "invalid txn retired-list");
 | ||
|       ret.err = MDBX_CORRUPTED;
 | ||
|       goto fail;
 | ||
|     }
 | ||
|   } else {
 | ||
|     eASSERT(env,
 | ||
|             pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated));
 | ||
|   }
 | ||
|   eASSERT(env, dpl_check(txn));
 | ||
| 
 | ||
|   eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.relist) == 0 ||
 | ||
|                    MDBX_PNL_MOST(txn->tw.relist) < txn->geo.first_unallocated);
 | ||
|   if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.relist) &&
 | ||
|       unlikely(MDBX_PNL_MOST(txn->tw.relist) ==
 | ||
|                txn->geo.first_unallocated - 1)) {
 | ||
|     /* Refund suitable pages into "unallocated" space */
 | ||
|     txn_refund(txn);
 | ||
|   }
 | ||
|   eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated -
 | ||
|                                                        MDBX_ENABLE_REFUND));
 | ||
| 
 | ||
|   /* Done for a kick-reclaim mode, actually no page needed */
 | ||
|   if (unlikely(num == 0)) {
 | ||
|     eASSERT(env, ret.err == MDBX_SUCCESS);
 | ||
|     TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id,
 | ||
|           MDBX_PNL_GETSIZE(txn->tw.relist));
 | ||
|     goto early_exit;
 | ||
|   }
 | ||
| 
 | ||
|   /* TODO: delete reclaimed records */
 | ||
| 
 | ||
|   eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT);
 | ||
|   if (flags & ALLOC_COALESCE) {
 | ||
|     TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id,
 | ||
|           MDBX_PNL_GETSIZE(txn->tw.relist));
 | ||
|     goto next_gc;
 | ||
|   }
 | ||
| 
 | ||
| scan:
 | ||
|   eASSERT(env, flags & ALLOC_SHOULD_SCAN);
 | ||
|   eASSERT(env, num > 0);
 | ||
|   if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) {
 | ||
|     eASSERT(env,
 | ||
|             MDBX_PNL_LAST(txn->tw.relist) < txn->geo.first_unallocated &&
 | ||
|                 MDBX_PNL_FIRST(txn->tw.relist) < txn->geo.first_unallocated);
 | ||
|     if (likely(num == 1)) {
 | ||
|       eASSERT(env, !(flags & ALLOC_RESERVE));
 | ||
|       pgno = relist_get_single(txn);
 | ||
|       goto done;
 | ||
|     }
 | ||
|     pgno = relist_get_sequence(txn, num, flags);
 | ||
|     if (likely(pgno))
 | ||
|       goto done;
 | ||
|   }
 | ||
|   flags -= ALLOC_SHOULD_SCAN;
 | ||
|   if (ret.err == MDBX_SUCCESS) {
 | ||
|     TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id,
 | ||
|           MDBX_PNL_GETSIZE(txn->tw.relist));
 | ||
|     goto next_gc;
 | ||
|   }
 | ||
| 
 | ||
| depleted_gc:
 | ||
|   TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id,
 | ||
|         MDBX_PNL_GETSIZE(txn->tw.relist));
 | ||
|   ret.err = MDBX_NOTFOUND;
 | ||
|   if (flags & ALLOC_SHOULD_SCAN)
 | ||
|     goto scan;
 | ||
|   txn->flags |= txn_gc_drained;
 | ||
| 
 | ||
|   //-------------------------------------------------------------------------
 | ||
| 
 | ||
|   /* There is no suitable pages in the GC and to be able to allocate
 | ||
|    * we should CHOICE one of:
 | ||
|    *  - make a new steady checkpoint if reclaiming was stopped by
 | ||
|    *    the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode;
 | ||
|    *  - kick lagging reader(s) if reclaiming was stopped by ones of it.
 | ||
|    *  - extend the database file. */
 | ||
| 
 | ||
|   /* Will use new pages from the map if nothing is suitable in the GC. */
 | ||
|   newnext = txn->geo.first_unallocated + num;
 | ||
| 
 | ||
|   /* Does reclaiming stopped at the last steady point? */
 | ||
|   const meta_ptr_t recent = meta_recent(env, &txn->tw.troika);
 | ||
|   const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika);
 | ||
|   if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady &&
 | ||
|       detent == prefer_steady.txnid + 1) {
 | ||
|     DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN
 | ||
|           "-%s, detent %" PRIaTXN,
 | ||
|           recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid,
 | ||
|           durable_caption(prefer_steady.ptr_c), detent);
 | ||
|     const pgno_t autosync_threshold =
 | ||
|         atomic_load32(&env->lck->autosync_threshold, mo_Relaxed);
 | ||
|     const uint64_t autosync_period =
 | ||
|         atomic_load64(&env->lck->autosync_period, mo_Relaxed);
 | ||
|     uint64_t eoos_timestamp;
 | ||
|     /* wipe the last steady-point if one of:
 | ||
|      *  - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified
 | ||
|      *  - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted
 | ||
|      * otherwise, make a new steady-point if one of:
 | ||
|      *  - auto-sync threshold is specified and reached;
 | ||
|      *  - upper limit of database size is reached;
 | ||
|      *  - database is full (with the current file size)
 | ||
|      *       AND auto-sync threshold it NOT specified */
 | ||
|     if (F_ISSET(env->flags, MDBX_UTTERLY_NOSYNC) &&
 | ||
|         ((autosync_threshold | autosync_period) == 0 ||
 | ||
|          newnext >= prefer_steady.ptr_c->geometry.now)) {
 | ||
|       /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode
 | ||
|        * without any auto-sync threshold(s). */
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|       env->lck->pgops.gc_prof.wipes += 1;
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
|       ret.err = meta_wipe_steady(env, detent);
 | ||
|       DEBUG("gc-wipe-steady, rc %d", ret.err);
 | ||
|       if (unlikely(ret.err != MDBX_SUCCESS))
 | ||
|         goto fail;
 | ||
|       eASSERT(env, prefer_steady.ptr_c !=
 | ||
|                        meta_prefer_steady(env, &txn->tw.troika).ptr_c);
 | ||
|       goto retry_gc_refresh_oldest;
 | ||
|     }
 | ||
|     if ((autosync_threshold &&
 | ||
|          atomic_load64(&env->lck->unsynced_pages, mo_Relaxed) >=
 | ||
|              autosync_threshold) ||
 | ||
|         (autosync_period &&
 | ||
|          (eoos_timestamp =
 | ||
|               atomic_load64(&env->lck->eoos_timestamp, mo_Relaxed)) &&
 | ||
|          osal_monotime() - eoos_timestamp >= autosync_period) ||
 | ||
|         newnext >= txn->geo.upper ||
 | ||
|         ((num == 0 || newnext >= txn->geo.end_pgno) &&
 | ||
|          (autosync_threshold | autosync_period) == 0)) {
 | ||
|       /* make steady checkpoint. */
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|       env->lck->pgops.gc_prof.flushes += 1;
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
|       meta_t meta = *recent.ptr_c;
 | ||
|       ret.err = dxb_sync_locked(env, env->flags & MDBX_WRITEMAP, &meta,
 | ||
|                                 &txn->tw.troika);
 | ||
|       DEBUG("gc-make-steady, rc %d", ret.err);
 | ||
|       eASSERT(env, ret.err != MDBX_RESULT_TRUE);
 | ||
|       if (unlikely(ret.err != MDBX_SUCCESS))
 | ||
|         goto fail;
 | ||
|       eASSERT(env, prefer_steady.ptr_c !=
 | ||
|                        meta_prefer_steady(env, &txn->tw.troika).ptr_c);
 | ||
|       goto retry_gc_refresh_oldest;
 | ||
|     }
 | ||
|   }
 | ||
| 
 | ||
|   if (unlikely(true ==
 | ||
|                atomic_load32(&env->lck->rdt_refresh_flag, mo_AcquireRelease))) {
 | ||
|     oldest = txn_snapshot_oldest(txn);
 | ||
|     if (oldest >= detent)
 | ||
|       goto retry_gc_have_oldest;
 | ||
|   }
 | ||
| 
 | ||
|   /* Avoid kick lagging reader(s) if is enough unallocated space
 | ||
|    * at the end of database file. */
 | ||
|   if (!(flags & ALLOC_RESERVE) && newnext <= txn->geo.end_pgno) {
 | ||
|     eASSERT(env, pgno == 0);
 | ||
|     goto done;
 | ||
|   }
 | ||
| 
 | ||
|   if (oldest < txn->txnid - xMDBX_TXNID_STEP) {
 | ||
|     oldest = mvcc_kick_laggards(env, oldest);
 | ||
|     if (oldest >= detent)
 | ||
|       goto retry_gc_have_oldest;
 | ||
|   }
 | ||
| 
 | ||
|   //---------------------------------------------------------------------------
 | ||
| 
 | ||
| no_gc:
 | ||
|   eASSERT(env, pgno == 0);
 | ||
| #ifndef MDBX_ENABLE_BACKLOG_DEPLETED
 | ||
| #define MDBX_ENABLE_BACKLOG_DEPLETED 0
 | ||
| #endif /* MDBX_ENABLE_BACKLOG_DEPLETED*/
 | ||
|   if (MDBX_ENABLE_BACKLOG_DEPLETED &&
 | ||
|       unlikely(!(txn->flags & txn_gc_drained))) {
 | ||
|     ret.err = MDBX_BACKLOG_DEPLETED;
 | ||
|     goto fail;
 | ||
|   }
 | ||
|   if (flags & ALLOC_RESERVE) {
 | ||
|     ret.err = MDBX_NOTFOUND;
 | ||
|     goto fail;
 | ||
|   }
 | ||
| 
 | ||
|   /* Will use new pages from the map if nothing is suitable in the GC. */
 | ||
|   newnext = txn->geo.first_unallocated + num;
 | ||
|   if (newnext <= txn->geo.end_pgno)
 | ||
|     goto done;
 | ||
| 
 | ||
|   if (newnext > txn->geo.upper || !txn->geo.grow_pv) {
 | ||
|     NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, newnext, txn->geo.upper);
 | ||
|     ret.err = MDBX_MAP_FULL;
 | ||
|     goto fail;
 | ||
|   }
 | ||
| 
 | ||
|   eASSERT(env, newnext > txn->geo.end_pgno);
 | ||
|   const size_t grow_step = pv2pages(txn->geo.grow_pv);
 | ||
|   size_t aligned = pgno_align2os_pgno(
 | ||
|       env, (pgno_t)(newnext + grow_step - newnext % grow_step));
 | ||
| 
 | ||
|   if (aligned > txn->geo.upper)
 | ||
|     aligned = txn->geo.upper;
 | ||
|   eASSERT(env, aligned >= newnext);
 | ||
| 
 | ||
|   VERBOSE("try growth datafile to %zu pages (+%zu)", aligned,
 | ||
|           aligned - txn->geo.end_pgno);
 | ||
|   ret.err = dxb_resize(env, txn->geo.first_unallocated, (pgno_t)aligned,
 | ||
|                        txn->geo.upper, implicit_grow);
 | ||
|   if (ret.err != MDBX_SUCCESS) {
 | ||
|     ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned,
 | ||
|           aligned - txn->geo.end_pgno, ret.err);
 | ||
|     goto fail;
 | ||
|   }
 | ||
|   env->txn->geo.end_pgno = (pgno_t)aligned;
 | ||
|   eASSERT(env, pgno == 0);
 | ||
| 
 | ||
|   //---------------------------------------------------------------------------
 | ||
| 
 | ||
| done:
 | ||
|   ret.err = MDBX_SUCCESS;
 | ||
|   if (likely((flags & ALLOC_RESERVE) == 0)) {
 | ||
|     if (pgno) {
 | ||
|       eASSERT(env,
 | ||
|               pgno + num <= txn->geo.first_unallocated && pgno >= NUM_METAS);
 | ||
|       eASSERT(env,
 | ||
|               pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated -
 | ||
|                                                       MDBX_ENABLE_REFUND));
 | ||
|     } else {
 | ||
|       pgno = txn->geo.first_unallocated;
 | ||
|       txn->geo.first_unallocated += (pgno_t)num;
 | ||
|       eASSERT(env, txn->geo.first_unallocated <= txn->geo.end_pgno);
 | ||
|       eASSERT(env,
 | ||
|               pgno >= NUM_METAS && pgno + num <= txn->geo.first_unallocated);
 | ||
|     }
 | ||
| 
 | ||
|     ret = page_alloc_finalize(env, txn, mc, pgno, num);
 | ||
|     if (unlikely(ret.err != MDBX_SUCCESS)) {
 | ||
|     fail:
 | ||
|       eASSERT(env, ret.err != MDBX_SUCCESS);
 | ||
|       eASSERT(env,
 | ||
|               pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated -
 | ||
|                                                       MDBX_ENABLE_REFUND));
 | ||
|       int level;
 | ||
|       const char *what;
 | ||
|       if (flags & ALLOC_RESERVE) {
 | ||
|         level = (flags & ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE;
 | ||
|         what = num ? "reserve-pages" : "fetch-slot";
 | ||
|       } else {
 | ||
|         txn->flags |= MDBX_TXN_ERROR;
 | ||
|         level = MDBX_LOG_ERROR;
 | ||
|         what = "pages";
 | ||
|       }
 | ||
|       if (LOG_ENABLED(level))
 | ||
|         debug_log(level, __func__, __LINE__,
 | ||
|                   "unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags "
 | ||
|                   "0x%x, re-list-len %zu, loose-count %zu, gc: height %u, "
 | ||
|                   "branch %zu, leaf %zu, large %zu, entries %zu\n",
 | ||
|                   num, what, flags, ret.err, txn->flags,
 | ||
|                   MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count,
 | ||
|                   txn->dbs[FREE_DBI].height,
 | ||
|                   (size_t)txn->dbs[FREE_DBI].branch_pages,
 | ||
|                   (size_t)txn->dbs[FREE_DBI].leaf_pages,
 | ||
|                   (size_t)txn->dbs[FREE_DBI].large_pages,
 | ||
|                   (size_t)txn->dbs[FREE_DBI].items);
 | ||
|       ret.page = nullptr;
 | ||
|     }
 | ||
|     if (num > 1)
 | ||
|       txn->tw.gc.time_acc += monotime_since_cached(monotime_begin, &now_cache);
 | ||
|   } else {
 | ||
|   early_exit:
 | ||
|     DEBUG("return nullptr for %zu pages for ALLOC_%s, rc %d", num,
 | ||
|           num ? "RESERVE" : "SLOT", ret.err);
 | ||
|     ret.page = nullptr;
 | ||
|   }
 | ||
| 
 | ||
| #if MDBX_ENABLE_PROFGC
 | ||
|   prof->rtime_monotonic += osal_monotime() - monotime_begin;
 | ||
| #endif /* MDBX_ENABLE_PROFGC */
 | ||
|   return ret;
 | ||
| }
 | ||
| 
 | ||
| __hot pgr_t gc_alloc_single(const MDBX_cursor *const mc) {
 | ||
|   MDBX_txn *const txn = mc->txn;
 | ||
|   tASSERT(txn, mc->txn->flags & MDBX_TXN_DIRTY);
 | ||
|   tASSERT(txn,
 | ||
|           F_ISSET(*cursor_dbi_state(mc), DBI_LINDO | DBI_VALID | DBI_DIRTY));
 | ||
| 
 | ||
|   /* If there are any loose pages, just use them */
 | ||
|   while (likely(txn->tw.loose_pages)) {
 | ||
| #if MDBX_ENABLE_REFUND
 | ||
|     if (unlikely(txn->tw.loose_refund_wl > txn->geo.first_unallocated)) {
 | ||
|       txn_refund(txn);
 | ||
|       if (!txn->tw.loose_pages)
 | ||
|         break;
 | ||
|     }
 | ||
| #endif /* MDBX_ENABLE_REFUND */
 | ||
| 
 | ||
|     page_t *lp = txn->tw.loose_pages;
 | ||
|     MDBX_ASAN_UNPOISON_MEMORY_REGION(lp, txn->env->ps);
 | ||
|     VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
 | ||
|     txn->tw.loose_pages = page_next(lp);
 | ||
|     txn->tw.loose_count--;
 | ||
|     DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, cursor_dbi_dbg(mc),
 | ||
|                 lp->pgno);
 | ||
|     tASSERT(txn, lp->pgno < txn->geo.first_unallocated);
 | ||
|     tASSERT(txn, lp->pgno >= NUM_METAS);
 | ||
|     VALGRIND_MAKE_MEM_UNDEFINED(page_data(lp), page_space(txn->env));
 | ||
|     lp->txnid = txn->front_txnid;
 | ||
|     pgr_t ret = {lp, MDBX_SUCCESS};
 | ||
|     return ret;
 | ||
|   }
 | ||
| 
 | ||
|   if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0))
 | ||
|     return page_alloc_finalize(txn->env, txn, mc, relist_get_single(txn), 1);
 | ||
| 
 | ||
|   return gc_alloc_ex(mc, 1, ALLOC_DEFAULT);
 | ||
| }
 |