/// \copyright SPDX-License-Identifier: Apache-2.0 /// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 /// /// \file sort.h /// \brief Маркосы реализующие сортировку и двоичный поиск #pragma once #define MDBX_RADIXSORT_THRESHOLD 142 /* --------------------------------------------------------------------------- * LY: State of the art quicksort-based sorting, with internal stack * and network-sort for small chunks. * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */ #if MDBX_HAVE_CMOV #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ do { \ const TYPE swap_tmp = (a); \ const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \ (a) = swap_cmp ? swap_tmp : b; \ (b) = swap_cmp ? b : swap_tmp; \ } while (0) #else #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ do \ if (expect_with_probability(!CMP(a, b), 0, .5)) { \ const TYPE swap_tmp = (a); \ (a) = (b); \ (b) = swap_tmp; \ } \ while (0) #endif // 3 comparators, 3 parallel operations // o-----^--^--o // | | // o--^--|--v--o // | | // o--v--v-----o // // [[1,2]] // [[0,2]] // [[0,1]] #define SORT_NETWORK_3(TYPE, CMP, begin) \ do { \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ } while (0) // 5 comparators, 3 parallel operations // o--^--^--------o // | | // o--v--|--^--^--o // | | | // o--^--v--|--v--o // | | // o--v-----v-----o // // [[0,1],[2,3]] // [[0,2],[1,3]] // [[1,2]] #define SORT_NETWORK_4(TYPE, CMP, begin) \ do { \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ } while (0) // 9 comparators, 5 parallel operations // o--^--^-----^-----------o // | | | // o--|--|--^--v-----^--^--o // | | | | | // o--|--v--|--^--^--|--v--o // | | | | | // o--|-----v--|--v--|--^--o // | | | | // o--v--------v-----v--v--o // // [[0,4],[1,3]] // [[0,2]] // [[2,4],[0,1]] // [[2,3],[1,4]] // [[1,2],[3,4]] #define SORT_NETWORK_5(TYPE, CMP, begin) \ do { \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ } while (0) // 12 comparators, 6 parallel operations // o-----^--^--^-----------------o // | | | // o--^--|--v--|--^--------^-----o // | | | | | // o--v--v-----|--|--^--^--|--^--o // | | | | | | // o-----^--^--v--|--|--|--v--v--o // | | | | | // o--^--|--v-----v--|--v--------o // | | | // o--v--v-----------v-----------o // // [[1,2],[4,5]] // [[0,2],[3,5]] // [[0,1],[3,4],[2,5]] // [[0,3],[1,4]] // [[2,4],[1,3]] // [[2,3]] #define SORT_NETWORK_6(TYPE, CMP, begin) \ do { \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ } while (0) // 16 comparators, 6 parallel operations // o--^--------^-----^-----------------o // | | | // o--|--^-----|--^--v--------^--^-----o // | | | | | | // o--|--|--^--v--|--^-----^--|--v-----o // | | | | | | | // o--|--|--|-----v--|--^--v--|--^--^--o // | | | | | | | | // o--v--|--|--^-----v--|--^--v--|--v--o // | | | | | | // o-----v--|--|--------v--v-----|--^--o // | | | | // o--------v--v-----------------v--v--o // // [[0,4],[1,5],[2,6]] // [[0,2],[1,3],[4,6]] // [[2,4],[3,5],[0,1]] // [[2,3],[4,5]] // [[1,4],[3,6]] // [[1,2],[3,4],[5,6]] #define SORT_NETWORK_7(TYPE, CMP, begin) \ do { \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ } while (0) // 19 comparators, 6 parallel operations // o--^--------^-----^-----------------o // | | | // o--|--^-----|--^--v--------^--^-----o // | | | | | | // o--|--|--^--v--|--^-----^--|--v-----o // | | | | | | | // o--|--|--|--^--v--|--^--v--|--^--^--o // | | | | | | | | | // o--v--|--|--|--^--v--|--^--v--|--v--o // | | | | | | | // o-----v--|--|--|--^--v--v-----|--^--o // | | | | | | // o--------v--|--v--|--^--------v--v--o // | | | // o-----------v-----v--v--------------o // // [[0,4],[1,5],[2,6],[3,7]] // [[0,2],[1,3],[4,6],[5,7]] // [[2,4],[3,5],[0,1],[6,7]] // [[2,3],[4,5]] // [[1,4],[3,6]] // [[1,2],[3,4],[5,6]] #define SORT_NETWORK_8(TYPE, CMP, begin) \ do { \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ } while (0) #define SORT_INNER(TYPE, CMP, begin, end, len) \ switch (len) { \ default: \ assert(false); \ __unreachable(); \ case 0: \ case 1: \ break; \ case 2: \ SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ break; \ case 3: \ SORT_NETWORK_3(TYPE, CMP, begin); \ break; \ case 4: \ SORT_NETWORK_4(TYPE, CMP, begin); \ break; \ case 5: \ SORT_NETWORK_5(TYPE, CMP, begin); \ break; \ case 6: \ SORT_NETWORK_6(TYPE, CMP, begin); \ break; \ case 7: \ SORT_NETWORK_7(TYPE, CMP, begin); \ break; \ case 8: \ SORT_NETWORK_8(TYPE, CMP, begin); \ break; \ } #define SORT_SWAP(TYPE, a, b) \ do { \ const TYPE swap_tmp = (a); \ (a) = (b); \ (b) = swap_tmp; \ } while (0) #define SORT_PUSH(low, high) \ do { \ top->lo = (low); \ top->hi = (high); \ ++top; \ } while (0) #define SORT_POP(low, high) \ do { \ --top; \ low = top->lo; \ high = top->hi; \ } while (0) #define SORT_IMPL(NAME, EXPECT_LOW_CARDINALITY_OR_PRESORTED, TYPE, CMP) \ \ static inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \ while (++first <= last) \ if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \ return false; \ return true; \ } \ \ typedef struct { \ TYPE *lo, *hi; \ } NAME##_stack; \ \ __hot static void NAME(TYPE *const __restrict begin, \ TYPE *const __restrict end) { \ NAME##_stack stack[sizeof(size_t) * CHAR_BIT], *__restrict top = stack; \ \ TYPE *__restrict hi = end - 1; \ TYPE *__restrict lo = begin; \ while (true) { \ const ptrdiff_t len = hi - lo; \ if (len < 8) { \ SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \ if (unlikely(top == stack)) \ break; \ SORT_POP(lo, hi); \ continue; \ } \ \ TYPE *__restrict mid = lo + (len >> 1); \ SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \ SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ \ TYPE *right = hi - 1; \ TYPE *left = lo + 1; \ while (1) { \ while (expect_with_probability(CMP(*left, *mid), 0, .5)) \ ++left; \ while (expect_with_probability(CMP(*mid, *right), 0, .5)) \ --right; \ if (unlikely(left > right)) { \ if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \ if (NAME##_is_sorted(lo, right)) \ lo = right + 1; \ if (NAME##_is_sorted(left, hi)) \ hi = left; \ } \ break; \ } \ SORT_SWAP(TYPE, *left, *right); \ mid = (mid == left) ? right : (mid == right) ? left : mid; \ ++left; \ --right; \ } \ \ if (right - lo > hi - left) { \ SORT_PUSH(lo, right); \ lo = left; \ } else { \ SORT_PUSH(left, hi); \ hi = right; \ } \ } \ \ if (AUDIT_ENABLED()) { \ for (TYPE *scan = begin + 1; scan < end; ++scan) \ assert(CMP(scan[-1], scan[0])); \ } \ } /*------------------------------------------------------------------------------ * LY: radix sort for large chunks */ #define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \ \ __hot static bool NAME##_radixsort(TYPE *const begin, const size_t length) { \ TYPE *tmp; \ if (BUFFER_PREALLOCATED) { \ tmp = begin + length + END_GAP; \ /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \ } else { \ tmp = osal_malloc(sizeof(TYPE) * length); \ if (unlikely(!tmp)) \ return false; \ } \ \ size_t key_shift = 0, key_diff_mask; \ do { \ struct { \ pgno_t a[256], b[256]; \ } counters; \ memset(&counters, 0, sizeof(counters)); \ \ key_diff_mask = 0; \ size_t prev_key = EXTRACT_KEY(begin) >> key_shift; \ TYPE *r = begin, *end = begin + length; \ do { \ const size_t key = EXTRACT_KEY(r) >> key_shift; \ counters.a[key & 255]++; \ counters.b[(key >> 8) & 255]++; \ key_diff_mask |= prev_key ^ key; \ prev_key = key; \ } while (++r != end); \ \ pgno_t ta = 0, tb = 0; \ for (size_t i = 0; i < 256; ++i) { \ const pgno_t ia = counters.a[i]; \ counters.a[i] = ta; \ ta += ia; \ const pgno_t ib = counters.b[i]; \ counters.b[i] = tb; \ tb += ib; \ } \ \ r = begin; \ do { \ const size_t key = EXTRACT_KEY(r) >> key_shift; \ tmp[counters.a[key & 255]++] = *r; \ } while (++r != end); \ \ if (unlikely(key_diff_mask < 256)) { \ memcpy(begin, tmp, ptr_dist(end, begin)); \ break; \ } \ end = (r = tmp) + length; \ do { \ const size_t key = EXTRACT_KEY(r) >> key_shift; \ begin[counters.b[(key >> 8) & 255]++] = *r; \ } while (++r != end); \ \ key_shift += 16; \ } while (key_diff_mask >> 16); \ \ if (!(BUFFER_PREALLOCATED)) \ osal_free(tmp); \ return true; \ } /*------------------------------------------------------------------------------ * LY: Binary search */ #if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__) #define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ do \ __asm __volatile("" \ : "+r"(size) \ : "r" /* the `b` constraint is more suitable here, but \ cause CLANG to allocate and push/pop an one more \ register, so using the `r` which avoids this. */ \ (flag)); \ while (0) #else #define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ do { \ /* nope for non-clang or non-x86 */; \ } while (0) #endif /* Workaround for CLANG */ /* *INDENT-OFF* */ /* clang-format off */ #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ static __always_inline const TYPE_LIST *NAME( \ const TYPE_LIST *it, size_t length, const TYPE_ARG item) { \ const TYPE_LIST *const begin = it, *const end = begin + length; \ \ if (MDBX_HAVE_CMOV) \ do { \ /* Адаптивно-упрощенный шаг двоичного поиска: \ * - без переходов при наличии cmov или аналога; \ * - допускает лишние итерации; \ * - но ищет пока size > 2, что требует дозавершения поиска \ * среди остающихся 0-1-2 элементов. */ \ const TYPE_LIST *const middle = it + (length >> 1); \ length = (length + 1) >> 1; \ const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \ it = flag ? middle : it; \ } while (length > 2); \ else \ while (length > 2) { \ /* Вариант с использованием условного перехода. Основное отличие в \ * том, что при "не равно" (true от компаратора) переход делается на 1 \ * ближе к концу массива. Алгоритмически это верно и обеспечивает \ * чуть-чуть более быструю сходимость, но зато требует больше \ * вычислений при true от компаратора. Также ВАЖНО(!) не допускается \ * спекулятивное выполнение при size == 0. */ \ const TYPE_LIST *const middle = it + (length >> 1); \ length = (length + 1) >> 1; \ const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ if (flag) { \ it = middle + 1; \ length -= 1; \ } \ } \ it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \ it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \ \ if (AUDIT_ENABLED()) { \ for (const TYPE_LIST *scan = begin; scan < it; ++scan) \ assert(CMP(*scan, item)); \ for (const TYPE_LIST *scan = it; scan < end; ++scan) \ assert(!CMP(*scan, item)); \ (void)begin, (void)end; \ } \ \ return it; \ } /* *INDENT-ON* */ /* clang-format on */