mdbx: перенос LRU-отметок в теневые страницы по отрицательному смещению.

Это позволяет избавиться от повторного поиска в "гзязном" списке
страниц, уже находящихся в стеке курсора, для обнлвления LRU-отметок.
This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2023-01-01 01:26:55 +03:00
parent 8519fde741
commit f0c2927fc7
2 changed files with 55 additions and 46 deletions

View File

@ -2768,7 +2768,7 @@ static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) {
dl->length = len; dl->length = len;
dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE; dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE;
dl->items[len + 1].pgno = P_INVALID; dl->items[len + 1].pgno = P_INVALID;
dl->items[len + 1].mlru = 0; dl->items[len + 1].npages = 1;
return len; return len;
} }
@ -2783,7 +2783,7 @@ static __always_inline void dpl_clear(MDBX_dpl *dl) {
dl->pages_including_loose = 0; dl->pages_including_loose = 0;
dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB;
dl->items[0].pgno = 0; dl->items[0].pgno = 0;
dl->items[0].mlru = 0; dl->items[0].npages = 1;
assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID);
} }
@ -2954,9 +2954,7 @@ __hot __noinline static size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) {
MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned
dpl_npages(const MDBX_dpl *dl, size_t i) { dpl_npages(const MDBX_dpl *dl, size_t i) {
assert(0 <= (intptr_t)i && i <= dl->length); assert(0 <= (intptr_t)i && i <= dl->length);
unsigned n = 1; unsigned n = dl->items[i].npages;
if (unlikely(dl->items[i].mlru & MDBX_dp_multi_mask))
n = dl->items[i].ptr->mp_pages;
assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1));
return n; return n;
} }
@ -3047,13 +3045,14 @@ static void dpl_remove(const MDBX_txn *txn, size_t i) {
static __noinline void txn_lru_reduce(MDBX_txn *txn) { static __noinline void txn_lru_reduce(MDBX_txn *txn) {
NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1); NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1);
tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
do { do {
txn->tw.dirtylru >>= 1; txn->tw.dirtylru >>= 1;
MDBX_dpl *dl = txn->tw.dirtylist; MDBX_dpl *dl = txn->tw.dirtylist;
for (size_t i = 1; i <= dl->length; ++i) { for (size_t i = 1; i <= dl->length; ++i) {
uint32_t mlru = dl->items[i].mlru; size_t *const ptr =
mlru = (mlru & MDBX_dp_multi_mask) + ((mlru >> 1) & MDBX_dp_lru_mask); ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
dl->items[i].mlru = mlru; *ptr >>= 1;
} }
txn = txn->mt_parent; txn = txn->mt_parent;
} while (txn); } while (txn);
@ -3061,18 +3060,19 @@ static __noinline void txn_lru_reduce(MDBX_txn *txn) {
MDBX_NOTHROW_PURE_FUNCTION static __inline uint32_t dpl_age(const MDBX_txn *txn, MDBX_NOTHROW_PURE_FUNCTION static __inline uint32_t dpl_age(const MDBX_txn *txn,
size_t i) { size_t i) {
tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const MDBX_dpl *dl = txn->tw.dirtylist; const MDBX_dpl *dl = txn->tw.dirtylist;
assert((intptr_t)i > 0 && i <= dl->length); assert((intptr_t)i > 0 && i <= dl->length);
return (txn->tw.dirtylru >> 1) - (dl->items[i].mlru >> 1); size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
return txn->tw.dirtylru - (uint32_t)*ptr;
} }
static __inline uint32_t txn_lru_turn(MDBX_txn *txn) { static __inline uint32_t txn_lru_turn(MDBX_txn *txn) {
txn->tw.dirtylru += 2; txn->tw.dirtylru += 1;
if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3)) if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) &&
(txn->mt_flags & MDBX_WRITEMAP) == 0)
txn_lru_reduce(txn); txn_lru_reduce(txn);
return txn->tw.dirtylru & MDBX_dp_lru_mask; return txn->tw.dirtylru;
} }
static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, static __always_inline int __must_check_result dpl_append(MDBX_txn *txn,
@ -3081,7 +3081,12 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn,
size_t npages) { size_t npages) {
tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0);
tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
const MDBX_dp dp = {page, pgno, txn_lru_turn(txn) + (npages > 1)}; const MDBX_dp dp = {page, pgno, (pgno_t)npages};
if ((txn->mt_flags & MDBX_WRITEMAP) == 0) {
size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->tw.dirtylru;
}
MDBX_dpl *dl = txn->tw.dirtylist; MDBX_dpl *dl = txn->tw.dirtylist;
tASSERT(txn, dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); tASSERT(txn, dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE);
tASSERT(txn, dl->items[0].pgno == 0 && tASSERT(txn, dl->items[0].pgno == 0 &&
@ -3166,7 +3171,7 @@ static __always_inline int __must_check_result dpl_append(MDBX_txn *txn,
#else #else
i[1].ptr = i->ptr; i[1].ptr = i->ptr;
i[1].pgno = i->pgno; i[1].pgno = i->pgno;
i[1].mlru = i->mlru; i[1].npages = i->npages;
#endif #endif
--i; --i;
} }
@ -3784,12 +3789,13 @@ static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) {
env->me_dp_reserve_len -= 1; env->me_dp_reserve_len -= 1;
} else { } else {
size = pgno2bytes(env, num); size = pgno2bytes(env, num);
np = osal_malloc(size); void *const ptr = osal_malloc(size + sizeof(size_t));
if (unlikely(!np)) { if (unlikely(!ptr)) {
txn->mt_flags |= MDBX_TXN_ERROR; txn->mt_flags |= MDBX_TXN_ERROR;
return np; return nullptr;
} }
VALGRIND_MEMPOOL_ALLOC(env, np, size); VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t));
np = ptr_disp(ptr, sizeof(size_t));
} }
if ((env->me_flags & MDBX_NOMEMINIT) == 0) { if ((env->me_flags & MDBX_NOMEMINIT) == 0) {
@ -3826,8 +3832,9 @@ static void dpage_free(MDBX_env *env, MDBX_page *dp, size_t npages) {
env->me_dp_reserve_len += 1; env->me_dp_reserve_len += 1;
} else { } else {
/* large pages just get freed directly */ /* large pages just get freed directly */
VALGRIND_MEMPOOL_FREE(env, dp); void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
osal_free(dp); VALGRIND_MEMPOOL_FREE(env, ptr);
osal_free(ptr);
} }
} }
@ -3879,10 +3886,12 @@ MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) {
if (unlikely(dp->mp_pgno != dl->items[i].pgno)) if (unlikely(dp->mp_pgno != dl->items[i].pgno))
return false; return false;
if ((txn->mt_flags & MDBX_WRITEMAP) == 0) {
const uint32_t age = dpl_age(txn, i); const uint32_t age = dpl_age(txn, i);
tASSERT(txn, age < UINT32_MAX / 3); tASSERT(txn, age < UINT32_MAX / 3);
if (unlikely(age > UINT32_MAX / 3)) if (unlikely(age > UINT32_MAX / 3))
return false; return false;
}
tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp));
if (dp->mp_flags == P_LOOSE) { if (dp->mp_flags == P_LOOSE) {
@ -4766,8 +4775,7 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp,
/* Set unspillable LRU-label for dirty pages watched by txn. /* Set unspillable LRU-label for dirty pages watched by txn.
* Returns the number of pages marked as unspillable. */ * Returns the number of pages marked as unspillable. */
static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) {
tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0);
size_t keep = 0; size_t keep = 0;
while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) {
tASSERT(txn, mc->mc_top == mc->mc_snum - 1); tASSERT(txn, mc->mc_top == mc->mc_snum - 1);
@ -4780,9 +4788,9 @@ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) {
size_t const n = dpl_search(txn, mp->mp_pgno); size_t const n = dpl_search(txn, mp->mp_pgno);
if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno &&
/* не считаем дважды */ dpl_age(txn, n)) { /* не считаем дважды */ dpl_age(txn, n)) {
txn->tw.dirtylist->items[n].mlru = size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr,
(txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + -(ptrdiff_t)sizeof(size_t));
(txn->tw.dirtylru & MDBX_dp_lru_mask); *ptr = txn->tw.dirtylru;
tASSERT(txn, dpl_age(txn, n) == 0); tASSERT(txn, dpl_age(txn, n) == 0);
++keep; ++keep;
} }
@ -4801,8 +4809,7 @@ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) {
} }
static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) {
tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0);
tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0);
txn_lru_turn(txn); txn_lru_turn(txn);
size_t keep = m0 ? cursor_keep(txn, m0) : 0; size_t keep = m0 ? cursor_keep(txn, m0) : 0;
for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i)
@ -5085,10 +5092,10 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1);
for (size_t i = 1; i <= dl->length; ++i) { for (size_t i = 1; i <= dl->length; ++i) {
const unsigned prio = spill_prio(txn, i, reciprocal); const unsigned prio = spill_prio(txn, i, reciprocal);
size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t));
TRACE("page %" PRIaPGNO TRACE("page %" PRIaPGNO
", lru %u, is_multi %c, npages %u, age %u of %u, prio %u", ", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u",
dl->items[i].pgno, dl->items[i].mlru & MDBX_dp_lru_mask, dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N',
(dl->items[i].mlru & MDBX_dp_multi_mask) ? 'Y' : 'N',
dpl_npages(dl, i), dpl_age(txn, i), age_max, prio); dpl_npages(dl, i), dpl_age(txn, i), age_max, prio);
if (prio < 256) { if (prio < 256) {
radix_entries[prio] += 1; radix_entries[prio] += 1;
@ -7858,9 +7865,11 @@ __hot static int page_touch(MDBX_cursor *mc) {
tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length); tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length);
tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->mp_pgno &&
txn->tw.dirtylist->items[n].ptr == mp); txn->tw.dirtylist->items[n].ptr == mp);
txn->tw.dirtylist->items[n].mlru = if (!MDBX_AVOID_MSYNC || (txn->mt_flags & MDBX_WRITEMAP) == 0) {
(txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + size_t *const ptr =
(txn->tw.dirtylru & MDBX_dp_lru_mask); ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t));
*ptr = txn->tw.dirtylru;
}
return MDBX_SUCCESS; return MDBX_SUCCESS;
} }
if (IS_SUBP(mp)) { if (IS_SUBP(mp)) {
@ -15280,7 +15289,9 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) {
MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize);
VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *));
env->me_dp_reserve = mp_next(dp); env->me_dp_reserve = mp_next(dp);
osal_free(dp); void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
VALGRIND_MEMPOOL_FREE(env, ptr);
osal_free(ptr);
} }
VALGRIND_DESTROY_MEMPOOL(env); VALGRIND_DESTROY_MEMPOOL(env);
ENSURE(env, env->me_lcklist_next == nullptr); ENSURE(env, env->me_lcklist_next == nullptr);
@ -24653,8 +24664,9 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option,
MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize);
VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *));
env->me_dp_reserve = mp_next(dp); env->me_dp_reserve = mp_next(dp);
VALGRIND_MEMPOOL_FREE(env, dp); void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t));
osal_free(dp); VALGRIND_MEMPOOL_FREE(env, ptr);
osal_free(ptr);
env->me_dp_reserve_len -= 1; env->me_dp_reserve_len -= 1;
} }
} }

View File

@ -1059,10 +1059,7 @@ typedef txnid_t *MDBX_TXL;
/* An Dirty-Page list item is an pgno/pointer pair. */ /* An Dirty-Page list item is an pgno/pointer pair. */
typedef struct MDBX_dp { typedef struct MDBX_dp {
MDBX_page *ptr; MDBX_page *ptr;
pgno_t pgno; pgno_t pgno, npages;
uint32_t mlru;
#define MDBX_dp_multi_mask 1
#define MDBX_dp_lru_mask UINT32_C(0xffffFFFe)
} MDBX_dp; } MDBX_dp;
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */