/// \copyright SPDX-License-Identifier: Apache-2.0 /// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 #include "internals.h" void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) { tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && txn->tw.spilled.least_removed > 0); txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) ? idx : txn->tw.spilled.least_removed; txn->tw.spilled.list[idx] |= 1; MDBX_PNL_SETSIZE(txn->tw.spilled.list, MDBX_PNL_GETSIZE(txn->tw.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); while (unlikely(npages > 1)) { const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1; if (MDBX_PNL_ASCENDING) { if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) || (txn->tw.spilled.list[idx] >> 1) != pgno) return; } else { if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno) return; txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) ? idx : txn->tw.spilled.least_removed; } txn->tw.spilled.list[idx] |= 1; MDBX_PNL_SETSIZE(txn->tw.spilled.list, MDBX_PNL_GETSIZE(txn->tw.spilled.list) - (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); --npages; } } pnl_t spill_purge(MDBX_txn *txn) { tASSERT(txn, txn->tw.spilled.least_removed > 0); const pnl_t sl = txn->tw.spilled.list; if (txn->tw.spilled.least_removed != INT_MAX) { size_t len = MDBX_PNL_GETSIZE(sl), r, w; for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) { sl[w] = sl[r]; w += 1 - (sl[r] & 1); } for (size_t i = 1; i < w; ++i) tASSERT(txn, (sl[i] & 1) == 0); MDBX_PNL_SETSIZE(sl, w - 1); txn->tw.spilled.least_removed = INT_MAX; } else { for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i) tASSERT(txn, (sl[i] & 1) == 0); } return sl; } /*----------------------------------------------------------------------------*/ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, page_t *dp, const size_t npages) { tASSERT(txn, !(txn->flags & MDBX_WRITEMAP)); #if MDBX_ENABLE_PGOP_STAT txn->env->lck->pgops.spill.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ const pgno_t pgno = dp->pgno; int err = iov_page(txn, ctx, dp, npages); if (likely(err == MDBX_SUCCESS)) err = spill_append_span(&txn->tw.spilled.list, pgno, npages); return err; } /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ static size_t spill_cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); size_t keep = 0; while (!is_poor(mc)) { tASSERT(txn, mc->top >= 0); const page_t *mp; intptr_t i = 0; do { mp = mc->pg[i]; tASSERT(txn, !is_subpage(mp)); if (is_modifable(txn, mp)) { size_t const n = dpl_search(txn, mp->pgno); if (txn->tw.dirtylist->items[n].pgno == mp->pgno && /* не считаем дважды */ dpl_age(txn, n)) { size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t)); *ptr = txn->tw.dirtylru; tASSERT(txn, dpl_age(txn, n) == 0); ++keep; } } } while (++i <= mc->top); tASSERT(txn, is_leaf(mp)); if (!mc->subcur || mc->ki[mc->top] >= page_numkeys(mp)) break; if (!(node_flags(page_node(mp, mc->ki[mc->top])) & N_TREE)) break; mc = &mc->subcur->cursor; } return keep; } static size_t spill_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { tASSERT(txn, (txn->flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); dpl_lru_turn(txn); size_t keep = m0 ? spill_cursor_keep(txn, m0) : 0; TXN_FOREACH_DBI_ALL(txn, dbi) { if (F_ISSET(txn->dbi_state[dbi], DBI_DIRTY | DBI_VALID) && txn->dbs[dbi].root != P_INVALID) for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next) if (mc != m0) keep += spill_cursor_keep(txn, mc); } return keep; } /* Returns the spilling priority (0..255) for a dirty page: * 0 = should be spilled; * ... * > 255 = must not be spilled. */ MDBX_NOTHROW_PURE_FUNCTION static unsigned spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { dpl_t *const dl = txn->tw.dirtylist; const uint32_t age = dpl_age(txn, i); const size_t npages = dpl_npages(dl, i); const pgno_t pgno = dl->items[i].pgno; if (age == 0) { DEBUG("skip %s %zu page %" PRIaPGNO, "keep", npages, pgno); return 256; } page_t *const dp = dl->items[i].ptr; if (dp->flags & (P_LOOSE | P_SPILLED)) { DEBUG("skip %s %zu page %" PRIaPGNO, (dp->flags & P_LOOSE) ? "loose" : "parent-spilled", npages, pgno); return 256; } /* Can't spill twice, * make sure it's not already in a parent's spill list(s). */ MDBX_txn *parent = txn->parent; if (parent && (parent->flags & MDBX_TXN_SPILLS)) { do if (spill_intersect(parent, pgno, npages)) { DEBUG("skip-2 parent-spilled %zu page %" PRIaPGNO, npages, pgno); dp->flags |= P_SPILLED; return 256; } while ((parent = parent->parent) != nullptr); } tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX); unsigned prio = age * reciprocal >> 24; tASSERT(txn, prio < 256); if (likely(npages == 1)) return prio = 256 - prio; /* make a large/overflow pages be likely to spill */ size_t factor = npages | npages >> 1; factor |= factor >> 2; factor |= factor >> 4; factor |= factor >> 8; factor |= factor >> 16; factor = (size_t)prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; factor = (factor < 256) ? 255 - factor : 0; tASSERT(txn, factor < 256 && factor < (256 - prio)); return prio = (unsigned)factor; } static size_t spill_gate(const MDBX_env *env, intptr_t part, const size_t total) { const intptr_t spill_min = env->options.spill_min_denominator ? (total + env->options.spill_min_denominator - 1) / env->options.spill_min_denominator : 1; const intptr_t spill_max = total - (env->options.spill_max_denominator ? total / env->options.spill_max_denominator : 0); part = (part < spill_max) ? part : spill_max; part = (part > spill_min) ? part : spill_min; eASSERT(env, part >= 0 && (size_t)part <= total); return (size_t)part; } __cold int spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, const intptr_t wanna_spill_entries, const intptr_t wanna_spill_npages, const size_t need) { tASSERT(txn, (txn->flags & MDBX_TXN_RDONLY) == 0); int rc = MDBX_SUCCESS; if (unlikely(txn->tw.loose_count >= (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose : txn->tw.writemap_dirty_npages))) goto done; const size_t dirty_entries = txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1; const size_t dirty_npages = (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose : txn->tw.writemap_dirty_npages) - txn->tw.loose_count; const size_t need_spill_entries = spill_gate(txn->env, wanna_spill_entries, dirty_entries); const size_t need_spill_npages = spill_gate(txn->env, wanna_spill_npages, dirty_npages); const size_t need_spill = (need_spill_entries > need_spill_npages) ? need_spill_entries : need_spill_npages; if (!need_spill) goto done; if (txn->flags & MDBX_WRITEMAP) { NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", dirty_entries, dirty_npages); const MDBX_env *env = txn->env; tASSERT(txn, txn->tw.spilled.list == nullptr); rc = osal_msync(&txn->env->dxb_mmap, 0, pgno_align2os_bytes(env, txn->geo.first_unallocated), MDBX_SYNC_KICK); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; #if MDBX_AVOID_MSYNC MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); tASSERT(txn, dpl_check(txn)); env->lck->unsynced_pages.weak += txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; dpl_clear(txn->tw.dirtylist); txn->tw.dirtyroom = env->options.dp_limit - txn->tw.loose_count; for (page_t *lp = txn->tw.loose_pages; lp != nullptr; lp = page_next(lp)) { tASSERT(txn, lp->flags == P_LOOSE); rc = dpl_append(txn, lp->pgno, lp, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); } tASSERT(txn, dpl_check(txn)); #else tASSERT(txn, txn->tw.dirtylist == nullptr); env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages; txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages; txn->tw.writemap_dirty_npages = 0; #endif /* MDBX_AVOID_MSYNC */ goto done; } NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", need_spill_entries, need_spill_npages); MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= need_spill_npages); if (!txn->tw.spilled.list) { txn->tw.spilled.least_removed = INT_MAX; txn->tw.spilled.list = pnl_alloc(need_spill); if (unlikely(!txn->tw.spilled.list)) { rc = MDBX_ENOMEM; bailout: txn->flags |= MDBX_TXN_ERROR; return rc; } } else { /* purge deleted slots */ spill_purge(txn); rc = pnl_reserve(&txn->tw.spilled.list, need_spill); (void)rc /* ignore since the resulting list may be shorter and pnl_append() will increase pnl on demand */ ; } /* Сортируем чтобы запись на диск была полее последовательна */ dpl_t *const dl = dpl_sort(txn); /* Preserve pages which may soon be dirtied again */ const size_t unspillable = spill_txn_keep(txn, m0); if (unspillable + txn->tw.loose_count >= dl->length) { #if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) return MDBX_SUCCESS; #endif /* xMDBX_DEBUG_SPILLING */ ERROR("all %zu dirty pages are unspillable since referenced " "by a cursor(s), use fewer cursors or increase " "MDBX_opt_txn_dp_limit", unspillable); goto done; } /* Подзадача: Вытолкнуть часть страниц на диск в соответствии с LRU, * но при этом учесть важные поправки: * - лучше выталкивать старые large/overflow страницы, так будет освобождено * больше памяти, а также так как они (в текущем понимании) гораздо реже * повторно изменяются; * - при прочих равных лучше выталкивать смежные страницы, так будет * меньше I/O операций; * - желательно потратить на это меньше времени чем std::partial_sort_copy; * * Решение: * - Квантуем весь диапазон lru-меток до 256 значений и задействуем один * проход 8-битного radix-sort. В результате получаем 256 уровней * "свежести", в том числе значение lru-метки, старее которой страницы * должны быть выгружены; * - Двигаемся последовательно в сторону увеличения номеров страниц * и выталкиваем страницы с lru-меткой старее отсекающего значения, * пока не вытолкнем достаточно; * - Встречая страницы смежные с выталкиваемыми для уменьшения кол-ва * I/O операций выталкиваем и их, если они попадают в первую половину * между выталкиваемыми и самыми свежими lru-метками; * - дополнительно при сортировке умышленно старим large/overflow страницы, * тем самым повышая их шансы на выталкивание. */ /* get min/max of LRU-labels */ uint32_t age_max = 0; for (size_t i = 1; i <= dl->length; ++i) { const uint32_t age = dpl_age(txn, i); age_max = (age_max >= age) ? age_max : age; } VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); /* half of 8-bit radix-sort */ pgno_t radix_entries[256], radix_npages[256]; memset(&radix_entries, 0, sizeof(radix_entries)); memset(&radix_npages, 0, sizeof(radix_npages)); size_t spillable_entries = 0, spillable_npages = 0; const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); for (size_t i = 1; i <= dl->length; ++i) { const unsigned prio = spill_prio(txn, i, reciprocal); size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); TRACE("page %" PRIaPGNO ", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u", dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N', dpl_npages(dl, i), dpl_age(txn, i), age_max, prio); if (prio < 256) { radix_entries[prio] += 1; spillable_entries += 1; const pgno_t npages = dpl_npages(dl, i); radix_npages[prio] += npages; spillable_npages += npages; } } tASSERT(txn, spillable_npages >= spillable_entries); pgno_t spilled_entries = 0, spilled_npages = 0; if (likely(spillable_entries > 0)) { size_t prio2spill = 0, prio2adjacent = 128, amount_entries = radix_entries[0], amount_npages = radix_npages[0]; for (size_t i = 1; i < 256; i++) { if (amount_entries < need_spill_entries || amount_npages < need_spill_npages) { prio2spill = i; prio2adjacent = i + (257 - i) / 2; amount_entries += radix_entries[i]; amount_npages += radix_npages[i]; } else if (amount_entries + amount_entries < spillable_entries + need_spill_entries /* РАВНОЗНАЧНО: amount - need_spill < spillable - amount */ || amount_npages + amount_npages < spillable_npages + need_spill_npages) { prio2adjacent = i; amount_entries += radix_entries[i]; amount_npages += radix_npages[i]; } else break; } VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %zu/%zu," " wanna-spill %zu/%zu, amount %zu/%zu", prio2spill, prio2adjacent, spillable_entries, spillable_npages, need_spill_entries, need_spill_npages, amount_entries, amount_npages); tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); iov_ctx_t ctx; rc = iov_init(txn, &ctx, amount_entries, amount_npages, #if defined(_WIN32) || defined(_WIN64) txn->env->ioring.overlapped_fd ? txn->env->ioring.overlapped_fd : #endif txn->env->lazy_fd, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; size_t r = 0, w = 0; pgno_t last = 0; while (r < dl->length && (spilled_entries < need_spill_entries || spilled_npages < need_spill_npages)) { dl->items[++w] = dl->items[++r]; unsigned prio = spill_prio(txn, w, reciprocal); if (prio > prio2spill && (prio >= prio2adjacent || last != dl->items[w].pgno)) continue; const size_t e = w; last = dpl_endpgno(dl, w); while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno && spill_prio(txn, w, reciprocal) < prio2adjacent) ; for (size_t i = w; ++i <= e;) { const unsigned npages = dpl_npages(dl, i); prio = spill_prio(txn, i, reciprocal); DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)", (prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno, dpl_age(txn, i), prio); tASSERT(txn, prio < 256); ++spilled_entries; spilled_npages += npages; rc = spill_page(txn, &ctx, dl->items[i].ptr, npages); if (unlikely(rc != MDBX_SUCCESS)) goto failed; } } VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, spilled_npages); tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); tASSERT(txn, spilled_npages >= spilled_entries); failed: while (r < dl->length) dl->items[++w] = dl->items[++r]; tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS); dl->sorted = dpl_setlen(dl, w); txn->tw.dirtyroom += spilled_entries; txn->tw.dirtylist->pages_including_loose -= spilled_npages; tASSERT(txn, dpl_check(txn)); if (!iov_empty(&ctx)) { tASSERT(txn, rc == MDBX_SUCCESS); rc = iov_write(&ctx); } if (unlikely(rc != MDBX_SUCCESS)) goto bailout; txn->env->lck->unsynced_pages.weak += spilled_npages; pnl_sort(txn->tw.spilled.list, (size_t)txn->geo.first_unallocated << 1); txn->flags |= MDBX_TXN_SPILLS; NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", spilled_entries, spilled_npages, txn->tw.dirtyroom); } else { tASSERT(txn, rc == MDBX_SUCCESS); for (size_t i = 1; i <= dl->length; ++i) { page_t *dp = dl->items[i].ptr; VERBOSE("unspillable[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, dp->pgno, dpl_npages(dl, i), dp->flags, dpl_age(txn, i), spill_prio(txn, i, reciprocal)); } } #if xMDBX_DEBUG_SPILLING == 2 if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; " "needed %zu, spillable %zu; " "spilled %u dirty-entries, now have %zu dirty-room", dl->length + spilled_entries, dl->length, (txn->parent && txn->parent->tw.dirtylist) ? (intptr_t)txn->parent->tw.dirtylist->length : -1, txn->tw.loose_count, need, spillable_entries, spilled_entries, txn->tw.dirtyroom); ENSURE(txn->env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); #endif /* xMDBX_DEBUG_SPILLING */ done: return likely(txn->tw.dirtyroom + txn->tw.loose_count > ((need > CURSOR_STACK_SIZE) ? CURSOR_STACK_SIZE : need)) ? MDBX_SUCCESS : MDBX_TXN_FULL; }