mirror of
https://github.com/isar/libmdbx.git
synced 2024-10-29 23:19:20 +08:00
mdbx: спиллинг грязных страниц с учетом их суммарного размера.
This commit is contained in:
parent
f6eec7195b
commit
940ef30659
2
TODO.md
2
TODO.md
@ -16,7 +16,6 @@ So currently most of the links are broken due to noted malicious ~~Github~~ sabo
|
||||
- [More flexible support of asynchronous runtime/framework(s)](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/200).
|
||||
- [Migration guide from LMDB to MDBX](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/199).
|
||||
- [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/193).
|
||||
- [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/192).
|
||||
- [Support for RAW devices](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/124).
|
||||
- [Support MessagePack for Keys & Values](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/115).
|
||||
- [Engage new terminology](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/137).
|
||||
@ -27,3 +26,4 @@ Done
|
||||
|
||||
- [Simple careful mode for working with corrupted DB](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/223).
|
||||
- [Engage an "overlapped I/O" on Windows](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/224).
|
||||
- [Large/Overflow pages accounting for dirty-room](https://web.archive.org/web/20220414235959/https://github.com/erthink/libmdbx/issues/192).
|
||||
|
175
src/core.c
175
src/core.c
@ -4635,24 +4635,74 @@ static unsigned spill_prio(const MDBX_txn *txn, const size_t i,
|
||||
* parent txn. That would alter the parent txns' data even though
|
||||
* the child hasn't committed yet, and we'd have no way to undo it if
|
||||
* the child aborted. */
|
||||
static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
const size_t need) {
|
||||
#if xMDBX_DEBUG_SPILLING != 1
|
||||
/* production mode */
|
||||
if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need))
|
||||
return MDBX_SUCCESS;
|
||||
size_t wanna_spill = need - txn->tw.dirtyroom;
|
||||
#else
|
||||
/* debug mode: spill at least one page if xMDBX_DEBUG_SPILLING == 1 */
|
||||
size_t wanna_spill =
|
||||
(need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1;
|
||||
#endif /* xMDBX_DEBUG_SPILLING */
|
||||
__cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
const intptr_t wanna_spill_entries,
|
||||
const intptr_t wanna_spill_npages,
|
||||
const size_t need);
|
||||
|
||||
static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
const size_t need) {
|
||||
intptr_t wanna_spill_entries = need - txn->tw.dirtyroom - txn->tw.loose_count;
|
||||
intptr_t wanna_spill_npages =
|
||||
need + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count -
|
||||
txn->mt_env->me_options.dp_limit;
|
||||
|
||||
/* production mode */
|
||||
if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1)
|
||||
#if xMDBX_DEBUG_SPILLING == 1
|
||||
/* debug mode: always try to spill if xMDBX_DEBUG_SPILLING == 1 */
|
||||
&& txn->mt_txnid % 23 > 11
|
||||
#endif
|
||||
)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
return txn_spill_slowpath(txn, m0, wanna_spill_entries, wanna_spill_npages,
|
||||
need);
|
||||
}
|
||||
|
||||
static size_t spill_gate(const MDBX_env *env, intptr_t part,
|
||||
const size_t total) {
|
||||
const intptr_t spill_min =
|
||||
env->me_options.spill_min_denominator
|
||||
? (total + env->me_options.spill_min_denominator - 1) /
|
||||
env->me_options.spill_min_denominator
|
||||
: 1;
|
||||
const intptr_t spill_max =
|
||||
total - (env->me_options.spill_max_denominator
|
||||
? total / env->me_options.spill_max_denominator
|
||||
: 0);
|
||||
part = (part < spill_max) ? part : spill_max;
|
||||
part = (part > spill_min) ? part : spill_min;
|
||||
eASSERT(env, part > 0 && (size_t)part <= total);
|
||||
return (size_t)part;
|
||||
}
|
||||
|
||||
__cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
const intptr_t wanna_spill_entries,
|
||||
const intptr_t wanna_spill_npages,
|
||||
const size_t need) {
|
||||
int rc = MDBX_SUCCESS;
|
||||
if (unlikely(txn->tw.dirtylist->length <= txn->tw.loose_count))
|
||||
goto done;
|
||||
|
||||
const size_t dirty_entries = txn->tw.dirtylist->length - txn->tw.loose_count;
|
||||
const size_t dirty_npages =
|
||||
txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count;
|
||||
const size_t need_spill_entries =
|
||||
spill_gate(txn->mt_env, wanna_spill_entries, dirty_entries);
|
||||
const size_t need_spill_npages =
|
||||
spill_gate(txn->mt_env, wanna_spill_npages, dirty_npages);
|
||||
|
||||
const size_t need_spill = (need_spill_entries > need_spill_npages)
|
||||
? need_spill_entries
|
||||
: need_spill_npages;
|
||||
if (!need_spill)
|
||||
goto done;
|
||||
|
||||
#if !MDBX_AVOID_MSYNC
|
||||
if (txn->mt_flags & MDBX_WRITEMAP) {
|
||||
NOTICE("%s-spilling of %zu dirty-entries (have %zu dirty-room, need %zu)",
|
||||
"msync", wanna_spill, txn->tw.dirtyroom, need);
|
||||
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync",
|
||||
dirty_entries, dirty_npages);
|
||||
tASSERT(txn, txn->tw.spill_pages == nullptr);
|
||||
const MDBX_env *env = txn->mt_env;
|
||||
rc =
|
||||
@ -4671,27 +4721,15 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
}
|
||||
#endif /* MDBX_AVOID_MSYNC */
|
||||
|
||||
const size_t dirty = txn->tw.dirtylist->length;
|
||||
const size_t spill_min =
|
||||
txn->mt_env->me_options.spill_min_denominator
|
||||
? dirty / txn->mt_env->me_options.spill_min_denominator
|
||||
: 0;
|
||||
const size_t spill_max =
|
||||
dirty - (txn->mt_env->me_options.spill_max_denominator
|
||||
? dirty / txn->mt_env->me_options.spill_max_denominator
|
||||
: 0);
|
||||
wanna_spill = (wanna_spill > spill_min) ? wanna_spill : spill_min;
|
||||
wanna_spill = (wanna_spill < spill_max) ? wanna_spill : spill_max;
|
||||
if (!wanna_spill)
|
||||
return MDBX_SUCCESS;
|
||||
|
||||
NOTICE("%s-spilling %zu dirty-entries (have %zu dirty-room, need %zu)",
|
||||
"pwrite", wanna_spill, txn->tw.dirtyroom, need);
|
||||
tASSERT(txn, txn->tw.dirtylist->length >= wanna_spill);
|
||||
NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write",
|
||||
need_spill_entries, need_spill_npages);
|
||||
tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1);
|
||||
tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >=
|
||||
need_spill_npages);
|
||||
if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) {
|
||||
if (!txn->tw.spill_pages) {
|
||||
txn->tw.spill_least_removed = INT_MAX;
|
||||
txn->tw.spill_pages = pnl_alloc(wanna_spill);
|
||||
txn->tw.spill_pages = pnl_alloc(need_spill);
|
||||
if (unlikely(!txn->tw.spill_pages)) {
|
||||
rc = MDBX_ENOMEM;
|
||||
bailout:
|
||||
@ -4701,7 +4739,7 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
} else {
|
||||
/* purge deleted slots */
|
||||
spill_purge(txn);
|
||||
rc = pnl_reserve(&txn->tw.spill_pages, wanna_spill);
|
||||
rc = pnl_reserve(&txn->tw.spill_pages, need_spill);
|
||||
(void)rc /* ignore since the resulting list may be shorter
|
||||
and pnl_append() will increase pnl on demand */
|
||||
;
|
||||
@ -4758,48 +4796,63 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max);
|
||||
|
||||
/* half of 8-bit radix-sort */
|
||||
pgno_t radix_counters[256], spillable = 0;
|
||||
memset(&radix_counters, 0, sizeof(radix_counters));
|
||||
pgno_t radix_entries[256], radix_npages[256];
|
||||
memset(&radix_entries, 0, sizeof(radix_entries));
|
||||
memset(&radix_npages, 0, sizeof(radix_npages));
|
||||
size_t spillable_entries = 0, spillable_npages = 0;
|
||||
const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1);
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
unsigned prio = spill_prio(txn, i, reciprocal);
|
||||
const unsigned prio = spill_prio(txn, i, reciprocal);
|
||||
if (prio < 256) {
|
||||
radix_counters[prio] += 1;
|
||||
spillable += 1;
|
||||
radix_entries[prio] += 1;
|
||||
spillable_entries += 1;
|
||||
const pgno_t npages = dpl_npages(dl, i);
|
||||
radix_npages[prio] += npages;
|
||||
spillable_npages += npages;
|
||||
}
|
||||
}
|
||||
|
||||
if (likely(spillable > 0)) {
|
||||
size_t prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0];
|
||||
tASSERT(txn, spillable_npages >= spillable_entries);
|
||||
pgno_t spilled_entries = 0, spilled_npages = 0;
|
||||
if (likely(spillable_entries > 0)) {
|
||||
size_t prio2spill = 0, prio2adjacent = 128,
|
||||
amount_entries = radix_entries[0], amount_npages = radix_npages[0];
|
||||
for (size_t i = 1; i < 256; i++) {
|
||||
if (amount < wanna_spill) {
|
||||
if (amount_entries < need_spill_entries ||
|
||||
amount_npages < need_spill_npages) {
|
||||
prio2spill = i;
|
||||
prio2adjacent = i + (257 - i) / 2;
|
||||
amount += radix_counters[i];
|
||||
} else if (amount + amount < spillable + wanna_spill
|
||||
/* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) {
|
||||
amount_entries += radix_entries[i];
|
||||
amount_npages += radix_npages[i];
|
||||
} else if (amount_entries + amount_entries <
|
||||
spillable_entries + need_spill_entries
|
||||
/* РАВНОЗНАЧНО: amount - need_spill < spillable - amount */
|
||||
|| amount_npages + amount_npages <
|
||||
spillable_npages + need_spill_npages) {
|
||||
prio2adjacent = i;
|
||||
amount += radix_counters[i];
|
||||
amount_entries += radix_entries[i];
|
||||
amount_npages += radix_npages[i];
|
||||
} else
|
||||
break;
|
||||
}
|
||||
|
||||
VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %u,"
|
||||
" wanna-spill %zu, amount %zu",
|
||||
prio2spill, prio2adjacent, spillable, wanna_spill, amount);
|
||||
VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %zu/%zu,"
|
||||
" wanna-spill %zu/%zu, amount %zu/%zu",
|
||||
prio2spill, prio2adjacent, spillable_entries, spillable_npages,
|
||||
need_spill_entries, need_spill_npages, amount_entries,
|
||||
amount_npages);
|
||||
tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256);
|
||||
|
||||
iov_ctx_t ctx;
|
||||
rc = iov_init(txn, &ctx, amount,
|
||||
txn->tw.dirtylist->pages_including_loose -
|
||||
txn->tw.loose_count);
|
||||
rc = iov_init(txn, &ctx, amount_entries, amount_npages);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
|
||||
unsigned prev_prio = 256, prio;
|
||||
size_t r, w;
|
||||
pgno_t spilled_entries = 0, spilled_npages = 0;
|
||||
for (w = 0, r = 1; r <= dl->length && spilled_entries < wanna_spill;
|
||||
for (w = 0, r = 1;
|
||||
r <= dl->length && (spilled_entries < need_spill_entries ||
|
||||
spilled_npages < need_spill_npages);
|
||||
prev_prio = prio, ++r) {
|
||||
prio = spill_prio(txn, r, reciprocal);
|
||||
MDBX_page *const dp = dl->items[r].ptr;
|
||||
@ -4850,7 +4903,8 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
|
||||
VERBOSE("spilled entries %u, spilled npages %u", spilled_entries,
|
||||
spilled_npages);
|
||||
tASSERT(txn, spillable == 0 || spilled_entries > 0);
|
||||
tASSERT(txn, spillable_entries == 0 || spilled_entries > 0);
|
||||
tASSERT(txn, spilled_npages >= spilled_entries);
|
||||
|
||||
while (r <= dl->length)
|
||||
dl->items[++w] = dl->items[r++];
|
||||
@ -4872,13 +4926,13 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1);
|
||||
txn->mt_flags |= MDBX_TXN_SPILLS;
|
||||
}
|
||||
NOTICE("spilled %u dirty-entries, now have %zu dirty-room", spilled_entries,
|
||||
txn->tw.dirtyroom);
|
||||
NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room",
|
||||
spilled_entries, spilled_npages, txn->tw.dirtyroom);
|
||||
} else {
|
||||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||||
for (size_t i = 1; i <= dl->length; ++i) {
|
||||
MDBX_page *dp = dl->items[i].ptr;
|
||||
NOTICE(
|
||||
VERBOSE(
|
||||
"dirtylist[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u",
|
||||
i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i),
|
||||
spill_prio(txn, i, reciprocal));
|
||||
@ -4888,13 +4942,14 @@ static int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0,
|
||||
#if xMDBX_DEBUG_SPILLING == 2
|
||||
if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1)
|
||||
ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; "
|
||||
"needed %zu, spillable %u; "
|
||||
"needed %zu, spillable %zu; "
|
||||
"spilled %u dirty-entries, now have %zu dirty-room",
|
||||
dl->length + spilled, dl->length,
|
||||
dl->length + spilled_entries, dl->length,
|
||||
(txn->mt_parent && txn->mt_parent->tw.dirtylist)
|
||||
? (intptr_t)txn->mt_parent->tw.dirtylist->length
|
||||
: -1,
|
||||
txn->tw.loose_count, need, spillable, spilled, txn->tw.dirtyroom);
|
||||
txn->tw.loose_count, need, spillable_entries, spilled_entries,
|
||||
txn->tw.dirtyroom);
|
||||
ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2);
|
||||
#endif /* xMDBX_DEBUG_SPILLING */
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user