mirror of
https://github.com/isar/libmdbx.git
synced 2025-03-01 04:08:13 +08:00
mdbx: доработка rebalance()
ради уменьшения WAF.
После предыдущей серии доработок весной 2021 года, функция `rebalance()` обеспечивала слияние мало заполненной страницы с менее заполненной соседней, одновременно пытаясь не вовлекать соседних страниц, если те еще не были скопированы/клонированы/изменены в текущей транзакции. В целом, реализованная тактика представляется успешной. Однако, при обновлении GC она иногда приводила к исчерпанию подготовленного резерва извлеченных из GC страниц. Это не является проблемой, если не считать вероятность срабатывания `assert(txn->mt_flags & MDBX_TXN_DRAINED_GC)` в отладочных сборках. Тем не менее, из этой ситуации можно сделать вывод, что поведение `rebalance()`, как минимум, может быть обогащено опцией уменьшения WAF ценой меньшей сбалансированности дерева. Технически при этом слияние выполняется преимущественно с грязной страницей, если на ней достаточно места и соседняя страница с другой стороны еще чистая. Соответствующая опция в `enum MDBX_option_t` будет добавлена чуть позже.
This commit is contained in:
parent
72e51ee370
commit
9480599afa
59
src/core.c
59
src/core.c
@ -10642,7 +10642,9 @@ static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) {
|
|||||||
const size_t for_all_before_touch = for_relist + for_tree_before_touch;
|
const size_t for_all_before_touch = for_relist + for_tree_before_touch;
|
||||||
const size_t for_all_after_touch = for_relist + for_tree_after_touch;
|
const size_t for_all_after_touch = for_relist + for_tree_after_touch;
|
||||||
|
|
||||||
if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch))
|
if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch) &&
|
||||||
|
(ctx->cursor.mc_snum == 0 ||
|
||||||
|
IS_MODIFIABLE(txn, ctx->cursor.mc_pg[ctx->cursor.mc_top])))
|
||||||
return MDBX_SUCCESS;
|
return MDBX_SUCCESS;
|
||||||
|
|
||||||
TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, "
|
TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, "
|
||||||
@ -18867,6 +18869,7 @@ static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) {
|
|||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
MDBX_page *mp = mc->mc_pg[mc->mc_top];
|
MDBX_page *mp = mc->mc_pg[mc->mc_top];
|
||||||
|
cASSERT(mc, IS_MODIFIABLE(mc->mc_txn, mp));
|
||||||
if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
|
if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) {
|
||||||
ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
|
ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor",
|
||||||
mp->mp_pgno, mp->mp_flags);
|
mp->mp_pgno, mp->mp_flags);
|
||||||
@ -20386,7 +20389,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1]));
|
IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1]));
|
||||||
cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth ||
|
cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth ||
|
||||||
IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1]));
|
IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1]));
|
||||||
cASSERT(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc));
|
cASSERT(cdst, csrc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance ||
|
||||||
|
page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc));
|
||||||
const int pagetype = PAGETYPE_WHOLE(psrc);
|
const int pagetype = PAGETYPE_WHOLE(psrc);
|
||||||
|
|
||||||
/* Move all nodes from src to dst */
|
/* Move all nodes from src to dst */
|
||||||
@ -20397,7 +20401,9 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
size_t j = dst_nkeys;
|
size_t j = dst_nkeys;
|
||||||
if (unlikely(pagetype & P_LEAF2)) {
|
if (unlikely(pagetype & P_LEAF2)) {
|
||||||
/* Mark dst as dirty. */
|
/* Mark dst as dirty. */
|
||||||
if (unlikely(rc = page_touch(cdst)))
|
rc = page_touch(cdst);
|
||||||
|
cASSERT(cdst, rc != MDBX_RESULT_TRUE);
|
||||||
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
key.iov_len = csrc->mc_db->md_xsize;
|
key.iov_len = csrc->mc_db->md_xsize;
|
||||||
@ -20405,6 +20411,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
do {
|
do {
|
||||||
rc = node_add_leaf2(cdst, j++, &key);
|
rc = node_add_leaf2(cdst, j++, &key);
|
||||||
|
cASSERT(cdst, rc != MDBX_RESULT_TRUE);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
key.iov_base = ptr_disp(key.iov_base, key.iov_len);
|
key.iov_base = ptr_disp(key.iov_base, key.iov_len);
|
||||||
@ -20418,7 +20425,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
cursor_copy(csrc, &mn);
|
cursor_copy(csrc, &mn);
|
||||||
/* must find the lowest key below src */
|
/* must find the lowest key below src */
|
||||||
rc = page_search_lowest(&mn);
|
rc = page_search_lowest(&mn);
|
||||||
if (unlikely(rc))
|
cASSERT(csrc, rc != MDBX_RESULT_TRUE);
|
||||||
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
const MDBX_page *mp = mn.mc_pg[mn.mc_top];
|
const MDBX_page *mp = mn.mc_pg[mn.mc_top];
|
||||||
@ -20443,7 +20451,9 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Mark dst as dirty. */
|
/* Mark dst as dirty. */
|
||||||
if (unlikely(rc = page_touch(cdst)))
|
rc = page_touch(cdst);
|
||||||
|
cASSERT(cdst, rc != MDBX_RESULT_TRUE);
|
||||||
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
size_t i = 0;
|
size_t i = 0;
|
||||||
@ -20457,6 +20467,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
cASSERT(csrc, node_flags(srcnode) == 0);
|
cASSERT(csrc, node_flags(srcnode) == 0);
|
||||||
rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode));
|
rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode));
|
||||||
}
|
}
|
||||||
|
cASSERT(cdst, rc != MDBX_RESULT_TRUE);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
@ -20483,7 +20494,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
if (csrc->mc_ki[csrc->mc_top] == 0) {
|
if (csrc->mc_ki[csrc->mc_top] == 0) {
|
||||||
const MDBX_val nullkey = {0, 0};
|
const MDBX_val nullkey = {0, 0};
|
||||||
rc = update_key(csrc, &nullkey);
|
rc = update_key(csrc, &nullkey);
|
||||||
if (unlikely(rc)) {
|
cASSERT(csrc, rc != MDBX_RESULT_TRUE);
|
||||||
|
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||||
csrc->mc_top++;
|
csrc->mc_top++;
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -20518,7 +20530,8 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
rc = page_retire(csrc, (MDBX_page *)psrc);
|
rc = page_retire(csrc, (MDBX_page *)psrc);
|
||||||
if (unlikely(rc))
|
cASSERT(csrc, rc != MDBX_RESULT_TRUE);
|
||||||
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
cASSERT(cdst, cdst->mc_db->md_entries > 0);
|
cASSERT(cdst, cdst->mc_db->md_entries > 0);
|
||||||
@ -20531,7 +20544,7 @@ static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
|
|||||||
const uint16_t save_depth = cdst->mc_db->md_depth;
|
const uint16_t save_depth = cdst->mc_db->md_depth;
|
||||||
cursor_pop(cdst);
|
cursor_pop(cdst);
|
||||||
rc = rebalance(cdst);
|
rc = rebalance(cdst);
|
||||||
if (unlikely(rc))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
cASSERT(cdst, cdst->mc_db->md_entries > 0);
|
cASSERT(cdst, cdst->mc_db->md_entries > 0);
|
||||||
@ -20719,11 +20732,9 @@ static int rebalance(MDBX_cursor *mc) {
|
|||||||
mc->mc_snum = 0;
|
mc->mc_snum = 0;
|
||||||
mc->mc_top = 0;
|
mc->mc_top = 0;
|
||||||
mc->mc_flags &= ~C_INITIALIZED;
|
mc->mc_flags &= ~C_INITIALIZED;
|
||||||
|
return page_retire(mc, mp);
|
||||||
rc = page_retire(mc, mp);
|
}
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (IS_BRANCH(mp) && nkeys == 1) {
|
||||||
return rc;
|
|
||||||
} else if (IS_BRANCH(mp) && nkeys == 1) {
|
|
||||||
DEBUG("%s", "collapsing root page!");
|
DEBUG("%s", "collapsing root page!");
|
||||||
mc->mc_db->md_root = node_pgno(page_node(mp, 0));
|
mc->mc_db->md_root = node_pgno(page_node(mp, 0));
|
||||||
rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid);
|
rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid);
|
||||||
@ -20756,15 +20767,10 @@ static int rebalance(MDBX_cursor *mc) {
|
|||||||
PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype);
|
PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype);
|
||||||
cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth ||
|
cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth ||
|
||||||
IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
|
IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1]));
|
||||||
|
return page_retire(mc, mp);
|
||||||
rc = page_retire(mc, mp);
|
|
||||||
if (likely(rc == MDBX_SUCCESS))
|
|
||||||
rc = page_touch(mc);
|
|
||||||
return rc;
|
|
||||||
} else {
|
|
||||||
DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)",
|
|
||||||
mp->mp_pgno, mp->mp_flags);
|
|
||||||
}
|
}
|
||||||
|
DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)",
|
||||||
|
mp->mp_pgno, mp->mp_flags);
|
||||||
return MDBX_SUCCESS;
|
return MDBX_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -20813,6 +20819,7 @@ static int rebalance(MDBX_cursor *mc) {
|
|||||||
const size_t right_nkeys = right ? page_numkeys(right) : 0;
|
const size_t right_nkeys = right ? page_numkeys(right) : 0;
|
||||||
bool involve = false;
|
bool involve = false;
|
||||||
retry:
|
retry:
|
||||||
|
cASSERT(mc, mc->mc_snum > 1);
|
||||||
if (left_room > room_threshold && left_room >= right_room &&
|
if (left_room > room_threshold && left_room >= right_room &&
|
||||||
(IS_MODIFIABLE(mc->mc_txn, left) || involve)) {
|
(IS_MODIFIABLE(mc->mc_txn, left) || involve)) {
|
||||||
/* try merge with left */
|
/* try merge with left */
|
||||||
@ -20884,7 +20891,15 @@ retry:
|
|||||||
return MDBX_SUCCESS;
|
return MDBX_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (likely(!involve)) {
|
if (mc->mc_txn->mt_env->me_options.prefer_waf_insteadof_balance &&
|
||||||
|
likely(room_threshold > 0)) {
|
||||||
|
room_threshold = 0;
|
||||||
|
goto retry;
|
||||||
|
}
|
||||||
|
if (likely(!involve) &&
|
||||||
|
(likely(mc->mc_dbi != FREE_DBI) || mc->mc_txn->tw.loose_pages ||
|
||||||
|
MDBX_PNL_GETSIZE(mc->mc_txn->tw.relist) || (mc->mc_flags & C_GCU) ||
|
||||||
|
(mc->mc_txn->mt_flags & MDBX_TXN_DRAINED_GC) || room_threshold)) {
|
||||||
involve = true;
|
involve = true;
|
||||||
goto retry;
|
goto retry;
|
||||||
}
|
}
|
||||||
|
@ -1437,6 +1437,8 @@ struct MDBX_env {
|
|||||||
unsigned writethrough_threshold;
|
unsigned writethrough_threshold;
|
||||||
#endif /* Windows */
|
#endif /* Windows */
|
||||||
bool prefault_write;
|
bool prefault_write;
|
||||||
|
bool prefer_waf_insteadof_balance; /* Strive to minimize WAF instead of
|
||||||
|
balancing pages fullment */
|
||||||
union {
|
union {
|
||||||
unsigned all;
|
unsigned all;
|
||||||
/* tracks options with non-auto values but tuned by user */
|
/* tracks options with non-auto values but tuned by user */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user