mdbx: rework copy-with-compactification.

Кроме небольшого рефакторинга здесь реализуется более регулярный способ
обхода дерева при копировании с компактификаций. В частности, полная
инициализация курсоров позволяет выполнять больше проверок/контроля
структуры БД и избавиться от флажка CC_COPYING.

Beside a small refactoring, a more regular way of traversing the tree
when copying with compactification is implemented here. In particular,
full initialization of cursors allows to perform more checks/control of
the DB structure and get rid of the CC_COPYING flag.
This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-07-04 13:22:18 +03:00
parent 2d300d807b
commit 1740f8227a

View File

@ -19373,7 +19373,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data,
/**** COPYING *****************************************************************/ /**** COPYING *****************************************************************/
/* State needed for a double-buffering compacting copy. */ /* State needed for a double-buffering compacting copy. */
typedef struct mdbx_copy { typedef struct mdbx_compacting_ctx {
MDBX_env *mc_env; MDBX_env *mc_env;
MDBX_txn *mc_txn; MDBX_txn *mc_txn;
mdbx_condpair_t mc_condpair; mdbx_condpair_t mc_condpair;
@ -19388,39 +19388,39 @@ typedef struct mdbx_copy {
pgno_t mc_next_pgno; pgno_t mc_next_pgno;
volatile unsigned mc_head; volatile unsigned mc_head;
volatile unsigned mc_tail; volatile unsigned mc_tail;
} mdbx_copy; } mdbx_compacting_ctx;
/* Dedicated writer thread for compacting copy. */ /* Dedicated writer thread for compacting copy. */
__cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) { __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) {
mdbx_copy *my = arg; mdbx_compacting_ctx *const ctx = arg;
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
sigset_t sigset; sigset_t sigset;
sigemptyset(&sigset); sigemptyset(&sigset);
sigaddset(&sigset, SIGPIPE); sigaddset(&sigset, SIGPIPE);
my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL);
#endif /* EPIPE */ #endif /* EPIPE */
mdbx_condpair_lock(&my->mc_condpair); mdbx_condpair_lock(&ctx->mc_condpair);
while (!my->mc_error) { while (!ctx->mc_error) {
while (my->mc_tail == my->mc_head && !my->mc_error) { while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) {
int err = mdbx_condpair_wait(&my->mc_condpair, true); int err = mdbx_condpair_wait(&ctx->mc_condpair, true);
if (err != MDBX_SUCCESS) { if (err != MDBX_SUCCESS) {
my->mc_error = err; ctx->mc_error = err;
goto bailout; goto bailout;
} }
} }
const unsigned toggle = my->mc_tail & 1; const unsigned toggle = ctx->mc_tail & 1;
size_t wsize = my->mc_wlen[toggle]; size_t wsize = ctx->mc_wlen[toggle];
if (wsize == 0) { if (wsize == 0) {
my->mc_tail += 1; ctx->mc_tail += 1;
break /* EOF */; break /* EOF */;
} }
my->mc_wlen[toggle] = 0; ctx->mc_wlen[toggle] = 0;
uint8_t *ptr = my->mc_wbuf[toggle]; uint8_t *ptr = ctx->mc_wbuf[toggle];
again: again:
if (!my->mc_error) { if (!ctx->mc_error) {
int err = mdbx_write(my->mc_fd, ptr, wsize); int err = mdbx_write(ctx->mc_fd, ptr, wsize);
if (err != MDBX_SUCCESS) { if (err != MDBX_SUCCESS) {
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
if (err == EPIPE) { if (err == EPIPE) {
@ -19430,128 +19430,118 @@ __cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) {
sigwait(&sigset, &unused); sigwait(&sigset, &unused);
} }
#endif /* EPIPE */ #endif /* EPIPE */
my->mc_error = err; ctx->mc_error = err;
goto bailout; goto bailout;
} }
} }
/* If there's an overflow page tail, write it too */ /* If there's an overflow page tail, write it too */
wsize = my->mc_olen[toggle]; wsize = ctx->mc_olen[toggle];
if (wsize) { if (wsize) {
my->mc_olen[toggle] = 0; ctx->mc_olen[toggle] = 0;
ptr = my->mc_over[toggle]; ptr = ctx->mc_over[toggle];
goto again; goto again;
} }
my->mc_tail += 1; ctx->mc_tail += 1;
mdbx_condpair_signal(&my->mc_condpair, false); mdbx_condpair_signal(&ctx->mc_condpair, false);
} }
bailout: bailout:
mdbx_condpair_unlock(&my->mc_condpair); mdbx_condpair_unlock(&ctx->mc_condpair);
return (THREAD_RESULT)0; return (THREAD_RESULT)0;
} }
/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */
__cold static int mdbx_env_cthr_toggle(mdbx_copy *my) { __cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) {
mdbx_condpair_lock(&my->mc_condpair); mdbx_condpair_lock(&ctx->mc_condpair);
mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error); mdbx_assert(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error);
my->mc_head += 1; ctx->mc_head += 1;
mdbx_condpair_signal(&my->mc_condpair, true); mdbx_condpair_signal(&ctx->mc_condpair, true);
while (!my->mc_error && while (!ctx->mc_error &&
my->mc_head - my->mc_tail == 2 /* both buffers in use */) { ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) {
int err = mdbx_condpair_wait(&my->mc_condpair, false); int err = mdbx_condpair_wait(&ctx->mc_condpair, false);
if (err != MDBX_SUCCESS) if (err != MDBX_SUCCESS)
my->mc_error = err; ctx->mc_error = err;
} }
mdbx_condpair_unlock(&my->mc_condpair); mdbx_condpair_unlock(&ctx->mc_condpair);
return my->mc_error; return ctx->mc_error;
} }
/* Depth-first tree traversal for compacting copy. */ __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb);
__cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg,
const unsigned hive_flags) {
MDBX_cursor_couple couple;
MDBX_page *copy;
/* Empty DB, nothing to do */ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx,
if (*pg == P_INVALID) MDBX_cursor *mc, pgno_t *root,
return MDBX_SUCCESS; txnid_t parent_txnid) {
mc->mc_snum = 1;
memset(&couple, 0, sizeof(couple)); int rc = mdbx_page_get(mc, *root, &mc->mc_pg[0], parent_txnid);
couple.outer.mc_snum = 1;
couple.outer.mc_txn = my->mc_txn;
couple.outer.mc_checking = couple.inner.mx_cursor.mc_checking =
(hive_flags & MDBX_DUPFIXED)
? CC_PAGECHECK | CC_COPYING | CC_SKIPORD | CC_LEAF | CC_LEAF2
: CC_PAGECHECK | CC_COPYING | CC_SKIPORD | CC_LEAF;
int rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0],
my->mc_txn->mt_txnid);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
rc = mdbx_page_search_root(&couple.outer, NULL, MDBX_PS_FIRST);
rc = mdbx_page_search_root(mc, nullptr, MDBX_PS_FIRST);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
/* Make cursor pages writable */ /* Make cursor pages writable */
char *const buf = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum)); char *const buf = mdbx_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum));
if (buf == NULL) if (buf == NULL)
return MDBX_ENOMEM; return MDBX_ENOMEM;
char *ptr = buf; char *ptr = buf;
for (unsigned i = 0; i < couple.outer.mc_top; i++) { for (unsigned i = 0; i < mc->mc_top; i++) {
mdbx_page_copy((MDBX_page *)ptr, couple.outer.mc_pg[i], mdbx_page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize);
my->mc_env->me_psize); mc->mc_pg[i] = (MDBX_page *)ptr;
couple.outer.mc_pg[i] = (MDBX_page *)ptr; ptr += ctx->mc_env->me_psize;
ptr += my->mc_env->me_psize;
} }
/* This is writable space for a leaf page. Usually not needed. */ /* This is writable space for a leaf page. Usually not needed. */
MDBX_page *const leaf = (MDBX_page *)ptr; MDBX_page *const leaf = (MDBX_page *)ptr;
MDBX_page *copy;
while (couple.outer.mc_snum > 0) { while (mc->mc_snum > 0) {
MDBX_page *mp = couple.outer.mc_pg[couple.outer.mc_top]; MDBX_page *mp = mc->mc_pg[mc->mc_top];
unsigned n = page_numkeys(mp); unsigned n = page_numkeys(mp);
if (IS_LEAF(mp)) { if (IS_LEAF(mp)) {
if (hive_flags == 0 /* may have nested F_SUBDATA or F_BIGDATA nodes */) { if (!(mc->mc_flags &
C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) {
for (unsigned i = 0; i < n; i++) { for (unsigned i = 0; i < n; i++) {
MDBX_node *node = page_node(mp, i); MDBX_node *node = page_node(mp, i);
if (node_flags(node) & F_BIGDATA) { if (node_flags(node) == F_BIGDATA) {
MDBX_page *osrc;
/* Need writable leaf */ /* Need writable leaf */
if (mp != leaf) { if (mp != leaf) {
couple.outer.mc_pg[couple.outer.mc_top] = leaf; mc->mc_pg[mc->mc_top] = leaf;
mdbx_page_copy(leaf, mp, my->mc_env->me_psize); mdbx_page_copy(leaf, mp, ctx->mc_env->me_psize);
mp = leaf; mp = leaf;
node = page_node(mp, i); node = page_node(mp, i);
} }
const pgno_t pgno = node_largedata_pgno(node); const pgno_t pgno = node_largedata_pgno(node);
poke_pgno(node_data(node), my->mc_next_pgno); poke_pgno(node_data(node), ctx->mc_next_pgno);
rc = mdbx_page_get(&couple.outer, pgno, &osrc, mp->mp_txnid); MDBX_page *osrc;
rc = mdbx_page_get(mc, pgno, &osrc, mp->mp_txnid);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
unsigned toggle = my->mc_head & 1;
if (my->mc_wlen[toggle] + my->mc_env->me_psize > unsigned side = ctx->mc_head & 1;
((size_t)(MDBX_ENVCOPY_WRITEBUF))) { if (ctx->mc_wlen[side] + ctx->mc_env->me_psize >
rc = mdbx_env_cthr_toggle(my); (size_t)MDBX_ENVCOPY_WRITEBUF) {
rc = compacting_toggle_write_buffers(ctx);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
toggle = my->mc_head & 1; side = ctx->mc_head & 1;
} }
copy = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]);
memcpy(copy, osrc, my->mc_env->me_psize); memcpy(copy, osrc, ctx->mc_env->me_psize);
copy->mp_pgno = my->mc_next_pgno; copy->mp_pgno = ctx->mc_next_pgno;
my->mc_next_pgno += osrc->mp_pages; ctx->mc_next_pgno += osrc->mp_pages;
my->mc_wlen[toggle] += my->mc_env->me_psize; ctx->mc_wlen[side] += ctx->mc_env->me_psize;
if (osrc->mp_pages > 1) { if (osrc->mp_pages > 1) {
my->mc_olen[toggle] = pgno2bytes(my->mc_env, osrc->mp_pages - 1); ctx->mc_olen[side] = pgno2bytes(ctx->mc_env, osrc->mp_pages - 1);
my->mc_over[toggle] = (uint8_t *)osrc + my->mc_env->me_psize; ctx->mc_over[side] = (uint8_t *)osrc + ctx->mc_env->me_psize;
rc = mdbx_env_cthr_toggle(my); rc = compacting_toggle_write_buffers(ctx);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
toggle = my->mc_head & 1; side = ctx->mc_head & 1;
} }
} else if (node_flags(node) & F_SUBDATA) { } else if (node_flags(node) & F_SUBDATA) {
if (!MDBX_DISABLE_VALIDATION && if (!MDBX_DISABLE_VALIDATION &&
@ -19562,77 +19552,84 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg,
/* Need writable leaf */ /* Need writable leaf */
if (mp != leaf) { if (mp != leaf) {
couple.outer.mc_pg[couple.outer.mc_top] = leaf; mc->mc_pg[mc->mc_top] = leaf;
mdbx_page_copy(leaf, mp, my->mc_env->me_psize); mdbx_page_copy(leaf, mp, ctx->mc_env->me_psize);
mp = leaf; mp = leaf;
node = page_node(mp, i); node = page_node(mp, i);
} }
MDBX_db db; MDBX_db *nested = nullptr;
memcpy(&db, node_data(node), sizeof(MDBX_db)); if (node_flags(node) & F_DUPDATA) {
STATIC_ASSERT(F_DUPDATA == MDBX_DUPSORT); rc = mdbx_xcursor_init1(mc, node, mp);
rc = mdbx_env_cwalk(my, &db.md_root, if (likely(rc == MDBX_SUCCESS)) {
(node_flags(node) & F_DUPDATA) nested = &mc->mc_xcursor->mx_db;
? MDBX_DUPSORT | rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor,
(db.md_flags & MDBX_DUPFIXED) &nested->md_root, mp->mp_txnid);
: 0); }
} else {
MDBX_cursor_couple *couple =
container_of(mc, MDBX_cursor_couple, inner.mx_cursor);
nested = &couple->inner.mx_db;
memcpy(nested, node_data(node), sizeof(MDBX_db));
rc = compacting_walk_sdb(ctx, nested);
}
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
memcpy(node_data(node), &db, sizeof(MDBX_db)); memcpy(node_data(node), nested, sizeof(MDBX_db));
} }
} }
} }
} else { } else {
couple.outer.mc_ki[couple.outer.mc_top]++; mc->mc_ki[mc->mc_top]++;
if (couple.outer.mc_ki[couple.outer.mc_top] < n) { if (mc->mc_ki[mc->mc_top] < n) {
again:; again:;
const MDBX_node *node = const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
page_node(mp, couple.outer.mc_ki[couple.outer.mc_top]); if (unlikely(node_flags(node))) {
if (unlikely(node->mn_flags)) {
mdbx_error("unexpected type 0x%x of node #%u on page #%" PRIaPGNO, mdbx_error("unexpected type 0x%x of node #%u on page #%" PRIaPGNO,
node->mn_flags, couple.outer.mc_ki[couple.outer.mc_top], node_flags(node), mc->mc_ki[mc->mc_top],
couple.outer.mc_pg[couple.outer.mc_top]->mp_pgno); mc->mc_pg[mc->mc_top]->mp_pgno);
rc = MDBX_CORRUPTED; rc = MDBX_CORRUPTED;
goto done; goto done;
} }
rc = mdbx_page_get(&couple.outer, node_pgno(node), &mp, mp->mp_txnid); rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
couple.outer.mc_top++; mc->mc_top++;
couple.outer.mc_snum++; mc->mc_snum++;
couple.outer.mc_ki[couple.outer.mc_top] = 0; mc->mc_ki[mc->mc_top] = 0;
if (IS_BRANCH(mp)) { if (IS_BRANCH(mp)) {
/* Whenever we advance to a sibling branch page, /* Whenever we advance to a sibling branch page,
* we must proceed all the way down to its first leaf. */ * we must proceed all the way down to its first leaf. */
mdbx_page_copy(couple.outer.mc_pg[couple.outer.mc_top], mp, mdbx_page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize);
my->mc_env->me_psize);
goto again; goto again;
} else } else
couple.outer.mc_pg[couple.outer.mc_top] = mp; mc->mc_pg[mc->mc_top] = mp;
continue; continue;
} }
} }
unsigned toggle = my->mc_head & 1;
if (my->mc_wlen[toggle] + my->mc_wlen[toggle] > unsigned side = ctx->mc_head & 1;
((size_t)(MDBX_ENVCOPY_WRITEBUF))) { if (ctx->mc_wlen[side] + ctx->mc_env->me_psize >
rc = mdbx_env_cthr_toggle(my); (size_t)MDBX_ENVCOPY_WRITEBUF) {
rc = compacting_toggle_write_buffers(ctx);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
goto done; goto done;
toggle = my->mc_head & 1; side = ctx->mc_head & 1;
} }
copy = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]);
mdbx_page_copy(copy, mp, my->mc_env->me_psize); mdbx_page_copy(copy, mp, ctx->mc_env->me_psize);
copy->mp_pgno = my->mc_next_pgno++; copy->mp_pgno = ctx->mc_next_pgno++;
my->mc_wlen[toggle] += my->mc_env->me_psize; ctx->mc_wlen[side] += ctx->mc_env->me_psize;
if (couple.outer.mc_top) {
if (mc->mc_top) {
/* Update parent if there is one */ /* Update parent if there is one */
node_set_pgno(page_node(couple.outer.mc_pg[couple.outer.mc_top - 1], node_set_pgno(
couple.outer.mc_ki[couple.outer.mc_top - 1]), page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]),
copy->mp_pgno); copy->mp_pgno);
mdbx_cursor_pop(&couple.outer); mdbx_cursor_pop(mc);
} else { } else {
/* Otherwise we're done */ /* Otherwise we're done */
*pg = copy->mp_pgno; *root = copy->mp_pgno;
break; break;
} }
} }
@ -19641,7 +19638,25 @@ done:
return rc; return rc;
} }
__cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { __cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) {
if (unlikely(sdb->md_root == P_INVALID))
return MDBX_SUCCESS; /* empty db */
MDBX_cursor_couple couple;
MDBX_dbx dbx = {.md_klen_min = INT_MAX};
uint8_t dbistate = DBI_VALID | DBI_AUDITED;
int rc = mdbx_couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK;
couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK;
return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root,
sdb->md_mod_txnid ? sdb->md_mod_txnid
: ctx->mc_txn->mt_txnid);
}
__cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) {
/* Calculate filesize taking in account shrink/growing thresholds */ /* Calculate filesize taking in account shrink/growing thresholds */
if (meta->mm_geo.next != meta->mm_geo.now) { if (meta->mm_geo.next != meta->mm_geo.now) {
meta->mm_geo.now = meta->mm_geo.next; meta->mm_geo.now = meta->mm_geo.next;
@ -19665,7 +19680,7 @@ __cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) {
} }
/* Make resizeable */ /* Make resizeable */
__cold static void make_sizeable(MDBX_meta *meta) { __cold static void meta_make_sizeable(MDBX_meta *meta) {
meta->mm_geo.lower = MIN_PAGENO; meta->mm_geo.lower = MIN_PAGENO;
if (meta->mm_geo.grow_pv == 0) { if (meta->mm_geo.grow_pv == 0) {
const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42; const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42;
@ -19688,7 +19703,7 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
meta_set_txnid(env, meta, read_txn->mt_txnid); meta_set_txnid(env, meta, read_txn->mt_txnid);
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
make_sizeable(meta); meta_make_sizeable(meta);
/* copy canary sequences if present */ /* copy canary sequences if present */
if (read_txn->mt_canary.v) { if (read_txn->mt_canary.v) {
@ -19696,67 +19711,96 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
meta->mm_canary.v = constmeta_txnid(env, meta); meta->mm_canary.v = constmeta_txnid(env, meta);
} }
/* Set metapage 1 with current main DB */ if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) {
pgno_t new_root, root = read_txn->mt_dbs[MAIN_DBI].md_root;
if ((new_root = root) == P_INVALID) {
/* When the DB is empty, handle it specially to /* When the DB is empty, handle it specially to
* fix any breakage like page leaks from ITS#8174. */ * fix any breakage like page leaks from ITS#8174. */
meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags;
compact_fixup_meta(env, meta); compacting_fixup_meta(env, meta);
if (dest_is_pipe) { if (dest_is_pipe) {
int rc = mdbx_write(fd, buffer, meta_bytes); int rc = mdbx_write(fd, buffer, meta_bytes);
if (rc != MDBX_SUCCESS) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
} }
} else { } else {
/* Count free pages + GC pages. Subtract from last_pg /* Count free pages + GC pages. */
* to find the new last_pg, which also becomes the new root. */
pgno_t freecount = 0;
MDBX_cursor_couple couple; MDBX_cursor_couple couple;
MDBX_val key, data;
int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI); int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == 0) pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages +
freecount += *(pgno_t *)data.iov_base; read_txn->mt_dbs[FREE_DBI].md_leaf_pages +
read_txn->mt_dbs[FREE_DBI].md_overflow_pages;
MDBX_val key, data;
while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) ==
MDBX_SUCCESS) {
const MDBX_PNL pnl = data.iov_base;
if (unlikely(data.iov_len % sizeof(pgno_t) ||
data.iov_len < MDBX_PNL_SIZEOF(pnl) ||
!(mdbx_pnl_check(pnl, read_txn->mt_next_pgno))))
return MDBX_CORRUPTED;
gc += MDBX_PNL_SIZE(pnl);
}
if (unlikely(rc != MDBX_NOTFOUND)) if (unlikely(rc != MDBX_NOTFOUND))
return rc; return rc;
freecount += read_txn->mt_dbs[FREE_DBI].md_branch_pages + /* Substract GC-pages from mt_next_pgno to find the new mt_next_pgno. */
read_txn->mt_dbs[FREE_DBI].md_leaf_pages + meta->mm_geo.next = read_txn->mt_next_pgno - gc;
read_txn->mt_dbs[FREE_DBI].md_overflow_pages; /* Set with current main DB */
new_root = read_txn->mt_next_pgno - 1 - freecount;
meta->mm_geo.next = new_root + 1;
meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI];
meta->mm_dbs[MAIN_DBI].md_root = new_root;
mdbx_copy ctx; mdbx_compacting_ctx ctx;
memset(&ctx, 0, sizeof(ctx)); memset(&ctx, 0, sizeof(ctx));
rc = mdbx_condpair_init(&ctx.mc_condpair); rc = mdbx_condpair_init(&ctx.mc_condpair);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
return rc; return rc;
memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2); memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF);
ctx.mc_wbuf[0] = data_buffer; ctx.mc_wbuf[0] = data_buffer;
ctx.mc_wbuf[1] = data_buffer + ((size_t)(MDBX_ENVCOPY_WRITEBUF)); ctx.mc_wbuf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF;
ctx.mc_next_pgno = NUM_METAS; ctx.mc_next_pgno = NUM_METAS;
ctx.mc_env = env; ctx.mc_env = env;
ctx.mc_fd = fd; ctx.mc_fd = fd;
ctx.mc_txn = read_txn; ctx.mc_txn = read_txn;
mdbx_thread_t thread; mdbx_thread_t thread;
int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx); int thread_err = mdbx_thread_create(&thread, compacting_write_thread, &ctx);
if (likely(thread_err == MDBX_SUCCESS)) { if (likely(thread_err == MDBX_SUCCESS)) {
if (dest_is_pipe) { if (dest_is_pipe) {
compact_fixup_meta(env, meta); compacting_fixup_meta(env, meta);
rc = mdbx_write(fd, buffer, meta_bytes); rc = mdbx_write(fd, buffer, meta_bytes);
} }
if (rc == MDBX_SUCCESS) if (likely(rc == MDBX_SUCCESS))
rc = mdbx_env_cwalk(&ctx, &root, 0); rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]);
mdbx_env_cthr_toggle(&ctx); if (ctx.mc_wlen[ctx.mc_head & 1])
mdbx_env_cthr_toggle(&ctx); /* toggle to flush non-empty buffers */
compacting_toggle_write_buffers(&ctx);
if (likely(rc == MDBX_SUCCESS) &&
unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) {
if (ctx.mc_next_pgno > meta->mm_geo.next) {
mdbx_error(
"the source DB %s: post-compactification used pages %" PRIaPGNO
" %c expected %" PRIaPGNO,
"has double-used pages or other corruption", ctx.mc_next_pgno,
'>', meta->mm_geo.next);
rc = MDBX_CORRUPTED; /* corrupted DB */
}
if (ctx.mc_next_pgno < meta->mm_geo.next) {
mdbx_warning(
"the source DB %s: post-compactification used pages %" PRIaPGNO
" %c expected %" PRIaPGNO,
"has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next);
if (dest_is_pipe)
/* the root within already written meta-pages is wrong */
rc = MDBX_CORRUPTED;
}
/* fixup meta */
meta->mm_geo.next = ctx.mc_next_pgno;
}
/* toggle with empty buffers to exit thread's loop */
mdbx_assert(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0);
compacting_toggle_write_buffers(&ctx);
thread_err = mdbx_thread_join(thread); thread_err = mdbx_thread_join(thread);
mdbx_assert(env, (ctx.mc_tail == ctx.mc_head && mdbx_assert(env, (ctx.mc_tail == ctx.mc_head &&
ctx.mc_wlen[ctx.mc_head & 1] == 0) || ctx.mc_wlen[ctx.mc_head & 1] == 0) ||
@ -19769,32 +19813,8 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
return rc; return rc;
if (unlikely(ctx.mc_error != MDBX_SUCCESS)) if (unlikely(ctx.mc_error != MDBX_SUCCESS))
return ctx.mc_error; return ctx.mc_error;
if (!dest_is_pipe)
if (dest_is_pipe) { compacting_fixup_meta(env, meta);
if (unlikely(root != new_root)) {
mdbx_error("post-compactification root %" PRIaPGNO
" NE expected %" PRIaPGNO
" (source DB corrupted or has a page leak(s))",
root, new_root);
return MDBX_CORRUPTED; /* page leak or corrupt DB */
}
} else {
if (unlikely(root > new_root)) {
mdbx_error("post-compactification root %" PRIaPGNO
" GT expected %" PRIaPGNO " (source DB corrupted)",
root, new_root);
return MDBX_CORRUPTED; /* page leak or corrupt DB */
}
if (unlikely(root < new_root)) {
mdbx_warning("post-compactification root %" PRIaPGNO
" LT expected %" PRIaPGNO " (page leak(s) in source DB)",
root, new_root);
/* fixup meta */
meta->mm_dbs[MAIN_DBI].md_root = root;
meta->mm_geo.next = root + 1;
}
compact_fixup_meta(env, meta);
}
} }
/* Extend file if required */ /* Extend file if required */
@ -19804,12 +19824,11 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
return mdbx_ftruncate(fd, whole_size); return mdbx_ftruncate(fd, whole_size);
const size_t used_size = pgno2bytes(env, meta->mm_geo.next); const size_t used_size = pgno2bytes(env, meta->mm_geo.next);
memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
for (size_t offset = used_size; offset < whole_size;) { for (size_t offset = used_size; offset < whole_size;) {
const size_t chunk = const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
(((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF
? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) : whole_size - offset;
: whole_size - offset;
/* copy to avoid EFAULT in case swapped-out */ /* copy to avoid EFAULT in case swapped-out */
int rc = mdbx_write(fd, data_buffer, chunk); int rc = mdbx_write(fd, data_buffer, chunk);
if (unlikely(rc != MDBX_SUCCESS)) if (unlikely(rc != MDBX_SUCCESS))
@ -19850,7 +19869,7 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
mdbx_txn_unlock(env); mdbx_txn_unlock(env);
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
make_sizeable(headcopy); meta_make_sizeable(headcopy);
/* Update signature to steady */ /* Update signature to steady */
unaligned_poke_u64(4, headcopy->mm_datasync_sign, meta_sign(headcopy)); unaligned_poke_u64(4, headcopy->mm_datasync_sign, meta_sign(headcopy));
@ -19910,10 +19929,9 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
#endif /* MDBX_USE_COPYFILERANGE */ #endif /* MDBX_USE_COPYFILERANGE */
/* fallback to portable */ /* fallback to portable */
const size_t chunk = const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset)
(((size_t)(MDBX_ENVCOPY_WRITEBUF)) < used_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF
? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) : used_size - offset;
: used_size - offset;
/* copy to avoid EFAULT in case swapped-out */ /* copy to avoid EFAULT in case swapped-out */
memcpy(data_buffer, env->me_map + offset, chunk); memcpy(data_buffer, env->me_map + offset, chunk);
rc = mdbx_write(fd, data_buffer, chunk); rc = mdbx_write(fd, data_buffer, chunk);
@ -19925,12 +19943,12 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
if (!dest_is_pipe) if (!dest_is_pipe)
rc = mdbx_ftruncate(fd, whole_size); rc = mdbx_ftruncate(fd, whole_size);
else { else {
memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
for (size_t offset = used_size; for (size_t offset = used_size;
rc == MDBX_SUCCESS && offset < whole_size;) { rc == MDBX_SUCCESS && offset < whole_size;) {
const size_t chunk = const size_t chunk =
(((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) ? (size_t)MDBX_ENVCOPY_WRITEBUF
: whole_size - offset; : whole_size - offset;
/* copy to avoid EFAULT in case swapped-out */ /* copy to avoid EFAULT in case swapped-out */
rc = mdbx_write(fd, data_buffer, chunk); rc = mdbx_write(fd, data_buffer, chunk);
@ -19961,8 +19979,8 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
const size_t buffer_size = const size_t buffer_size =
pgno_align2os_bytes(env, NUM_METAS) + pgno_align2os_bytes(env, NUM_METAS) +
ceil_powerof2(((flags & MDBX_CP_COMPACT) ceil_powerof2(((flags & MDBX_CP_COMPACT)
? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2 ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF
: ((size_t)(MDBX_ENVCOPY_WRITEBUF))), : (size_t)MDBX_ENVCOPY_WRITEBUF),
env->me_os_psize); env->me_os_psize);
uint8_t *buffer = NULL; uint8_t *buffer = NULL;