mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-06 18:44:13 +08:00
mdbx: rework copy-with-compactification.
Кроме небольшого рефакторинга здесь реализуется более регулярный способ обхода дерева при копировании с компактификаций. В частности, полная инициализация курсоров позволяет выполнять больше проверок/контроля структуры БД и избавиться от флажка CC_COPYING. Beside a small refactoring, a more regular way of traversing the tree when copying with compactification is implemented here. In particular, full initialization of cursors allows to perform more checks/control of the DB structure and get rid of the CC_COPYING flag.
This commit is contained in:
parent
2d300d807b
commit
1740f8227a
412
src/core.c
412
src/core.c
@ -19373,7 +19373,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data,
|
|||||||
/**** COPYING *****************************************************************/
|
/**** COPYING *****************************************************************/
|
||||||
|
|
||||||
/* State needed for a double-buffering compacting copy. */
|
/* State needed for a double-buffering compacting copy. */
|
||||||
typedef struct mdbx_copy {
|
typedef struct mdbx_compacting_ctx {
|
||||||
MDBX_env *mc_env;
|
MDBX_env *mc_env;
|
||||||
MDBX_txn *mc_txn;
|
MDBX_txn *mc_txn;
|
||||||
mdbx_condpair_t mc_condpair;
|
mdbx_condpair_t mc_condpair;
|
||||||
@ -19388,39 +19388,39 @@ typedef struct mdbx_copy {
|
|||||||
pgno_t mc_next_pgno;
|
pgno_t mc_next_pgno;
|
||||||
volatile unsigned mc_head;
|
volatile unsigned mc_head;
|
||||||
volatile unsigned mc_tail;
|
volatile unsigned mc_tail;
|
||||||
} mdbx_copy;
|
} mdbx_compacting_ctx;
|
||||||
|
|
||||||
/* Dedicated writer thread for compacting copy. */
|
/* Dedicated writer thread for compacting copy. */
|
||||||
__cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) {
|
__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) {
|
||||||
mdbx_copy *my = arg;
|
mdbx_compacting_ctx *const ctx = arg;
|
||||||
|
|
||||||
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
|
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
|
||||||
sigset_t sigset;
|
sigset_t sigset;
|
||||||
sigemptyset(&sigset);
|
sigemptyset(&sigset);
|
||||||
sigaddset(&sigset, SIGPIPE);
|
sigaddset(&sigset, SIGPIPE);
|
||||||
my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL);
|
ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL);
|
||||||
#endif /* EPIPE */
|
#endif /* EPIPE */
|
||||||
|
|
||||||
mdbx_condpair_lock(&my->mc_condpair);
|
mdbx_condpair_lock(&ctx->mc_condpair);
|
||||||
while (!my->mc_error) {
|
while (!ctx->mc_error) {
|
||||||
while (my->mc_tail == my->mc_head && !my->mc_error) {
|
while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) {
|
||||||
int err = mdbx_condpair_wait(&my->mc_condpair, true);
|
int err = mdbx_condpair_wait(&ctx->mc_condpair, true);
|
||||||
if (err != MDBX_SUCCESS) {
|
if (err != MDBX_SUCCESS) {
|
||||||
my->mc_error = err;
|
ctx->mc_error = err;
|
||||||
goto bailout;
|
goto bailout;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
const unsigned toggle = my->mc_tail & 1;
|
const unsigned toggle = ctx->mc_tail & 1;
|
||||||
size_t wsize = my->mc_wlen[toggle];
|
size_t wsize = ctx->mc_wlen[toggle];
|
||||||
if (wsize == 0) {
|
if (wsize == 0) {
|
||||||
my->mc_tail += 1;
|
ctx->mc_tail += 1;
|
||||||
break /* EOF */;
|
break /* EOF */;
|
||||||
}
|
}
|
||||||
my->mc_wlen[toggle] = 0;
|
ctx->mc_wlen[toggle] = 0;
|
||||||
uint8_t *ptr = my->mc_wbuf[toggle];
|
uint8_t *ptr = ctx->mc_wbuf[toggle];
|
||||||
again:
|
again:
|
||||||
if (!my->mc_error) {
|
if (!ctx->mc_error) {
|
||||||
int err = mdbx_write(my->mc_fd, ptr, wsize);
|
int err = mdbx_write(ctx->mc_fd, ptr, wsize);
|
||||||
if (err != MDBX_SUCCESS) {
|
if (err != MDBX_SUCCESS) {
|
||||||
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
|
#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64))
|
||||||
if (err == EPIPE) {
|
if (err == EPIPE) {
|
||||||
@ -19430,128 +19430,118 @@ __cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) {
|
|||||||
sigwait(&sigset, &unused);
|
sigwait(&sigset, &unused);
|
||||||
}
|
}
|
||||||
#endif /* EPIPE */
|
#endif /* EPIPE */
|
||||||
my->mc_error = err;
|
ctx->mc_error = err;
|
||||||
goto bailout;
|
goto bailout;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If there's an overflow page tail, write it too */
|
/* If there's an overflow page tail, write it too */
|
||||||
wsize = my->mc_olen[toggle];
|
wsize = ctx->mc_olen[toggle];
|
||||||
if (wsize) {
|
if (wsize) {
|
||||||
my->mc_olen[toggle] = 0;
|
ctx->mc_olen[toggle] = 0;
|
||||||
ptr = my->mc_over[toggle];
|
ptr = ctx->mc_over[toggle];
|
||||||
goto again;
|
goto again;
|
||||||
}
|
}
|
||||||
my->mc_tail += 1;
|
ctx->mc_tail += 1;
|
||||||
mdbx_condpair_signal(&my->mc_condpair, false);
|
mdbx_condpair_signal(&ctx->mc_condpair, false);
|
||||||
}
|
}
|
||||||
bailout:
|
bailout:
|
||||||
mdbx_condpair_unlock(&my->mc_condpair);
|
mdbx_condpair_unlock(&ctx->mc_condpair);
|
||||||
return (THREAD_RESULT)0;
|
return (THREAD_RESULT)0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */
|
/* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */
|
||||||
__cold static int mdbx_env_cthr_toggle(mdbx_copy *my) {
|
__cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) {
|
||||||
mdbx_condpair_lock(&my->mc_condpair);
|
mdbx_condpair_lock(&ctx->mc_condpair);
|
||||||
mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error);
|
mdbx_assert(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error);
|
||||||
my->mc_head += 1;
|
ctx->mc_head += 1;
|
||||||
mdbx_condpair_signal(&my->mc_condpair, true);
|
mdbx_condpair_signal(&ctx->mc_condpair, true);
|
||||||
while (!my->mc_error &&
|
while (!ctx->mc_error &&
|
||||||
my->mc_head - my->mc_tail == 2 /* both buffers in use */) {
|
ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) {
|
||||||
int err = mdbx_condpair_wait(&my->mc_condpair, false);
|
int err = mdbx_condpair_wait(&ctx->mc_condpair, false);
|
||||||
if (err != MDBX_SUCCESS)
|
if (err != MDBX_SUCCESS)
|
||||||
my->mc_error = err;
|
ctx->mc_error = err;
|
||||||
}
|
}
|
||||||
mdbx_condpair_unlock(&my->mc_condpair);
|
mdbx_condpair_unlock(&ctx->mc_condpair);
|
||||||
return my->mc_error;
|
return ctx->mc_error;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Depth-first tree traversal for compacting copy. */
|
__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb);
|
||||||
__cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg,
|
|
||||||
const unsigned hive_flags) {
|
|
||||||
MDBX_cursor_couple couple;
|
|
||||||
MDBX_page *copy;
|
|
||||||
|
|
||||||
/* Empty DB, nothing to do */
|
__cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx,
|
||||||
if (*pg == P_INVALID)
|
MDBX_cursor *mc, pgno_t *root,
|
||||||
return MDBX_SUCCESS;
|
txnid_t parent_txnid) {
|
||||||
|
mc->mc_snum = 1;
|
||||||
memset(&couple, 0, sizeof(couple));
|
int rc = mdbx_page_get(mc, *root, &mc->mc_pg[0], parent_txnid);
|
||||||
couple.outer.mc_snum = 1;
|
|
||||||
couple.outer.mc_txn = my->mc_txn;
|
|
||||||
couple.outer.mc_checking = couple.inner.mx_cursor.mc_checking =
|
|
||||||
(hive_flags & MDBX_DUPFIXED)
|
|
||||||
? CC_PAGECHECK | CC_COPYING | CC_SKIPORD | CC_LEAF | CC_LEAF2
|
|
||||||
: CC_PAGECHECK | CC_COPYING | CC_SKIPORD | CC_LEAF;
|
|
||||||
|
|
||||||
int rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0],
|
|
||||||
my->mc_txn->mt_txnid);
|
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
rc = mdbx_page_search_root(&couple.outer, NULL, MDBX_PS_FIRST);
|
|
||||||
|
rc = mdbx_page_search_root(mc, nullptr, MDBX_PS_FIRST);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
/* Make cursor pages writable */
|
/* Make cursor pages writable */
|
||||||
char *const buf = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum));
|
char *const buf = mdbx_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum));
|
||||||
if (buf == NULL)
|
if (buf == NULL)
|
||||||
return MDBX_ENOMEM;
|
return MDBX_ENOMEM;
|
||||||
|
|
||||||
char *ptr = buf;
|
char *ptr = buf;
|
||||||
for (unsigned i = 0; i < couple.outer.mc_top; i++) {
|
for (unsigned i = 0; i < mc->mc_top; i++) {
|
||||||
mdbx_page_copy((MDBX_page *)ptr, couple.outer.mc_pg[i],
|
mdbx_page_copy((MDBX_page *)ptr, mc->mc_pg[i], ctx->mc_env->me_psize);
|
||||||
my->mc_env->me_psize);
|
mc->mc_pg[i] = (MDBX_page *)ptr;
|
||||||
couple.outer.mc_pg[i] = (MDBX_page *)ptr;
|
ptr += ctx->mc_env->me_psize;
|
||||||
ptr += my->mc_env->me_psize;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This is writable space for a leaf page. Usually not needed. */
|
/* This is writable space for a leaf page. Usually not needed. */
|
||||||
MDBX_page *const leaf = (MDBX_page *)ptr;
|
MDBX_page *const leaf = (MDBX_page *)ptr;
|
||||||
|
MDBX_page *copy;
|
||||||
|
|
||||||
while (couple.outer.mc_snum > 0) {
|
while (mc->mc_snum > 0) {
|
||||||
MDBX_page *mp = couple.outer.mc_pg[couple.outer.mc_top];
|
MDBX_page *mp = mc->mc_pg[mc->mc_top];
|
||||||
unsigned n = page_numkeys(mp);
|
unsigned n = page_numkeys(mp);
|
||||||
|
|
||||||
if (IS_LEAF(mp)) {
|
if (IS_LEAF(mp)) {
|
||||||
if (hive_flags == 0 /* may have nested F_SUBDATA or F_BIGDATA nodes */) {
|
if (!(mc->mc_flags &
|
||||||
|
C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) {
|
||||||
for (unsigned i = 0; i < n; i++) {
|
for (unsigned i = 0; i < n; i++) {
|
||||||
MDBX_node *node = page_node(mp, i);
|
MDBX_node *node = page_node(mp, i);
|
||||||
if (node_flags(node) & F_BIGDATA) {
|
if (node_flags(node) == F_BIGDATA) {
|
||||||
MDBX_page *osrc;
|
|
||||||
|
|
||||||
/* Need writable leaf */
|
/* Need writable leaf */
|
||||||
if (mp != leaf) {
|
if (mp != leaf) {
|
||||||
couple.outer.mc_pg[couple.outer.mc_top] = leaf;
|
mc->mc_pg[mc->mc_top] = leaf;
|
||||||
mdbx_page_copy(leaf, mp, my->mc_env->me_psize);
|
mdbx_page_copy(leaf, mp, ctx->mc_env->me_psize);
|
||||||
mp = leaf;
|
mp = leaf;
|
||||||
node = page_node(mp, i);
|
node = page_node(mp, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
const pgno_t pgno = node_largedata_pgno(node);
|
const pgno_t pgno = node_largedata_pgno(node);
|
||||||
poke_pgno(node_data(node), my->mc_next_pgno);
|
poke_pgno(node_data(node), ctx->mc_next_pgno);
|
||||||
rc = mdbx_page_get(&couple.outer, pgno, &osrc, mp->mp_txnid);
|
MDBX_page *osrc;
|
||||||
|
rc = mdbx_page_get(mc, pgno, &osrc, mp->mp_txnid);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
unsigned toggle = my->mc_head & 1;
|
|
||||||
if (my->mc_wlen[toggle] + my->mc_env->me_psize >
|
unsigned side = ctx->mc_head & 1;
|
||||||
((size_t)(MDBX_ENVCOPY_WRITEBUF))) {
|
if (ctx->mc_wlen[side] + ctx->mc_env->me_psize >
|
||||||
rc = mdbx_env_cthr_toggle(my);
|
(size_t)MDBX_ENVCOPY_WRITEBUF) {
|
||||||
|
rc = compacting_toggle_write_buffers(ctx);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
toggle = my->mc_head & 1;
|
side = ctx->mc_head & 1;
|
||||||
}
|
}
|
||||||
copy = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
|
copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]);
|
||||||
memcpy(copy, osrc, my->mc_env->me_psize);
|
memcpy(copy, osrc, ctx->mc_env->me_psize);
|
||||||
copy->mp_pgno = my->mc_next_pgno;
|
copy->mp_pgno = ctx->mc_next_pgno;
|
||||||
my->mc_next_pgno += osrc->mp_pages;
|
ctx->mc_next_pgno += osrc->mp_pages;
|
||||||
my->mc_wlen[toggle] += my->mc_env->me_psize;
|
ctx->mc_wlen[side] += ctx->mc_env->me_psize;
|
||||||
|
|
||||||
if (osrc->mp_pages > 1) {
|
if (osrc->mp_pages > 1) {
|
||||||
my->mc_olen[toggle] = pgno2bytes(my->mc_env, osrc->mp_pages - 1);
|
ctx->mc_olen[side] = pgno2bytes(ctx->mc_env, osrc->mp_pages - 1);
|
||||||
my->mc_over[toggle] = (uint8_t *)osrc + my->mc_env->me_psize;
|
ctx->mc_over[side] = (uint8_t *)osrc + ctx->mc_env->me_psize;
|
||||||
rc = mdbx_env_cthr_toggle(my);
|
rc = compacting_toggle_write_buffers(ctx);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
toggle = my->mc_head & 1;
|
side = ctx->mc_head & 1;
|
||||||
}
|
}
|
||||||
} else if (node_flags(node) & F_SUBDATA) {
|
} else if (node_flags(node) & F_SUBDATA) {
|
||||||
if (!MDBX_DISABLE_VALIDATION &&
|
if (!MDBX_DISABLE_VALIDATION &&
|
||||||
@ -19562,77 +19552,84 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg,
|
|||||||
|
|
||||||
/* Need writable leaf */
|
/* Need writable leaf */
|
||||||
if (mp != leaf) {
|
if (mp != leaf) {
|
||||||
couple.outer.mc_pg[couple.outer.mc_top] = leaf;
|
mc->mc_pg[mc->mc_top] = leaf;
|
||||||
mdbx_page_copy(leaf, mp, my->mc_env->me_psize);
|
mdbx_page_copy(leaf, mp, ctx->mc_env->me_psize);
|
||||||
mp = leaf;
|
mp = leaf;
|
||||||
node = page_node(mp, i);
|
node = page_node(mp, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
MDBX_db db;
|
MDBX_db *nested = nullptr;
|
||||||
memcpy(&db, node_data(node), sizeof(MDBX_db));
|
if (node_flags(node) & F_DUPDATA) {
|
||||||
STATIC_ASSERT(F_DUPDATA == MDBX_DUPSORT);
|
rc = mdbx_xcursor_init1(mc, node, mp);
|
||||||
rc = mdbx_env_cwalk(my, &db.md_root,
|
if (likely(rc == MDBX_SUCCESS)) {
|
||||||
(node_flags(node) & F_DUPDATA)
|
nested = &mc->mc_xcursor->mx_db;
|
||||||
? MDBX_DUPSORT |
|
rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor,
|
||||||
(db.md_flags & MDBX_DUPFIXED)
|
&nested->md_root, mp->mp_txnid);
|
||||||
: 0);
|
}
|
||||||
|
} else {
|
||||||
|
MDBX_cursor_couple *couple =
|
||||||
|
container_of(mc, MDBX_cursor_couple, inner.mx_cursor);
|
||||||
|
nested = &couple->inner.mx_db;
|
||||||
|
memcpy(nested, node_data(node), sizeof(MDBX_db));
|
||||||
|
rc = compacting_walk_sdb(ctx, nested);
|
||||||
|
}
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
memcpy(node_data(node), &db, sizeof(MDBX_db));
|
memcpy(node_data(node), nested, sizeof(MDBX_db));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
couple.outer.mc_ki[couple.outer.mc_top]++;
|
mc->mc_ki[mc->mc_top]++;
|
||||||
if (couple.outer.mc_ki[couple.outer.mc_top] < n) {
|
if (mc->mc_ki[mc->mc_top] < n) {
|
||||||
again:;
|
again:;
|
||||||
const MDBX_node *node =
|
const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
|
||||||
page_node(mp, couple.outer.mc_ki[couple.outer.mc_top]);
|
if (unlikely(node_flags(node))) {
|
||||||
if (unlikely(node->mn_flags)) {
|
|
||||||
mdbx_error("unexpected type 0x%x of node #%u on page #%" PRIaPGNO,
|
mdbx_error("unexpected type 0x%x of node #%u on page #%" PRIaPGNO,
|
||||||
node->mn_flags, couple.outer.mc_ki[couple.outer.mc_top],
|
node_flags(node), mc->mc_ki[mc->mc_top],
|
||||||
couple.outer.mc_pg[couple.outer.mc_top]->mp_pgno);
|
mc->mc_pg[mc->mc_top]->mp_pgno);
|
||||||
rc = MDBX_CORRUPTED;
|
rc = MDBX_CORRUPTED;
|
||||||
goto done;
|
goto done;
|
||||||
}
|
}
|
||||||
rc = mdbx_page_get(&couple.outer, node_pgno(node), &mp, mp->mp_txnid);
|
rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
couple.outer.mc_top++;
|
mc->mc_top++;
|
||||||
couple.outer.mc_snum++;
|
mc->mc_snum++;
|
||||||
couple.outer.mc_ki[couple.outer.mc_top] = 0;
|
mc->mc_ki[mc->mc_top] = 0;
|
||||||
if (IS_BRANCH(mp)) {
|
if (IS_BRANCH(mp)) {
|
||||||
/* Whenever we advance to a sibling branch page,
|
/* Whenever we advance to a sibling branch page,
|
||||||
* we must proceed all the way down to its first leaf. */
|
* we must proceed all the way down to its first leaf. */
|
||||||
mdbx_page_copy(couple.outer.mc_pg[couple.outer.mc_top], mp,
|
mdbx_page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize);
|
||||||
my->mc_env->me_psize);
|
|
||||||
goto again;
|
goto again;
|
||||||
} else
|
} else
|
||||||
couple.outer.mc_pg[couple.outer.mc_top] = mp;
|
mc->mc_pg[mc->mc_top] = mp;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
unsigned toggle = my->mc_head & 1;
|
|
||||||
if (my->mc_wlen[toggle] + my->mc_wlen[toggle] >
|
unsigned side = ctx->mc_head & 1;
|
||||||
((size_t)(MDBX_ENVCOPY_WRITEBUF))) {
|
if (ctx->mc_wlen[side] + ctx->mc_env->me_psize >
|
||||||
rc = mdbx_env_cthr_toggle(my);
|
(size_t)MDBX_ENVCOPY_WRITEBUF) {
|
||||||
|
rc = compacting_toggle_write_buffers(ctx);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
toggle = my->mc_head & 1;
|
side = ctx->mc_head & 1;
|
||||||
}
|
}
|
||||||
copy = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]);
|
copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]);
|
||||||
mdbx_page_copy(copy, mp, my->mc_env->me_psize);
|
mdbx_page_copy(copy, mp, ctx->mc_env->me_psize);
|
||||||
copy->mp_pgno = my->mc_next_pgno++;
|
copy->mp_pgno = ctx->mc_next_pgno++;
|
||||||
my->mc_wlen[toggle] += my->mc_env->me_psize;
|
ctx->mc_wlen[side] += ctx->mc_env->me_psize;
|
||||||
if (couple.outer.mc_top) {
|
|
||||||
|
if (mc->mc_top) {
|
||||||
/* Update parent if there is one */
|
/* Update parent if there is one */
|
||||||
node_set_pgno(page_node(couple.outer.mc_pg[couple.outer.mc_top - 1],
|
node_set_pgno(
|
||||||
couple.outer.mc_ki[couple.outer.mc_top - 1]),
|
page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]),
|
||||||
copy->mp_pgno);
|
copy->mp_pgno);
|
||||||
mdbx_cursor_pop(&couple.outer);
|
mdbx_cursor_pop(mc);
|
||||||
} else {
|
} else {
|
||||||
/* Otherwise we're done */
|
/* Otherwise we're done */
|
||||||
*pg = copy->mp_pgno;
|
*root = copy->mp_pgno;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -19641,7 +19638,25 @@ done:
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
__cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) {
|
__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) {
|
||||||
|
if (unlikely(sdb->md_root == P_INVALID))
|
||||||
|
return MDBX_SUCCESS; /* empty db */
|
||||||
|
|
||||||
|
MDBX_cursor_couple couple;
|
||||||
|
MDBX_dbx dbx = {.md_klen_min = INT_MAX};
|
||||||
|
uint8_t dbistate = DBI_VALID | DBI_AUDITED;
|
||||||
|
int rc = mdbx_couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate);
|
||||||
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
|
return rc;
|
||||||
|
|
||||||
|
couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK;
|
||||||
|
couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK;
|
||||||
|
return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root,
|
||||||
|
sdb->md_mod_txnid ? sdb->md_mod_txnid
|
||||||
|
: ctx->mc_txn->mt_txnid);
|
||||||
|
}
|
||||||
|
|
||||||
|
__cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) {
|
||||||
/* Calculate filesize taking in account shrink/growing thresholds */
|
/* Calculate filesize taking in account shrink/growing thresholds */
|
||||||
if (meta->mm_geo.next != meta->mm_geo.now) {
|
if (meta->mm_geo.next != meta->mm_geo.now) {
|
||||||
meta->mm_geo.now = meta->mm_geo.next;
|
meta->mm_geo.now = meta->mm_geo.next;
|
||||||
@ -19665,7 +19680,7 @@ __cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Make resizeable */
|
/* Make resizeable */
|
||||||
__cold static void make_sizeable(MDBX_meta *meta) {
|
__cold static void meta_make_sizeable(MDBX_meta *meta) {
|
||||||
meta->mm_geo.lower = MIN_PAGENO;
|
meta->mm_geo.lower = MIN_PAGENO;
|
||||||
if (meta->mm_geo.grow_pv == 0) {
|
if (meta->mm_geo.grow_pv == 0) {
|
||||||
const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42;
|
const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42;
|
||||||
@ -19688,7 +19703,7 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
|
|||||||
meta_set_txnid(env, meta, read_txn->mt_txnid);
|
meta_set_txnid(env, meta, read_txn->mt_txnid);
|
||||||
|
|
||||||
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
|
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
|
||||||
make_sizeable(meta);
|
meta_make_sizeable(meta);
|
||||||
|
|
||||||
/* copy canary sequences if present */
|
/* copy canary sequences if present */
|
||||||
if (read_txn->mt_canary.v) {
|
if (read_txn->mt_canary.v) {
|
||||||
@ -19696,67 +19711,96 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
|
|||||||
meta->mm_canary.v = constmeta_txnid(env, meta);
|
meta->mm_canary.v = constmeta_txnid(env, meta);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Set metapage 1 with current main DB */
|
if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) {
|
||||||
pgno_t new_root, root = read_txn->mt_dbs[MAIN_DBI].md_root;
|
|
||||||
if ((new_root = root) == P_INVALID) {
|
|
||||||
/* When the DB is empty, handle it specially to
|
/* When the DB is empty, handle it specially to
|
||||||
* fix any breakage like page leaks from ITS#8174. */
|
* fix any breakage like page leaks from ITS#8174. */
|
||||||
meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags;
|
meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags;
|
||||||
compact_fixup_meta(env, meta);
|
compacting_fixup_meta(env, meta);
|
||||||
if (dest_is_pipe) {
|
if (dest_is_pipe) {
|
||||||
int rc = mdbx_write(fd, buffer, meta_bytes);
|
int rc = mdbx_write(fd, buffer, meta_bytes);
|
||||||
if (rc != MDBX_SUCCESS)
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
/* Count free pages + GC pages. Subtract from last_pg
|
/* Count free pages + GC pages. */
|
||||||
* to find the new last_pg, which also becomes the new root. */
|
|
||||||
pgno_t freecount = 0;
|
|
||||||
MDBX_cursor_couple couple;
|
MDBX_cursor_couple couple;
|
||||||
MDBX_val key, data;
|
|
||||||
|
|
||||||
int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI);
|
int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == 0)
|
pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages +
|
||||||
freecount += *(pgno_t *)data.iov_base;
|
read_txn->mt_dbs[FREE_DBI].md_leaf_pages +
|
||||||
|
read_txn->mt_dbs[FREE_DBI].md_overflow_pages;
|
||||||
|
MDBX_val key, data;
|
||||||
|
while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) ==
|
||||||
|
MDBX_SUCCESS) {
|
||||||
|
const MDBX_PNL pnl = data.iov_base;
|
||||||
|
if (unlikely(data.iov_len % sizeof(pgno_t) ||
|
||||||
|
data.iov_len < MDBX_PNL_SIZEOF(pnl) ||
|
||||||
|
!(mdbx_pnl_check(pnl, read_txn->mt_next_pgno))))
|
||||||
|
return MDBX_CORRUPTED;
|
||||||
|
gc += MDBX_PNL_SIZE(pnl);
|
||||||
|
}
|
||||||
if (unlikely(rc != MDBX_NOTFOUND))
|
if (unlikely(rc != MDBX_NOTFOUND))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
freecount += read_txn->mt_dbs[FREE_DBI].md_branch_pages +
|
/* Substract GC-pages from mt_next_pgno to find the new mt_next_pgno. */
|
||||||
read_txn->mt_dbs[FREE_DBI].md_leaf_pages +
|
meta->mm_geo.next = read_txn->mt_next_pgno - gc;
|
||||||
read_txn->mt_dbs[FREE_DBI].md_overflow_pages;
|
/* Set with current main DB */
|
||||||
|
|
||||||
new_root = read_txn->mt_next_pgno - 1 - freecount;
|
|
||||||
meta->mm_geo.next = new_root + 1;
|
|
||||||
meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI];
|
meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI];
|
||||||
meta->mm_dbs[MAIN_DBI].md_root = new_root;
|
|
||||||
|
|
||||||
mdbx_copy ctx;
|
mdbx_compacting_ctx ctx;
|
||||||
memset(&ctx, 0, sizeof(ctx));
|
memset(&ctx, 0, sizeof(ctx));
|
||||||
rc = mdbx_condpair_init(&ctx.mc_condpair);
|
rc = mdbx_condpair_init(&ctx.mc_condpair);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
return rc;
|
return rc;
|
||||||
|
|
||||||
memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2);
|
memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF);
|
||||||
ctx.mc_wbuf[0] = data_buffer;
|
ctx.mc_wbuf[0] = data_buffer;
|
||||||
ctx.mc_wbuf[1] = data_buffer + ((size_t)(MDBX_ENVCOPY_WRITEBUF));
|
ctx.mc_wbuf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF;
|
||||||
ctx.mc_next_pgno = NUM_METAS;
|
ctx.mc_next_pgno = NUM_METAS;
|
||||||
ctx.mc_env = env;
|
ctx.mc_env = env;
|
||||||
ctx.mc_fd = fd;
|
ctx.mc_fd = fd;
|
||||||
ctx.mc_txn = read_txn;
|
ctx.mc_txn = read_txn;
|
||||||
|
|
||||||
mdbx_thread_t thread;
|
mdbx_thread_t thread;
|
||||||
int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx);
|
int thread_err = mdbx_thread_create(&thread, compacting_write_thread, &ctx);
|
||||||
if (likely(thread_err == MDBX_SUCCESS)) {
|
if (likely(thread_err == MDBX_SUCCESS)) {
|
||||||
if (dest_is_pipe) {
|
if (dest_is_pipe) {
|
||||||
compact_fixup_meta(env, meta);
|
compacting_fixup_meta(env, meta);
|
||||||
rc = mdbx_write(fd, buffer, meta_bytes);
|
rc = mdbx_write(fd, buffer, meta_bytes);
|
||||||
}
|
}
|
||||||
if (rc == MDBX_SUCCESS)
|
if (likely(rc == MDBX_SUCCESS))
|
||||||
rc = mdbx_env_cwalk(&ctx, &root, 0);
|
rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]);
|
||||||
mdbx_env_cthr_toggle(&ctx);
|
if (ctx.mc_wlen[ctx.mc_head & 1])
|
||||||
mdbx_env_cthr_toggle(&ctx);
|
/* toggle to flush non-empty buffers */
|
||||||
|
compacting_toggle_write_buffers(&ctx);
|
||||||
|
|
||||||
|
if (likely(rc == MDBX_SUCCESS) &&
|
||||||
|
unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) {
|
||||||
|
if (ctx.mc_next_pgno > meta->mm_geo.next) {
|
||||||
|
mdbx_error(
|
||||||
|
"the source DB %s: post-compactification used pages %" PRIaPGNO
|
||||||
|
" %c expected %" PRIaPGNO,
|
||||||
|
"has double-used pages or other corruption", ctx.mc_next_pgno,
|
||||||
|
'>', meta->mm_geo.next);
|
||||||
|
rc = MDBX_CORRUPTED; /* corrupted DB */
|
||||||
|
}
|
||||||
|
if (ctx.mc_next_pgno < meta->mm_geo.next) {
|
||||||
|
mdbx_warning(
|
||||||
|
"the source DB %s: post-compactification used pages %" PRIaPGNO
|
||||||
|
" %c expected %" PRIaPGNO,
|
||||||
|
"has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next);
|
||||||
|
if (dest_is_pipe)
|
||||||
|
/* the root within already written meta-pages is wrong */
|
||||||
|
rc = MDBX_CORRUPTED;
|
||||||
|
}
|
||||||
|
/* fixup meta */
|
||||||
|
meta->mm_geo.next = ctx.mc_next_pgno;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* toggle with empty buffers to exit thread's loop */
|
||||||
|
mdbx_assert(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0);
|
||||||
|
compacting_toggle_write_buffers(&ctx);
|
||||||
thread_err = mdbx_thread_join(thread);
|
thread_err = mdbx_thread_join(thread);
|
||||||
mdbx_assert(env, (ctx.mc_tail == ctx.mc_head &&
|
mdbx_assert(env, (ctx.mc_tail == ctx.mc_head &&
|
||||||
ctx.mc_wlen[ctx.mc_head & 1] == 0) ||
|
ctx.mc_wlen[ctx.mc_head & 1] == 0) ||
|
||||||
@ -19769,32 +19813,8 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
|
|||||||
return rc;
|
return rc;
|
||||||
if (unlikely(ctx.mc_error != MDBX_SUCCESS))
|
if (unlikely(ctx.mc_error != MDBX_SUCCESS))
|
||||||
return ctx.mc_error;
|
return ctx.mc_error;
|
||||||
|
if (!dest_is_pipe)
|
||||||
if (dest_is_pipe) {
|
compacting_fixup_meta(env, meta);
|
||||||
if (unlikely(root != new_root)) {
|
|
||||||
mdbx_error("post-compactification root %" PRIaPGNO
|
|
||||||
" NE expected %" PRIaPGNO
|
|
||||||
" (source DB corrupted or has a page leak(s))",
|
|
||||||
root, new_root);
|
|
||||||
return MDBX_CORRUPTED; /* page leak or corrupt DB */
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (unlikely(root > new_root)) {
|
|
||||||
mdbx_error("post-compactification root %" PRIaPGNO
|
|
||||||
" GT expected %" PRIaPGNO " (source DB corrupted)",
|
|
||||||
root, new_root);
|
|
||||||
return MDBX_CORRUPTED; /* page leak or corrupt DB */
|
|
||||||
}
|
|
||||||
if (unlikely(root < new_root)) {
|
|
||||||
mdbx_warning("post-compactification root %" PRIaPGNO
|
|
||||||
" LT expected %" PRIaPGNO " (page leak(s) in source DB)",
|
|
||||||
root, new_root);
|
|
||||||
/* fixup meta */
|
|
||||||
meta->mm_dbs[MAIN_DBI].md_root = root;
|
|
||||||
meta->mm_geo.next = root + 1;
|
|
||||||
}
|
|
||||||
compact_fixup_meta(env, meta);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Extend file if required */
|
/* Extend file if required */
|
||||||
@ -19804,12 +19824,11 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn,
|
|||||||
return mdbx_ftruncate(fd, whole_size);
|
return mdbx_ftruncate(fd, whole_size);
|
||||||
|
|
||||||
const size_t used_size = pgno2bytes(env, meta->mm_geo.next);
|
const size_t used_size = pgno2bytes(env, meta->mm_geo.next);
|
||||||
memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)));
|
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
|
||||||
for (size_t offset = used_size; offset < whole_size;) {
|
for (size_t offset = used_size; offset < whole_size;) {
|
||||||
const size_t chunk =
|
const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
|
||||||
(((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset)
|
? (size_t)MDBX_ENVCOPY_WRITEBUF
|
||||||
? ((size_t)(MDBX_ENVCOPY_WRITEBUF))
|
: whole_size - offset;
|
||||||
: whole_size - offset;
|
|
||||||
/* copy to avoid EFAULT in case swapped-out */
|
/* copy to avoid EFAULT in case swapped-out */
|
||||||
int rc = mdbx_write(fd, data_buffer, chunk);
|
int rc = mdbx_write(fd, data_buffer, chunk);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
@ -19850,7 +19869,7 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
|
|||||||
mdbx_txn_unlock(env);
|
mdbx_txn_unlock(env);
|
||||||
|
|
||||||
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
|
if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE)
|
||||||
make_sizeable(headcopy);
|
meta_make_sizeable(headcopy);
|
||||||
/* Update signature to steady */
|
/* Update signature to steady */
|
||||||
unaligned_poke_u64(4, headcopy->mm_datasync_sign, meta_sign(headcopy));
|
unaligned_poke_u64(4, headcopy->mm_datasync_sign, meta_sign(headcopy));
|
||||||
|
|
||||||
@ -19910,10 +19929,9 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
|
|||||||
#endif /* MDBX_USE_COPYFILERANGE */
|
#endif /* MDBX_USE_COPYFILERANGE */
|
||||||
|
|
||||||
/* fallback to portable */
|
/* fallback to portable */
|
||||||
const size_t chunk =
|
const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset)
|
||||||
(((size_t)(MDBX_ENVCOPY_WRITEBUF)) < used_size - offset)
|
? (size_t)MDBX_ENVCOPY_WRITEBUF
|
||||||
? ((size_t)(MDBX_ENVCOPY_WRITEBUF))
|
: used_size - offset;
|
||||||
: used_size - offset;
|
|
||||||
/* copy to avoid EFAULT in case swapped-out */
|
/* copy to avoid EFAULT in case swapped-out */
|
||||||
memcpy(data_buffer, env->me_map + offset, chunk);
|
memcpy(data_buffer, env->me_map + offset, chunk);
|
||||||
rc = mdbx_write(fd, data_buffer, chunk);
|
rc = mdbx_write(fd, data_buffer, chunk);
|
||||||
@ -19925,12 +19943,12 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
|
|||||||
if (!dest_is_pipe)
|
if (!dest_is_pipe)
|
||||||
rc = mdbx_ftruncate(fd, whole_size);
|
rc = mdbx_ftruncate(fd, whole_size);
|
||||||
else {
|
else {
|
||||||
memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)));
|
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
|
||||||
for (size_t offset = used_size;
|
for (size_t offset = used_size;
|
||||||
rc == MDBX_SUCCESS && offset < whole_size;) {
|
rc == MDBX_SUCCESS && offset < whole_size;) {
|
||||||
const size_t chunk =
|
const size_t chunk =
|
||||||
(((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset)
|
((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset)
|
||||||
? ((size_t)(MDBX_ENVCOPY_WRITEBUF))
|
? (size_t)MDBX_ENVCOPY_WRITEBUF
|
||||||
: whole_size - offset;
|
: whole_size - offset;
|
||||||
/* copy to avoid EFAULT in case swapped-out */
|
/* copy to avoid EFAULT in case swapped-out */
|
||||||
rc = mdbx_write(fd, data_buffer, chunk);
|
rc = mdbx_write(fd, data_buffer, chunk);
|
||||||
@ -19961,8 +19979,8 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd,
|
|||||||
const size_t buffer_size =
|
const size_t buffer_size =
|
||||||
pgno_align2os_bytes(env, NUM_METAS) +
|
pgno_align2os_bytes(env, NUM_METAS) +
|
||||||
ceil_powerof2(((flags & MDBX_CP_COMPACT)
|
ceil_powerof2(((flags & MDBX_CP_COMPACT)
|
||||||
? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2
|
? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF
|
||||||
: ((size_t)(MDBX_ENVCOPY_WRITEBUF))),
|
: (size_t)MDBX_ENVCOPY_WRITEBUF),
|
||||||
env->me_os_psize);
|
env->me_os_psize);
|
||||||
|
|
||||||
uint8_t *buffer = NULL;
|
uint8_t *buffer = NULL;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user