mirror of
https://github.com/isar/libmdbx.git
synced 2024-10-30 11:29:19 +08:00
mdbx: refine copy-with-compactification to clear/zero unused gaps on a DB pages.
This commit is contained in:
parent
6eefa05f3d
commit
b31b270ffd
151
src/core.c
151
src/core.c
@ -19368,9 +19368,7 @@ typedef struct mdbx_compacting_ctx {
|
|||||||
MDBX_txn *mc_txn;
|
MDBX_txn *mc_txn;
|
||||||
mdbx_condpair_t mc_condpair;
|
mdbx_condpair_t mc_condpair;
|
||||||
uint8_t *mc_wbuf[2];
|
uint8_t *mc_wbuf[2];
|
||||||
uint8_t *mc_over[2];
|
|
||||||
size_t mc_wlen[2];
|
size_t mc_wlen[2];
|
||||||
size_t mc_olen[2];
|
|
||||||
mdbx_filehandle_t mc_fd;
|
mdbx_filehandle_t mc_fd;
|
||||||
/* Error code. Never cleared if set. Both threads can set nonzero
|
/* Error code. Never cleared if set. Both threads can set nonzero
|
||||||
* to fail the copy. Not mutex-protected, MDBX expects atomic int. */
|
* to fail the copy. Not mutex-protected, MDBX expects atomic int. */
|
||||||
@ -19408,7 +19406,6 @@ __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) {
|
|||||||
}
|
}
|
||||||
ctx->mc_wlen[toggle] = 0;
|
ctx->mc_wlen[toggle] = 0;
|
||||||
uint8_t *ptr = ctx->mc_wbuf[toggle];
|
uint8_t *ptr = ctx->mc_wbuf[toggle];
|
||||||
again:
|
|
||||||
if (!ctx->mc_error) {
|
if (!ctx->mc_error) {
|
||||||
int err = mdbx_write(ctx->mc_fd, ptr, wsize);
|
int err = mdbx_write(ctx->mc_fd, ptr, wsize);
|
||||||
if (err != MDBX_SUCCESS) {
|
if (err != MDBX_SUCCESS) {
|
||||||
@ -19424,14 +19421,6 @@ __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) {
|
|||||||
goto bailout;
|
goto bailout;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If there's an overflow page tail, write it too */
|
|
||||||
wsize = ctx->mc_olen[toggle];
|
|
||||||
if (wsize) {
|
|
||||||
ctx->mc_olen[toggle] = 0;
|
|
||||||
ptr = ctx->mc_over[toggle];
|
|
||||||
goto again;
|
|
||||||
}
|
|
||||||
ctx->mc_tail += 1;
|
ctx->mc_tail += 1;
|
||||||
mdbx_condpair_signal(&ctx->mc_condpair, false);
|
mdbx_condpair_signal(&ctx->mc_condpair, false);
|
||||||
}
|
}
|
||||||
@ -19458,6 +19447,69 @@ __cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) {
|
|||||||
|
|
||||||
__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb);
|
__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb);
|
||||||
|
|
||||||
|
static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src,
|
||||||
|
size_t bytes, pgno_t pgno, pgno_t npages) {
|
||||||
|
assert(pgno == 0 || bytes > PAGEHDRSZ);
|
||||||
|
while (bytes > 0) {
|
||||||
|
const unsigned side = ctx->mc_head & 1;
|
||||||
|
const size_t left = (size_t)MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side];
|
||||||
|
if (left < (pgno ? PAGEHDRSZ : 1)) {
|
||||||
|
int err = compacting_toggle_write_buffers(ctx);
|
||||||
|
if (unlikely(err != MDBX_SUCCESS))
|
||||||
|
return err;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const size_t chunk = (bytes < left) ? bytes : left;
|
||||||
|
void *const dst = ctx->mc_wbuf[side] + ctx->mc_wlen[side];
|
||||||
|
if (src) {
|
||||||
|
memcpy(dst, src, chunk);
|
||||||
|
if (pgno) {
|
||||||
|
assert(chunk > PAGEHDRSZ);
|
||||||
|
MDBX_page *mp = dst;
|
||||||
|
mp->mp_pgno = pgno;
|
||||||
|
if (mp->mp_flags == P_OVERFLOW) {
|
||||||
|
assert(bytes <= pgno2bytes(ctx->mc_env, npages));
|
||||||
|
mp->mp_pages = npages;
|
||||||
|
}
|
||||||
|
pgno = 0;
|
||||||
|
}
|
||||||
|
src = (const char *)src + chunk;
|
||||||
|
} else
|
||||||
|
memset(dst, 0, chunk);
|
||||||
|
bytes -= chunk;
|
||||||
|
ctx->mc_wlen[side] += chunk;
|
||||||
|
}
|
||||||
|
return MDBX_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp,
|
||||||
|
const size_t head_bytes, const size_t tail_bytes,
|
||||||
|
const pgno_t npages) {
|
||||||
|
if (tail_bytes) {
|
||||||
|
assert(head_bytes + tail_bytes <= ctx->mc_env->me_psize);
|
||||||
|
assert(npages == 1 &&
|
||||||
|
(PAGETYPE_EXTRA(mp) == P_BRANCH || PAGETYPE_EXTRA(mp) == P_LEAF));
|
||||||
|
} else {
|
||||||
|
assert(head_bytes <= pgno2bytes(ctx->mc_env, npages));
|
||||||
|
assert((npages == 1 && PAGETYPE_EXTRA(mp) == (P_LEAF | P_LEAF2)) ||
|
||||||
|
PAGETYPE_EXTRA(mp) == P_OVERFLOW);
|
||||||
|
}
|
||||||
|
|
||||||
|
const pgno_t pgno = ctx->mc_next_pgno;
|
||||||
|
ctx->mc_next_pgno += npages;
|
||||||
|
int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages);
|
||||||
|
if (unlikely(err != MDBX_SUCCESS))
|
||||||
|
return err;
|
||||||
|
err = compacting_put_bytes(
|
||||||
|
ctx, nullptr, pgno2bytes(ctx->mc_env, npages) - (head_bytes + tail_bytes),
|
||||||
|
0, 0);
|
||||||
|
if (unlikely(err != MDBX_SUCCESS))
|
||||||
|
return err;
|
||||||
|
return compacting_put_bytes(
|
||||||
|
ctx, (const char *)mp + ctx->mc_env->me_psize - tail_bytes, tail_bytes, 0,
|
||||||
|
0);
|
||||||
|
}
|
||||||
|
|
||||||
__cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx,
|
__cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx,
|
||||||
MDBX_cursor *mc, pgno_t *root,
|
MDBX_cursor *mc, pgno_t *root,
|
||||||
txnid_t parent_txnid) {
|
txnid_t parent_txnid) {
|
||||||
@ -19481,10 +19533,8 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx,
|
|||||||
mc->mc_pg[i] = (MDBX_page *)ptr;
|
mc->mc_pg[i] = (MDBX_page *)ptr;
|
||||||
ptr += ctx->mc_env->me_psize;
|
ptr += ctx->mc_env->me_psize;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* This is writable space for a leaf page. Usually not needed. */
|
/* This is writable space for a leaf page. Usually not needed. */
|
||||||
MDBX_page *const leaf = (MDBX_page *)ptr;
|
MDBX_page *const leaf = (MDBX_page *)ptr;
|
||||||
MDBX_page *copy;
|
|
||||||
|
|
||||||
while (mc->mc_snum > 0) {
|
while (mc->mc_snum > 0) {
|
||||||
MDBX_page *mp = mc->mc_pg[mc->mc_top];
|
MDBX_page *mp = mc->mc_pg[mc->mc_top];
|
||||||
@ -19504,35 +19554,17 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx,
|
|||||||
node = page_node(mp, i);
|
node = page_node(mp, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
const pgno_t pgno = node_largedata_pgno(node);
|
const struct page_result lp =
|
||||||
|
mdbx_page_get_ex(mc, node_largedata_pgno(node), mp->mp_txnid);
|
||||||
|
if (unlikely((rc = lp.err) != MDBX_SUCCESS))
|
||||||
|
goto done;
|
||||||
|
const size_t datasize = node_ds(node);
|
||||||
|
const pgno_t npages = number_of_ovpages(ctx->mc_env, datasize);
|
||||||
poke_pgno(node_data(node), ctx->mc_next_pgno);
|
poke_pgno(node_data(node), ctx->mc_next_pgno);
|
||||||
MDBX_page *osrc;
|
rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0,
|
||||||
rc = mdbx_page_get(mc, pgno, &osrc, mp->mp_txnid);
|
npages);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
|
|
||||||
unsigned side = ctx->mc_head & 1;
|
|
||||||
if (ctx->mc_wlen[side] + ctx->mc_env->me_psize >
|
|
||||||
(size_t)MDBX_ENVCOPY_WRITEBUF) {
|
|
||||||
rc = compacting_toggle_write_buffers(ctx);
|
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
|
||||||
goto done;
|
|
||||||
side = ctx->mc_head & 1;
|
|
||||||
}
|
|
||||||
copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]);
|
|
||||||
memcpy(copy, osrc, ctx->mc_env->me_psize);
|
|
||||||
copy->mp_pgno = ctx->mc_next_pgno;
|
|
||||||
ctx->mc_next_pgno += osrc->mp_pages;
|
|
||||||
ctx->mc_wlen[side] += ctx->mc_env->me_psize;
|
|
||||||
|
|
||||||
if (osrc->mp_pages > 1) {
|
|
||||||
ctx->mc_olen[side] = pgno2bytes(ctx->mc_env, osrc->mp_pages - 1);
|
|
||||||
ctx->mc_over[side] = (uint8_t *)osrc + ctx->mc_env->me_psize;
|
|
||||||
rc = compacting_toggle_write_buffers(ctx);
|
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
|
||||||
goto done;
|
|
||||||
side = ctx->mc_head & 1;
|
|
||||||
}
|
|
||||||
} else if (node_flags(node) & F_SUBDATA) {
|
} else if (node_flags(node) & F_SUBDATA) {
|
||||||
if (!MDBX_DISABLE_VALIDATION &&
|
if (!MDBX_DISABLE_VALIDATION &&
|
||||||
unlikely(node_ds(node) != sizeof(MDBX_db))) {
|
unlikely(node_ds(node) != sizeof(MDBX_db))) {
|
||||||
@ -19572,54 +19604,47 @@ __cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx,
|
|||||||
} else {
|
} else {
|
||||||
mc->mc_ki[mc->mc_top]++;
|
mc->mc_ki[mc->mc_top]++;
|
||||||
if (mc->mc_ki[mc->mc_top] < n) {
|
if (mc->mc_ki[mc->mc_top] < n) {
|
||||||
again:;
|
while (1) {
|
||||||
const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
|
const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]);
|
||||||
if (unlikely(node_flags(node))) {
|
|
||||||
mdbx_error("unexpected type 0x%x of node #%u on page #%" PRIaPGNO,
|
|
||||||
node_flags(node), mc->mc_ki[mc->mc_top],
|
|
||||||
mc->mc_pg[mc->mc_top]->mp_pgno);
|
|
||||||
rc = MDBX_CORRUPTED;
|
|
||||||
goto done;
|
|
||||||
}
|
|
||||||
rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid);
|
rc = mdbx_page_get(mc, node_pgno(node), &mp, mp->mp_txnid);
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
mc->mc_top++;
|
mc->mc_top++;
|
||||||
mc->mc_snum++;
|
mc->mc_snum++;
|
||||||
mc->mc_ki[mc->mc_top] = 0;
|
mc->mc_ki[mc->mc_top] = 0;
|
||||||
if (IS_BRANCH(mp)) {
|
if (!IS_BRANCH(mp)) {
|
||||||
|
mc->mc_pg[mc->mc_top] = mp;
|
||||||
|
break;
|
||||||
|
}
|
||||||
/* Whenever we advance to a sibling branch page,
|
/* Whenever we advance to a sibling branch page,
|
||||||
* we must proceed all the way down to its first leaf. */
|
* we must proceed all the way down to its first leaf. */
|
||||||
mdbx_page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize);
|
mdbx_page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize);
|
||||||
goto again;
|
}
|
||||||
} else
|
|
||||||
mc->mc_pg[mc->mc_top] = mp;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned side = ctx->mc_head & 1;
|
const pgno_t pgno = ctx->mc_next_pgno;
|
||||||
if (ctx->mc_wlen[side] + ctx->mc_env->me_psize >
|
if (likely(!IS_LEAF2(mp))) {
|
||||||
(size_t)MDBX_ENVCOPY_WRITEBUF) {
|
rc = compacting_put_page(
|
||||||
rc = compacting_toggle_write_buffers(ctx);
|
ctx, mp, PAGEHDRSZ + mp->mp_lower,
|
||||||
|
ctx->mc_env->me_psize - (PAGEHDRSZ + mp->mp_upper), 1);
|
||||||
|
} else {
|
||||||
|
rc = compacting_put_page(
|
||||||
|
ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->mp_leaf2_ksize, 0, 1);
|
||||||
|
}
|
||||||
if (unlikely(rc != MDBX_SUCCESS))
|
if (unlikely(rc != MDBX_SUCCESS))
|
||||||
goto done;
|
goto done;
|
||||||
side = ctx->mc_head & 1;
|
|
||||||
}
|
|
||||||
copy = (MDBX_page *)(ctx->mc_wbuf[side] + ctx->mc_wlen[side]);
|
|
||||||
mdbx_page_copy(copy, mp, ctx->mc_env->me_psize);
|
|
||||||
copy->mp_pgno = ctx->mc_next_pgno++;
|
|
||||||
ctx->mc_wlen[side] += ctx->mc_env->me_psize;
|
|
||||||
|
|
||||||
if (mc->mc_top) {
|
if (mc->mc_top) {
|
||||||
/* Update parent if there is one */
|
/* Update parent if there is one */
|
||||||
node_set_pgno(
|
node_set_pgno(
|
||||||
page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]),
|
page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]),
|
||||||
copy->mp_pgno);
|
pgno);
|
||||||
mdbx_cursor_pop(mc);
|
mdbx_cursor_pop(mc);
|
||||||
} else {
|
} else {
|
||||||
/* Otherwise we're done */
|
/* Otherwise we're done */
|
||||||
*root = copy->mp_pgno;
|
*root = pgno;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user