/// \copyright SPDX-License-Identifier: Apache-2.0 /// \note Please refer to the COPYRIGHT file for explanations license change, /// credits and acknowledgments. /// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 #include "internals.h" typedef struct compacting_context { MDBX_env *env; MDBX_txn *txn; MDBX_copy_flags_t flags; pgno_t first_unallocated; osal_condpair_t condpair; volatile unsigned head; volatile unsigned tail; uint8_t *write_buf[2]; size_t write_len[2]; /* Error code. Never cleared if set. Both threads can set nonzero * to fail the copy. Not mutex-protected, expects atomic int. */ volatile int error; mdbx_filehandle_t fd; } ctx_t; __cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree); /* Dedicated writer thread for compacting copy. */ __cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { ctx_t *const ctx = arg; #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) sigset_t sigset; sigemptyset(&sigset); sigaddset(&sigset, SIGPIPE); ctx->error = pthread_sigmask(SIG_BLOCK, &sigset, nullptr); #endif /* EPIPE */ osal_condpair_lock(&ctx->condpair); while (!ctx->error) { while (ctx->tail == ctx->head && !ctx->error) { int err = osal_condpair_wait(&ctx->condpair, true); if (err != MDBX_SUCCESS) { ctx->error = err; goto bailout; } } const unsigned toggle = ctx->tail & 1; size_t wsize = ctx->write_len[toggle]; if (wsize == 0) { ctx->tail += 1; break /* EOF */; } ctx->write_len[toggle] = 0; uint8_t *ptr = ctx->write_buf[toggle]; if (!ctx->error) { int err = osal_write(ctx->fd, ptr, wsize); if (err != MDBX_SUCCESS) { #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) if (err == EPIPE) { /* Collect the pending SIGPIPE, * otherwise at least OS X gives it to the process on thread-exit. */ int unused; sigwait(&sigset, &unused); } #endif /* EPIPE */ ctx->error = err; goto bailout; } } ctx->tail += 1; osal_condpair_signal(&ctx->condpair, false); } bailout: osal_condpair_unlock(&ctx->condpair); return (THREAD_RESULT)0; } /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ __cold static int compacting_toggle_write_buffers(ctx_t *ctx) { osal_condpair_lock(&ctx->condpair); eASSERT(ctx->env, ctx->head - ctx->tail < 2 || ctx->error); ctx->head += 1; osal_condpair_signal(&ctx->condpair, true); while (!ctx->error && ctx->head - ctx->tail == 2 /* both buffers in use */) { if (ctx->flags & MDBX_CP_THROTTLE_MVCC) mdbx_txn_park(ctx->txn, false); int err = osal_condpair_wait(&ctx->condpair, false); if (err == MDBX_SUCCESS && (ctx->flags & MDBX_CP_THROTTLE_MVCC) != 0) err = mdbx_txn_unpark(ctx->txn, false); if (err != MDBX_SUCCESS) ctx->error = err; } osal_condpair_unlock(&ctx->condpair); return ctx->error; } static int compacting_put_bytes(ctx_t *ctx, const void *src, size_t bytes, pgno_t pgno, pgno_t npages) { assert(pgno == 0 || bytes > PAGEHDRSZ); while (bytes > 0) { const size_t side = ctx->head & 1; const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->write_len[side]; if (left < (pgno ? PAGEHDRSZ : 1)) { int err = compacting_toggle_write_buffers(ctx); if (unlikely(err != MDBX_SUCCESS)) return err; continue; } const size_t chunk = (bytes < left) ? bytes : left; void *const dst = ctx->write_buf[side] + ctx->write_len[side]; if (src) { memcpy(dst, src, chunk); if (pgno) { assert(chunk > PAGEHDRSZ); page_t *mp = dst; mp->pgno = pgno; if (mp->txnid == 0) mp->txnid = ctx->txn->txnid; if (mp->flags == P_LARGE) { assert(bytes <= pgno2bytes(ctx->env, npages)); mp->pages = npages; } pgno = 0; } src = ptr_disp(src, chunk); } else memset(dst, 0, chunk); bytes -= chunk; ctx->write_len[side] += chunk; } return MDBX_SUCCESS; } static int compacting_put_page(ctx_t *ctx, const page_t *mp, const size_t head_bytes, const size_t tail_bytes, const pgno_t npages) { if (tail_bytes) { assert(head_bytes + tail_bytes <= ctx->env->ps); assert(npages == 1 && (page_type(mp) == P_BRANCH || page_type(mp) == P_LEAF)); } else { assert(head_bytes <= pgno2bytes(ctx->env, npages)); assert((npages == 1 && page_type(mp) == (P_LEAF | P_DUPFIX)) || page_type(mp) == P_LARGE); } const pgno_t pgno = ctx->first_unallocated; ctx->first_unallocated += npages; int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages); if (unlikely(err != MDBX_SUCCESS)) return err; err = compacting_put_bytes( ctx, nullptr, pgno2bytes(ctx->env, npages) - (head_bytes + tail_bytes), 0, 0); if (unlikely(err != MDBX_SUCCESS)) return err; return compacting_put_bytes(ctx, ptr_disp(mp, ctx->env->ps - tail_bytes), tail_bytes, 0, 0); } __cold static int compacting_walk(ctx_t *ctx, MDBX_cursor *mc, pgno_t *const parent_pgno, txnid_t parent_txnid) { mc->top = 0; mc->ki[0] = 0; int rc = page_get(mc, *parent_pgno, &mc->pg[0], parent_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; rc = tree_search_finalize(mc, nullptr, Z_FIRST); if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Make cursor pages writable */ const intptr_t deep_limit = mc->top + 1; void *const buf = osal_malloc(pgno2bytes(ctx->env, deep_limit + 1)); if (buf == nullptr) return MDBX_ENOMEM; void *ptr = buf; for (intptr_t i = 0; i <= mc->top; i++) { page_copy(ptr, mc->pg[i], ctx->env->ps); mc->pg[i] = ptr; ptr = ptr_disp(ptr, ctx->env->ps); } /* This is writable space for a leaf page. Usually not needed. */ page_t *const leaf = ptr; while (mc->top >= 0) { page_t *mp = mc->pg[mc->top]; const size_t nkeys = page_numkeys(mp); if (is_leaf(mp)) { if (!(mc->flags & z_inner) /* may have nested N_TREE or N_BIG nodes */) { for (size_t i = 0; i < nkeys; i++) { node_t *node = page_node(mp, i); if (node_flags(node) == N_BIG) { /* Need writable leaf */ if (mp != leaf) { mc->pg[mc->top] = leaf; page_copy(leaf, mp, ctx->env->ps); mp = leaf; node = page_node(mp, i); } const pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->txnid); if (unlikely((rc = lp.err) != MDBX_SUCCESS)) goto bailout; const size_t datasize = node_ds(node); const pgno_t npages = largechunk_npages(ctx->env, datasize); poke_pgno(node_data(node), ctx->first_unallocated); rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0, npages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else if (node_flags(node) & N_TREE) { if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(tree_t))) { ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid dupsort sub-tree node size", (unsigned)node_ds(node)); rc = MDBX_CORRUPTED; goto bailout; } /* Need writable leaf */ if (mp != leaf) { mc->pg[mc->top] = leaf; page_copy(leaf, mp, ctx->env->ps); mp = leaf; node = page_node(mp, i); } tree_t *nested = nullptr; if (node_flags(node) & N_DUP) { rc = cursor_dupsort_setup(mc, node, mp); if (likely(rc == MDBX_SUCCESS)) { nested = &mc->subcur->nested_tree; rc = compacting_walk(ctx, &mc->subcur->cursor, &nested->root, mp->txnid); } } else { cASSERT(mc, (mc->flags & z_inner) == 0 && mc->subcur == 0); cursor_couple_t *couple = container_of(mc, cursor_couple_t, outer); nested = &couple->inner.nested_tree; memcpy(nested, node_data(node), sizeof(tree_t)); rc = compacting_walk_tree(ctx, nested); } if (unlikely(rc != MDBX_SUCCESS)) goto bailout; memcpy(node_data(node), nested, sizeof(tree_t)); } } } } else { mc->ki[mc->top]++; if (mc->ki[mc->top] < nkeys) { for (;;) { const node_t *node = page_node(mp, mc->ki[mc->top]); rc = page_get(mc, node_pgno(node), &mp, mp->txnid); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; mc->top += 1; if (unlikely(mc->top >= deep_limit)) { rc = MDBX_CURSOR_FULL; goto bailout; } mc->ki[mc->top] = 0; if (!is_branch(mp)) { mc->pg[mc->top] = mp; break; } /* Whenever we advance to a sibling branch page, * we must proceed all the way down to its first leaf. */ page_copy(mc->pg[mc->top], mp, ctx->env->ps); } continue; } } const pgno_t pgno = ctx->first_unallocated; if (likely(!is_dupfix_leaf(mp))) { rc = compacting_put_page(ctx, mp, PAGEHDRSZ + mp->lower, ctx->env->ps - (PAGEHDRSZ + mp->upper), 1); } else { rc = compacting_put_page( ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->dupfix_ksize, 0, 1); } if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (mc->top) { /* Update parent if there is one */ node_set_pgno(page_node(mc->pg[mc->top - 1], mc->ki[mc->top - 1]), pgno); cursor_pop(mc); } else { /* Otherwise we're done */ *parent_pgno = pgno; break; } } bailout: osal_free(buf); return rc; } __cold static int compacting_walk_tree(ctx_t *ctx, tree_t *tree) { if (unlikely(tree->root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ cursor_couple_t couple; memset(&couple, 0, sizeof(couple)); couple.inner.cursor.signature = ~cur_signature_live; kvx_t kvx = {.clc = {.k = {.lmin = INT_MAX}, .v = {.lmin = INT_MAX}}}; int rc = cursor_init4walk(&couple, ctx->txn, tree, &kvx); if (unlikely(rc != MDBX_SUCCESS)) return rc; couple.outer.checking |= z_ignord | z_pagecheck; couple.inner.cursor.checking |= z_ignord | z_pagecheck; if (!tree->mod_txnid) tree->mod_txnid = ctx->txn->txnid; return compacting_walk(ctx, &couple.outer, &tree->root, tree->mod_txnid); } __cold static void compacting_fixup_meta(MDBX_env *env, meta_t *meta) { eASSERT(env, meta->trees.gc.mod_txnid || meta->trees.gc.root == P_INVALID); eASSERT(env, meta->trees.main.mod_txnid || meta->trees.main.root == P_INVALID); /* Calculate filesize taking in account shrink/growing thresholds */ if (meta->geometry.first_unallocated != meta->geometry.now) { meta->geometry.now = meta->geometry.first_unallocated; const size_t aligner = pv2pages(meta->geometry.grow_pv ? meta->geometry.grow_pv : meta->geometry.shrink_pv); if (aligner) { const pgno_t aligned = pgno_align2os_pgno( env, meta->geometry.first_unallocated + aligner - meta->geometry.first_unallocated % aligner); meta->geometry.now = aligned; } } if (meta->geometry.now < meta->geometry.lower) meta->geometry.now = meta->geometry.lower; if (meta->geometry.now > meta->geometry.upper) meta->geometry.now = meta->geometry.upper; /* Update signature */ assert(meta->geometry.now >= meta->geometry.first_unallocated); meta_sign_as_steady(meta); } /* Make resizable */ __cold static void meta_make_sizeable(meta_t *meta) { meta->geometry.lower = MIN_PAGENO; if (meta->geometry.grow_pv == 0) { const pgno_t step = 1 + (meta->geometry.upper - meta->geometry.lower) / 42; meta->geometry.grow_pv = pages2pv(step); } if (meta->geometry.shrink_pv == 0) { const pgno_t step = pv2pages(meta->geometry.grow_pv) << 1; meta->geometry.shrink_pv = pages2pv(step); } } __cold static int copy_with_compacting(MDBX_env *env, MDBX_txn *txn, mdbx_filehandle_t fd, uint8_t *buffer, const bool dest_is_pipe, const MDBX_copy_flags_t flags) { const size_t meta_bytes = pgno2bytes(env, NUM_METAS); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize); meta_t *const meta = meta_init_triplet(env, buffer); meta_set_txnid(env, meta, txn->txnid); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) meta_make_sizeable(meta); /* copy canary sequences if present */ if (txn->canary.v) { meta->canary = txn->canary; meta->canary.v = constmeta_txnid(meta); } if (txn->dbs[MAIN_DBI].root == P_INVALID) { /* When the DB is empty, handle it specially to * fix any breakage like page leaks from ITS#8174. */ meta->trees.main.flags = txn->dbs[MAIN_DBI].flags; compacting_fixup_meta(env, meta); if (dest_is_pipe) { if (flags & MDBX_CP_THROTTLE_MVCC) mdbx_txn_park(txn, false); int rc = osal_write(fd, buffer, meta_bytes); if (likely(rc == MDBX_SUCCESS) && (flags & MDBX_CP_THROTTLE_MVCC) != 0) rc = mdbx_txn_unpark(txn, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; } } else { /* Count free pages + GC pages. */ cursor_couple_t couple; int rc = cursor_init(&couple.outer, txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; pgno_t gc_npages = txn->dbs[FREE_DBI].branch_pages + txn->dbs[FREE_DBI].leaf_pages + txn->dbs[FREE_DBI].large_pages; MDBX_val key, data; rc = outer_first(&couple.outer, &key, &data); while (rc == MDBX_SUCCESS) { const pnl_t pnl = data.iov_base; if (unlikely(data.iov_len % sizeof(pgno_t) || data.iov_len < MDBX_PNL_SIZEOF(pnl))) { ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-record length", data.iov_len); return MDBX_CORRUPTED; } if (unlikely(!pnl_check(pnl, txn->geo.first_unallocated))) { ERROR("%s/%d: %s", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid GC-record content"); return MDBX_CORRUPTED; } gc_npages += MDBX_PNL_GETSIZE(pnl); rc = outer_next(&couple.outer, &key, &data, MDBX_NEXT); } if (unlikely(rc != MDBX_NOTFOUND)) return rc; meta->geometry.first_unallocated = txn->geo.first_unallocated - gc_npages; meta->trees.main = txn->dbs[MAIN_DBI]; ctx_t ctx; memset(&ctx, 0, sizeof(ctx)); rc = osal_condpair_init(&ctx.condpair); if (unlikely(rc != MDBX_SUCCESS)) return rc; memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF); ctx.write_buf[0] = data_buffer; ctx.write_buf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF; ctx.first_unallocated = NUM_METAS; ctx.env = env; ctx.fd = fd; ctx.txn = txn; ctx.flags = flags; osal_thread_t thread; int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx); if (likely(thread_err == MDBX_SUCCESS)) { if (dest_is_pipe) { if (!meta->trees.main.mod_txnid) meta->trees.main.mod_txnid = txn->txnid; compacting_fixup_meta(env, meta); if (flags & MDBX_CP_THROTTLE_MVCC) mdbx_txn_park(txn, false); rc = osal_write(fd, buffer, meta_bytes); if (likely(rc == MDBX_SUCCESS) && (flags & MDBX_CP_THROTTLE_MVCC) != 0) rc = mdbx_txn_unpark(txn, false); } if (likely(rc == MDBX_SUCCESS)) rc = compacting_walk_tree(&ctx, &meta->trees.main); if (ctx.write_len[ctx.head & 1]) /* toggle to flush non-empty buffers */ compacting_toggle_write_buffers(&ctx); if (likely(rc == MDBX_SUCCESS) && unlikely(meta->geometry.first_unallocated != ctx.first_unallocated)) { if (ctx.first_unallocated > meta->geometry.first_unallocated) { ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO " %c expected %" PRIaPGNO, "has double-used pages or other corruption", ctx.first_unallocated, '>', meta->geometry.first_unallocated); rc = MDBX_CORRUPTED; /* corrupted DB */ } if (ctx.first_unallocated < meta->geometry.first_unallocated) { WARNING( "the source DB %s: post-compactification used pages %" PRIaPGNO " %c expected %" PRIaPGNO, "has page leak(s)", ctx.first_unallocated, '<', meta->geometry.first_unallocated); if (dest_is_pipe) /* the root within already written meta-pages is wrong */ rc = MDBX_CORRUPTED; } /* fixup meta */ meta->geometry.first_unallocated = ctx.first_unallocated; } /* toggle with empty buffers to exit thread's loop */ eASSERT(env, (ctx.write_len[ctx.head & 1]) == 0); compacting_toggle_write_buffers(&ctx); thread_err = osal_thread_join(thread); eASSERT(env, (ctx.tail == ctx.head && ctx.write_len[ctx.head & 1] == 0) || ctx.error); osal_condpair_destroy(&ctx.condpair); } if (unlikely(thread_err != MDBX_SUCCESS)) return thread_err; if (unlikely(rc != MDBX_SUCCESS)) return rc; if (unlikely(ctx.error != MDBX_SUCCESS)) return ctx.error; if (!dest_is_pipe) compacting_fixup_meta(env, meta); } if (flags & MDBX_CP_THROTTLE_MVCC) mdbx_txn_park(txn, false); /* Extend file if required */ if (meta->geometry.now != meta->geometry.first_unallocated) { const size_t whole_size = pgno2bytes(env, meta->geometry.now); if (!dest_is_pipe) return osal_ftruncate(fd, whole_size); const size_t used_size = pgno2bytes(env, meta->geometry.first_unallocated); memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; offset < whole_size;) { const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; int rc = osal_write(fd, data_buffer, chunk); if (unlikely(rc != MDBX_SUCCESS)) return rc; offset += chunk; } } return MDBX_SUCCESS; } //---------------------------------------------------------------------------- __cold static int copy_asis(MDBX_env *env, MDBX_txn *txn, mdbx_filehandle_t fd, uint8_t *buffer, const bool dest_is_pipe, const MDBX_copy_flags_t flags) { bool should_unlock = false; if ((txn->flags & MDBX_TXN_RDONLY) != 0 && (flags & MDBX_CP_RENEW_TXN) != 0) { /* Try temporarily block writers until we snapshot the meta pages */ int err = lck_txn_lock(env, true); if (likely(err == MDBX_SUCCESS)) should_unlock = true; else if (unlikely(err != MDBX_BUSY)) return err; } jitter4testing(false); int rc = MDBX_SUCCESS; const size_t meta_bytes = pgno2bytes(env, NUM_METAS); troika_t troika = meta_tap(env); /* Make a snapshot of meta-pages, * but writing ones after the data was flushed */ retry_snap_meta: memcpy(buffer, env->dxb_mmap.base, meta_bytes); const meta_ptr_t recent = meta_recent(env, &troika); meta_t *headcopy = /* LY: get pointer to the snapshot copy */ ptr_disp(buffer, ptr_dist(recent.ptr_c, env->dxb_mmap.base)); jitter4testing(false); if (txn->flags & MDBX_TXN_RDONLY) { if (recent.txnid != txn->txnid) { if (flags & MDBX_CP_RENEW_TXN) rc = mdbx_txn_renew(txn); else { rc = MDBX_MVCC_RETARDED; for (size_t n = 0; n < NUM_METAS; ++n) { meta_t *const meta = page_meta(ptr_disp(buffer, pgno2bytes(env, n))); if (troika.txnid[n] == txn->txnid && ((/* is_steady */ (troika.fsm >> n) & 1) || rc != MDBX_SUCCESS)) { rc = MDBX_SUCCESS; headcopy = meta; } else if (troika.txnid[n] > txn->txnid) meta_set_txnid(env, meta, 0); } } } if (should_unlock) lck_txn_unlock(env); else { troika_t snap = meta_tap(env); if (memcmp(&troika, &snap, sizeof(troika_t)) && rc == MDBX_SUCCESS) { troika = snap; goto retry_snap_meta; } } } if (unlikely(rc != MDBX_SUCCESS)) return rc; if (txn->flags & MDBX_TXN_RDONLY) eASSERT(env, meta_txnid(headcopy) == txn->txnid); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) meta_make_sizeable(headcopy); /* Update signature to steady */ meta_sign_as_steady(headcopy); /* Copy the data */ const size_t whole_size = pgno_align2os_bytes(env, txn->geo.end_pgno); const size_t used_size = pgno2bytes(env, txn->geo.first_unallocated); jitter4testing(false); if (flags & MDBX_CP_THROTTLE_MVCC) mdbx_txn_park(txn, false); if (dest_is_pipe) rc = osal_write(fd, buffer, meta_bytes); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, globals.sys_pagesize); #if MDBX_USE_COPYFILERANGE static bool copyfilerange_unavailable; bool not_the_same_filesystem = false; struct statfs statfs_info; if (fstatfs(fd, &statfs_info) || statfs_info.f_type == /* ECRYPTFS_SUPER_MAGIC */ 0xf15f) /* avoid use copyfilerange_unavailable() to ecryptfs due bugs */ not_the_same_filesystem = true; #endif /* MDBX_USE_COPYFILERANGE */ for (size_t offset = meta_bytes; rc == MDBX_SUCCESS && offset < used_size;) { if (flags & MDBX_CP_THROTTLE_MVCC) { rc = mdbx_txn_unpark(txn, false); if (unlikely(rc != MDBX_SUCCESS)) break; } #if MDBX_USE_SENDFILE static bool sendfile_unavailable; if (dest_is_pipe && likely(!sendfile_unavailable)) { off_t in_offset = offset; const ssize_t written = sendfile(fd, env->lazy_fd, &in_offset, used_size - offset); if (likely(written > 0)) { offset = in_offset; if (flags & MDBX_CP_THROTTLE_MVCC) rc = mdbx_txn_park(txn, false); continue; } rc = MDBX_ENODATA; if (written == 0 || ignore_enosys(rc = errno) != MDBX_RESULT_TRUE) break; sendfile_unavailable = true; } #endif /* MDBX_USE_SENDFILE */ #if MDBX_USE_COPYFILERANGE if (!dest_is_pipe && !not_the_same_filesystem && likely(!copyfilerange_unavailable)) { off_t in_offset = offset, out_offset = offset; ssize_t bytes_copied = copy_file_range( env->lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0); if (likely(bytes_copied > 0)) { offset = in_offset; if (flags & MDBX_CP_THROTTLE_MVCC) rc = mdbx_txn_park(txn, false); continue; } rc = MDBX_ENODATA; if (bytes_copied == 0) break; rc = errno; if (rc == EXDEV || rc == /* workaround for ecryptfs bug(s), maybe useful for others FS */ EINVAL) not_the_same_filesystem = true; else if (ignore_enosys(rc) == MDBX_RESULT_TRUE) copyfilerange_unavailable = true; else break; } #endif /* MDBX_USE_COPYFILERANGE */ /* fallback to portable */ const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF : used_size - offset; /* copy to avoid EFAULT in case swapped-out */ memcpy(data_buffer, ptr_disp(env->dxb_mmap.base, offset), chunk); if (flags & MDBX_CP_THROTTLE_MVCC) mdbx_txn_park(txn, false); rc = osal_write(fd, data_buffer, chunk); offset += chunk; } /* Extend file if required */ if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { if (!dest_is_pipe) rc = osal_ftruncate(fd, whole_size); else { memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; rc == MDBX_SUCCESS && offset < whole_size;) { const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; rc = osal_write(fd, data_buffer, chunk); offset += chunk; } } } return rc; } //---------------------------------------------------------------------------- __cold static int copy2fd(MDBX_txn *txn, mdbx_filehandle_t fd, MDBX_copy_flags_t flags) { if (unlikely(txn->flags & MDBX_TXN_DIRTY)) return MDBX_BAD_TXN; int rc = MDBX_SUCCESS; if (txn->flags & MDBX_TXN_RDONLY) { if (flags & MDBX_CP_THROTTLE_MVCC) { rc = mdbx_txn_park(txn, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; } } else if (unlikely(flags & (MDBX_CP_THROTTLE_MVCC | MDBX_CP_RENEW_TXN))) return MDBX_EINVAL; const int dest_is_pipe = osal_is_pipe(fd); if (MDBX_IS_ERROR(dest_is_pipe)) return dest_is_pipe; if (!dest_is_pipe) { rc = osal_fseek(fd, 0); if (unlikely(rc != MDBX_SUCCESS)) return rc; } MDBX_env *const env = txn->env; const size_t buffer_size = pgno_align2os_bytes(env, NUM_METAS) + ceil_powerof2(((flags & MDBX_CP_COMPACT) ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF : (size_t)MDBX_ENVCOPY_WRITEBUF), globals.sys_pagesize); uint8_t *buffer = nullptr; rc = osal_memalign_alloc(globals.sys_pagesize, buffer_size, (void **)&buffer); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (!dest_is_pipe) { /* Firstly write a stub to meta-pages. * Now we sure to incomplete copy will not be used. */ memset(buffer, -1, pgno2bytes(env, NUM_METAS)); rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS)); } if (likely(rc == MDBX_SUCCESS)) rc = mdbx_txn_unpark(txn, false); if (likely(rc == MDBX_SUCCESS)) { memset(buffer, 0, pgno2bytes(env, NUM_METAS)); rc = ((flags & MDBX_CP_COMPACT) ? copy_with_compacting : copy_asis)( env, txn, fd, buffer, dest_is_pipe, flags); if (likely(rc == MDBX_SUCCESS)) rc = mdbx_txn_unpark(txn, false); } if (txn->flags & MDBX_TXN_RDONLY) { if (flags & MDBX_CP_THROTTLE_MVCC) mdbx_txn_park(txn, true); else if (flags & MDBX_CP_DISPOSE_TXN) mdbx_txn_reset(txn); } if (!dest_is_pipe) { if (likely(rc == MDBX_SUCCESS) && (flags & MDBX_CP_DONT_FLUSH) == 0) rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); /* Write actual meta */ if (likely(rc == MDBX_SUCCESS)) rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); if (likely(rc == MDBX_SUCCESS) && (flags & MDBX_CP_DONT_FLUSH) == 0) rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } osal_memalign_free(buffer); return rc; } __cold static int copy2pathname(MDBX_txn *txn, const pathchar_t *dest_path, MDBX_copy_flags_t flags) { if (unlikely(!dest_path || *dest_path == '\0')) return MDBX_EINVAL; /* The destination path must exist, but the destination file must not. * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ mdbx_filehandle_t newfd = INVALID_HANDLE_VALUE; int rc = osal_openfile(MDBX_OPEN_COPY, txn->env, dest_path, &newfd, #if defined(_WIN32) || defined(_WIN64) (mdbx_mode_t)-1 #else S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP #endif ); #if defined(_WIN32) || defined(_WIN64) /* no locking required since the file opened with ShareMode == 0 */ #else if (rc == MDBX_SUCCESS) { MDBX_STRUCT_FLOCK lock_op; memset(&lock_op, 0, sizeof(lock_op)); lock_op.l_type = F_WRLCK; lock_op.l_whence = SEEK_SET; lock_op.l_start = 0; lock_op.l_len = OFF_T_MAX; if (MDBX_FCNTL(newfd, MDBX_F_SETLK, &lock_op) #if (defined(__linux__) || defined(__gnu_linux__)) && defined(LOCK_EX) && \ (!defined(__ANDROID_API__) || __ANDROID_API__ >= 24) || flock(newfd, LOCK_EX | LOCK_NB) #endif /* Linux */ ) rc = errno; } #endif /* Windows / POSIX */ if (rc == MDBX_SUCCESS) rc = copy2fd(txn, newfd, flags); if (newfd != INVALID_HANDLE_VALUE) { int err = osal_closefile(newfd); if (rc == MDBX_SUCCESS && err != rc) rc = err; if (rc != MDBX_SUCCESS) (void)osal_removefile(dest_path); } return rc; } //---------------------------------------------------------------------------- __cold int mdbx_txn_copy2fd(MDBX_txn *txn, mdbx_filehandle_t fd, MDBX_copy_flags_t flags) { int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (likely(rc == MDBX_SUCCESS)) rc = copy2fd(txn, fd, flags); if (flags & MDBX_CP_DISPOSE_TXN) mdbx_txn_abort(txn); return LOG_IFERR(rc); } __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, MDBX_copy_flags_t flags) { if (unlikely(flags & (MDBX_CP_DISPOSE_TXN | MDBX_CP_RENEW_TXN))) return LOG_IFERR(MDBX_EINVAL); int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return LOG_IFERR(rc); MDBX_txn *txn = nullptr; rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); if (unlikely(rc != MDBX_SUCCESS)) return LOG_IFERR(rc); rc = copy2fd(txn, fd, flags | MDBX_CP_DISPOSE_TXN | MDBX_CP_RENEW_TXN); mdbx_txn_abort(txn); return LOG_IFERR(rc); } __cold int mdbx_txn_copy2pathname(MDBX_txn *txn, const char *dest_path, MDBX_copy_flags_t flags) { #if defined(_WIN32) || defined(_WIN64) wchar_t *dest_pathW = nullptr; int rc = osal_mb2w(dest_path, &dest_pathW); if (likely(rc == MDBX_SUCCESS)) { rc = mdbx_txn_copy2pathnameW(txn, dest_pathW, flags); osal_free(dest_pathW); } return LOG_IFERR(rc); } __cold int mdbx_txn_copy2pathnameW(MDBX_txn *txn, const wchar_t *dest_path, MDBX_copy_flags_t flags) { #endif /* Windows */ int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (likely(rc == MDBX_SUCCESS)) rc = copy2pathname(txn, dest_path, flags); if (flags & MDBX_CP_DISPOSE_TXN) mdbx_txn_abort(txn); return LOG_IFERR(rc); } __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { #if defined(_WIN32) || defined(_WIN64) wchar_t *dest_pathW = nullptr; int rc = osal_mb2w(dest_path, &dest_pathW); if (likely(rc == MDBX_SUCCESS)) { rc = mdbx_env_copyW(env, dest_pathW, flags); osal_free(dest_pathW); } return LOG_IFERR(rc); } __cold int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, MDBX_copy_flags_t flags) { #endif /* Windows */ if (unlikely(flags & (MDBX_CP_DISPOSE_TXN | MDBX_CP_RENEW_TXN))) return LOG_IFERR(MDBX_EINVAL); int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return LOG_IFERR(rc); MDBX_txn *txn = nullptr; rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); if (unlikely(rc != MDBX_SUCCESS)) return LOG_IFERR(rc); rc = copy2pathname(txn, dest_path, flags | MDBX_CP_DISPOSE_TXN | MDBX_CP_RENEW_TXN); mdbx_txn_abort(txn); return LOG_IFERR(rc); }