From ba6df2bb6d2c8ddb133fb73ec6bf52a561682456 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 17 Dec 2024 17:47:45 +0300 Subject: [PATCH] =?UTF-8?q?mdbx:=20=D0=B2=D1=8B=D0=B4=D0=B5=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5=20API-=D1=84=D1=83=D0=BD=D0=BA=D1=86=D0=B8?= =?UTF-8?q?=D0=B9=20=D0=B2=20api-=D1=84=D0=B0=D0=B9=D0=BB=D1=8B.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CMakeLists.txt | 36 +- src/alloy.c | 16 +- src/{cold.c => api-cold.c} | 105 -- src/{copy.c => api-copy.c} | 0 src/api-dbi.c | 324 +++++ src/api-env.c | 105 ++ src/api-extra.c | 59 + src/{misc.c => api-misc.c} | 0 src/{env-opts.c => api-opts.c} | 0 ...{range-estimate.c => api-range-estimate.c} | 3 + src/api-txn-data.c | 449 ++++++ src/api-txn.c | 1217 +++++++++++------ src/dbi.c | 333 +---- src/dbi.h | 14 + src/internals.h | 5 - src/mvcc-readers.c | 137 -- src/proto.h | 3 + src/{tree.c => tree-ops.c} | 0 src/{page-search.c => tree-search.c} | 0 src/txn.c | 942 ++----------- 20 files changed, 1884 insertions(+), 1864 deletions(-) rename src/{cold.c => api-cold.c} (83%) rename src/{copy.c => api-copy.c} (100%) create mode 100644 src/api-dbi.c rename src/{misc.c => api-misc.c} (100%) rename src/{env-opts.c => api-opts.c} (100%) rename src/{range-estimate.c => api-range-estimate.c} (99%) create mode 100644 src/api-txn-data.c rename src/{tree.c => tree-ops.c} (100%) rename src/{page-search.c => tree-search.c} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc22f2c7..8f8d1b02 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -60,12 +60,18 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/README.md" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/mdbx.h++" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/CMakeLists.txt" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/alloy.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-cold.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-copy.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-cursor.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-dbi.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-env.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-extra.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-key-transform.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-misc.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-opts.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-range-estimate.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-txn-data.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-txn.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/atomics-ops.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/atomics-types.h" @@ -74,9 +80,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cogs.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cogs.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/coherency.c" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cold.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/config.h.in" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/copy.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cursor.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/cursor.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dbi.c" @@ -86,7 +90,6 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dpl.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dpl.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/dxb.c" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/env-opts.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/env.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/essentials.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/gc-get.c" @@ -111,7 +114,6 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mdbx.c++" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/meta.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/meta.h" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/misc.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/mvcc-readers.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/node.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/node.h" @@ -124,12 +126,11 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-iov.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-ops.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-ops.h" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/page-search.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tree-search.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/pnl.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/pnl.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/preface.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/proto.h" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/range-estimate.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/refund.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/sort.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/spill.c" @@ -145,7 +146,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/stat.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/wingetopt.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tools/wingetopt.h" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tree.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/tree-ops.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txl.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/txn.c" @@ -156,7 +157,8 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/walk.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/walk.h" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/windows-import.c" - AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/windows-import.h") + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/windows-import.h" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/CMakeLists.txt") set(MDBX_AMALGAMATED_SOURCE FALSE) find_program(GIT git) if(NOT GIT) @@ -755,10 +757,17 @@ else() list( APPEND LIBMDBX_SOURCES + "${MDBX_SOURCE_DIR}/api-cold.c" + "${MDBX_SOURCE_DIR}/api-copy.c" "${MDBX_SOURCE_DIR}/api-cursor.c" + "${MDBX_SOURCE_DIR}/api-dbi.c" "${MDBX_SOURCE_DIR}/api-env.c" "${MDBX_SOURCE_DIR}/api-extra.c" "${MDBX_SOURCE_DIR}/api-key-transform.c" + "${MDBX_SOURCE_DIR}/api-misc.c" + "${MDBX_SOURCE_DIR}/api-opts.c" + "${MDBX_SOURCE_DIR}/api-range-estimate.c" + "${MDBX_SOURCE_DIR}/api-txn-data.c" "${MDBX_SOURCE_DIR}/api-txn.c" "${MDBX_SOURCE_DIR}/atomics-ops.h" "${MDBX_SOURCE_DIR}/atomics-types.h" @@ -767,8 +776,6 @@ else() "${MDBX_SOURCE_DIR}/cogs.c" "${MDBX_SOURCE_DIR}/cogs.h" "${MDBX_SOURCE_DIR}/coherency.c" - "${MDBX_SOURCE_DIR}/cold.c" - "${MDBX_SOURCE_DIR}/copy.c" "${MDBX_SOURCE_DIR}/cursor.c" "${MDBX_SOURCE_DIR}/cursor.h" "${MDBX_SOURCE_DIR}/dbi.c" @@ -776,7 +783,6 @@ else() "${MDBX_SOURCE_DIR}/dpl.c" "${MDBX_SOURCE_DIR}/dpl.h" "${MDBX_SOURCE_DIR}/dxb.c" - "${MDBX_SOURCE_DIR}/env-opts.c" "${MDBX_SOURCE_DIR}/env.c" "${MDBX_SOURCE_DIR}/essentials.h" "${MDBX_SOURCE_DIR}/gc-get.c" @@ -792,7 +798,6 @@ else() "${MDBX_SOURCE_DIR}/logging_and_debug.h" "${MDBX_SOURCE_DIR}/meta.c" "${MDBX_SOURCE_DIR}/meta.h" - "${MDBX_SOURCE_DIR}/misc.c" "${MDBX_SOURCE_DIR}/mvcc-readers.c" "${MDBX_SOURCE_DIR}/node.c" "${MDBX_SOURCE_DIR}/node.h" @@ -804,12 +809,11 @@ else() "${MDBX_SOURCE_DIR}/page-iov.h" "${MDBX_SOURCE_DIR}/page-ops.c" "${MDBX_SOURCE_DIR}/page-ops.h" - "${MDBX_SOURCE_DIR}/page-search.c" + "${MDBX_SOURCE_DIR}/tree-search.c" "${MDBX_SOURCE_DIR}/pnl.c" "${MDBX_SOURCE_DIR}/pnl.h" "${MDBX_SOURCE_DIR}/preface.h" "${MDBX_SOURCE_DIR}/proto.h" - "${MDBX_SOURCE_DIR}/range-estimate.c" "${MDBX_SOURCE_DIR}/refund.c" "${MDBX_SOURCE_DIR}/sort.h" "${MDBX_SOURCE_DIR}/spill.c" @@ -817,7 +821,7 @@ else() "${MDBX_SOURCE_DIR}/table.c" "${MDBX_SOURCE_DIR}/tls.c" "${MDBX_SOURCE_DIR}/tls.h" - "${MDBX_SOURCE_DIR}/tree.c" + "${MDBX_SOURCE_DIR}/tree-ops.c" "${MDBX_SOURCE_DIR}/txl.c" "${MDBX_SOURCE_DIR}/txl.h" "${MDBX_SOURCE_DIR}/txn.c" diff --git a/src/alloy.c b/src/alloy.c index f2cce532..076d39da 100644 --- a/src/alloy.c +++ b/src/alloy.c @@ -4,22 +4,26 @@ #define xMDBX_ALLOY 1 /* alloyed build */ #include "internals.h" /* must be included first */ +#include "api-cold.c" +#include "api-copy.c" #include "api-cursor.c" +#include "api-dbi.c" #include "api-env.c" #include "api-extra.c" #include "api-key-transform.c" +#include "api-misc.c" +#include "api-opts.c" +#include "api-range-estimate.c" +#include "api-txn-data.c" #include "api-txn.c" #include "audit.c" #include "chk.c" #include "cogs.c" #include "coherency.c" -#include "cold.c" -#include "copy.c" #include "cursor.c" #include "dbi.c" #include "dpl.c" #include "dxb.c" -#include "env-opts.c" #include "env.c" #include "gc-get.c" #include "gc-put.c" @@ -29,21 +33,19 @@ #include "lck.c" #include "logging_and_debug.c" #include "meta.c" -#include "misc.c" #include "mvcc-readers.c" #include "node.c" #include "osal.c" #include "page-get.c" #include "page-iov.c" #include "page-ops.c" -#include "page-search.c" #include "pnl.c" -#include "range-estimate.c" #include "refund.c" #include "spill.c" #include "table.c" #include "tls.c" -#include "tree.c" +#include "tree-ops.c" +#include "tree-search.c" #include "txl.c" #include "txn.c" #include "utils.c" diff --git a/src/cold.c b/src/api-cold.c similarity index 83% rename from src/cold.c rename to src/api-cold.c index 11260ace..dfa082db 100644 --- a/src/cold.c +++ b/src/api-cold.c @@ -128,111 +128,6 @@ __cold int mdbx_env_get_valsize4page_max(const MDBX_env *env, MDBX_db_flags_t fl /*----------------------------------------------------------------------------*/ -__cold static void stat_add(const tree_t *db, MDBX_stat *const st, const size_t bytes) { - st->ms_depth += db->height; - st->ms_branch_pages += db->branch_pages; - st->ms_leaf_pages += db->leaf_pages; - st->ms_overflow_pages += db->large_pages; - st->ms_entries += db->items; - if (likely(bytes >= offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) - st->ms_mod_txnid = (st->ms_mod_txnid > db->mod_txnid) ? st->ms_mod_txnid : db->mod_txnid; -} - -__cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { - memset(st, 0, bytes); - - int err = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(err != MDBX_SUCCESS)) - return err; - - cursor_couple_t cx; - err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); - if (unlikely(err != MDBX_SUCCESS)) - return err; - - const MDBX_env *const env = txn->env; - st->ms_psize = env->ps; - TXN_FOREACH_DBI_FROM(txn, dbi, - /* assuming GC is internal and not subject for accounting */ MAIN_DBI) { - if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) - stat_add(txn->dbs + dbi, st, bytes); - } - - if (!(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT) && txn->dbs[MAIN_DBI].items /* TODO: use `md_subs` field */) { - - /* scan and account not opened named tables */ - err = tree_search(&cx.outer, nullptr, Z_FIRST); - while (err == MDBX_SUCCESS) { - const page_t *mp = cx.outer.pg[cx.outer.top]; - for (size_t i = 0; i < page_numkeys(mp); i++) { - const node_t *node = page_node(mp, i); - if (node_flags(node) != N_TREE) - continue; - if (unlikely(node_ds(node) != sizeof(tree_t))) { - ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid table node size", node_ds(node)); - return MDBX_CORRUPTED; - } - - /* skip opened and already accounted */ - const MDBX_val name = {node_key(node), node_ks(node)}; - TXN_FOREACH_DBI_USER(txn, dbi) { - if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && - env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[dbi].name) == 0) { - node = nullptr; - break; - } - } - - if (node) { - tree_t db; - memcpy(&db, node_data(node), sizeof(db)); - stat_add(&db, st, bytes); - } - } - err = cursor_sibling_right(&cx.outer); - } - if (unlikely(err != MDBX_NOTFOUND)) - return err; - } - - return MDBX_SUCCESS; -} - -__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_stat *dest, size_t bytes) { - if (unlikely(!dest)) - return LOG_IFERR(MDBX_EINVAL); - const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); - if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) - return LOG_IFERR(MDBX_EINVAL); - - if (likely(txn)) { - if (env && unlikely(txn->env != env)) - return LOG_IFERR(MDBX_EINVAL); - return LOG_IFERR(stat_acc(txn, dest, bytes)); - } - - int err = check_env(env, true); - if (unlikely(err != MDBX_SUCCESS)) - return LOG_IFERR(err); - - if (env->txn && env_txn0_owned(env)) - /* inside write-txn */ - return LOG_IFERR(stat_acc(env->txn, dest, bytes)); - - MDBX_txn *tmp_txn; - err = mdbx_txn_begin((MDBX_env *)env, nullptr, MDBX_TXN_RDONLY, &tmp_txn); - if (unlikely(err != MDBX_SUCCESS)) - return LOG_IFERR(err); - - const int rc = stat_acc(tmp_txn, dest, bytes); - err = mdbx_txn_abort(tmp_txn); - if (unlikely(err != MDBX_SUCCESS)) - return LOG_IFERR(err); - return LOG_IFERR(rc); -} - -/*----------------------------------------------------------------------------*/ - static size_t estimate_rss(size_t database_bytes) { return database_bytes + database_bytes / 64 + (512 + MDBX_WORDBITS * 16) * MEGABYTE; } diff --git a/src/copy.c b/src/api-copy.c similarity index 100% rename from src/copy.c rename to src/api-copy.c diff --git a/src/api-dbi.c b/src/api-dbi.c new file mode 100644 index 00000000..5f52d770 --- /dev/null +++ b/src/api-dbi.c @@ -0,0 +1,324 @@ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + +#include "internals.h" + +int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi) { + return LOG_IFERR(dbi_open(txn, name, flags, dbi, nullptr, nullptr)); +} + +int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + return LOG_IFERR(dbi_open(txn, name, flags, dbi, keycmp, datacmp)); +} + +static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || name_cstr == MDBX_CHK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; + } + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); +} + +int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi) { + return LOG_IFERR(dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr)); +} + +int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + return LOG_IFERR(dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp)); +} + +__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (txn->dbs[dbi].height) { + cx.outer.next = txn->cursors[dbi]; + txn->cursors[dbi] = &cx.outer; + rc = tree_drop(&cx.outer, dbi == MAIN_DBI || (cx.outer.tree->flags & MDBX_DUPSORT)); + txn->cursors[dbi] = cx.outer.next; + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + } + + /* Invalidate the dropped DB's cursors */ + for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next) + be_poor(mc); + + if (!del || dbi < CORE_DBS) { + /* reset the DB record, mark it dirty */ + txn->dbi_state[dbi] |= DBI_DIRTY; + txn->dbs[dbi].height = 0; + txn->dbs[dbi].branch_pages = 0; + txn->dbs[dbi].leaf_pages = 0; + txn->dbs[dbi].large_pages = 0; + txn->dbs[dbi].items = 0; + txn->dbs[dbi].root = P_INVALID; + txn->dbs[dbi].sequence = 0; + /* txn->dbs[dbi].mod_txnid = txn->txnid; */ + txn->flags |= MDBX_TXN_DIRTY; + return MDBX_SUCCESS; + } + + MDBX_env *const env = txn->env; + MDBX_val name = env->kvs[dbi].name; + rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (likely(rc == MDBX_SUCCESS)) { + rc = cursor_seek(&cx.outer, &name, nullptr, MDBX_SET).err; + if (likely(rc == MDBX_SUCCESS)) { + cx.outer.next = txn->cursors[MAIN_DBI]; + txn->cursors[MAIN_DBI] = &cx.outer; + rc = cursor_del(&cx.outer, N_TREE); + txn->cursors[MAIN_DBI] = cx.outer.next; + if (likely(rc == MDBX_SUCCESS)) { + tASSERT(txn, txn->dbi_state[MAIN_DBI] & DBI_DIRTY); + tASSERT(txn, txn->flags & MDBX_TXN_DIRTY); + txn->dbi_state[dbi] = DBI_LINDO | DBI_OLDEN; + rc = osal_fastmutex_acquire(&env->dbi_lock); + if (likely(rc == MDBX_SUCCESS)) + return LOG_IFERR(dbi_close_release(env, dbi)); + } + } + } + + txn->flags |= MDBX_TXN_ERROR; + return LOG_IFERR(rc); +} + +__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || name_cstr == MDBX_CHK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; + } + return mdbx_dbi_rename2(txn, dbi, name); +} + +__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *new_name) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(new_name == MDBX_CHK_MAIN || new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC || + new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META || new_name->iov_base == MDBX_CHK_META)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(dbi < CORE_DBS)) + return LOG_IFERR(MDBX_EINVAL); + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + rc = osal_fastmutex_acquire(&txn->env->dbi_lock); + if (likely(rc == MDBX_SUCCESS)) { + struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name); + if (pair.defer) + pair.defer->next = nullptr; + dbi_defer_release(txn->env, pair.defer); + rc = pair.err; + } + return LOG_IFERR(rc); +} + +int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(dbi < CORE_DBS)) + return (dbi == MAIN_DBI) ? MDBX_SUCCESS : LOG_IFERR(MDBX_BAD_DBI); + + if (unlikely(dbi >= env->max_dbi)) + return LOG_IFERR(MDBX_BAD_DBI); + + if (unlikely(dbi < CORE_DBS || dbi >= env->max_dbi)) + return LOG_IFERR(MDBX_BAD_DBI); + + rc = osal_fastmutex_acquire(&env->dbi_lock); + if (likely(rc == MDBX_SUCCESS && dbi < env->n_dbi)) { + retry: + if (env->basal_txn && (env->dbs_flags[dbi] & DB_VALID) && (env->basal_txn->flags & MDBX_TXN_FINISHED) == 0) { + /* LY: Опасный код, так как env->txn может быть изменено в другом потоке. + * К сожалению тут нет надежного решения и может быть падение при неверном + * использовании API (вызове mdbx_dbi_close конкурентно с завершением + * пишущей транзакции). + * + * Для минимизации вероятности падения сначала проверяем dbi-флаги + * в basal_txn, а уже после в env->txn. Таким образом, падение может быть + * только при коллизии с завершением вложенной транзакции. + * + * Альтернативно можно попробовать выполнять обновление/put записи в + * mainDb соответствующей таблице закрываемого хендла. Семантически это + * верный путь, но проблема в текущем API, в котором исторически dbi-хендл + * живет и закрывается вне транзакции. Причем проблема не только в том, + * что нет указателя на текущую пишущую транзакцию, а в том что + * пользователь точно не ожидает что закрытие хендла приведет к + * скрытой/непрозрачной активности внутри транзакции потенциально + * выполняемой в другом потоке. Другими словами, проблема может быть + * только при неверном использовании API и если пользователь это + * допускает, то точно не будет ожидать скрытых действий внутри + * транзакции, и поэтому этот путь потенциально более опасен. */ + const MDBX_txn *const hazard = env->txn; + osal_compiler_barrier(); + if ((dbi_state(env->basal_txn, dbi) & (DBI_LINDO | DBI_DIRTY | DBI_CREAT)) > DBI_LINDO) { + bailout_dirty_dbi: + osal_fastmutex_release(&env->dbi_lock); + return LOG_IFERR(MDBX_DANGLING_DBI); + } + osal_memory_barrier(); + if (unlikely(hazard != env->txn)) + goto retry; + if (hazard != env->basal_txn && hazard && (hazard->flags & MDBX_TXN_FINISHED) == 0 && + hazard->signature == txn_signature && + (dbi_state(hazard, dbi) & (DBI_LINDO | DBI_DIRTY | DBI_CREAT)) > DBI_LINDO) + goto bailout_dirty_dbi; + osal_compiler_barrier(); + if (unlikely(hazard != env->txn)) + goto retry; + } + rc = dbi_close_release(env, dbi); + } + return LOG_IFERR(rc); +} + +int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, unsigned *state) { + if (unlikely(!flags || !state)) + return LOG_IFERR(MDBX_EINVAL); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR - MDBX_TXN_PARKED); + if (unlikely(rc != MDBX_SUCCESS)) { + *flags = 0; + *state = 0; + return LOG_IFERR(rc); + } + + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) { + *flags = 0; + *state = 0; + return LOG_IFERR(rc); + } + + *flags = txn->dbs[dbi].flags & DB_PERSISTENT_FLAGS; + *state = txn->dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); + return MDBX_SUCCESS; +} + +static void stat_get(const tree_t *db, MDBX_stat *st, size_t bytes) { + st->ms_depth = db->height; + st->ms_branch_pages = db->branch_pages; + st->ms_leaf_pages = db->leaf_pages; + st->ms_overflow_pages = db->large_pages; + st->ms_entries = db->items; + if (likely(bytes >= offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) + st->ms_mod_txnid = db->mod_txnid; +} + +__cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, size_t bytes) { + if (unlikely(!dest)) + return LOG_IFERR(MDBX_EINVAL); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + rc = dbi_check(txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); + if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) { + rc = MDBX_EINVAL; + goto bailout; + } + + if (unlikely(txn->flags & MDBX_TXN_BLOCKED)) { + rc = MDBX_BAD_TXN; + goto bailout; + } + + if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) { + rc = tbl_fetch((MDBX_txn *)txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + + dest->ms_psize = txn->env->ps; + stat_get(&txn->dbs[dbi], dest, bytes); + return MDBX_SUCCESS; + +bailout: + memset(dest, 0, bytes); + return LOG_IFERR(rc); +} + +__cold int mdbx_enumerate_tables(const MDBX_txn *txn, MDBX_table_enum_func *func, void *ctx) { + if (unlikely(!func)) + return LOG_IFERR(MDBX_EINVAL); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + cx.outer.next = txn->cursors[MAIN_DBI]; + txn->cursors[MAIN_DBI] = &cx.outer; + for (rc = outer_first(&cx.outer, nullptr, nullptr); rc == MDBX_SUCCESS; + rc = outer_next(&cx.outer, nullptr, nullptr, MDBX_NEXT_NODUP)) { + node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); + if (node_flags(node) != N_TREE) + continue; + if (unlikely(node_ds(node) != sizeof(tree_t))) { + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid dupsort sub-tree node size", + (unsigned)node_ds(node)); + rc = MDBX_CORRUPTED; + break; + } + + tree_t reside; + const tree_t *tree = memcpy(&reside, node_data(node), sizeof(reside)); + const MDBX_val name = {node_key(node), node_ks(node)}; + const MDBX_env *const env = txn->env; + MDBX_dbi dbi = 0; + for (size_t i = CORE_DBS; i < env->n_dbi; ++i) { + if (i >= txn->n_dbi || !(env->dbs_flags[i] & DB_VALID)) + continue; + if (env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[i].name)) + continue; + + tree = dbi_dig(txn, i, &reside); + dbi = (MDBX_dbi)i; + break; + } + + MDBX_stat stat; + stat_get(tree, &stat, sizeof(stat)); + rc = func(ctx, txn, &name, tree->flags, &stat, dbi); + if (rc != MDBX_SUCCESS) + goto bailout; + } + rc = (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; + +bailout: + txn->cursors[MAIN_DBI] = cx.outer.next; + return LOG_IFERR(rc); +} diff --git a/src/api-env.c b/src/api-env.c index b9d0488c..cbfba423 100644 --- a/src/api-env.c +++ b/src/api-env.c @@ -1315,3 +1315,108 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { return LOG_IFERR(env_sync(env, force, nonblock)); } + +/*----------------------------------------------------------------------------*/ + +static void stat_add(const tree_t *db, MDBX_stat *const st, const size_t bytes) { + st->ms_depth += db->height; + st->ms_branch_pages += db->branch_pages; + st->ms_leaf_pages += db->leaf_pages; + st->ms_overflow_pages += db->large_pages; + st->ms_entries += db->items; + if (likely(bytes >= offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) + st->ms_mod_txnid = (st->ms_mod_txnid > db->mod_txnid) ? st->ms_mod_txnid : db->mod_txnid; +} + +static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { + memset(st, 0, bytes); + + int err = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + cursor_couple_t cx; + err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); + if (unlikely(err != MDBX_SUCCESS)) + return err; + + const MDBX_env *const env = txn->env; + st->ms_psize = env->ps; + TXN_FOREACH_DBI_FROM(txn, dbi, + /* assuming GC is internal and not subject for accounting */ MAIN_DBI) { + if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID) + stat_add(txn->dbs + dbi, st, bytes); + } + + if (!(txn->dbs[MAIN_DBI].flags & MDBX_DUPSORT) && txn->dbs[MAIN_DBI].items /* TODO: use `md_subs` field */) { + + /* scan and account not opened named tables */ + err = tree_search(&cx.outer, nullptr, Z_FIRST); + while (err == MDBX_SUCCESS) { + const page_t *mp = cx.outer.pg[cx.outer.top]; + for (size_t i = 0; i < page_numkeys(mp); i++) { + const node_t *node = page_node(mp, i); + if (node_flags(node) != N_TREE) + continue; + if (unlikely(node_ds(node) != sizeof(tree_t))) { + ERROR("%s/%d: %s %zu", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid table node size", node_ds(node)); + return MDBX_CORRUPTED; + } + + /* skip opened and already accounted */ + const MDBX_val name = {node_key(node), node_ks(node)}; + TXN_FOREACH_DBI_USER(txn, dbi) { + if ((txn->dbi_state[dbi] & (DBI_VALID | DBI_STALE)) == DBI_VALID && + env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[dbi].name) == 0) { + node = nullptr; + break; + } + } + + if (node) { + tree_t db; + memcpy(&db, node_data(node), sizeof(db)); + stat_add(&db, st, bytes); + } + } + err = cursor_sibling_right(&cx.outer); + } + if (unlikely(err != MDBX_NOTFOUND)) + return err; + } + + return MDBX_SUCCESS; +} + +__cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, MDBX_stat *dest, size_t bytes) { + if (unlikely(!dest)) + return LOG_IFERR(MDBX_EINVAL); + const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); + if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) + return LOG_IFERR(MDBX_EINVAL); + + if (likely(txn)) { + if (env && unlikely(txn->env != env)) + return LOG_IFERR(MDBX_EINVAL); + return LOG_IFERR(stat_acc(txn, dest, bytes)); + } + + int err = check_env(env, true); + if (unlikely(err != MDBX_SUCCESS)) + return LOG_IFERR(err); + + if (env->txn && env_txn0_owned(env)) + /* inside write-txn */ + return LOG_IFERR(stat_acc(env->txn, dest, bytes)); + + MDBX_txn *tmp_txn; + err = mdbx_txn_begin((MDBX_env *)env, nullptr, MDBX_TXN_RDONLY, &tmp_txn); + if (unlikely(err != MDBX_SUCCESS)) + return LOG_IFERR(err); + + const int rc = stat_acc(tmp_txn, dest, bytes); + err = mdbx_txn_abort(tmp_txn); + if (unlikely(err != MDBX_SUCCESS)) + return LOG_IFERR(err); + return LOG_IFERR(rc); +} diff --git a/src/api-extra.c b/src/api-extra.c index e74c3bbc..dd394f7b 100644 --- a/src/api-extra.c +++ b/src/api-extra.c @@ -72,6 +72,65 @@ __cold int mdbx_reader_check(MDBX_env *env, int *dead) { return LOG_IFERR(mvcc_cleanup_dead(env, false, dead)); } +__cold int mdbx_thread_register(const MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!env->lck_mmap.lck)) + return LOG_IFERR((env->flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM); + + if (unlikely((env->flags & ENV_TXKEY) == 0)) { + eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS); + return LOG_IFERR(MDBX_EINVAL) /* MDBX_NOSTICKYTHREADS mode */; + } + + eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY); + reader_slot_t *r = thread_rthc_get(env->me_txkey); + if (unlikely(r != nullptr)) { + eASSERT(env, r->pid.weak == env->pid); + eASSERT(env, r->tid.weak == osal_thread_self()); + if (unlikely(r->pid.weak != env->pid)) + return LOG_IFERR(MDBX_BAD_RSLOT); + return MDBX_RESULT_TRUE /* already registered */; + } + + return LOG_IFERR(mvcc_bind_slot((MDBX_env *)env).err); +} + +__cold int mdbx_thread_unregister(const MDBX_env *env) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!env->lck_mmap.lck)) + return MDBX_RESULT_TRUE; + + if (unlikely((env->flags & ENV_TXKEY) == 0)) { + eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS); + return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */; + } + + eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY); + reader_slot_t *r = thread_rthc_get(env->me_txkey); + if (unlikely(r == nullptr)) + return MDBX_RESULT_TRUE /* not registered */; + + eASSERT(env, r->pid.weak == env->pid); + eASSERT(env, r->tid.weak == osal_thread_self()); + if (unlikely(r->pid.weak != env->pid || r->tid.weak != osal_thread_self())) + return LOG_IFERR(MDBX_BAD_RSLOT); + + eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD); + if (unlikely(r->txnid.weak < SAFE64_INVALID_THRESHOLD)) + return LOG_IFERR(MDBX_BUSY) /* transaction is still active */; + + atomic_store32(&r->pid, 0, mo_Relaxed); + atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease); + thread_rthc_set(env->me_txkey, nullptr); + return MDBX_SUCCESS; +} + /*------------------------------------------------------------------------------ * Locking API */ diff --git a/src/misc.c b/src/api-misc.c similarity index 100% rename from src/misc.c rename to src/api-misc.c diff --git a/src/env-opts.c b/src/api-opts.c similarity index 100% rename from src/env-opts.c rename to src/api-opts.c diff --git a/src/range-estimate.c b/src/api-range-estimate.c similarity index 99% rename from src/range-estimate.c rename to src/api-range-estimate.c index ea093088..de3e1d12 100644 --- a/src/range-estimate.c +++ b/src/api-range-estimate.c @@ -142,6 +142,9 @@ __hot static ptrdiff_t estimate(const tree_t *tree, diff_t *const __restrict dr) } } +/*------------------------------------------------------------------------------ + * Range-Estimation API */ + __hot int mdbx_estimate_distance(const MDBX_cursor *first, const MDBX_cursor *last, ptrdiff_t *distance_items) { if (unlikely(first == nullptr || last == nullptr || distance_items == nullptr)) return LOG_IFERR(MDBX_EINVAL); diff --git a/src/api-txn-data.c b/src/api-txn-data.c new file mode 100644 index 00000000..f12b06d3 --- /dev/null +++ b/src/api-txn-data.c @@ -0,0 +1,449 @@ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 + +#include "internals.h" + +__cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, uint32_t *mask) { + if (unlikely(!mask)) + return LOG_IFERR(MDBX_EINVAL); + + *mask = 0; + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + if ((cx.outer.tree->flags & MDBX_DUPSORT) == 0) + return MDBX_RESULT_TRUE; + + MDBX_val key, data; + rc = outer_first(&cx.outer, &key, &data); + while (rc == MDBX_SUCCESS) { + const node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); + const tree_t *db = node_data(node); + const unsigned flags = node_flags(node); + switch (flags) { + case N_BIG: + case 0: + /* single-value entry, deep = 0 */ + *mask |= 1 << 0; + break; + case N_DUP: + /* single sub-page, deep = 1 */ + *mask |= 1 << 1; + break; + case N_DUP | N_TREE: + /* sub-tree */ + *mask |= 1 << UNALIGNED_PEEK_16(db, tree_t, height); + break; + default: + ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid node-size", flags); + return LOG_IFERR(MDBX_CORRUPTED); + } + rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); + } + + return LOG_IFERR((rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc); +} + +int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) { + if (unlikely(canary == nullptr)) + return LOG_IFERR(MDBX_EINVAL); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) { + memset(canary, 0, sizeof(*canary)); + return LOG_IFERR(rc); + } + + *canary = txn->canary; + return MDBX_SUCCESS; +} + +int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { + DKBUF_DEBUG; + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!key || !data)) + return LOG_IFERR(MDBX_EINVAL); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + return LOG_IFERR(cursor_seek(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err); +} + +int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!key || !data)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(txn->flags & MDBX_TXN_BLOCKED)) + return LOG_IFERR(MDBX_BAD_TXN); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + return LOG_IFERR(cursor_ops(&cx.outer, key, data, MDBX_SET_LOWERBOUND)); +} + +int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, size_t *values_count) { + DKBUF_DEBUG; + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!key || !data)) + return LOG_IFERR(MDBX_EINVAL); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + rc = cursor_seek(&cx.outer, key, data, MDBX_SET_KEY).err; + if (unlikely(rc != MDBX_SUCCESS)) { + if (values_count) + *values_count = 0; + return LOG_IFERR(rc); + } + + if (values_count) { + *values_count = 1; + if (inner_pointed(&cx.outer)) + *values_count = + (sizeof(*values_count) >= sizeof(cx.inner.nested_tree.items) || cx.inner.nested_tree.items <= PTRDIFF_MAX) + ? (size_t)cx.inner.nested_tree.items + : PTRDIFF_MAX; + } + return MDBX_SUCCESS; +} + +/*----------------------------------------------------------------------------*/ + +int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (likely(canary)) { + if (txn->canary.x == canary->x && txn->canary.y == canary->y && txn->canary.z == canary->z) + return MDBX_SUCCESS; + txn->canary.x = canary->x; + txn->canary.y = canary->y; + txn->canary.z = canary->z; + } + txn->canary.v = txn->txnid; + txn->flags |= MDBX_TXN_DIRTY; + + return MDBX_SUCCESS; +} + +/* Функция сообщает находится ли указанный адрес в "грязной" странице у + * заданной пишущей транзакции. В конечном счете это позволяет избавиться от + * лишнего копирования данных из НЕ-грязных страниц. + * + * "Грязные" страницы - это те, которые уже были изменены в ходе пишущей + * транзакции. Соответственно, какие-либо дальнейшие изменения могут привести + * к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в + * качестве аргументов НЕ должны получать указатели на данные в таких + * страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут + * скопированы. + * + * Другими словами, данные из "грязных" страниц должны быть либо скопированы + * перед передачей в качестве аргументов для дальнейших модификаций, либо + * отвергнуты на стадии проверки корректности аргументов. + * + * Таким образом, функция позволяет как избавится от лишнего копирования, + * так и выполнить более полную проверку аргументов. + * + * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только + * так гарантируется что актуальный заголовок страницы будет физически + * расположен в той-же странице памяти, в том числе для многостраничных + * P_LARGE страниц с длинными данными. */ +int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { + int rc = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + const MDBX_env *env = txn->env; + const ptrdiff_t offset = ptr_dist(ptr, env->dxb_mmap.base); + if (offset >= 0) { + const pgno_t pgno = bytes2pgno(env, offset); + if (likely(pgno < txn->geo.first_unallocated)) { + const page_t *page = pgno2page(env, pgno); + if (unlikely(page->pgno != pgno || (page->flags & P_ILL_BITS) != 0)) { + /* The ptr pointed into middle of a large page, + * not to the beginning of a data. */ + return LOG_IFERR(MDBX_EINVAL); + } + return ((txn->flags & MDBX_TXN_RDONLY) || !is_modifable(txn, page)) ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; + } + if ((size_t)offset < env->dxb_mmap.limit) { + /* Указатель адресует что-то в пределах mmap, но за границей + * распределенных страниц. Такое может случится если mdbx_is_dirty() + * вызывается после операции, в ходе которой грязная страница была + * возвращена в нераспределенное пространство. */ + return (txn->flags & MDBX_TXN_RDONLY) ? LOG_IFERR(MDBX_EINVAL) : MDBX_RESULT_TRUE; + } + } + + /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был + * передан некорректный адрес, либо адрес в теневой странице, которая была + * выделена посредством malloc(). + * + * Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная", + * а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */ + return (txn->flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? LOG_IFERR(MDBX_EINVAL) : MDBX_RESULT_TRUE; +} + +int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, const MDBX_val *data) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!key)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(dbi <= FREE_DBI)) + return LOG_IFERR(MDBX_BAD_DBI); + + if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return LOG_IFERR((txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + MDBX_val proxy; + MDBX_cursor_op op = MDBX_SET; + unsigned flags = MDBX_ALLDUPS; + if (data) { + proxy = *data; + data = &proxy; + op = MDBX_GET_BOTH; + flags = 0; + } + rc = cursor_seek(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + cx.outer.next = txn->cursors[dbi]; + txn->cursors[dbi] = &cx.outer; + rc = cursor_del(&cx.outer, flags); + txn->cursors[dbi] = cx.outer.next; + return LOG_IFERR(rc); +} + +int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, MDBX_put_flags_t flags) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!key || !data)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(dbi <= FREE_DBI)) + return LOG_IFERR(MDBX_BAD_DBI); + + if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND | + MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE))) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return LOG_IFERR((txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + cx.outer.next = txn->cursors[dbi]; + txn->cursors[dbi] = &cx.outer; + + /* LY: support for update (explicit overwrite) */ + if (flags & MDBX_CURRENT) { + rc = cursor_seek(&cx.outer, (MDBX_val *)key, nullptr, MDBX_SET).err; + if (likely(rc == MDBX_SUCCESS) && (txn->dbs[dbi].flags & MDBX_DUPSORT) && (flags & MDBX_ALLDUPS) == 0) { + /* LY: allows update (explicit overwrite) only for unique keys */ + node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); + if (node_flags(node) & N_DUP) { + tASSERT(txn, inner_pointed(&cx.outer) && cx.outer.subcur->nested_tree.items > 1); + rc = MDBX_EMULTIVAL; + if ((flags & MDBX_NOOVERWRITE) == 0) { + flags -= MDBX_CURRENT; + rc = cursor_del(&cx.outer, MDBX_ALLDUPS); + } + } + } + } + + if (likely(rc == MDBX_SUCCESS)) + rc = cursor_put_checklen(&cx.outer, key, data, flags); + txn->cursors[dbi] = cx.outer.next; + + return LOG_IFERR(rc); +} + +//------------------------------------------------------------------------------ + +/* Позволяет обновить или удалить существующую запись с получением + * в old_data предыдущего значения данных. При этом если new_data равен + * нулю, то выполняется удаление, иначе обновление/вставка. + * + * Текущее значение может находиться в уже измененной (грязной) странице. + * В этом случае страница будет перезаписана при обновлении, а само старое + * значение утрачено. Поэтому исходно в old_data должен быть передан + * дополнительный буфер для копирования старого значения. + * Если переданный буфер слишком мал, то функция вернет -1, установив + * old_data->iov_len в соответствующее значение. + * + * Для не-уникальных ключей также возможен второй сценарий использования, + * когда посредством old_data из записей с одинаковым ключом для + * удаления/обновления выбирается конкретная. Для выбора этого сценария + * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE. + * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет + * идентифицировать запрос такого сценария. + * + * Функция может быть замещена соответствующими операциями с курсорами + * после двух доработок (TODO): + * - внешняя аллокация курсоров, в том числе на стеке (без malloc). + * - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE). + */ + +int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data, + MDBX_put_flags_t flags, MDBX_preserve_func preserver, void *preserver_context) { + int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!key || !old_data || old_data == new_data)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(old_data->iov_base == nullptr && old_data->iov_len)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(new_data == nullptr && (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(dbi <= FREE_DBI)) + return LOG_IFERR(MDBX_BAD_DBI); + + if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND | + MDBX_APPENDDUP | MDBX_CURRENT))) + return LOG_IFERR(MDBX_EINVAL); + + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, dbi); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + cx.outer.next = txn->cursors[dbi]; + txn->cursors[dbi] = &cx.outer; + + MDBX_val present_key = *key; + if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { + /* в old_data значение для выбора конкретного дубликата */ + if (unlikely(!(txn->dbs[dbi].flags & MDBX_DUPSORT))) { + rc = MDBX_EINVAL; + goto bailout; + } + + /* убираем лишний бит, он был признаком запрошенного режима */ + flags -= MDBX_NOOVERWRITE; + + rc = cursor_seek(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err; + if (rc != MDBX_SUCCESS) + goto bailout; + } else { + /* в old_data буфер для сохранения предыдущего значения */ + if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) + return LOG_IFERR(MDBX_EINVAL); + MDBX_val present_data; + rc = cursor_seek(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err; + if (unlikely(rc != MDBX_SUCCESS)) { + old_data->iov_base = nullptr; + old_data->iov_len = 0; + if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT)) + goto bailout; + } else if (flags & MDBX_NOOVERWRITE) { + rc = MDBX_KEYEXIST; + *old_data = present_data; + goto bailout; + } else { + page_t *page = cx.outer.pg[cx.outer.top]; + if (txn->dbs[dbi].flags & MDBX_DUPSORT) { + if (flags & MDBX_CURRENT) { + /* disallow update/delete for multi-values */ + node_t *node = page_node(page, cx.outer.ki[cx.outer.top]); + if (node_flags(node) & N_DUP) { + tASSERT(txn, inner_pointed(&cx.outer) && cx.outer.subcur->nested_tree.items > 1); + if (cx.outer.subcur->nested_tree.items > 1) { + rc = MDBX_EMULTIVAL; + goto bailout; + } + } + /* В LMDB флажок MDBX_CURRENT здесь приведет + * к замене данных без учета MDBX_DUPSORT сортировки, + * но здесь это в любом случае допустимо, так как мы + * проверили что для ключа есть только одно значение. */ + } + } + + if (is_modifable(txn, page)) { + if (new_data && cmp_lenfast(&present_data, new_data) == 0) { + /* если данные совпадают, то ничего делать не надо */ + *old_data = *new_data; + goto bailout; + } + rc = preserver ? preserver(preserver_context, old_data, present_data.iov_base, present_data.iov_len) + : MDBX_SUCCESS; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else { + *old_data = present_data; + } + flags |= MDBX_CURRENT; + } + } + + if (likely(new_data)) + rc = cursor_put_checklen(&cx.outer, key, new_data, flags); + else + rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS); + +bailout: + txn->cursors[dbi] = cx.outer.next; + return LOG_IFERR(rc); +} + +static int default_value_preserver(void *context, MDBX_val *target, const void *src, size_t bytes) { + (void)context; + if (unlikely(target->iov_len < bytes)) { + target->iov_base = nullptr; + target->iov_len = bytes; + return MDBX_RESULT_TRUE; + } + memcpy(target->iov_base, src, target->iov_len = bytes); + return MDBX_SUCCESS; +} + +int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data, + MDBX_put_flags_t flags) { + return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags, default_value_preserver, nullptr); +} diff --git a/src/api-txn.c b/src/api-txn.c index 2b3ba0d8..508f37d5 100644 --- a/src/api-txn.c +++ b/src/api-txn.c @@ -34,447 +34,858 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) return (lag > INT_MAX) ? INT_MAX : (int)lag; } -__cold int mdbx_dbi_dupsort_depthmask(const MDBX_txn *txn, MDBX_dbi dbi, uint32_t *mask) { - if (unlikely(!mask)) +MDBX_env *mdbx_txn_env(const MDBX_txn *txn) { + if (unlikely(!txn || txn->signature != txn_signature || txn->env->signature.weak != env_signature)) + return nullptr; + return txn->env; +} + +uint64_t mdbx_txn_id(const MDBX_txn *txn) { + if (unlikely(!txn || txn->signature != txn_signature)) + return 0; + return txn->txnid; +} + +MDBX_txn_flags_t mdbx_txn_flags(const MDBX_txn *txn) { + STATIC_ASSERT( + (MDBX_TXN_INVALID & (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD | + txn_gc_drained | txn_shrink_allowed | txn_rw_begin_flags | txn_ro_begin_flags)) == 0); + if (unlikely(!txn || txn->signature != txn_signature)) + return MDBX_TXN_INVALID; + assert(0 == (int)(txn->flags & MDBX_TXN_INVALID)); + + MDBX_txn_flags_t flags = txn->flags; + if (F_ISSET(flags, MDBX_TXN_PARKED | MDBX_TXN_RDONLY) && txn->to.reader && + safe64_read(&txn->to.reader->tid) == MDBX_TID_TXN_OUSTED) + flags |= MDBX_TXN_OUSTED; + return flags; +} + +int mdbx_txn_reset(MDBX_txn *txn) { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + /* This call is only valid for read-only txns */ + if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) return LOG_IFERR(MDBX_EINVAL); - *mask = 0; - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - if ((cx.outer.tree->flags & MDBX_DUPSORT) == 0) - return MDBX_RESULT_TRUE; - - MDBX_val key, data; - rc = outer_first(&cx.outer, &key, &data); - while (rc == MDBX_SUCCESS) { - const node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); - const tree_t *db = node_data(node); - const unsigned flags = node_flags(node); - switch (flags) { - case N_BIG: - case 0: - /* single-value entry, deep = 0 */ - *mask |= 1 << 0; - break; - case N_DUP: - /* single sub-page, deep = 1 */ - *mask |= 1 << 1; - break; - case N_DUP | N_TREE: - /* sub-tree */ - *mask |= 1 << UNALIGNED_PEEK_16(db, tree_t, height); - break; - default: - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid node-size", flags); - return LOG_IFERR(MDBX_CORRUPTED); - } - rc = outer_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); + /* LY: don't close DBI-handles */ + rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE); + if (rc == MDBX_SUCCESS) { + tASSERT(txn, txn->signature == txn_signature); + tASSERT(txn, txn->owner == 0); } - - return LOG_IFERR((rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc); -} - -int mdbx_canary_get(const MDBX_txn *txn, MDBX_canary *canary) { - if (unlikely(canary == nullptr)) - return LOG_IFERR(MDBX_EINVAL); - - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) { - memset(canary, 0, sizeof(*canary)); - return LOG_IFERR(rc); - } - - *canary = txn->canary; - return MDBX_SUCCESS; -} - -int mdbx_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { - DKBUF_DEBUG; - DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); - - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(!key || !data)) - return LOG_IFERR(MDBX_EINVAL); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - return LOG_IFERR(cursor_seek(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err); -} - -int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(!key || !data)) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(txn->flags & MDBX_TXN_BLOCKED)) - return LOG_IFERR(MDBX_BAD_TXN); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - return LOG_IFERR(cursor_ops(&cx.outer, key, data, MDBX_SET_LOWERBOUND)); -} - -int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, size_t *values_count) { - DKBUF_DEBUG; - DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); - - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(!key || !data)) - return LOG_IFERR(MDBX_EINVAL); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - rc = cursor_seek(&cx.outer, key, data, MDBX_SET_KEY).err; - if (unlikely(rc != MDBX_SUCCESS)) { - if (values_count) - *values_count = 0; - return LOG_IFERR(rc); - } - - if (values_count) { - *values_count = 1; - if (inner_pointed(&cx.outer)) - *values_count = - (sizeof(*values_count) >= sizeof(cx.inner.nested_tree.items) || cx.inner.nested_tree.items <= PTRDIFF_MAX) - ? (size_t)cx.inner.nested_tree.items - : PTRDIFF_MAX; - } - return MDBX_SUCCESS; -} - -/*----------------------------------------------------------------------------*/ - -int mdbx_canary_put(MDBX_txn *txn, const MDBX_canary *canary) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (likely(canary)) { - if (txn->canary.x == canary->x && txn->canary.y == canary->y && txn->canary.z == canary->z) - return MDBX_SUCCESS; - txn->canary.x = canary->x; - txn->canary.y = canary->y; - txn->canary.z = canary->z; - } - txn->canary.v = txn->txnid; - txn->flags |= MDBX_TXN_DIRTY; - - return MDBX_SUCCESS; -} - -/* Функция сообщает находится ли указанный адрес в "грязной" странице у - * заданной пишущей транзакции. В конечном счете это позволяет избавиться от - * лишнего копирования данных из НЕ-грязных страниц. - * - * "Грязные" страницы - это те, которые уже были изменены в ходе пишущей - * транзакции. Соответственно, какие-либо дальнейшие изменения могут привести - * к перезаписи таких страниц. Поэтому все функции, выполняющие изменения, в - * качестве аргументов НЕ должны получать указатели на данные в таких - * страницах. В свою очередь "НЕ грязные" страницы перед модификацией будут - * скопированы. - * - * Другими словами, данные из "грязных" страниц должны быть либо скопированы - * перед передачей в качестве аргументов для дальнейших модификаций, либо - * отвергнуты на стадии проверки корректности аргументов. - * - * Таким образом, функция позволяет как избавится от лишнего копирования, - * так и выполнить более полную проверку аргументов. - * - * ВАЖНО: Передаваемый указатель должен указывать на начало данных. Только - * так гарантируется что актуальный заголовок страницы будет физически - * расположен в той-же странице памяти, в том числе для многостраничных - * P_LARGE страниц с длинными данными. */ -int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - const MDBX_env *env = txn->env; - const ptrdiff_t offset = ptr_dist(ptr, env->dxb_mmap.base); - if (offset >= 0) { - const pgno_t pgno = bytes2pgno(env, offset); - if (likely(pgno < txn->geo.first_unallocated)) { - const page_t *page = pgno2page(env, pgno); - if (unlikely(page->pgno != pgno || (page->flags & P_ILL_BITS) != 0)) { - /* The ptr pointed into middle of a large page, - * not to the beginning of a data. */ - return LOG_IFERR(MDBX_EINVAL); - } - return ((txn->flags & MDBX_TXN_RDONLY) || !is_modifable(txn, page)) ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; - } - if ((size_t)offset < env->dxb_mmap.limit) { - /* Указатель адресует что-то в пределах mmap, но за границей - * распределенных страниц. Такое может случится если mdbx_is_dirty() - * вызывается после операции, в ходе которой грязная страница была - * возвращена в нераспределенное пространство. */ - return (txn->flags & MDBX_TXN_RDONLY) ? LOG_IFERR(MDBX_EINVAL) : MDBX_RESULT_TRUE; - } - } - - /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был - * передан некорректный адрес, либо адрес в теневой странице, которая была - * выделена посредством malloc(). - * - * Для режима MDBX_WRITE_MAP режима страница однозначно "не грязная", - * а для режимов без MDBX_WRITE_MAP однозначно "не чистая". */ - return (txn->flags & (MDBX_WRITEMAP | MDBX_TXN_RDONLY)) ? LOG_IFERR(MDBX_EINVAL) : MDBX_RESULT_TRUE; -} - -int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, const MDBX_val *data) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(!key)) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(dbi <= FREE_DBI)) - return LOG_IFERR(MDBX_BAD_DBI); - - if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return LOG_IFERR((txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - MDBX_val proxy; - MDBX_cursor_op op = MDBX_SET; - unsigned flags = MDBX_ALLDUPS; - if (data) { - proxy = *data; - data = &proxy; - op = MDBX_GET_BOTH; - flags = 0; - } - rc = cursor_seek(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - cx.outer.next = txn->cursors[dbi]; - txn->cursors[dbi] = &cx.outer; - rc = cursor_del(&cx.outer, flags); - txn->cursors[dbi] = cx.outer.next; return LOG_IFERR(rc); } -int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, MDBX_put_flags_t flags) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); +int mdbx_txn_break(MDBX_txn *txn) { + do { + int rc = check_txn(txn, 0); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + txn->flags |= MDBX_TXN_ERROR; + if (txn->flags & MDBX_TXN_RDONLY) + break; + txn = txn->nested; + } while (txn); + return MDBX_SUCCESS; +} + +int mdbx_txn_abort(MDBX_txn *txn) { + int rc = check_txn(txn, 0); if (unlikely(rc != MDBX_SUCCESS)) return LOG_IFERR(rc); - if (unlikely(!key || !data)) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(dbi <= FREE_DBI)) - return LOG_IFERR(MDBX_BAD_DBI); - - if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND | - MDBX_APPENDDUP | MDBX_CURRENT | MDBX_MULTIPLE))) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(txn->flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return LOG_IFERR((txn->flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, dbi); + rc = check_env(txn->env, true); if (unlikely(rc != MDBX_SUCCESS)) return LOG_IFERR(rc); - cx.outer.next = txn->cursors[dbi]; - txn->cursors[dbi] = &cx.outer; - /* LY: support for update (explicit overwrite) */ - if (flags & MDBX_CURRENT) { - rc = cursor_seek(&cx.outer, (MDBX_val *)key, nullptr, MDBX_SET).err; - if (likely(rc == MDBX_SUCCESS) && (txn->dbs[dbi].flags & MDBX_DUPSORT) && (flags & MDBX_ALLDUPS) == 0) { - /* LY: allows update (explicit overwrite) only for unique keys */ - node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); - if (node_flags(node) & N_DUP) { - tASSERT(txn, inner_pointed(&cx.outer) && cx.outer.subcur->nested_tree.items > 1); - rc = MDBX_EMULTIVAL; - if ((flags & MDBX_NOOVERWRITE) == 0) { - flags -= MDBX_CURRENT; - rc = cursor_del(&cx.outer, MDBX_ALLDUPS); - } - } - } + if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_NOSTICKYTHREADS)) == MDBX_NOSTICKYTHREADS && + unlikely(txn->owner != osal_thread_self())) { + mdbx_txn_break(txn); + return LOG_IFERR(MDBX_THREAD_MISMATCH); } - if (likely(rc == MDBX_SUCCESS)) - rc = cursor_put_checklen(&cx.outer, key, data, flags); - txn->cursors[dbi] = cx.outer.next; + return LOG_IFERR(txn_abort(txn)); +} +int mdbx_txn_park(MDBX_txn *txn, bool autounpark) { + STATIC_ASSERT(MDBX_TXN_BLOCKED > MDBX_TXN_ERROR); + int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) + return LOG_IFERR(MDBX_TXN_INVALID); + + if (unlikely((txn->flags & MDBX_TXN_ERROR))) { + rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE); + return LOG_IFERR(rc ? rc : MDBX_OUSTED); + } + + return LOG_IFERR(txn_park(txn, autounpark)); +} + +int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) { + STATIC_ASSERT(MDBX_TXN_BLOCKED > MDBX_TXN_PARKED + MDBX_TXN_ERROR); + int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED - MDBX_TXN_ERROR); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + if (unlikely(!F_ISSET(txn->flags, MDBX_TXN_RDONLY | MDBX_TXN_PARKED))) + return MDBX_SUCCESS; + + rc = txn_unpark(txn); + if (likely(rc != MDBX_OUSTED) || !restart_if_ousted) + return LOG_IFERR(rc); + + tASSERT(txn, txn->flags & MDBX_TXN_FINISHED); + rc = txn_renew(txn, MDBX_TXN_RDONLY); + return (rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : LOG_IFERR(rc); +} + +int mdbx_txn_renew(MDBX_txn *txn) { + if (unlikely(!txn)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(txn->signature != txn_signature)) + return LOG_IFERR(MDBX_EBADSIGN); + + if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) + return LOG_IFERR(MDBX_EINVAL); + + if (unlikely(txn->owner != 0 || !(txn->flags & MDBX_TXN_FINISHED))) { + int rc = mdbx_txn_reset(txn); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } + + int rc = txn_renew(txn, MDBX_TXN_RDONLY); + if (rc == MDBX_SUCCESS) { + tASSERT(txn, txn->owner == (txn->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()); + DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, + (txn->flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->env, txn->dbs[MAIN_DBI].root, + txn->dbs[FREE_DBI].root); + } return LOG_IFERR(rc); } -//------------------------------------------------------------------------------ - -/* Позволяет обновить или удалить существующую запись с получением - * в old_data предыдущего значения данных. При этом если new_data равен - * нулю, то выполняется удаление, иначе обновление/вставка. - * - * Текущее значение может находиться в уже измененной (грязной) странице. - * В этом случае страница будет перезаписана при обновлении, а само старое - * значение утрачено. Поэтому исходно в old_data должен быть передан - * дополнительный буфер для копирования старого значения. - * Если переданный буфер слишком мал, то функция вернет -1, установив - * old_data->iov_len в соответствующее значение. - * - * Для не-уникальных ключей также возможен второй сценарий использования, - * когда посредством old_data из записей с одинаковым ключом для - * удаления/обновления выбирается конкретная. Для выбора этого сценария - * во flags следует одновременно указать MDBX_CURRENT и MDBX_NOOVERWRITE. - * Именно эта комбинация выбрана, так как она лишена смысла, и этим позволяет - * идентифицировать запрос такого сценария. - * - * Функция может быть замещена соответствующими операциями с курсорами - * после двух доработок (TODO): - * - внешняя аллокация курсоров, в том числе на стеке (без malloc). - * - получения dirty-статуса страницы по адресу (знать о MUTABLE/WRITEABLE). - */ - -int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data, - MDBX_put_flags_t flags, MDBX_preserve_func preserver, void *preserver_context) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); +int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { + int rc = check_txn(txn, MDBX_TXN_FINISHED); if (unlikely(rc != MDBX_SUCCESS)) return LOG_IFERR(rc); - if (unlikely(!key || !old_data || old_data == new_data)) + txn->userctx = ctx; + return MDBX_SUCCESS; +} + +void *mdbx_txn_get_userctx(const MDBX_txn *txn) { return check_txn(txn, MDBX_TXN_FINISHED) ? nullptr : txn->userctx; } + +int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret, void *context) { + if (unlikely(!ret)) + return LOG_IFERR(MDBX_EINVAL); + *ret = nullptr; + + if (unlikely((flags & ~txn_rw_begin_flags) && (parent || (flags & ~txn_ro_begin_flags)))) return LOG_IFERR(MDBX_EINVAL); - if (unlikely(old_data->iov_base == nullptr && old_data->iov_len)) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(new_data == nullptr && (flags & (MDBX_CURRENT | MDBX_RESERVE)) != MDBX_CURRENT)) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(dbi <= FREE_DBI)) - return LOG_IFERR(MDBX_BAD_DBI); - - if (unlikely(flags & ~(MDBX_NOOVERWRITE | MDBX_NODUPDATA | MDBX_ALLDUPS | MDBX_RESERVE | MDBX_APPEND | - MDBX_APPENDDUP | MDBX_CURRENT))) - return LOG_IFERR(MDBX_EINVAL); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, dbi); + int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return LOG_IFERR(rc); - cx.outer.next = txn->cursors[dbi]; - txn->cursors[dbi] = &cx.outer; - MDBX_val present_key = *key; - if (F_ISSET(flags, MDBX_CURRENT | MDBX_NOOVERWRITE)) { - /* в old_data значение для выбора конкретного дубликата */ - if (unlikely(!(txn->dbs[dbi].flags & MDBX_DUPSORT))) { - rc = MDBX_EINVAL; - goto bailout; + if (unlikely(env->flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */ + return LOG_IFERR(MDBX_EACCESS); + + MDBX_txn *txn = nullptr; + if (parent) { + /* Nested transactions: Max 1 child, write txns only, no writemap */ + rc = check_txn_rw(parent, MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (env->options.spill_parent4child_denominator) { + /* Spill dirty-pages of parent to provide dirtyroom for child txn */ + rc = txn_spill(parent, nullptr, parent->tw.dirtylist->length / env->options.spill_parent4child_denominator); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); } + tASSERT(parent, audit_ex(parent, 0, false) == 0); - /* убираем лишний бит, он был признаком запрошенного режима */ - flags -= MDBX_NOOVERWRITE; + flags |= parent->flags & (txn_rw_begin_flags | MDBX_TXN_SPILLS | MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); + } else if ((flags & MDBX_TXN_RDONLY) == 0) { + /* Reuse preallocated write txn. However, do not touch it until + * txn_renew() succeeds, since it currently may be active. */ + txn = env->basal_txn; + goto renew; + } - rc = cursor_seek(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err; - if (rc != MDBX_SUCCESS) - goto bailout; - } else { - /* в old_data буфер для сохранения предыдущего значения */ - if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) - return LOG_IFERR(MDBX_EINVAL); - MDBX_val present_data; - rc = cursor_seek(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err; + const intptr_t bitmap_bytes = +#if MDBX_ENABLE_DBI_SPARSE + ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / CHAR_BIT; +#else + 0; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to)); + const size_t base = + (flags & MDBX_TXN_RDONLY) ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) : sizeof(MDBX_txn); + const size_t size = base + + ((flags & MDBX_TXN_RDONLY) ? (size_t)bitmap_bytes + env->max_dbi * sizeof(txn->dbi_seqs[0]) : 0) + + env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + sizeof(txn->dbi_state[0])); + txn = osal_malloc(size); + if (unlikely(txn == nullptr)) + return LOG_IFERR(MDBX_ENOMEM); +#if MDBX_DEBUG + memset(txn, 0xCD, size); + VALGRIND_MAKE_MEM_UNDEFINED(txn, size); +#endif /* MDBX_DEBUG */ + MDBX_ANALYSIS_ASSUME(size > base); + memset(txn, 0, (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); + txn->dbs = ptr_disp(txn, base); + txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0])); +#if MDBX_DEBUG + txn->cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */ +#endif + txn->dbi_state = ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0])); + txn->flags = flags; + txn->env = env; + + if (parent) { + tASSERT(parent, dpl_check(parent)); +#if MDBX_ENABLE_DBI_SPARSE + txn->dbi_sparse = parent->dbi_sparse; +#endif /* MDBX_ENABLE_DBI_SPARSE */ + txn->dbi_seqs = parent->dbi_seqs; + txn->geo = parent->geo; + rc = dpl_alloc(txn); + if (likely(rc == MDBX_SUCCESS)) { + const size_t len = MDBX_PNL_GETSIZE(parent->tw.relist) + parent->tw.loose_count; + txn->tw.relist = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.relist)) + rc = MDBX_ENOMEM; + } if (unlikely(rc != MDBX_SUCCESS)) { - old_data->iov_base = nullptr; - old_data->iov_len = 0; - if (rc != MDBX_NOTFOUND || (flags & MDBX_CURRENT)) - goto bailout; - } else if (flags & MDBX_NOOVERWRITE) { - rc = MDBX_KEYEXIST; - *old_data = present_data; - goto bailout; - } else { - page_t *page = cx.outer.pg[cx.outer.top]; - if (txn->dbs[dbi].flags & MDBX_DUPSORT) { - if (flags & MDBX_CURRENT) { - /* disallow update/delete for multi-values */ - node_t *node = page_node(page, cx.outer.ki[cx.outer.top]); - if (node_flags(node) & N_DUP) { - tASSERT(txn, inner_pointed(&cx.outer) && cx.outer.subcur->nested_tree.items > 1); - if (cx.outer.subcur->nested_tree.items > 1) { - rc = MDBX_EMULTIVAL; - goto bailout; + nested_failed: + pnl_free(txn->tw.relist); + dpl_free(txn); + osal_free(txn); + return LOG_IFERR(rc); + } + + /* Move loose pages to reclaimed list */ + if (parent->tw.loose_count) { + do { + page_t *lp = parent->tw.loose_pages; + tASSERT(parent, lp->flags == P_LOOSE); + rc = pnl_insert_span(&parent->tw.relist, lp->pgno, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto nested_failed; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + parent->tw.loose_pages = page_next(lp); + /* Remove from dirty list */ + page_wash(parent, dpl_exist(parent, lp->pgno), lp, 1); + } while (parent->tw.loose_pages); + parent->tw.loose_count = 0; +#if MDBX_ENABLE_REFUND + parent->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + tASSERT(parent, dpl_check(parent)); + } + txn->tw.dirtyroom = parent->tw.dirtyroom; + txn->tw.dirtylru = parent->tw.dirtylru; + + dpl_sort(parent); + if (parent->tw.spilled.list) + spill_purge(parent); + + tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= MDBX_PNL_GETSIZE(parent->tw.relist)); + memcpy(txn->tw.relist, parent->tw.relist, MDBX_PNL_SIZEOF(parent->tw.relist)); + eASSERT(env, pnl_check_allocated(txn->tw.relist, (txn->geo.first_unallocated /* LY: intentional assignment + here, only for assertion */ + = parent->geo.first_unallocated) - + MDBX_ENABLE_REFUND)); + + txn->tw.gc.time_acc = parent->tw.gc.time_acc; + txn->tw.gc.last_reclaimed = parent->tw.gc.last_reclaimed; + if (parent->tw.gc.reclaimed) { + txn->tw.gc.reclaimed = parent->tw.gc.reclaimed; + parent->tw.gc.reclaimed = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.gc.reclaimed); + } + + txn->tw.retired_pages = parent->tw.retired_pages; + parent->tw.retired_pages = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages); + + txn->txnid = parent->txnid; + txn->front_txnid = parent->front_txnid + 1; +#if MDBX_ENABLE_REFUND + txn->tw.loose_refund_wl = 0; +#endif /* MDBX_ENABLE_REFUND */ + txn->canary = parent->canary; + parent->flags |= MDBX_TXN_HAS_CHILD; + parent->nested = txn; + txn->parent = parent; + txn->owner = parent->owner; + txn->tw.troika = parent->tw.troika; + + txn->cursors[FREE_DBI] = nullptr; + txn->cursors[MAIN_DBI] = nullptr; + txn->dbi_state[FREE_DBI] = parent->dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + txn->dbi_state[MAIN_DBI] = parent->dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); + memset(txn->dbi_state + CORE_DBS, 0, (txn->n_dbi = parent->n_dbi) - CORE_DBS); + memcpy(txn->dbs, parent->dbs, sizeof(txn->dbs[0]) * CORE_DBS); + + tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->parent ? parent->parent->tw.dirtyroom : parent->env->options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit)); + env->txn = txn; + tASSERT(parent, parent->cursors[FREE_DBI] == nullptr); + rc = parent->cursors[MAIN_DBI] ? cursor_shadow(parent->cursors[MAIN_DBI], txn, MAIN_DBI) : MDBX_SUCCESS; + if (AUDIT_ENABLED() && ASSERT_ENABLED()) { + txn->signature = txn_signature; + tASSERT(txn, audit_ex(txn, 0, false) == 0); + } + if (unlikely(rc != MDBX_SUCCESS)) + txn_end(txn, TXN_END_FAIL_BEGINCHILD); + } else { /* MDBX_TXN_RDONLY */ + txn->dbi_seqs = ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0])); +#if MDBX_ENABLE_DBI_SPARSE + txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes); +#endif /* MDBX_ENABLE_DBI_SPARSE */ + renew: + rc = txn_renew(txn, flags); + } + + if (unlikely(rc != MDBX_SUCCESS)) { + if (txn != env->basal_txn) + osal_free(txn); + } else { + if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) + eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); + else if (flags & MDBX_TXN_RDONLY) + eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | + /* Win32: SRWL flag */ txn_shrink_allowed)) == 0); + else { + eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | txn_shrink_allowed | MDBX_NOMETASYNC | + MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); + assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); + } + txn->signature = txn_signature; + txn->userctx = context; + *ret = txn; + DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, + (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root, + txn->dbs[FREE_DBI].root); + } + + return LOG_IFERR(rc); +} + +int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { + STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR - MDBX_TXN_PARKED); + const uint64_t ts_0 = latency ? osal_monotime() : 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; + + int rc = check_txn(txn, MDBX_TXN_FINISHED); + if (unlikely(rc != MDBX_SUCCESS)) { + if (rc == MDBX_BAD_TXN && (txn->flags & MDBX_TXN_RDONLY)) { + rc = MDBX_RESULT_TRUE; + goto fail; + } + bailout: + if (latency) + memset(latency, 0, sizeof(*latency)); + return LOG_IFERR(rc); + } + + MDBX_env *const env = txn->env; + if (MDBX_ENV_CHECKPID && unlikely(env->pid != osal_getpid())) { + env->flags |= ENV_FATAL_ERROR; + rc = MDBX_PANIC; + goto bailout; + } + + if (unlikely(txn->flags & MDBX_TXN_ERROR)) { + rc = MDBX_RESULT_TRUE; + goto fail; + } + + /* txn_end() mode for a commit which writes nothing */ + unsigned end_mode = TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE; + if (unlikely(txn->flags & MDBX_TXN_RDONLY)) + goto done; + + if ((txn->flags & MDBX_NOSTICKYTHREADS) && unlikely(txn->owner != osal_thread_self())) { + rc = MDBX_THREAD_MISMATCH; + goto fail; + } + + if (txn->nested) { + rc = mdbx_txn_commit_ex(txn->nested, nullptr); + tASSERT(txn, txn->nested == nullptr); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + + if (unlikely(txn != env->txn)) { + DEBUG("%s", "attempt to commit unknown transaction"); + rc = MDBX_EINVAL; + goto fail; + } + + if (txn->parent) { + tASSERT(txn, audit_ex(txn, 0, false) == 0); + eASSERT(env, txn != env->basal_txn); + MDBX_txn *const parent = txn->parent; + eASSERT(env, parent->signature == txn_signature); + eASSERT(env, parent->nested == txn && (parent->flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, dpl_check(txn)); + + if (txn->tw.dirtylist->length == 0 && !(txn->flags & MDBX_TXN_DIRTY) && parent->n_dbi == txn->n_dbi) { + TXN_FOREACH_DBI_ALL(txn, i) { + tASSERT(txn, (txn->dbi_state[i] & DBI_DIRTY) == 0); + if ((txn->dbi_state[i] & DBI_STALE) && !(parent->dbi_state[i] & DBI_STALE)) + tASSERT(txn, memcmp(&parent->dbs[i], &txn->dbs[i], sizeof(tree_t)) == 0); + } + + tASSERT(txn, memcmp(&parent->geo, &txn->geo, sizeof(parent->geo)) == 0); + tASSERT(txn, memcmp(&parent->canary, &txn->canary, sizeof(parent->canary)) == 0); + tASSERT(txn, !txn->tw.spilled.list || MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0); + tASSERT(txn, txn->tw.loose_count == 0); + + /* fast completion of pure nested transaction */ + VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->txnid); + end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE; + goto done; + } + + /* Preserve space for spill list to avoid parent's state corruption + * if allocation fails. */ + const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages; + tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + const size_t retired_delta = MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len; + if (retired_delta) { + rc = pnl_need(&txn->tw.relist, retired_delta); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { + rc = pnl_need(&parent->tw.spilled.list, MDBX_PNL_GETSIZE(txn->tw.spilled.list)); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + spill_purge(txn); + } + + if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > parent->tw.dirtylist->detent && + !dpl_reserve(parent, txn->tw.dirtylist->length + parent->tw.dirtylist->length))) { + rc = MDBX_ENOMEM; + goto fail; + } + + //------------------------------------------------------------------------- + + parent->tw.gc.reclaimed = txn->tw.gc.reclaimed; + txn->tw.gc.reclaimed = nullptr; + + parent->tw.retired_pages = txn->tw.retired_pages; + txn->tw.retired_pages = nullptr; + + pnl_free(parent->tw.relist); + parent->tw.relist = txn->tw.relist; + txn->tw.relist = nullptr; + parent->tw.gc.time_acc = txn->tw.gc.time_acc; + parent->tw.gc.last_reclaimed = txn->tw.gc.last_reclaimed; + + parent->geo = txn->geo; + parent->canary = txn->canary; + parent->flags |= txn->flags & MDBX_TXN_DIRTY; + + /* Move loose pages to parent */ +#if MDBX_ENABLE_REFUND + parent->tw.loose_refund_wl = txn->tw.loose_refund_wl; +#endif /* MDBX_ENABLE_REFUND */ + parent->tw.loose_count = txn->tw.loose_count; + parent->tw.loose_pages = txn->tw.loose_pages; + + /* Merge our cursors into parent's and close them */ + txn_done_cursors(txn, true); + end_mode |= TXN_END_EOTDONE; + + /* Update parent's DBs array */ + eASSERT(env, parent->n_dbi == txn->n_dbi); + TXN_FOREACH_DBI_ALL(txn, dbi) { + if (txn->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) { + parent->dbs[dbi] = txn->dbs[dbi]; + /* preserve parent's status */ + const uint8_t state = txn->dbi_state[dbi] | (parent->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); + DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still", + parent->dbi_state[dbi], state); + parent->dbi_state[dbi] = state; + } else { + eASSERT(env, txn->dbi_state[dbi] == (parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))); + } + } + + if (latency) { + ts_1 = osal_monotime(); + ts_2 = /* no gc-update */ ts_1; + ts_3 = /* no audit */ ts_2; + ts_4 = /* no write */ ts_3; + ts_5 = /* no sync */ ts_4; + } + txn_merge(parent, txn, parent_retired_len); + env->txn = parent; + parent->nested = nullptr; + tASSERT(parent, dpl_check(parent)); + +#if MDBX_ENABLE_REFUND + txn_refund(parent); + if (ASSERT_ENABLED()) { + /* Check parent's loose pages not suitable for refund */ + for (page_t *lp = parent->tw.loose_pages; lp; lp = page_next(lp)) { + tASSERT(parent, lp->pgno < parent->tw.loose_refund_wl && lp->pgno + 1 < parent->geo.first_unallocated); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); + VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); + } + /* Check parent's reclaimed pages not suitable for refund */ + if (MDBX_PNL_GETSIZE(parent->tw.relist)) + tASSERT(parent, MDBX_PNL_MOST(parent->tw.relist) + 1 < parent->geo.first_unallocated); + } +#endif /* MDBX_ENABLE_REFUND */ + + txn->signature = 0; + osal_free(txn); + tASSERT(parent, audit_ex(parent, 0, false) == 0); + rc = MDBX_SUCCESS; + goto provide_latency; + } + + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } else { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->parent ? txn->parent->tw.dirtyroom : env->options.dp_limit)); + } + txn_done_cursors(txn, false); + end_mode |= TXN_END_EOTDONE; + + if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) && + (txn->flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { + TXN_FOREACH_DBI_ALL(txn, i) { tASSERT(txn, !(txn->dbi_state[i] & DBI_DIRTY)); } +#if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT + rc = txn_end(txn, end_mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = MDBX_RESULT_TRUE; + goto provide_latency; +#else + goto done; +#endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ + } + + DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, (void *)txn, + (void *)env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root); + + if (txn->n_dbi > CORE_DBS) { + /* Update table root pointers */ + cursor_couple_t cx; + rc = cursor_init(&cx.outer, txn, MAIN_DBI); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + cx.outer.next = txn->cursors[MAIN_DBI]; + txn->cursors[MAIN_DBI] = &cx.outer; + TXN_FOREACH_DBI_USER(txn, i) { + if ((txn->dbi_state[i] & DBI_DIRTY) == 0) + continue; + tree_t *const db = &txn->dbs[i]; + DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN " -> %" PRIaTXN, i, db->mod_txnid, txn->txnid); + /* Может быть mod_txnid > front после коммита вложенных тразакций */ + db->mod_txnid = txn->txnid; + MDBX_val data = {db, sizeof(tree_t)}; + rc = cursor_put(&cx.outer, &env->kvs[i].name, &data, N_TREE); + if (unlikely(rc != MDBX_SUCCESS)) { + txn->cursors[MAIN_DBI] = cx.outer.next; + goto fail; + } + } + txn->cursors[MAIN_DBI] = cx.outer.next; + } + + ts_1 = latency ? osal_monotime() : 0; + + gcu_t gcu_ctx; + gc_cputime = latency ? osal_cputime(nullptr) : 0; + rc = gc_update_init(txn, &gcu_ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = gc_update(txn, &gcu_ctx); + gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0; + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + + tASSERT(txn, txn->tw.loose_count == 0); + txn->dbs[FREE_DBI].mod_txnid = (txn->dbi_state[FREE_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[FREE_DBI].mod_txnid; + + txn->dbs[MAIN_DBI].mod_txnid = (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[MAIN_DBI].mod_txnid; + + ts_2 = latency ? osal_monotime() : 0; + ts_3 = ts_2; + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); + ts_3 = osal_monotime(); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + + bool need_flush_for_nometasync = false; + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); + const uint32_t meta_sync_txnid = atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed); + /* sync prev meta */ + if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) { + /* Исправление унаследованного от LMDB недочета: + * + * Всё хорошо, если все процессы работающие с БД не используют WRITEMAP. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате fdatasync() при записи данных этой транзакции. + * + * Всё хорошо, если все процессы работающие с БД используют WRITEMAP + * без MDBX_AVOID_MSYNC. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате msync() при записи данных этой транзакции. + * + * Если же в процессах работающих с БД используется оба метода, как sync() + * в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то + * становится невозможным обеспечить фиксацию на диске мета-страницы + * предыдущей транзакции и данных текущей транзакции, за счет одной + * sync-операцией выполняемой после записи данных текущей транзакции. + * Соответственно, требуется явно обновлять мета-страницу, что полностью + * уничтожает выгоду от NOMETASYNC. */ + const uint32_t txnid_dist = ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; + /* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync() + * или msync() для гарантированной фиксации на диске мета-страницы, + * которая была "лениво" отправлена на запись в предыдущей транзакции, + * но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */ + if ( +#if defined(_WIN32) || defined(_WIN64) + !env->ioring.overlapped_fd && +#endif + meta_sync_txnid == (uint32_t)head.txnid - txnid_dist) + need_flush_for_nometasync = true; + else { + rc = meta_sync(env, head); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "presync-meta", rc); + goto fail; + } + } + } + + if (txn->tw.dirtylist) { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.loose_count == 0); + + mdbx_filehandle_t fd = +#if defined(_WIN32) || defined(_WIN64) + env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; + (void)need_flush_for_nometasync; +#else + (need_flush_for_nometasync || env->dsync_fd == INVALID_HANDLE_VALUE || + txn->tw.dirtylist->length > env->options.writethrough_threshold || + atomic_load64(&env->lck->unsynced_pages, mo_Relaxed)) + ? env->lazy_fd + : env->dsync_fd; +#endif /* Windows */ + + iov_ctx_t write_ctx; + rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, txn->tw.dirtylist->pages_including_loose, fd, false); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "iov-init", rc); + goto fail; + } + + rc = txn_write(txn, &write_ctx); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "write", rc); + goto fail; + } + } else { + tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages; + if (!env->lck->eoos_timestamp.weak) + env->lck->eoos_timestamp.weak = osal_monotime(); + } + + /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ + ts_4 = latency ? osal_monotime() : 0; + + meta_t meta; + memcpy(meta.magic_and_version, head.ptr_c->magic_and_version, 8); + meta.reserve16 = head.ptr_c->reserve16; + meta.validator_id = head.ptr_c->validator_id; + meta.extra_pagehdr = head.ptr_c->extra_pagehdr; + unaligned_poke_u64(4, meta.pages_retired, + unaligned_peek_u64(4, head.ptr_c->pages_retired) + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + meta.geometry = txn->geo; + meta.trees.gc = txn->dbs[FREE_DBI]; + meta.trees.main = txn->dbs[MAIN_DBI]; + meta.canary = txn->canary; + memcpy(&meta.dxbid, &head.ptr_c->dxbid, sizeof(meta.dxbid)); + + txnid_t commit_txnid = txn->txnid; +#if MDBX_ENABLE_BIGFOOT + if (gcu_ctx.bigfoot > txn->txnid) { + commit_txnid = gcu_ctx.bigfoot; + TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, (size_t)(commit_txnid - txn->txnid)); + } +#endif + meta.unsafe_sign = DATASIGN_NONE; + meta_set_txnid(env, &meta, commit_txnid); + + rc = dxb_sync_locked(env, env->flags | txn->flags | txn_shrink_allowed, &meta, &txn->tw.troika); + + ts_5 = latency ? osal_monotime() : 0; + if (unlikely(rc != MDBX_SUCCESS)) { + env->flags |= ENV_FATAL_ERROR; + ERROR("txn-%s: error %d", "sync", rc); + goto fail; + } + + end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE; + +done: + if (latency) + txn_take_gcprof(txn, latency); + rc = txn_end(txn, end_mode); + +provide_latency: + if (latency) { + latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; + latency->gc_wallclock = (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0; + latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + const uint64_t ts_6 = osal_monotime(); + latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; + latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); + } + return LOG_IFERR(rc); + +fail: + txn->flags |= MDBX_TXN_ERROR; + if (latency) + txn_take_gcprof(txn, latency); + txn_abort(txn); + goto provide_latency; +} + +int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { + int rc = check_txn(txn, MDBX_TXN_FINISHED); + if (unlikely(rc != MDBX_SUCCESS)) + return LOG_IFERR(rc); + + if (unlikely(!info)) + return LOG_IFERR(MDBX_EINVAL); + + MDBX_env *const env = txn->env; +#if MDBX_ENV_CHECKPID + if (unlikely(env->pid != osal_getpid())) { + env->flags |= ENV_FATAL_ERROR; + return LOG_IFERR(MDBX_PANIC); + } +#endif /* MDBX_ENV_CHECKPID */ + + info->txn_id = txn->txnid; + info->txn_space_used = pgno2bytes(env, txn->geo.first_unallocated); + + if (txn->flags & MDBX_TXN_RDONLY) { + meta_ptr_t head; + uint64_t head_retired; + troika_t troika = meta_tap(env); + do { + /* fetch info from volatile head */ + head = meta_recent(env, &troika); + head_retired = unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired); + info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->geometry.now); + info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->geometry.upper); + info->txn_space_leftover = pgno2bytes(env, head.ptr_v->geometry.now - head.ptr_v->geometry.first_unallocated); + } while (unlikely(meta_should_retry(env, &troika))); + + info->txn_reader_lag = head.txnid - info->txn_id; + info->txn_space_dirty = info->txn_space_retired = 0; + uint64_t reader_snapshot_pages_retired = 0; + if (txn->to.reader && + ((txn->flags & MDBX_TXN_PARKED) == 0 || safe64_read(&txn->to.reader->tid) != MDBX_TID_TXN_OUSTED) && + head_retired > + (reader_snapshot_pages_retired = atomic_load64(&txn->to.reader->snapshot_pages_retired, mo_Relaxed))) { + info->txn_space_dirty = info->txn_space_retired = + pgno2bytes(env, (pgno_t)(head_retired - reader_snapshot_pages_retired)); + + size_t retired_next_reader = 0; + lck_t *const lck = env->lck_mmap.lck; + if (scan_rlt && info->txn_reader_lag > 1 && lck) { + /* find next more recent reader */ + txnid_t next_reader = head.txnid; + const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease); + for (size_t i = 0; i < snap_nreaders; ++i) { + retry: + if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) { + jitter4testing(true); + const uint64_t snap_tid = safe64_read(&lck->rdt[i].tid); + const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid); + const uint64_t snap_retired = atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_AcquireRelease); + if (unlikely(snap_retired != atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed)) || + snap_txnid != safe64_read(&lck->rdt[i].txnid) || snap_tid != safe64_read(&lck->rdt[i].tid)) + goto retry; + if (snap_txnid <= txn->txnid) { + retired_next_reader = 0; + break; + } + if (snap_txnid < next_reader && snap_tid >= MDBX_TID_TXN_OUSTED) { + next_reader = snap_txnid; + retired_next_reader = pgno2bytes( + env, (pgno_t)(snap_retired - atomic_load64(&txn->to.reader->snapshot_pages_retired, mo_Relaxed))); } } - /* В LMDB флажок MDBX_CURRENT здесь приведет - * к замене данных без учета MDBX_DUPSORT сортировки, - * но здесь это в любом случае допустимо, так как мы - * проверили что для ключа есть только одно значение. */ } } - - if (is_modifable(txn, page)) { - if (new_data && cmp_lenfast(&present_data, new_data) == 0) { - /* если данные совпадают, то ничего делать не надо */ - *old_data = *new_data; - goto bailout; + info->txn_space_dirty = retired_next_reader; + } + } else { + info->txn_space_limit_soft = pgno2bytes(env, txn->geo.now); + info->txn_space_limit_hard = pgno2bytes(env, txn->geo.upper); + info->txn_space_retired = + pgno2bytes(env, txn->nested ? (size_t)txn->tw.retired_pages : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); + info->txn_space_dirty = + pgno2bytes(env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : (txn->tw.writemap_dirty_npages + txn->tw.writemap_spilled_npages)); + info->txn_reader_lag = INT64_MAX; + lck_t *const lck = env->lck_mmap.lck; + if (scan_rlt && lck) { + txnid_t oldest_snapshot = txn->txnid; + const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease); + if (snap_nreaders) { + oldest_snapshot = txn_snapshot_oldest(txn); + if (oldest_snapshot == txn->txnid - 1) { + /* check if there is at least one reader */ + bool exists = false; + for (size_t i = 0; i < snap_nreaders; ++i) { + if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->txnid > safe64_read(&lck->rdt[i].txnid)) { + exists = true; + break; + } + } + oldest_snapshot += !exists; } - rc = preserver ? preserver(preserver_context, old_data, present_data.iov_base, present_data.iov_len) - : MDBX_SUCCESS; - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } else { - *old_data = present_data; } - flags |= MDBX_CURRENT; + info->txn_reader_lag = txn->txnid - oldest_snapshot; } } - if (likely(new_data)) - rc = cursor_put_checklen(&cx.outer, key, new_data, flags); - else - rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS); - -bailout: - txn->cursors[dbi] = cx.outer.next; - return LOG_IFERR(rc); -} - -static int default_value_preserver(void *context, MDBX_val *target, const void *src, size_t bytes) { - (void)context; - if (unlikely(target->iov_len < bytes)) { - target->iov_base = nullptr; - target->iov_len = bytes; - return MDBX_RESULT_TRUE; - } - memcpy(target->iov_base, src, target->iov_len = bytes); return MDBX_SUCCESS; } - -int mdbx_replace(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *new_data, MDBX_val *old_data, - MDBX_put_flags_t flags) { - return mdbx_replace_ex(txn, dbi, key, new_data, old_data, flags, default_value_preserver, nullptr); -} diff --git a/src/dbi.c b/src/dbi.c index 60b32d8a..7716e6d2 100644 --- a/src/dbi.c +++ b/src/dbi.c @@ -132,7 +132,7 @@ __noinline int dbi_import(MDBX_txn *txn, const size_t dbi) { return MDBX_BAD_DBI; } -static int defer_and_release(MDBX_env *const env, defer_free_item_t *const chain) { +int dbi_defer_release(MDBX_env *const env, defer_free_item_t *const chain) { size_t length = 0; defer_free_item_t *obsolete_chain = nullptr; #if MDBX_ENABLE_DBI_LOCKFREE @@ -229,7 +229,7 @@ int dbi_update(MDBX_txn *txn, int keep) { eASSERT(env, !env->dbs_flags[i] && !env->kvs[i].name.iov_len && !env->kvs[i].name.iov_base); } env->n_dbi = (unsigned)i; - defer_and_release(env, defer_chain); + dbi_defer_release(env, defer_chain); } return MDBX_SUCCESS; } @@ -594,25 +594,7 @@ int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned user_flags, MDB return rc; } -static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, MDBX_db_flags_t flags, MDBX_dbi *dbi, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - MDBX_val thunk, *name; - if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || name_cstr == MDBX_CHK_META) - name = (void *)name_cstr; - else { - thunk.iov_len = strlen(name_cstr); - thunk.iov_base = (void *)name_cstr; - name = &thunk; - } - return dbi_open(txn, name, flags, dbi, keycmp, datacmp); -} - -struct dbi_rename_result { - defer_free_item_t *defer; - int err; -}; - -__cold static struct dbi_rename_result dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) { +__cold struct dbi_rename_result dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name) { struct dbi_rename_result pair; pair.defer = nullptr; pair.err = dbi_check(txn, dbi); @@ -690,259 +672,6 @@ static defer_free_item_t *dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { return defer_item; } -/*----------------------------------------------------------------------------*/ -/* API */ - -int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi) { - return LOG_IFERR(dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr)); -} - -int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi) { - return LOG_IFERR(dbi_open(txn, name, flags, dbi, nullptr, nullptr)); -} - -int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, - MDBX_cmp_func *datacmp) { - return LOG_IFERR(dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp)); -} - -int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, - MDBX_cmp_func *datacmp) { - return LOG_IFERR(dbi_open(txn, name, flags, dbi, keycmp, datacmp)); -} - -__cold int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (txn->dbs[dbi].height) { - cx.outer.next = txn->cursors[dbi]; - txn->cursors[dbi] = &cx.outer; - rc = tree_drop(&cx.outer, dbi == MAIN_DBI || (cx.outer.tree->flags & MDBX_DUPSORT)); - txn->cursors[dbi] = cx.outer.next; - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - } - - /* Invalidate the dropped DB's cursors */ - for (MDBX_cursor *mc = txn->cursors[dbi]; mc; mc = mc->next) - be_poor(mc); - - if (!del || dbi < CORE_DBS) { - /* reset the DB record, mark it dirty */ - txn->dbi_state[dbi] |= DBI_DIRTY; - txn->dbs[dbi].height = 0; - txn->dbs[dbi].branch_pages = 0; - txn->dbs[dbi].leaf_pages = 0; - txn->dbs[dbi].large_pages = 0; - txn->dbs[dbi].items = 0; - txn->dbs[dbi].root = P_INVALID; - txn->dbs[dbi].sequence = 0; - /* txn->dbs[dbi].mod_txnid = txn->txnid; */ - txn->flags |= MDBX_TXN_DIRTY; - return MDBX_SUCCESS; - } - - MDBX_env *const env = txn->env; - MDBX_val name = env->kvs[dbi].name; - rc = cursor_init(&cx.outer, txn, MAIN_DBI); - if (likely(rc == MDBX_SUCCESS)) { - rc = cursor_seek(&cx.outer, &name, nullptr, MDBX_SET).err; - if (likely(rc == MDBX_SUCCESS)) { - cx.outer.next = txn->cursors[MAIN_DBI]; - txn->cursors[MAIN_DBI] = &cx.outer; - rc = cursor_del(&cx.outer, N_TREE); - txn->cursors[MAIN_DBI] = cx.outer.next; - if (likely(rc == MDBX_SUCCESS)) { - tASSERT(txn, txn->dbi_state[MAIN_DBI] & DBI_DIRTY); - tASSERT(txn, txn->flags & MDBX_TXN_DIRTY); - txn->dbi_state[dbi] = DBI_LINDO | DBI_OLDEN; - rc = osal_fastmutex_acquire(&env->dbi_lock); - if (likely(rc == MDBX_SUCCESS)) - return LOG_IFERR(defer_and_release(env, dbi_close_locked(env, dbi))); - } - } - } - txn->flags |= MDBX_TXN_ERROR; - return LOG_IFERR(rc); -} - -__cold int mdbx_dbi_rename(MDBX_txn *txn, MDBX_dbi dbi, const char *name_cstr) { - MDBX_val thunk, *name; - if (name_cstr == MDBX_CHK_MAIN || name_cstr == MDBX_CHK_GC || name_cstr == MDBX_CHK_META) - name = (void *)name_cstr; - else { - thunk.iov_len = strlen(name_cstr); - thunk.iov_base = (void *)name_cstr; - name = &thunk; - } - return LOG_IFERR(mdbx_dbi_rename2(txn, dbi, name)); -} - -int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(dbi < CORE_DBS)) - return (dbi == MAIN_DBI) ? MDBX_SUCCESS : LOG_IFERR(MDBX_BAD_DBI); - - if (unlikely(dbi >= env->max_dbi)) - return LOG_IFERR(MDBX_BAD_DBI); - - if (unlikely(dbi < CORE_DBS || dbi >= env->max_dbi)) - return LOG_IFERR(MDBX_BAD_DBI); - - rc = osal_fastmutex_acquire(&env->dbi_lock); - if (likely(rc == MDBX_SUCCESS && dbi < env->n_dbi)) { - retry: - if (env->basal_txn && (env->dbs_flags[dbi] & DB_VALID) && (env->basal_txn->flags & MDBX_TXN_FINISHED) == 0) { - /* LY: Опасный код, так как env->txn может быть изменено в другом потоке. - * К сожалению тут нет надежного решения и может быть падение при неверном - * использовании API (вызове mdbx_dbi_close конкурентно с завершением - * пишущей транзакции). - * - * Для минимизации вероятности падения сначала проверяем dbi-флаги - * в basal_txn, а уже после в env->txn. Таким образом, падение может быть - * только при коллизии с завершением вложенной транзакции. - * - * Альтернативно можно попробовать выполнять обновление/put записи в - * mainDb соответствующей таблице закрываемого хендла. Семантически это - * верный путь, но проблема в текущем API, в котором исторически dbi-хендл - * живет и закрывается вне транзакции. Причем проблема не только в том, - * что нет указателя на текущую пишущую транзакцию, а в том что - * пользователь точно не ожидает что закрытие хендла приведет к - * скрытой/непрозрачной активности внутри транзакции потенциально - * выполняемой в другом потоке. Другими словами, проблема может быть - * только при неверном использовании API и если пользователь это - * допускает, то точно не будет ожидать скрытых действий внутри - * транзакции, и поэтому этот путь потенциально более опасен. */ - const MDBX_txn *const hazard = env->txn; - osal_compiler_barrier(); - if ((dbi_state(env->basal_txn, dbi) & (DBI_LINDO | DBI_DIRTY | DBI_CREAT)) > DBI_LINDO) { - bailout_dirty_dbi: - osal_fastmutex_release(&env->dbi_lock); - return LOG_IFERR(MDBX_DANGLING_DBI); - } - osal_memory_barrier(); - if (unlikely(hazard != env->txn)) - goto retry; - if (hazard != env->basal_txn && hazard && (hazard->flags & MDBX_TXN_FINISHED) == 0 && - hazard->signature == txn_signature && - (dbi_state(hazard, dbi) & (DBI_LINDO | DBI_DIRTY | DBI_CREAT)) > DBI_LINDO) - goto bailout_dirty_dbi; - osal_compiler_barrier(); - if (unlikely(hazard != env->txn)) - goto retry; - } - rc = defer_and_release(env, dbi_close_locked(env, dbi)); - } - return LOG_IFERR(rc); -} - -int mdbx_dbi_flags_ex(const MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags, unsigned *state) { - if (unlikely(!flags || !state)) - return LOG_IFERR(MDBX_EINVAL); - - int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR - MDBX_TXN_PARKED); - if (unlikely(rc != MDBX_SUCCESS)) { - *flags = 0; - *state = 0; - return LOG_IFERR(rc); - } - - rc = dbi_check(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) { - *flags = 0; - *state = 0; - return LOG_IFERR(rc); - } - - *flags = txn->dbs[dbi].flags & DB_PERSISTENT_FLAGS; - *state = txn->dbi_state[dbi] & (DBI_FRESH | DBI_CREAT | DBI_DIRTY | DBI_STALE); - return MDBX_SUCCESS; -} - -__cold int mdbx_dbi_rename2(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *new_name) { - int rc = check_txn_rw(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(new_name == MDBX_CHK_MAIN || new_name->iov_base == MDBX_CHK_MAIN || new_name == MDBX_CHK_GC || - new_name->iov_base == MDBX_CHK_GC || new_name == MDBX_CHK_META || new_name->iov_base == MDBX_CHK_META)) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(dbi < CORE_DBS)) - return LOG_IFERR(MDBX_EINVAL); - rc = dbi_check(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - rc = osal_fastmutex_acquire(&txn->env->dbi_lock); - if (likely(rc == MDBX_SUCCESS)) { - struct dbi_rename_result pair = dbi_rename_locked(txn, dbi, *new_name); - if (pair.defer) - pair.defer->next = nullptr; - defer_and_release(txn->env, pair.defer); - rc = pair.err; - } - return LOG_IFERR(rc); -} - -static void stat_get(const tree_t *db, MDBX_stat *st, size_t bytes) { - st->ms_depth = db->height; - st->ms_branch_pages = db->branch_pages; - st->ms_leaf_pages = db->leaf_pages; - st->ms_overflow_pages = db->large_pages; - st->ms_entries = db->items; - if (likely(bytes >= offsetof(MDBX_stat, ms_mod_txnid) + sizeof(st->ms_mod_txnid))) - st->ms_mod_txnid = db->mod_txnid; -} - -__cold int mdbx_dbi_stat(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, size_t bytes) { - if (unlikely(!dest)) - return LOG_IFERR(MDBX_EINVAL); - - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - rc = dbi_check(txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - const size_t size_before_modtxnid = offsetof(MDBX_stat, ms_mod_txnid); - if (unlikely(bytes != sizeof(MDBX_stat)) && bytes != size_before_modtxnid) { - rc = MDBX_EINVAL; - goto bailout; - } - - if (unlikely(txn->flags & MDBX_TXN_BLOCKED)) { - rc = MDBX_BAD_TXN; - goto bailout; - } - - if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) { - rc = tbl_fetch((MDBX_txn *)txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - - dest->ms_psize = txn->env->ps; - stat_get(&txn->dbs[dbi], dest, bytes); - return MDBX_SUCCESS; - -bailout: - memset(dest, 0, bytes); - return LOG_IFERR(rc); -} - __cold const tree_t *dbi_dig(const MDBX_txn *txn, const size_t dbi, tree_t *fallback) { const MDBX_txn *dig = txn; do { @@ -966,58 +695,4 @@ __cold const tree_t *dbi_dig(const MDBX_txn *txn, const size_t dbi, tree_t *fall return fallback; } -__cold int mdbx_enumerate_tables(const MDBX_txn *txn, MDBX_table_enum_func *func, void *ctx) { - if (unlikely(!func)) - return LOG_IFERR(MDBX_EINVAL); - - int rc = check_txn(txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, MAIN_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - cx.outer.next = txn->cursors[MAIN_DBI]; - txn->cursors[MAIN_DBI] = &cx.outer; - for (rc = outer_first(&cx.outer, nullptr, nullptr); rc == MDBX_SUCCESS; - rc = outer_next(&cx.outer, nullptr, nullptr, MDBX_NEXT_NODUP)) { - node_t *node = page_node(cx.outer.pg[cx.outer.top], cx.outer.ki[cx.outer.top]); - if (node_flags(node) != N_TREE) - continue; - if (unlikely(node_ds(node) != sizeof(tree_t))) { - ERROR("%s/%d: %s %u", "MDBX_CORRUPTED", MDBX_CORRUPTED, "invalid dupsort sub-tree node size", - (unsigned)node_ds(node)); - rc = MDBX_CORRUPTED; - break; - } - - tree_t reside; - const tree_t *tree = memcpy(&reside, node_data(node), sizeof(reside)); - const MDBX_val name = {node_key(node), node_ks(node)}; - const MDBX_env *const env = txn->env; - MDBX_dbi dbi = 0; - for (size_t i = CORE_DBS; i < env->n_dbi; ++i) { - if (i >= txn->n_dbi || !(env->dbs_flags[i] & DB_VALID)) - continue; - if (env->kvs[MAIN_DBI].clc.k.cmp(&name, &env->kvs[i].name)) - continue; - - tree = dbi_dig(txn, i, &reside); - dbi = (MDBX_dbi)i; - break; - } - - MDBX_stat stat; - stat_get(tree, &stat, sizeof(stat)); - rc = func(ctx, txn, &name, tree->flags, &stat, dbi); - if (rc != MDBX_SUCCESS) - goto bailout; - } - rc = (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; - -bailout: - txn->cursors[MAIN_DBI] = cx.outer.next; - return LOG_IFERR(rc); -} +int dbi_close_release(MDBX_env *env, MDBX_dbi dbi) { return dbi_defer_release(env, dbi_close_locked(env, dbi)); } diff --git a/src/dbi.h b/src/dbi.h index c06f5bd1..c654a0fa 100644 --- a/src/dbi.h +++ b/src/dbi.h @@ -124,4 +124,18 @@ MDBX_INTERNAL int dbi_open(MDBX_txn *txn, const MDBX_val *const name, unsigned u MDBX_INTERNAL int dbi_bind(MDBX_txn *txn, const size_t dbi, unsigned user_flags, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); +typedef struct defer_free_item { + struct defer_free_item *next; + uint64_t timestamp; +} defer_free_item_t; + +MDBX_INTERNAL int dbi_defer_release(MDBX_env *const env, defer_free_item_t *const chain); +MDBX_INTERNAL int dbi_close_release(MDBX_env *env, MDBX_dbi dbi); MDBX_INTERNAL const tree_t *dbi_dig(const MDBX_txn *txn, const size_t dbi, tree_t *fallback); + +struct dbi_rename_result { + defer_free_item_t *defer; + int err; +}; + +MDBX_INTERNAL struct dbi_rename_result dbi_rename_locked(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val new_name); diff --git a/src/internals.h b/src/internals.h index 2dca2d23..aae09e42 100644 --- a/src/internals.h +++ b/src/internals.h @@ -315,11 +315,6 @@ struct cursor_couple { subcur_t inner; }; -struct defer_free_item { - struct defer_free_item *next; - uint64_t timestamp; -}; - enum env_flags { /* Failed to update the meta page. Probably an I/O error. */ ENV_FATAL_ERROR = INT32_MIN /* 0x80000000 */, diff --git a/src/mvcc-readers.c b/src/mvcc-readers.c index d51576da..be0220a9 100644 --- a/src/mvcc-readers.c +++ b/src/mvcc-readers.c @@ -300,82 +300,6 @@ __cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked, int *d return rc; } -int txn_park(MDBX_txn *txn, bool autounpark) { - reader_slot_t *const rslot = txn->to.reader; - tASSERT(txn, (txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) == MDBX_TXN_RDONLY); - tASSERT(txn, txn->to.reader->tid.weak < MDBX_TID_TXN_OUSTED); - if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) != MDBX_TXN_RDONLY)) - return MDBX_BAD_TXN; - - const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed); - const uint64_t tid = atomic_load64(&rslot->tid, mo_Relaxed); - const uint64_t txnid = atomic_load64(&rslot->txnid, mo_Relaxed); - if (unlikely(pid != txn->env->pid)) { - ERROR("unexpected pid %u%s%u", pid, " != must ", txn->env->pid); - return MDBX_PROBLEM; - } - if (unlikely(tid != txn->owner || txnid != txn->txnid)) { - ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%0zx" - " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, - tid, " != must ", txn->owner, txnid, " != must ", txn->txnid); - return MDBX_BAD_RSLOT; - } - - atomic_store64(&rslot->tid, MDBX_TID_TXN_PARKED, mo_AcquireRelease); - atomic_store32(&txn->env->lck->rdt_refresh_flag, true, mo_Relaxed); - txn->flags += autounpark ? MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK : MDBX_TXN_PARKED; - return MDBX_SUCCESS; -} - -int txn_unpark(MDBX_txn *txn) { - if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) != - (MDBX_TXN_RDONLY | MDBX_TXN_PARKED))) - return MDBX_BAD_TXN; - - for (reader_slot_t *const rslot = txn->to.reader; rslot; atomic_yield()) { - const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed); - uint64_t tid = safe64_read(&rslot->tid); - uint64_t txnid = safe64_read(&rslot->txnid); - if (unlikely(pid != txn->env->pid)) { - ERROR("unexpected pid %u%s%u", pid, " != expected ", txn->env->pid); - return MDBX_PROBLEM; - } - if (unlikely(tid == MDBX_TID_TXN_OUSTED || txnid >= SAFE64_INVALID_THRESHOLD)) - break; - if (unlikely(tid != MDBX_TID_TXN_PARKED || txnid != txn->txnid)) { - ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%" PRIx64 " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, tid, " != must ", - MDBX_TID_TXN_OUSTED, txnid, " != must ", txn->txnid); - break; - } - if (unlikely((txn->flags & MDBX_TXN_ERROR))) - break; - -#if MDBX_64BIT_CAS - if (unlikely(!atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED, txn->owner))) - continue; -#else - atomic_store32(&rslot->tid.high, (uint32_t)((uint64_t)txn->owner >> 32), mo_Relaxed); - if (unlikely(!atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED, (uint32_t)txn->owner))) { - atomic_store32(&rslot->tid.high, (uint32_t)(MDBX_TID_TXN_PARKED >> 32), mo_AcquireRelease); - continue; - } -#endif - txnid = safe64_read(&rslot->txnid); - tid = safe64_read(&rslot->tid); - if (unlikely(txnid != txn->txnid || tid != txn->owner)) { - ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%zx" - " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, - tid, " != must ", txn->owner, txnid, " != must ", txn->txnid); - break; - } - txn->flags &= ~(MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK); - return MDBX_SUCCESS; - } - - int err = txn_end(txn, TXN_END_OUSTED | TXN_END_RESET | TXN_END_UPDATE); - return err ? err : MDBX_OUSTED; -} - __cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) { DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler); osal_memory_fence(mo_AcquireRelease, false); @@ -488,64 +412,3 @@ __cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) { } return oldest; } - -/*----------------------------------------------------------------------------*/ - -__cold int mdbx_thread_register(const MDBX_env *env) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(!env->lck_mmap.lck)) - return LOG_IFERR((env->flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM); - - if (unlikely((env->flags & ENV_TXKEY) == 0)) { - eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS); - return LOG_IFERR(MDBX_EINVAL) /* MDBX_NOSTICKYTHREADS mode */; - } - - eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY); - reader_slot_t *r = thread_rthc_get(env->me_txkey); - if (unlikely(r != nullptr)) { - eASSERT(env, r->pid.weak == env->pid); - eASSERT(env, r->tid.weak == osal_thread_self()); - if (unlikely(r->pid.weak != env->pid)) - return LOG_IFERR(MDBX_BAD_RSLOT); - return MDBX_RESULT_TRUE /* already registered */; - } - - return LOG_IFERR(mvcc_bind_slot((MDBX_env *)env).err); -} - -__cold int mdbx_thread_unregister(const MDBX_env *env) { - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(!env->lck_mmap.lck)) - return MDBX_RESULT_TRUE; - - if (unlikely((env->flags & ENV_TXKEY) == 0)) { - eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS); - return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */; - } - - eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY); - reader_slot_t *r = thread_rthc_get(env->me_txkey); - if (unlikely(r == nullptr)) - return MDBX_RESULT_TRUE /* not registered */; - - eASSERT(env, r->pid.weak == env->pid); - eASSERT(env, r->tid.weak == osal_thread_self()); - if (unlikely(r->pid.weak != env->pid || r->tid.weak != osal_thread_self())) - return LOG_IFERR(MDBX_BAD_RSLOT); - - eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD); - if (unlikely(r->txnid.weak < SAFE64_INVALID_THRESHOLD)) - return LOG_IFERR(MDBX_BUSY) /* transaction is still active */; - - atomic_store32(&r->pid, 0, mo_Relaxed); - atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease); - thread_rthc_set(env->me_txkey, nullptr); - return MDBX_SUCCESS; -} diff --git a/src/proto.h b/src/proto.h index d4cc67f4..bb8d1386 100644 --- a/src/proto.h +++ b/src/proto.h @@ -46,6 +46,7 @@ MDBX_INTERNAL int txn_renew(MDBX_txn *txn, unsigned flags); MDBX_INTERNAL int txn_park(MDBX_txn *txn, bool autounpark); MDBX_INTERNAL int txn_unpark(MDBX_txn *txn); MDBX_INTERNAL int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits); +MDBX_INTERNAL void txn_done_cursors(MDBX_txn *txn, const bool merge); #define TXN_END_NAMES \ {"committed", "empty-commit", "abort", "reset", "fail-begin", "fail-beginchild", "ousted", nullptr} @@ -67,6 +68,8 @@ enum { }; MDBX_INTERNAL int txn_end(MDBX_txn *txn, unsigned mode); MDBX_INTERNAL int txn_write(MDBX_txn *txn, iov_ctx_t *ctx); +MDBX_INTERNAL void txn_take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency); +MDBX_INTERNAL void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len); /* env.c */ MDBX_INTERNAL int env_open(MDBX_env *env, mdbx_mode_t mode); diff --git a/src/tree.c b/src/tree-ops.c similarity index 100% rename from src/tree.c rename to src/tree-ops.c diff --git a/src/page-search.c b/src/tree-search.c similarity index 100% rename from src/page-search.c rename to src/tree-search.c diff --git a/src/txn.c b/src/txn.c index 38e53e5a..3ac333fc 100644 --- a/src/txn.c +++ b/src/txn.c @@ -7,7 +7,7 @@ __hot txnid_t txn_snapshot_oldest(const MDBX_txn *const txn) { return mvcc_shapshot_oldest(txn->env, txn->tw.troika.txnid[txn->tw.troika.prefer_steady]); } -static void done_cursors(MDBX_txn *txn, const bool merge) { +void txn_done_cursors(MDBX_txn *txn, const bool merge) { tASSERT(txn, txn->cursors[FREE_DBI] == nullptr); TXN_FOREACH_DBI_FROM(txn, i, /* skip FREE_DBI */ 1) { MDBX_cursor *mc = txn->cursors[i]; @@ -65,7 +65,7 @@ int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { } /* Merge child txn into parent */ -static void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len) { +void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t parent_retired_len) { tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0); dpl_t *const src = dpl_sort(txn); @@ -395,7 +395,7 @@ static void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, const size_t } } -static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { +void txn_take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { MDBX_env *const env = txn->env; if (MDBX_ENABLE_PROFGC) { pgop_stat_t *const ptr = &env->lck->pgops; @@ -432,409 +432,6 @@ static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); } -int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { - STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR - MDBX_TXN_PARKED); - const uint64_t ts_0 = latency ? osal_monotime() : 0; - uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; - - int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_BAD_TXN && (txn->flags & MDBX_TXN_RDONLY)) { - rc = MDBX_RESULT_TRUE; - goto fail; - } - bailout: - if (latency) - memset(latency, 0, sizeof(*latency)); - return LOG_IFERR(rc); - } - - MDBX_env *const env = txn->env; - if (MDBX_ENV_CHECKPID && unlikely(env->pid != osal_getpid())) { - env->flags |= ENV_FATAL_ERROR; - rc = MDBX_PANIC; - goto bailout; - } - - if (unlikely(txn->flags & MDBX_TXN_ERROR)) { - rc = MDBX_RESULT_TRUE; - goto fail; - } - - /* txn_end() mode for a commit which writes nothing */ - unsigned end_mode = TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE; - if (unlikely(txn->flags & MDBX_TXN_RDONLY)) - goto done; - - if ((txn->flags & MDBX_NOSTICKYTHREADS) && unlikely(txn->owner != osal_thread_self())) { - rc = MDBX_THREAD_MISMATCH; - goto fail; - } - - if (txn->nested) { - rc = mdbx_txn_commit_ex(txn->nested, nullptr); - tASSERT(txn, txn->nested == nullptr); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - - if (unlikely(txn != env->txn)) { - DEBUG("%s", "attempt to commit unknown transaction"); - rc = MDBX_EINVAL; - goto fail; - } - - if (txn->parent) { - tASSERT(txn, audit_ex(txn, 0, false) == 0); - eASSERT(env, txn != env->basal_txn); - MDBX_txn *const parent = txn->parent; - eASSERT(env, parent->signature == txn_signature); - eASSERT(env, parent->nested == txn && (parent->flags & MDBX_TXN_HAS_CHILD) != 0); - eASSERT(env, dpl_check(txn)); - - if (txn->tw.dirtylist->length == 0 && !(txn->flags & MDBX_TXN_DIRTY) && parent->n_dbi == txn->n_dbi) { - TXN_FOREACH_DBI_ALL(txn, i) { - tASSERT(txn, (txn->dbi_state[i] & DBI_DIRTY) == 0); - if ((txn->dbi_state[i] & DBI_STALE) && !(parent->dbi_state[i] & DBI_STALE)) - tASSERT(txn, memcmp(&parent->dbs[i], &txn->dbs[i], sizeof(tree_t)) == 0); - } - - tASSERT(txn, memcmp(&parent->geo, &txn->geo, sizeof(parent->geo)) == 0); - tASSERT(txn, memcmp(&parent->canary, &txn->canary, sizeof(parent->canary)) == 0); - tASSERT(txn, !txn->tw.spilled.list || MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0); - tASSERT(txn, txn->tw.loose_count == 0); - - /* fast completion of pure nested transaction */ - VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->txnid); - end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE; - goto done; - } - - /* Preserve space for spill list to avoid parent's state corruption - * if allocation fails. */ - const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages; - tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - const size_t retired_delta = MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len; - if (retired_delta) { - rc = pnl_need(&txn->tw.relist, retired_delta); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - - if (txn->tw.spilled.list) { - if (parent->tw.spilled.list) { - rc = pnl_need(&parent->tw.spilled.list, MDBX_PNL_GETSIZE(txn->tw.spilled.list)); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - spill_purge(txn); - } - - if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > parent->tw.dirtylist->detent && - !dpl_reserve(parent, txn->tw.dirtylist->length + parent->tw.dirtylist->length))) { - rc = MDBX_ENOMEM; - goto fail; - } - - //------------------------------------------------------------------------- - - parent->tw.gc.reclaimed = txn->tw.gc.reclaimed; - txn->tw.gc.reclaimed = nullptr; - - parent->tw.retired_pages = txn->tw.retired_pages; - txn->tw.retired_pages = nullptr; - - pnl_free(parent->tw.relist); - parent->tw.relist = txn->tw.relist; - txn->tw.relist = nullptr; - parent->tw.gc.time_acc = txn->tw.gc.time_acc; - parent->tw.gc.last_reclaimed = txn->tw.gc.last_reclaimed; - - parent->geo = txn->geo; - parent->canary = txn->canary; - parent->flags |= txn->flags & MDBX_TXN_DIRTY; - - /* Move loose pages to parent */ -#if MDBX_ENABLE_REFUND - parent->tw.loose_refund_wl = txn->tw.loose_refund_wl; -#endif /* MDBX_ENABLE_REFUND */ - parent->tw.loose_count = txn->tw.loose_count; - parent->tw.loose_pages = txn->tw.loose_pages; - - /* Merge our cursors into parent's and close them */ - done_cursors(txn, true); - end_mode |= TXN_END_EOTDONE; - - /* Update parent's DBs array */ - eASSERT(env, parent->n_dbi == txn->n_dbi); - TXN_FOREACH_DBI_ALL(txn, dbi) { - if (txn->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) { - parent->dbs[dbi] = txn->dbs[dbi]; - /* preserve parent's status */ - const uint8_t state = txn->dbi_state[dbi] | (parent->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); - DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi, (parent->dbi_state[dbi] != state) ? "update" : "still", - parent->dbi_state[dbi], state); - parent->dbi_state[dbi] = state; - } else { - eASSERT(env, txn->dbi_state[dbi] == (parent->dbi_state[dbi] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY))); - } - } - - if (latency) { - ts_1 = osal_monotime(); - ts_2 = /* no gc-update */ ts_1; - ts_3 = /* no audit */ ts_2; - ts_4 = /* no write */ ts_3; - ts_5 = /* no sync */ ts_4; - } - txn_merge(parent, txn, parent_retired_len); - env->txn = parent; - parent->nested = nullptr; - tASSERT(parent, dpl_check(parent)); - -#if MDBX_ENABLE_REFUND - txn_refund(parent); - if (ASSERT_ENABLED()) { - /* Check parent's loose pages not suitable for refund */ - for (page_t *lp = parent->tw.loose_pages; lp; lp = page_next(lp)) { - tASSERT(parent, lp->pgno < parent->tw.loose_refund_wl && lp->pgno + 1 < parent->geo.first_unallocated); - MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); - VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); - } - /* Check parent's reclaimed pages not suitable for refund */ - if (MDBX_PNL_GETSIZE(parent->tw.relist)) - tASSERT(parent, MDBX_PNL_MOST(parent->tw.relist) + 1 < parent->geo.first_unallocated); - } -#endif /* MDBX_ENABLE_REFUND */ - - txn->signature = 0; - osal_free(txn); - tASSERT(parent, audit_ex(parent, 0, false) == 0); - rc = MDBX_SUCCESS; - goto provide_latency; - } - - if (!txn->tw.dirtylist) { - tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - } else { - tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->parent ? txn->parent->tw.dirtyroom : env->options.dp_limit)); - } - done_cursors(txn, false); - end_mode |= TXN_END_EOTDONE; - - if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) && - (txn->flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { - TXN_FOREACH_DBI_ALL(txn, i) { tASSERT(txn, !(txn->dbi_state[i] & DBI_DIRTY)); } -#if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT - rc = txn_end(txn, end_mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - rc = MDBX_RESULT_TRUE; - goto provide_latency; -#else - goto done; -#endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ - } - - DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, (void *)txn, - (void *)env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root); - - if (txn->n_dbi > CORE_DBS) { - /* Update table root pointers */ - cursor_couple_t cx; - rc = cursor_init(&cx.outer, txn, MAIN_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - cx.outer.next = txn->cursors[MAIN_DBI]; - txn->cursors[MAIN_DBI] = &cx.outer; - TXN_FOREACH_DBI_USER(txn, i) { - if ((txn->dbi_state[i] & DBI_DIRTY) == 0) - continue; - tree_t *const db = &txn->dbs[i]; - DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN " -> %" PRIaTXN, i, db->mod_txnid, txn->txnid); - /* Может быть mod_txnid > front после коммита вложенных тразакций */ - db->mod_txnid = txn->txnid; - MDBX_val data = {db, sizeof(tree_t)}; - rc = cursor_put(&cx.outer, &env->kvs[i].name, &data, N_TREE); - if (unlikely(rc != MDBX_SUCCESS)) { - txn->cursors[MAIN_DBI] = cx.outer.next; - goto fail; - } - } - txn->cursors[MAIN_DBI] = cx.outer.next; - } - - ts_1 = latency ? osal_monotime() : 0; - - gcu_t gcu_ctx; - gc_cputime = latency ? osal_cputime(nullptr) : 0; - rc = gc_update_init(txn, &gcu_ctx); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - rc = gc_update(txn, &gcu_ctx); - gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0; - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - - tASSERT(txn, txn->tw.loose_count == 0); - txn->dbs[FREE_DBI].mod_txnid = (txn->dbi_state[FREE_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[FREE_DBI].mod_txnid; - - txn->dbs[MAIN_DBI].mod_txnid = (txn->dbi_state[MAIN_DBI] & DBI_DIRTY) ? txn->txnid : txn->dbs[MAIN_DBI].mod_txnid; - - ts_2 = latency ? osal_monotime() : 0; - ts_3 = ts_2; - if (AUDIT_ENABLED()) { - rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); - ts_3 = osal_monotime(); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; - } - - bool need_flush_for_nometasync = false; - const meta_ptr_t head = meta_recent(env, &txn->tw.troika); - const uint32_t meta_sync_txnid = atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed); - /* sync prev meta */ - if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) { - /* Исправление унаследованного от LMDB недочета: - * - * Всё хорошо, если все процессы работающие с БД не используют WRITEMAP. - * Тогда мета-страница (обновленная, но не сброшенная на диск) будет - * сохранена в результате fdatasync() при записи данных этой транзакции. - * - * Всё хорошо, если все процессы работающие с БД используют WRITEMAP - * без MDBX_AVOID_MSYNC. - * Тогда мета-страница (обновленная, но не сброшенная на диск) будет - * сохранена в результате msync() при записи данных этой транзакции. - * - * Если же в процессах работающих с БД используется оба метода, как sync() - * в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то - * становится невозможным обеспечить фиксацию на диске мета-страницы - * предыдущей транзакции и данных текущей транзакции, за счет одной - * sync-операцией выполняемой после записи данных текущей транзакции. - * Соответственно, требуется явно обновлять мета-страницу, что полностью - * уничтожает выгоду от NOMETASYNC. */ - const uint32_t txnid_dist = ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) ? MDBX_NOMETASYNC_LAZY_FD - : MDBX_NOMETASYNC_LAZY_WRITEMAP; - /* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync() - * или msync() для гарантированной фиксации на диске мета-страницы, - * которая была "лениво" отправлена на запись в предыдущей транзакции, - * но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */ - if ( -#if defined(_WIN32) || defined(_WIN64) - !env->ioring.overlapped_fd && -#endif - meta_sync_txnid == (uint32_t)head.txnid - txnid_dist) - need_flush_for_nometasync = true; - else { - rc = meta_sync(env, head); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "presync-meta", rc); - goto fail; - } - } - } - - if (txn->tw.dirtylist) { - tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - tASSERT(txn, txn->tw.loose_count == 0); - - mdbx_filehandle_t fd = -#if defined(_WIN32) || defined(_WIN64) - env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd; - (void)need_flush_for_nometasync; -#else - (need_flush_for_nometasync || env->dsync_fd == INVALID_HANDLE_VALUE || - txn->tw.dirtylist->length > env->options.writethrough_threshold || - atomic_load64(&env->lck->unsynced_pages, mo_Relaxed)) - ? env->lazy_fd - : env->dsync_fd; -#endif /* Windows */ - - iov_ctx_t write_ctx; - rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, txn->tw.dirtylist->pages_including_loose, fd, false); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "iov-init", rc); - goto fail; - } - - rc = txn_write(txn, &write_ctx); - if (unlikely(rc != MDBX_SUCCESS)) { - ERROR("txn-%s: error %d", "write", rc); - goto fail; - } - } else { - tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); - env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages; - if (!env->lck->eoos_timestamp.weak) - env->lck->eoos_timestamp.weak = osal_monotime(); - } - - /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ - ts_4 = latency ? osal_monotime() : 0; - - meta_t meta; - memcpy(meta.magic_and_version, head.ptr_c->magic_and_version, 8); - meta.reserve16 = head.ptr_c->reserve16; - meta.validator_id = head.ptr_c->validator_id; - meta.extra_pagehdr = head.ptr_c->extra_pagehdr; - unaligned_poke_u64(4, meta.pages_retired, - unaligned_peek_u64(4, head.ptr_c->pages_retired) + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - meta.geometry = txn->geo; - meta.trees.gc = txn->dbs[FREE_DBI]; - meta.trees.main = txn->dbs[MAIN_DBI]; - meta.canary = txn->canary; - memcpy(&meta.dxbid, &head.ptr_c->dxbid, sizeof(meta.dxbid)); - - txnid_t commit_txnid = txn->txnid; -#if MDBX_ENABLE_BIGFOOT - if (gcu_ctx.bigfoot > txn->txnid) { - commit_txnid = gcu_ctx.bigfoot; - TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, (size_t)(commit_txnid - txn->txnid)); - } -#endif - meta.unsafe_sign = DATASIGN_NONE; - meta_set_txnid(env, &meta, commit_txnid); - - rc = dxb_sync_locked(env, env->flags | txn->flags | txn_shrink_allowed, &meta, &txn->tw.troika); - - ts_5 = latency ? osal_monotime() : 0; - if (unlikely(rc != MDBX_SUCCESS)) { - env->flags |= ENV_FATAL_ERROR; - ERROR("txn-%s: error %d", "sync", rc); - goto fail; - } - - end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE; - -done: - if (latency) - take_gcprof(txn, latency); - rc = txn_end(txn, end_mode); - -provide_latency: - if (latency) { - latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; - latency->gc_wallclock = (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; - latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0; - latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; - latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; - latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; - const uint64_t ts_6 = osal_monotime(); - latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; - latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); - } - return LOG_IFERR(rc); - -fail: - txn->flags |= MDBX_TXN_ERROR; - if (latency) - take_gcprof(txn, latency); - txn_abort(txn); - goto provide_latency; -} - int txn_abort(MDBX_txn *txn) { if (txn->flags & MDBX_TXN_RDONLY) /* LY: don't close DBI-handles */ @@ -1260,7 +857,7 @@ int txn_end(MDBX_txn *txn, unsigned mode) { txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root); if (!(mode & TXN_END_EOTDONE)) /* !(already closed cursors) */ - done_cursors(txn, false); + txn_done_cursors(txn, false); int rc = MDBX_SUCCESS; if (txn->flags & MDBX_TXN_RDONLY) { @@ -1388,461 +985,6 @@ int txn_end(MDBX_txn *txn, unsigned mode) { return rc; } -/*----------------------------------------------------------------------------*/ - -int mdbx_txn_renew(MDBX_txn *txn) { - if (unlikely(!txn)) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(txn->signature != txn_signature)) - return LOG_IFERR(MDBX_EBADSIGN); - - if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) - return LOG_IFERR(MDBX_EINVAL); - - if (unlikely(txn->owner != 0 || !(txn->flags & MDBX_TXN_FINISHED))) { - int rc = mdbx_txn_reset(txn); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - - int rc = txn_renew(txn, MDBX_TXN_RDONLY); - if (rc == MDBX_SUCCESS) { - tASSERT(txn, txn->owner == (txn->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self()); - DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, - (txn->flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)txn->env, txn->dbs[MAIN_DBI].root, - txn->dbs[FREE_DBI].root); - } - return LOG_IFERR(rc); -} - -int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) { - int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - txn->userctx = ctx; - return MDBX_SUCCESS; -} - -void *mdbx_txn_get_userctx(const MDBX_txn *txn) { return check_txn(txn, MDBX_TXN_FINISHED) ? nullptr : txn->userctx; } - -int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret, void *context) { - if (unlikely(!ret)) - return LOG_IFERR(MDBX_EINVAL); - *ret = nullptr; - - if (unlikely((flags & ~txn_rw_begin_flags) && (parent || (flags & ~txn_ro_begin_flags)))) - return LOG_IFERR(MDBX_EINVAL); - - int rc = check_env(env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(env->flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */ - return LOG_IFERR(MDBX_EACCESS); - - MDBX_txn *txn = nullptr; - if (parent) { - /* Nested transactions: Max 1 child, write txns only, no writemap */ - rc = check_txn_rw(parent, MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (env->options.spill_parent4child_denominator) { - /* Spill dirty-pages of parent to provide dirtyroom for child txn */ - rc = txn_spill(parent, nullptr, parent->tw.dirtylist->length / env->options.spill_parent4child_denominator); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - } - tASSERT(parent, audit_ex(parent, 0, false) == 0); - - flags |= parent->flags & (txn_rw_begin_flags | MDBX_TXN_SPILLS | MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP); - } else if ((flags & MDBX_TXN_RDONLY) == 0) { - /* Reuse preallocated write txn. However, do not touch it until - * txn_renew() succeeds, since it currently may be active. */ - txn = env->basal_txn; - goto renew; - } - - const intptr_t bitmap_bytes = -#if MDBX_ENABLE_DBI_SPARSE - ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) / CHAR_BIT; -#else - 0; -#endif /* MDBX_ENABLE_DBI_SPARSE */ - STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to)); - const size_t base = - (flags & MDBX_TXN_RDONLY) ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) : sizeof(MDBX_txn); - const size_t size = base + - ((flags & MDBX_TXN_RDONLY) ? (size_t)bitmap_bytes + env->max_dbi * sizeof(txn->dbi_seqs[0]) : 0) + - env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) + sizeof(txn->dbi_state[0])); - txn = osal_malloc(size); - if (unlikely(txn == nullptr)) - return LOG_IFERR(MDBX_ENOMEM); -#if MDBX_DEBUG - memset(txn, 0xCD, size); - VALGRIND_MAKE_MEM_UNDEFINED(txn, size); -#endif /* MDBX_DEBUG */ - MDBX_ANALYSIS_ASSUME(size > base); - memset(txn, 0, (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); - txn->dbs = ptr_disp(txn, base); - txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0])); -#if MDBX_DEBUG - txn->cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */ -#endif - txn->dbi_state = ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0])); - txn->flags = flags; - txn->env = env; - - if (parent) { - tASSERT(parent, dpl_check(parent)); -#if MDBX_ENABLE_DBI_SPARSE - txn->dbi_sparse = parent->dbi_sparse; -#endif /* MDBX_ENABLE_DBI_SPARSE */ - txn->dbi_seqs = parent->dbi_seqs; - txn->geo = parent->geo; - rc = dpl_alloc(txn); - if (likely(rc == MDBX_SUCCESS)) { - const size_t len = MDBX_PNL_GETSIZE(parent->tw.relist) + parent->tw.loose_count; - txn->tw.relist = pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.relist)) - rc = MDBX_ENOMEM; - } - if (unlikely(rc != MDBX_SUCCESS)) { - nested_failed: - pnl_free(txn->tw.relist); - dpl_free(txn); - osal_free(txn); - return LOG_IFERR(rc); - } - - /* Move loose pages to reclaimed list */ - if (parent->tw.loose_count) { - do { - page_t *lp = parent->tw.loose_pages; - tASSERT(parent, lp->flags == P_LOOSE); - rc = pnl_insert_span(&parent->tw.relist, lp->pgno, 1); - if (unlikely(rc != MDBX_SUCCESS)) - goto nested_failed; - MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *)); - VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *)); - parent->tw.loose_pages = page_next(lp); - /* Remove from dirty list */ - page_wash(parent, dpl_exist(parent, lp->pgno), lp, 1); - } while (parent->tw.loose_pages); - parent->tw.loose_count = 0; -#if MDBX_ENABLE_REFUND - parent->tw.loose_refund_wl = 0; -#endif /* MDBX_ENABLE_REFUND */ - tASSERT(parent, dpl_check(parent)); - } - txn->tw.dirtyroom = parent->tw.dirtyroom; - txn->tw.dirtylru = parent->tw.dirtylru; - - dpl_sort(parent); - if (parent->tw.spilled.list) - spill_purge(parent); - - tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= MDBX_PNL_GETSIZE(parent->tw.relist)); - memcpy(txn->tw.relist, parent->tw.relist, MDBX_PNL_SIZEOF(parent->tw.relist)); - eASSERT(env, pnl_check_allocated(txn->tw.relist, (txn->geo.first_unallocated /* LY: intentional assignment - here, only for assertion */ - = parent->geo.first_unallocated) - - MDBX_ENABLE_REFUND)); - - txn->tw.gc.time_acc = parent->tw.gc.time_acc; - txn->tw.gc.last_reclaimed = parent->tw.gc.last_reclaimed; - if (parent->tw.gc.reclaimed) { - txn->tw.gc.reclaimed = parent->tw.gc.reclaimed; - parent->tw.gc.reclaimed = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.gc.reclaimed); - } - - txn->tw.retired_pages = parent->tw.retired_pages; - parent->tw.retired_pages = (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages); - - txn->txnid = parent->txnid; - txn->front_txnid = parent->front_txnid + 1; -#if MDBX_ENABLE_REFUND - txn->tw.loose_refund_wl = 0; -#endif /* MDBX_ENABLE_REFUND */ - txn->canary = parent->canary; - parent->flags |= MDBX_TXN_HAS_CHILD; - parent->nested = txn; - txn->parent = parent; - txn->owner = parent->owner; - txn->tw.troika = parent->tw.troika; - - txn->cursors[FREE_DBI] = nullptr; - txn->cursors[MAIN_DBI] = nullptr; - txn->dbi_state[FREE_DBI] = parent->dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); - txn->dbi_state[MAIN_DBI] = parent->dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); - memset(txn->dbi_state + CORE_DBS, 0, (txn->n_dbi = parent->n_dbi) - CORE_DBS); - memcpy(txn->dbs, parent->dbs, sizeof(txn->dbs[0]) * CORE_DBS); - - tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->parent ? parent->parent->tw.dirtyroom : parent->env->options.dp_limit)); - tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->parent ? txn->parent->tw.dirtyroom : txn->env->options.dp_limit)); - env->txn = txn; - tASSERT(parent, parent->cursors[FREE_DBI] == nullptr); - rc = parent->cursors[MAIN_DBI] ? cursor_shadow(parent->cursors[MAIN_DBI], txn, MAIN_DBI) : MDBX_SUCCESS; - if (AUDIT_ENABLED() && ASSERT_ENABLED()) { - txn->signature = txn_signature; - tASSERT(txn, audit_ex(txn, 0, false) == 0); - } - if (unlikely(rc != MDBX_SUCCESS)) - txn_end(txn, TXN_END_FAIL_BEGINCHILD); - } else { /* MDBX_TXN_RDONLY */ - txn->dbi_seqs = ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0])); -#if MDBX_ENABLE_DBI_SPARSE - txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes); -#endif /* MDBX_ENABLE_DBI_SPARSE */ - renew: - rc = txn_renew(txn, flags); - } - - if (unlikely(rc != MDBX_SUCCESS)) { - if (txn != env->basal_txn) - osal_free(txn); - } else { - if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) - eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); - else if (flags & MDBX_TXN_RDONLY) - eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | - /* Win32: SRWL flag */ txn_shrink_allowed)) == 0); - else { - eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP | txn_shrink_allowed | MDBX_NOMETASYNC | - MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); - assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); - } - txn->signature = txn_signature; - txn->userctx = context; - *ret = txn; - DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO "/%" PRIaPGNO, txn->txnid, - (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root, - txn->dbs[FREE_DBI].root); - } - - return LOG_IFERR(rc); -} - -int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { - int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if (unlikely(!info)) - return LOG_IFERR(MDBX_EINVAL); - - MDBX_env *const env = txn->env; -#if MDBX_ENV_CHECKPID - if (unlikely(env->pid != osal_getpid())) { - env->flags |= ENV_FATAL_ERROR; - return LOG_IFERR(MDBX_PANIC); - } -#endif /* MDBX_ENV_CHECKPID */ - - info->txn_id = txn->txnid; - info->txn_space_used = pgno2bytes(env, txn->geo.first_unallocated); - - if (txn->flags & MDBX_TXN_RDONLY) { - meta_ptr_t head; - uint64_t head_retired; - troika_t troika = meta_tap(env); - do { - /* fetch info from volatile head */ - head = meta_recent(env, &troika); - head_retired = unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired); - info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->geometry.now); - info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->geometry.upper); - info->txn_space_leftover = pgno2bytes(env, head.ptr_v->geometry.now - head.ptr_v->geometry.first_unallocated); - } while (unlikely(meta_should_retry(env, &troika))); - - info->txn_reader_lag = head.txnid - info->txn_id; - info->txn_space_dirty = info->txn_space_retired = 0; - uint64_t reader_snapshot_pages_retired = 0; - if (txn->to.reader && - ((txn->flags & MDBX_TXN_PARKED) == 0 || safe64_read(&txn->to.reader->tid) != MDBX_TID_TXN_OUSTED) && - head_retired > - (reader_snapshot_pages_retired = atomic_load64(&txn->to.reader->snapshot_pages_retired, mo_Relaxed))) { - info->txn_space_dirty = info->txn_space_retired = - pgno2bytes(env, (pgno_t)(head_retired - reader_snapshot_pages_retired)); - - size_t retired_next_reader = 0; - lck_t *const lck = env->lck_mmap.lck; - if (scan_rlt && info->txn_reader_lag > 1 && lck) { - /* find next more recent reader */ - txnid_t next_reader = head.txnid; - const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease); - for (size_t i = 0; i < snap_nreaders; ++i) { - retry: - if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) { - jitter4testing(true); - const uint64_t snap_tid = safe64_read(&lck->rdt[i].tid); - const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid); - const uint64_t snap_retired = atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_AcquireRelease); - if (unlikely(snap_retired != atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed)) || - snap_txnid != safe64_read(&lck->rdt[i].txnid) || snap_tid != safe64_read(&lck->rdt[i].tid)) - goto retry; - if (snap_txnid <= txn->txnid) { - retired_next_reader = 0; - break; - } - if (snap_txnid < next_reader && snap_tid >= MDBX_TID_TXN_OUSTED) { - next_reader = snap_txnid; - retired_next_reader = pgno2bytes( - env, (pgno_t)(snap_retired - atomic_load64(&txn->to.reader->snapshot_pages_retired, mo_Relaxed))); - } - } - } - } - info->txn_space_dirty = retired_next_reader; - } - } else { - info->txn_space_limit_soft = pgno2bytes(env, txn->geo.now); - info->txn_space_limit_hard = pgno2bytes(env, txn->geo.upper); - info->txn_space_retired = - pgno2bytes(env, txn->nested ? (size_t)txn->tw.retired_pages : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); - info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); - info->txn_space_dirty = - pgno2bytes(env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose - : (txn->tw.writemap_dirty_npages + txn->tw.writemap_spilled_npages)); - info->txn_reader_lag = INT64_MAX; - lck_t *const lck = env->lck_mmap.lck; - if (scan_rlt && lck) { - txnid_t oldest_snapshot = txn->txnid; - const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease); - if (snap_nreaders) { - oldest_snapshot = txn_snapshot_oldest(txn); - if (oldest_snapshot == txn->txnid - 1) { - /* check if there is at least one reader */ - bool exists = false; - for (size_t i = 0; i < snap_nreaders; ++i) { - if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) && txn->txnid > safe64_read(&lck->rdt[i].txnid)) { - exists = true; - break; - } - } - oldest_snapshot += !exists; - } - } - info->txn_reader_lag = txn->txnid - oldest_snapshot; - } - } - - return MDBX_SUCCESS; -} - -MDBX_env *mdbx_txn_env(const MDBX_txn *txn) { - if (unlikely(!txn || txn->signature != txn_signature || txn->env->signature.weak != env_signature)) - return nullptr; - return txn->env; -} - -uint64_t mdbx_txn_id(const MDBX_txn *txn) { - if (unlikely(!txn || txn->signature != txn_signature)) - return 0; - return txn->txnid; -} - -MDBX_txn_flags_t mdbx_txn_flags(const MDBX_txn *txn) { - STATIC_ASSERT( - (MDBX_TXN_INVALID & (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | MDBX_TXN_HAS_CHILD | - txn_gc_drained | txn_shrink_allowed | txn_rw_begin_flags | txn_ro_begin_flags)) == 0); - if (unlikely(!txn || txn->signature != txn_signature)) - return MDBX_TXN_INVALID; - assert(0 == (int)(txn->flags & MDBX_TXN_INVALID)); - - MDBX_txn_flags_t flags = txn->flags; - if (F_ISSET(flags, MDBX_TXN_PARKED | MDBX_TXN_RDONLY) && txn->to.reader && - safe64_read(&txn->to.reader->tid) == MDBX_TID_TXN_OUSTED) - flags |= MDBX_TXN_OUSTED; - return flags; -} - -int mdbx_txn_reset(MDBX_txn *txn) { - int rc = check_txn(txn, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - /* This call is only valid for read-only txns */ - if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) - return LOG_IFERR(MDBX_EINVAL); - - /* LY: don't close DBI-handles */ - rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE); - if (rc == MDBX_SUCCESS) { - tASSERT(txn, txn->signature == txn_signature); - tASSERT(txn, txn->owner == 0); - } - return LOG_IFERR(rc); -} - -int mdbx_txn_break(MDBX_txn *txn) { - do { - int rc = check_txn(txn, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - txn->flags |= MDBX_TXN_ERROR; - if (txn->flags & MDBX_TXN_RDONLY) - break; - txn = txn->nested; - } while (txn); - return MDBX_SUCCESS; -} - -int mdbx_txn_abort(MDBX_txn *txn) { - int rc = check_txn(txn, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - rc = check_env(txn->env, true); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - - if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_NOSTICKYTHREADS)) == MDBX_NOSTICKYTHREADS && - unlikely(txn->owner != osal_thread_self())) { - mdbx_txn_break(txn); - return LOG_IFERR(MDBX_THREAD_MISMATCH); - } - - return LOG_IFERR(txn_abort(txn)); -} - -int mdbx_txn_park(MDBX_txn *txn, bool autounpark) { - STATIC_ASSERT(MDBX_TXN_BLOCKED > MDBX_TXN_ERROR); - int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0)) - return LOG_IFERR(MDBX_TXN_INVALID); - - if (unlikely((txn->flags & MDBX_TXN_ERROR))) { - rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE); - return LOG_IFERR(rc ? rc : MDBX_OUSTED); - } - - return LOG_IFERR(txn_park(txn, autounpark)); -} - -int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) { - STATIC_ASSERT(MDBX_TXN_BLOCKED > MDBX_TXN_PARKED + MDBX_TXN_ERROR); - int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED - MDBX_TXN_ERROR); - if (unlikely(rc != MDBX_SUCCESS)) - return LOG_IFERR(rc); - if (unlikely(!F_ISSET(txn->flags, MDBX_TXN_RDONLY | MDBX_TXN_PARKED))) - return MDBX_SUCCESS; - - rc = txn_unpark(txn); - if (likely(rc != MDBX_OUSTED) || !restart_if_ousted) - return LOG_IFERR(rc); - - tASSERT(txn, txn->flags & MDBX_TXN_FINISHED); - rc = txn_renew(txn, MDBX_TXN_RDONLY); - return (rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : LOG_IFERR(rc); -} - int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits) { tASSERT(txn, (bad_bits & MDBX_TXN_PARKED) && (txn->flags & bad_bits)); /* Здесь осознано заложено отличие в поведении припаркованных транзакций: @@ -1859,3 +1001,79 @@ int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits) { tASSERT(txn, bad_bits == MDBX_TXN_BLOCKED || bad_bits == MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); return mdbx_txn_unpark((MDBX_txn *)txn, false); } + +int txn_park(MDBX_txn *txn, bool autounpark) { + reader_slot_t *const rslot = txn->to.reader; + tASSERT(txn, (txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) == MDBX_TXN_RDONLY); + tASSERT(txn, txn->to.reader->tid.weak < MDBX_TID_TXN_OUSTED); + if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) != MDBX_TXN_RDONLY)) + return MDBX_BAD_TXN; + + const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed); + const uint64_t tid = atomic_load64(&rslot->tid, mo_Relaxed); + const uint64_t txnid = atomic_load64(&rslot->txnid, mo_Relaxed); + if (unlikely(pid != txn->env->pid)) { + ERROR("unexpected pid %u%s%u", pid, " != must ", txn->env->pid); + return MDBX_PROBLEM; + } + if (unlikely(tid != txn->owner || txnid != txn->txnid)) { + ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%0zx" + " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, + tid, " != must ", txn->owner, txnid, " != must ", txn->txnid); + return MDBX_BAD_RSLOT; + } + + atomic_store64(&rslot->tid, MDBX_TID_TXN_PARKED, mo_AcquireRelease); + atomic_store32(&txn->env->lck->rdt_refresh_flag, true, mo_Relaxed); + txn->flags += autounpark ? MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK : MDBX_TXN_PARKED; + return MDBX_SUCCESS; +} + +int txn_unpark(MDBX_txn *txn) { + if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) != + (MDBX_TXN_RDONLY | MDBX_TXN_PARKED))) + return MDBX_BAD_TXN; + + for (reader_slot_t *const rslot = txn->to.reader; rslot; atomic_yield()) { + const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed); + uint64_t tid = safe64_read(&rslot->tid); + uint64_t txnid = safe64_read(&rslot->txnid); + if (unlikely(pid != txn->env->pid)) { + ERROR("unexpected pid %u%s%u", pid, " != expected ", txn->env->pid); + return MDBX_PROBLEM; + } + if (unlikely(tid == MDBX_TID_TXN_OUSTED || txnid >= SAFE64_INVALID_THRESHOLD)) + break; + if (unlikely(tid != MDBX_TID_TXN_PARKED || txnid != txn->txnid)) { + ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%" PRIx64 " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, tid, " != must ", + MDBX_TID_TXN_OUSTED, txnid, " != must ", txn->txnid); + break; + } + if (unlikely((txn->flags & MDBX_TXN_ERROR))) + break; + +#if MDBX_64BIT_CAS + if (unlikely(!atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED, txn->owner))) + continue; +#else + atomic_store32(&rslot->tid.high, (uint32_t)((uint64_t)txn->owner >> 32), mo_Relaxed); + if (unlikely(!atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED, (uint32_t)txn->owner))) { + atomic_store32(&rslot->tid.high, (uint32_t)(MDBX_TID_TXN_PARKED >> 32), mo_AcquireRelease); + continue; + } +#endif + txnid = safe64_read(&rslot->txnid); + tid = safe64_read(&rslot->tid); + if (unlikely(txnid != txn->txnid || tid != txn->owner)) { + ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%zx" + " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, + tid, " != must ", txn->owner, txnid, " != must ", txn->txnid); + break; + } + txn->flags &= ~(MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK); + return MDBX_SUCCESS; + } + + int err = txn_end(txn, TXN_END_OUSTED | TXN_END_RESET | TXN_END_UPDATE); + return err ? err : MDBX_OUSTED; +}