diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index ae692c27..28a0b2b4 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -423,6 +423,7 @@ eot EOTDONE EOWNERDEAD EPERM +EPIPE erasevolume EREMOTE EROFS @@ -1429,12 +1430,14 @@ sigemptyset SIGHUP SIGINT SIGKILL +sigmask SIGPIPE sigprocmask SIGSEGV sigset SIGTERM sigusr +sigwait singlemode singleprocess sizeof diff --git a/ChangeLog.md b/ChangeLog.md index bfa3eadb..2613bd34 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -27,6 +27,8 @@ Fixes: - Fixed copy&paste typos. - Fixed minor false-positive GCC warning. - Added workaround for broken `DEFINE_ENUM_FLAG_OPERATORS` from Windows SDK. + - Fixed cursor state after multimap/dupsort repeated deletes (https://github.com/erthink/libmdbx/issues/121). + - Added `SIGPIPE` suppression for internal thread during `mdbx_env_copy()`. ## v0.9.1 2020-09-30 diff --git a/mdbx.h b/mdbx.h index 2093522d..08790666 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2833,9 +2833,9 @@ struct MDBX_txn_info { uint64_t txn_id; /** For READ-ONLY transaction: the lag from a recent MVCC-snapshot, i.e. the - number of committed transaction since read transaction started. For WRITE - transaction (provided if `scan_rlt=true`): the lag of the oldest reader - from current transaction (i.e. at least 1 if any reader running). */ + number of committed transaction since read transaction started. + For WRITE transaction (provided if `scan_rlt=true`): the lag of the oldest + reader from current transaction (i.e. at least 1 if any reader running). */ uint64_t txn_reader_lag; /** Used space by this transaction, i.e. corresponding to the last used @@ -2859,7 +2859,8 @@ struct MDBX_txn_info { /** For READ-ONLY transaction: the space available for writer(s) and that must be exhausted for reason to call the Handle-Slow-Readers callback for - this read transaction. For WRITE transaction: the space inside transaction + this read transaction. + For WRITE transaction: the space inside transaction that left to `MDBX_TXN_FULL` error. */ uint64_t txn_space_leftover; diff --git a/src/bits.md b/src/bits.md index 4a18c00a..ba7e4eaa 100644 --- a/src/bits.md +++ b/src/bits.md @@ -1,8 +1,8 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NODE | PAGE | --|---------|-----------|--------------|----------|-----------|------------|---------|----------| -0 |0000 0001| |TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH -1 |0000 0002| |TXN_ERROR |REVERSEKEY| |DBI_STALE |F_SUBDATA|P_LEAF -2 |0000 0004| |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW +0 |0000 0001|ALLOC_CACHE|TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH +1 |0000 0002|ALLOC_GC |TXN_ERROR |REVERSEKEY| |DBI_STALE |F_SUBDATA|P_LEAF +2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW 3 |0000 0008| |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META 4 |0000 0010| |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_DIRTY 5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 diff --git a/src/core.c b/src/core.c index 806c77d6..b6dee5c0 100644 --- a/src/core.c +++ b/src/core.c @@ -4166,7 +4166,7 @@ static int mdbx_pages_xkeep(MDBX_cursor *mc, unsigned pflags, bool all) { MDBX_txn *txn = mc->mc_txn; MDBX_cursor *m3, *m0 = mc; MDBX_xcursor *mx; - MDBX_page *dp, *mp; + MDBX_page *mp; unsigned i, j; int rc = MDBX_SUCCESS; @@ -4204,11 +4204,8 @@ mark_done: pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; - int level; - if (unlikely((rc = mdbx_page_get(m0, pgno, &dp, &level, - txn->mt_txnid)) != MDBX_SUCCESS)) - break; - if ((dp->mp_flags & Mask) == pflags && level <= 1) + MDBX_page *dp = mdbx_dpl_find(txn->tw.dirtylist, pgno); + if (dp && (dp->mp_flags & Mask) == pflags) dp->mp_flags ^= P_KEEP; } } @@ -5108,12 +5105,6 @@ skip_cache: const unsigned wanna_range = num - 1; while (true) { /* hsr-kick retry loop */ - /* If our dirty list is already full, we can't do anything */ - if (unlikely(txn->tw.dirtyroom == 0)) { - rc = MDBX_TXN_FULL; - goto fail; - } - MDBX_cursor_couple recur; for (MDBX_cursor_op op = MDBX_FIRST;; op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { @@ -5154,6 +5145,11 @@ skip_cache: } if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ + if (unlikely(txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth) && + !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)) { + /* If our dirty list is already full, we can't touch GC */ + flags &= ~MDBX_ALLOC_GC; + } if (unlikely(!(flags & MDBX_ALLOC_GC))) break /* reclaiming is prohibited for now */; @@ -5264,6 +5260,21 @@ skip_cache: goto fail; } const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); + /* TODO: provide a user-configurable threshold */ + const unsigned threshold_2_stop_gc_reclaiming = MDBX_PNL_MAX / 2; + if (unlikely(gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > + threshold_2_stop_gc_reclaiming)) { + /* Stop reclaiming to avoid overflow the page list. + * This is a rare case while search for a continuously multi-page region + * in a large database. https://github.com/erthink/libmdbx/issues/123 */ + flags -= MDBX_ALLOC_GC; + if (unlikely(flags == 0)) { + /* Oh, we can't do anything */ + rc = MDBX_TXN_FULL; + goto fail; + } + break; + } rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -6606,7 +6617,8 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); /* Copy parent's mt_dbistate, but clear DB_NEW */ for (unsigned i = 0; i < txn->mt_numdbs; i++) - txn->mt_dbistate[i] = parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT); + txn->mt_dbistate[i] = + parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); mdbx_tassert(parent, parent->mt_parent || parent->tw.dirtyroom + parent->tw.dirtylist->length == @@ -7491,9 +7503,12 @@ retry_noaccount: env->me_maxgc_ov1page) { /* LY: need just a txn-id for save page list. */ - couple.outer.mc_flags &= ~C_RECLAIMING; bool need_cleanup = false; + txnid_t snap_oldest; + retry_rid: + couple.outer.mc_flags &= ~C_RECLAIMING; do { + snap_oldest = mdbx_find_oldest(txn); rc = mdbx_page_alloc(&couple.outer, 0, NULL, MDBX_ALLOC_GC); if (likely(rc == MDBX_SUCCESS)) { mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, @@ -7521,9 +7536,13 @@ retry_noaccount: gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { mdbx_tassert(txn, txn->tw.last_reclaimed == 0); + if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) + /* should retry mdbx_page_alloc(MDBX_ALLOC_GC) + * if the oldest reader changes since the last attempt */ + goto retry_rid; /* no reclaimable GC entries, * therefore no entries with ID < mdbx_find_oldest(txn) */ - txn->tw.last_reclaimed = gc_rid = mdbx_find_oldest(txn) - 1; + txn->tw.last_reclaimed = gc_rid = snap_oldest - 1; mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, gc_rid); } @@ -8074,6 +8093,26 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { MDBX_txn *const parent = txn->mt_parent; mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + if (txn->tw.dirtylist->length == 0 && + (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { + for (int i = txn->mt_numdbs; --i >= 0;) { + mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + if ((txn->mt_dbistate[i] & DBI_STALE) && + !(parent->mt_dbistate[i] & DBI_STALE)) + mdbx_tassert(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], + sizeof(MDBX_db)) == 0); + } + + mdbx_tassert(txn, memcmp(&parent->mt_geo, &txn->mt_geo, + sizeof(parent->mt_geo)) == 0); + mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary, + sizeof(parent->mt_canary)) == 0); + mdbx_tassert(txn, parent->mt_numdbs == txn->mt_numdbs); + + end_mode = MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE; + goto done; + } + /* Preserve space for spill list to avoid parent's state corruption * if allocation fails. */ if (txn->tw.spill_pages && parent->tw.spill_pages) { @@ -11649,9 +11688,6 @@ int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data) { - DKBUF; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY(key)); - int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -11670,20 +11706,13 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, if (unlikely(rc != MDBX_SUCCESS)) return rc; - MDBX_val save_data = *data; int exact = 0; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_RANGE, &exact); + rc = mdbx_cursor_set( + &cx.outer, key, data, + cx.outer.mc_xcursor ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE, &exact); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (exact && (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) != 0) { - *data = save_data; - exact = 0; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_GET_BOTH_RANGE, &exact); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - return exact ? MDBX_SUCCESS : MDBX_RESULT_TRUE; } @@ -11872,18 +11901,14 @@ skip: rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (data) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { if (unlikely((rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn))) != MDBX_SUCCESS)) return rc; - - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } } get_key_optional(node, key); @@ -11951,6 +11976,9 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, " with %u keys, key index %u", mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + if (unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; + if (IS_LEAF2(mp)) { if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; @@ -11959,25 +11987,20 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_SUCCESS; } - mdbx_cassert(mc, IS_LEAF(mp)); node = page_node(mp, mc->mc_ki[mc->mc_top]); if (F_ISSET(node_flags(node), F_DUPDATA)) { rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (data) { + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { if (unlikely((rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn))) != MDBX_SUCCESS)) return rc; - - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } } get_key_optional(node, key); @@ -12106,7 +12129,7 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (!mc->mc_top) { /* There are no other pages */ mc->mc_ki[mc->mc_top] = 0; - if (op == MDBX_SET_RANGE && exactp == &stub_exactp) { + if (op == MDBX_SET_RANGE) { rc = 0; goto set1; } else @@ -12125,7 +12148,7 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, set2: node = mdbx_node_search(mc, &aligned_key, exactp); - if (exactp != &stub_exactp && !*exactp) { + if (!*exactp && !(op == MDBX_SET_RANGE || op == MDBX_GET_BOTH_RANGE)) { /* MDBX_SET specified and not an exact match. */ return MDBX_NOTFOUND; } @@ -12158,19 +12181,18 @@ set1: rc = mdbx_xcursor_init1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - } - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - } else { - int ex2 = 0, *ex2p = (op == MDBX_GET_BOTH) ? &ex2 : NULL; - rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE, ex2p); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } - } else if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { + if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + } else { + int dummy = 0; + rc = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_SET_RANGE, + (op == MDBX_GET_BOTH_RANGE) ? exactp : &dummy); + } + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + } else if (likely(data)) { + if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { mdbx_cassert(mc, !"Invalid data-size"); @@ -12207,6 +12229,7 @@ set1: if (rc) { if (op != MDBX_GET_BOTH_RANGE || rc > 0) return MDBX_NOTFOUND; + *exactp = 0; rc = 0; } *data = olddata; @@ -12250,28 +12273,29 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { mc->mc_ki[mc->mc_top] = 0; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + } return MDBX_SUCCESS; } MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0); - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) - return rc; - } + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read( + mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; } + get_key_optional(node, key); return MDBX_SUCCESS; } @@ -12283,12 +12307,10 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - if (likely((mc->mc_flags & (C_EOF | C_DEL)) != C_EOF)) { - if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - } + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; } if (unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) @@ -12298,28 +12320,28 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { mc->mc_flags |= C_INITIALIZED | C_EOF; if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], - key->iov_len); + if (likely(key)) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], + mc->mc_ki[mc->mc_top], key->iov_len); + } return MDBX_SUCCESS; } MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (likely(data)) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); - if (unlikely(rc)) - return rc; - } else { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) - return rc; - } + if (F_ISSET(node_flags(node), F_DUPDATA)) { + rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + if (unlikely(rc)) + return rc; + } else if (likely(data)) { + if (unlikely((rc = mdbx_node_read( + mc, node, data, + pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != + MDBX_SUCCESS)) + return rc; } get_key_optional(node, key); @@ -12369,14 +12391,17 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; + } else { + rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_GET_CURRENT); + if (unlikely(rc)) + return rc; } - rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_GET_CURRENT); } else { rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn)); + if (unlikely(rc)) + return rc; } - if (unlikely(rc)) - return rc; } } break; @@ -12394,8 +12419,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_SET_RANGE: if (unlikely(key == NULL)) return MDBX_EINVAL; - rc = mdbx_cursor_set(mc, key, data, op, - op == MDBX_SET_RANGE ? NULL : &exact); + rc = mdbx_cursor_set(mc, key, data, op, &exact); break; case MDBX_GET_MULTIPLE: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) @@ -12403,8 +12427,8 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; rc = MDBX_SUCCESS; - if (!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) || - (mc->mc_xcursor->mx_cursor.mc_flags & C_EOF)) + if ((mc->mc_xcursor->mx_cursor.mc_flags & (C_INITIALIZED | C_EOF)) != + C_INITIALIZED) break; goto fetchm; case MDBX_NEXT_MULTIPLE: @@ -12518,6 +12542,7 @@ static int mdbx_cursor_touch(MDBX_cursor *mc) { if (unlikely(rc)) return rc; *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; } mc->mc_top = 0; if (mc->mc_snum) { @@ -12822,6 +12847,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, data->iov_len); } *mc->mc_dbistate |= DBI_DIRTY; + mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) np->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; @@ -13360,6 +13386,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (unlikely(!IS_LEAF(mp))) + return MDBX_CORRUPTED; if (IS_LEAF2(mp)) goto del_key; @@ -13370,9 +13398,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { - if (!F_ISSET(node_flags(node), F_SUBDATA)) { + if (!F_ISSET(node_flags(node), F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); if (unlikely(rc)) return rc; @@ -15433,7 +15460,6 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { MDBX_page *mp; indx_t ki; unsigned nkeys; - MDBX_cursor *m2, *m3; MDBX_dbi dbi = mc->mc_dbi; mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); @@ -15441,127 +15467,108 @@ static int mdbx_cursor_del0(MDBX_cursor *mc) { mp = mc->mc_pg[mc->mc_top]; mdbx_node_del(mc, mc->mc_db->md_xsize); mc->mc_db->md_entries--; - { - /* Adjust other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; + + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDBX_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } } - rc = mdbx_rebalance(mc); - if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_rebalance(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + if (unlikely(!mc->mc_snum)) { /* DB is totally empty now, just bail out. * Other cursors adjustments were already done * by mdbx_rebalance and aren't needed here. */ - if (!mc->mc_snum) { - mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); - mc->mc_flags |= C_DEL | C_EOF; - return rc; - } - - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && - mc->mc_db->md_entries == 0 && nkeys == 0)); - - /* Adjust THIS and other cursors pointing to mp */ - for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT); - if (rc == MDBX_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - continue; - } else if (unlikely(rc != MDBX_SUCCESS)) - break; - } - if (m3->mc_ki[mc->mc_top] >= ki || m3->mc_pg[mc->mc_top] != mp) { - if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && - (m3->mc_flags & C_EOF) == 0) { - MDBX_node *node = - page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - break; - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - } - } - } - } - } - - if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) { - rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); - if (unlikely(rc == MDBX_NOTFOUND)) { - mc->mc_flags |= C_EOF; - return MDBX_SUCCESS; - } - } - if ((mc->mc_db->md_flags & MDBX_DUPSORT) != 0 && - (mc->mc_flags & C_EOF) == 0) { - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not initd it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); - if (likely(rc != MDBX_SUCCESS)) - mc->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - } - } - mc->mc_flags |= C_DEL; + mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); + mc->mc_flags |= C_EOF; + return MDBX_SUCCESS; } - if (unlikely(rc)) - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - else if (mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + nkeys = page_numkeys(mp); + mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); + /* Adjust this and other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT); + if (rc == MDBX_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + continue; + } + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + if (m3->mc_ki[mc->mc_top] >= ki || + /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { + mdbx_cassert(m3, (m3->mc_flags & C_EOF) == 0); + if (m3->mc_xcursor) { + MDBX_node *node = + page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not inited it must be reinited. + * Else if node points to a subDB, nothing is needed. */ + if (node_flags(node) & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node_flags(node) & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + } else { + rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + rc = mdbx_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + m3->mc_flags |= C_DEL; + } + } + } + + mdbx_cassert(mc, rc == MDBX_SUCCESS); + if (mdbx_audit_enabled()) + rc = mdbx_cursor_check(mc, 0); + return rc; + +bailout: + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return rc; } @@ -16189,6 +16196,13 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { uint8_t *ptr; int toggle = 0; +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + sigset_t sigset; + sigemptyset(&sigset); + sigaddset(&sigset, SIGPIPE); + my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); +#endif /* EPIPE */ + mdbx_condpair_lock(&my->mc_condpair); while (!my->mc_error) { while (!my->mc_new && !my->mc_error) { @@ -16206,6 +16220,14 @@ static THREAD_RESULT __cold THREAD_CALL mdbx_env_copythr(void *arg) { if (wsize > 0 && !my->mc_error) { int err = mdbx_write(my->mc_fd, ptr, wsize); if (err != MDBX_SUCCESS) { +#if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) + if (err == EPIPE) { + /* Collect the pending SIGPIPE, + * otherwise at least OS X gives it to the process on thread-exit. */ + int unused; + sigwait(&sigset, &unused); + } +#endif /* EPIPE */ my->mc_error = err; goto bailout; } @@ -17405,6 +17427,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, goto later_bailout; dbiflags |= DBI_DIRTY | DBI_CREAT; + txn->mt_flags |= MDBX_TXN_DIRTY; } /* Got info, register DBI in this txn */ diff --git a/src/debug_begin.h b/src/debug_begin.h index f3306b5d..b2b2d0bb 100644 --- a/src/debug_begin.h +++ b/src/debug_begin.h @@ -32,3 +32,5 @@ #undef mdbx_assert #define mdbx_assert(env, expr) mdbx_ensure(env, expr) + +#pragma GCC optimize("-O0") diff --git a/src/debug_end.h b/src/debug_end.h index e361119b..3d840e68 100644 --- a/src/debug_end.h +++ b/src/debug_end.h @@ -5,3 +5,5 @@ #pragma pop_macro("mdbx_warning") #pragma pop_macro("mdbx_error") #pragma pop_macro("mdbx_assert") + +#pragma GCC reset_options diff --git a/src/mdbx_dump.c b/src/mdbx_dump.c index 5cd3a39c..75b7c79e 100644 --- a/src/mdbx_dump.c +++ b/src/mdbx_dump.c @@ -209,6 +209,8 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) { rc = MDBX_SUCCESS; if (unlikely(rc != MDBX_SUCCESS)) error("mdbx_cursor_get", rc); + + mdbx_cursor_close(cursor); return rc; } @@ -243,7 +245,7 @@ int main(int argc, char *argv[]) { MDBX_dbi dbi; prog = argv[0]; char *envname; - char *subname = nullptr; + char *subname = nullptr, *buf4free = nullptr; unsigned envflags = 0; bool alldbs = false, list = false; @@ -389,7 +391,13 @@ int main(int argc, char *argv[]) { if (memchr(key.iov_base, '\0', key.iov_len)) continue; - subname = mdbx_malloc(key.iov_len + 1); + subname = mdbx_realloc(buf4free, key.iov_len + 1); + if (!subname) { + rc = MDBX_ENOMEM; + break; + } + + buf4free = subname; memcpy(subname, key.iov_base, key.iov_len); subname[key.iov_len] = '\0'; @@ -442,7 +450,6 @@ int main(int argc, char *argv[]) { break; } } - mdbx_free(subname); } mdbx_cursor_close(cursor); cursor = nullptr; @@ -476,6 +483,7 @@ txn_abort: mdbx_txn_abort(txn); env_close: mdbx_env_close(env); + free(buf4free); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/src/mdbx_load.c b/src/mdbx_load.c index 2c59ede6..c54049bb 100644 --- a/src/mdbx_load.c +++ b/src/mdbx_load.c @@ -118,7 +118,6 @@ static MDBX_envinfo envinfo; static int mode = GLOBAL; static MDBX_val kbuf, dbuf; -static MDBX_val k0buf; #define STRLENOF(s) (sizeof(s) - 1) @@ -481,11 +480,9 @@ int main(int argc, char *argv[]) { MDBX_cursor *mc = nullptr; MDBX_dbi dbi; char *envname = nullptr; - int envflags = MDBX_UTTERLY_NOSYNC, putflags = 0; - bool append = false; + int envflags = MDBX_UTTERLY_NOSYNC, putflags = MDBX_UPSERT; bool quiet = false; bool rescue = false; - MDBX_val prevk; prog = argv[0]; if (argc < 2) @@ -508,7 +505,7 @@ int main(int argc, char *argv[]) { mdbx_build.options); return EXIT_SUCCESS; case 'a': - append = true; + putflags |= MDBX_APPEND; break; case 'f': if (freopen(optarg, "r", stdin) == nullptr) { @@ -524,7 +521,7 @@ int main(int argc, char *argv[]) { subname = mdbx_strdup(optarg); break; case 'N': - putflags = MDBX_NOOVERWRITE | MDBX_NODUPDATA; + putflags |= MDBX_NOOVERWRITE | MDBX_NODUPDATA; break; case 'T': mode |= NOHDR | PRINT; @@ -565,6 +562,11 @@ int main(int argc, char *argv[]) { dbuf.iov_len = 4096; dbuf.iov_base = mdbx_malloc(dbuf.iov_len); + if (!dbuf.iov_base) { + rc = MDBX_ENOMEM; + error("value-buffer", rc); + goto env_close; + } /* read first header for mapsize= */ if (!(mode & NOHDR)) { @@ -625,17 +627,19 @@ int main(int argc, char *argv[]) { goto env_close; } - kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, MDBX_DUPSORT); - if (kbuf.iov_len >= INTPTR_MAX / 4) { + kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + 1; + if (kbuf.iov_len >= INTPTR_MAX / 2) { fprintf(stderr, "mdbx_env_get_maxkeysize() failed, returns %zu\n", kbuf.iov_len); goto env_close; } - kbuf.iov_len = (kbuf.iov_len + 1) * 2; - kbuf.iov_base = malloc(kbuf.iov_len * 2); - k0buf.iov_len = kbuf.iov_len; - k0buf.iov_base = (char *)kbuf.iov_base + kbuf.iov_len; - prevk.iov_base = k0buf.iov_base; + + kbuf.iov_base = malloc(kbuf.iov_len); + if (!kbuf.iov_base) { + rc = MDBX_ENOMEM; + error("key-buffer", rc); + goto env_close; + } while (rc == MDBX_SUCCESS) { if (user_break) { @@ -661,9 +665,10 @@ int main(int argc, char *argv[]) { } const char *const dbi_name = subname ? subname : "@MAIN"; - rc = mdbx_dbi_open_ex(txn, subname, dbi_flags | MDBX_CREATE, &dbi, - append ? equal_or_greater : nullptr, - append ? equal_or_greater : nullptr); + rc = + mdbx_dbi_open_ex(txn, subname, dbi_flags | MDBX_CREATE, &dbi, + (putflags & MDBX_APPEND) ? equal_or_greater : nullptr, + (putflags & MDBX_APPEND) ? equal_or_greater : nullptr); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_dbi_open_ex", rc); goto txn_abort; @@ -691,19 +696,17 @@ int main(int argc, char *argv[]) { } } + if (putflags & MDBX_APPEND) + putflags = (dbi_flags & MDBX_DUPSORT) ? putflags | MDBX_APPENDDUP + : putflags & ~MDBX_APPENDDUP; + rc = mdbx_cursor_open(txn, dbi, &mc); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_cursor_open", rc); goto txn_abort; } - /* if (append) { - mc->mc_flags |= C_SKIPORD; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; - } */ int batch = 0; - prevk.iov_len = 0; while (rc == MDBX_SUCCESS) { MDBX_val key, data; rc = readline(&key, &kbuf); @@ -718,18 +721,7 @@ int main(int argc, char *argv[]) { goto txn_abort; } - int appflag = 0; - if (append) { - appflag = MDBX_APPEND; - if (dbi_flags & MDBX_DUPSORT) { - if (prevk.iov_len == key.iov_len && - memcmp(prevk.iov_base, key.iov_base, key.iov_len) == 0) - appflag = MDBX_APPEND | MDBX_APPENDDUP; - else - memcpy(prevk.iov_base, key.iov_base, prevk.iov_len = key.iov_len); - } - } - rc = mdbx_cursor_put(mc, &key, &data, putflags | appflag); + rc = mdbx_cursor_put(mc, &key, &data, putflags); if (rc == MDBX_KEYEXIST && putflags) continue; if (rc == MDBX_BAD_VALSIZE && rescue) { @@ -770,11 +762,6 @@ int main(int argc, char *argv[]) { error("mdbx_cursor_open", rc); goto txn_abort; } - /* if (append) { - mc->mc_flags |= C_SKIPORD; - if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; - } */ } } @@ -815,6 +802,8 @@ txn_abort: mdbx_txn_abort(txn); env_close: mdbx_env_close(env); + free(kbuf.iov_base); + free(dbuf.iov_base); return rc ? EXIT_FAILURE : EXIT_SUCCESS; } diff --git a/test/append.cc b/test/append.cc index c7469b4a..cf5c8279 100644 --- a/test/append.cc +++ b/test/append.cc @@ -21,22 +21,18 @@ bool testcase_append::run() { return true; } + cursor_open(dbi); keyvalue_maker.setup(config.params, config.actor_id, 0 /* thread_number */); /* LY: тест наполнения таблиц в append-режиме, * при котором записи добавляются строго в конец (в порядке сортировки) */ - const MDBX_put_flags_t flags = (config.params.table_flags & MDBX_DUPSORT) - ? MDBX_APPEND | MDBX_APPENDDUP - : MDBX_APPEND; - keyvalue_maker.make_ordered(); + const MDBX_put_flags_t flags = + (config.params.table_flags & MDBX_DUPSORT) + ? (flipcoin() ? MDBX_APPEND | MDBX_APPENDDUP : MDBX_APPENDDUP) + : MDBX_APPEND; + keyvalue_maker.make_linear(); key = keygen::alloc(config.params.keylen_max); data = keygen::alloc(config.params.datalen_max); - keygen::buffer last_key = keygen::alloc(config.params.keylen_max); - keygen::buffer last_data = keygen::alloc(config.params.datalen_max); - last_key->value.iov_base = last_key->bytes; - last_key->value.iov_len = 0; - last_data->value.iov_base = last_data->bytes; - last_data->value.iov_len = 0; simple_checksum inserted_checksum; uint64_t inserted_number = 0; @@ -47,20 +43,78 @@ bool testcase_append::run() { simple_checksum committed_inserted_checksum = inserted_checksum; while (should_continue()) { const keygen::serial_t serial = serial_count; - if (!keyvalue_maker.increment(serial_count, 1)) { + const bool turn_key = (config.params.table_flags & MDBX_DUPSORT) == 0 || + flipcoin_n(config.params.keygen.split); + if (turn_key ? !keyvalue_maker.increment_key_part(serial_count, 1) + : !keyvalue_maker.increment(serial_count, 1)) { // дошли до границы пространства ключей break; } log_trace("append: append-a %" PRIu64, serial); generate_pair(serial); - int cmp = inserted_number ? mdbx_cmp(txn_guard.get(), dbi, &key->value, - &last_key->value) - : 1; - if (cmp == 0 && (config.params.table_flags & MDBX_DUPSORT)) - cmp = mdbx_dcmp(txn_guard.get(), dbi, &data->value, &last_data->value); + // keygen::log_pair(logging::verbose, "append.", key, data); - err = mdbx_put(txn_guard.get(), dbi, &key->value, &data->value, flags); + MDBX_val ge_key = key->value; + MDBX_val ge_data = data->value; + err = mdbx_get_equal_or_great(txn_guard.get(), dbi, &ge_key, &ge_data); + + bool expect_key_mismatch; + if (err == MDBX_SUCCESS /* exact match */) { + expect_key_mismatch = true; + assert(inserted_number > 0); + assert(mdbx_cmp(txn_guard.get(), dbi, &key->value, &ge_key) == 0); + assert((config.params.table_flags & MDBX_DUPSORT) == 0 || + mdbx_dcmp(txn_guard.get(), dbi, &data->value, &ge_data) == 0); + assert(inserted_number > 0); + } else if (err == MDBX_RESULT_TRUE /* have key-value pair great than */) { + assert(mdbx_cmp(txn_guard.get(), dbi, &key->value, &ge_key) < 0 || + ((config.params.table_flags & MDBX_DUPSORT) && + mdbx_cmp(txn_guard.get(), dbi, &key->value, &ge_key) == 0 && + mdbx_dcmp(txn_guard.get(), dbi, &data->value, &ge_data) < 0)); + switch (int(flags)) { + default: + abort(); +#if CONSTEXPR_ENUM_FLAGS_OPERATIONS + case MDBX_APPEND | MDBX_APPENDDUP: +#else + case int(MDBX_APPEND) | int(MDBX_APPENDDUP): +#endif + assert((config.params.table_flags & MDBX_DUPSORT) != 0); + __fallthrough; + // fall through + case MDBX_APPEND: + expect_key_mismatch = true; + break; + case MDBX_APPENDDUP: + assert((config.params.table_flags & MDBX_DUPSORT) != 0); + expect_key_mismatch = + mdbx_cmp(txn_guard.get(), dbi, &key->value, &ge_key) == 0; + break; + } + } else if (err == MDBX_NOTFOUND /* all pair are less than */) { + switch (int(flags)) { + default: + abort(); + case MDBX_APPENDDUP: +#if CONSTEXPR_ENUM_FLAGS_OPERATIONS + case MDBX_APPEND | MDBX_APPENDDUP: +#else + case int(MDBX_APPEND) | int(MDBX_APPENDDUP): +#endif + assert((config.params.table_flags & MDBX_DUPSORT) != 0); + __fallthrough; + // fall through + case MDBX_APPEND: + expect_key_mismatch = false; + break; + } + } else + failure_perror("mdbx_get_equal_or_great()", err); + + assert(!expect_key_mismatch); + + err = mdbx_cursor_put(cursor_guard.get(), &key->value, &data->value, flags); if (err == MDBX_MAP_FULL && config.params.ignore_dbfull) { log_notice("append: bailout-insert due '%s'", mdbx_strerror(err)); txn_end(true); @@ -69,21 +123,14 @@ bool testcase_append::run() { break; } - if (cmp > 0) { + if (!expect_key_mismatch) { if (unlikely(err != MDBX_SUCCESS)) - failure_perror("mdbx_put(appenda-a)", err); - - memcpy(last_key->value.iov_base, key->value.iov_base, - last_key->value.iov_len = key->value.iov_len); - memcpy(last_data->value.iov_base, data->value.iov_base, - last_data->value.iov_len = data->value.iov_len); + failure_perror("mdbx_cursor_put(appenda-a)", err); ++inserted_number; inserted_checksum.push((uint32_t)inserted_number, key->value); inserted_checksum.push(10639, data->value); - } else { - if (unlikely(err != MDBX_EKEYMISMATCH)) - failure_perror("mdbx_put(appenda-a) != MDBX_EKEYMISMATCH", err); - } + } else if (unlikely(err != MDBX_EKEYMISMATCH)) + failure_perror("mdbx_cursor_put(appenda-a) != MDBX_EKEYMISMATCH", err); if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); @@ -111,7 +158,7 @@ bool testcase_append::run() { } //---------------------------------------------------------------------------- txn_begin(true); - cursor_open(dbi); + cursor_renew(); MDBX_val check_key, check_data; err = diff --git a/test/cases.cc b/test/cases.cc index 98255f52..9730b10c 100644 --- a/test/cases.cc +++ b/test/cases.cc @@ -15,7 +15,15 @@ #include "test.h" void configure_actor(unsigned &last_space_id, const actor_testcase testcase, - const char *space_id_cstr, const actor_params ¶ms) { + const char *space_id_cstr, actor_params params) { + // silently fix key/data length for fixed-length modes + if ((params.table_flags & MDBX_INTEGERKEY) && + params.keylen_min != params.keylen_max) + params.keylen_min = params.keylen_max; + if ((params.table_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && + params.datalen_min != params.datalen_max) + params.datalen_min = params.datalen_max; + unsigned wait4id = 0; if (params.waitfor_nops) { for (auto i = global::actors.rbegin(); i != global::actors.rend(); ++i) { @@ -56,7 +64,7 @@ void configure_actor(unsigned &last_space_id, const actor_testcase testcase, global::databases.insert(params.pathname_db); } -void testcase_setup(const char *casename, actor_params ¶ms, +void testcase_setup(const char *casename, const actor_params ¶ms, unsigned &last_space_id) { if (strcmp(casename, "basic") == 0) { log_notice(">>> testcase_setup(%s)", casename); diff --git a/test/keygen.cc b/test/keygen.cc index 2e8641b7..411bf623 100644 --- a/test/keygen.cc +++ b/test/keygen.cc @@ -25,7 +25,7 @@ static inline MDBX_PURE_FUNCTION serial_t mask(unsigned bits) { serial_t injective(const serial_t serial, const unsigned bits /* at least serial_minwith (8) */, const serial_t salt) { - assert(bits > serial_minwith && bits <= serial_maxwith); + assert(bits >= serial_minwith && bits <= serial_maxwith); /* LY: All these "magic" prime numbers were found * and verified with a bit of brute force. */ @@ -124,7 +124,8 @@ void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, * Поэтому key_serial не трогаем, а в value_serial нелинейно вмешиваем * запрошенное количество бит из serial */ value_serial += - (serial ^ (serial >> mapping.split)) & mask(mapping.split); + (serial ^ (serial >> mapping.split) * UINT64_C(57035339200100753)) & + mask(mapping.split); } value_serial |= value_age << mapping.split; @@ -187,13 +188,7 @@ void __hot maker::pair(serial_t serial, const buffer &key, buffer &value, mk_continue(key_serial, key_essentials, *key); mk_continue(value_serial, value_essentials, *value); - - if (log_enabled(logging::trace)) { - char dump_key[4096], dump_value[4096]; - log_trace("keygen-pair: key %s, value %s", - mdbx_dump_val(&key->value, dump_key, sizeof(dump_key)), - mdbx_dump_val(&value->value, dump_value, sizeof(dump_value))); - } + log_pair(logging::trace, "kv", key, value); } void maker::setup(const config::actor_params_pod &actor, unsigned actor_id, @@ -213,9 +208,9 @@ void maker::setup(const config::actor_params_pod &actor, unsigned actor_id, key_essentials.minlen = (uint16_t)actor.keylen_min; assert(actor.keylen_max <= UINT32_MAX); key_essentials.maxlen = - std::min((uint32_t)actor.keylen_max, - (uint32_t)mdbx_limits_keysize_max( - actor.pagesize, MDBX_db_flags_t(key_essentials.flags))); + std::min(uint32_t(actor.keylen_max), + uint32_t(mdbx_limits_keysize_max( + actor.pagesize, MDBX_db_flags_t(key_essentials.flags)))); value_essentials.flags = actor.table_flags & uint16_t(MDBX_INTEGERDUP | MDBX_REVERSEDUP); @@ -223,9 +218,9 @@ void maker::setup(const config::actor_params_pod &actor, unsigned actor_id, value_essentials.minlen = (uint16_t)actor.datalen_min; assert(actor.datalen_max <= UINT32_MAX); value_essentials.maxlen = - std::min((uint32_t)actor.datalen_max, - (uint32_t)mdbx_limits_valsize_max( - actor.pagesize, MDBX_db_flags_t(key_essentials.flags))); + std::min(uint32_t(actor.datalen_max), + uint32_t(mdbx_limits_valsize_max( + actor.pagesize, MDBX_db_flags_t(key_essentials.flags)))); if (!actor.keygen.zero_fill) { key_essentials.flags |= essentials::prng_fill_flag; @@ -240,13 +235,45 @@ void maker::setup(const config::actor_params_pod &actor, unsigned actor_id, base = 0; } -void maker::make_ordered() { - mapping.mesh = 0; +void maker::make_linear() { + mapping.mesh = (key_essentials.flags & MDBX_DUPSORT) ? 0 : mapping.split; mapping.rotate = 0; + mapping.offset = 0; + const auto max_serial = mask(mapping.width) + base; + const auto max_key_serial = + (mapping.split && (key_essentials.flags & MDBX_DUPSORT)) + ? max_serial >> mapping.split + : max_serial; + const auto max_value_serial = + (mapping.split && (key_essentials.flags & MDBX_DUPSORT)) + ? mask(mapping.split) + : 0; + + while (key_essentials.minlen < 8 && + (key_essentials.minlen == 0 || + mask(key_essentials.minlen * 8) < max_key_serial)) { + key_essentials.minlen += + (key_essentials.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) ? 4 : 1; + if (key_essentials.maxlen < key_essentials.minlen) + key_essentials.maxlen = key_essentials.minlen; + } + + if ((key_essentials.flags | value_essentials.flags) & MDBX_DUPSORT) + while (value_essentials.minlen < 8 && + (value_essentials.minlen == 0 || + mask(value_essentials.minlen * 8) < max_value_serial)) { + value_essentials.minlen += + (value_essentials.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) ? 4 + : 1; + if (value_essentials.maxlen < value_essentials.minlen) + value_essentials.maxlen = value_essentials.minlen; + } } bool maker::is_unordered() const { - return (mapping.mesh >= serial_minwith || mapping.rotate) != 0; + return mapping.rotate || + mapping.mesh > + ((key_essentials.flags & MDBX_DUPSORT) ? 0 : mapping.split); } bool maker::increment(serial_t &serial, int delta) const { @@ -272,8 +299,9 @@ bool maker::increment(serial_t &serial, int delta) const { //----------------------------------------------------------------------------- -static size_t length(serial_t serial) { - size_t n = 0; +MDBX_NOTHROW_PURE_FUNCTION static inline unsigned length(serial_t serial) { +#if defined(__clang__) && __clang__ > 8 + unsigned n = 0; if (serial > UINT32_MAX) { n = 4; serial >>= 32; @@ -286,16 +314,26 @@ static size_t length(serial_t serial) { n += 1; serial >>= 8; } - return (serial > 0) ? n + 1 : n; +#else + unsigned n = (serial > UINT32_MAX) ? 4 : 0; + serial = (serial > UINT32_MAX) ? serial >> 32 : serial; + + n += (serial > UINT16_MAX) ? 2 : 0; + serial = (serial > UINT16_MAX) ? serial >> 16 : serial; + + n += (serial > UINT8_MAX); + serial = (serial > UINT8_MAX) ? serial >> 8 : serial; +#endif + return n + (serial > 0); } buffer alloc(size_t limit) { - result *ptr = (result *)malloc(sizeof(result) + limit); + result *ptr = (result *)malloc(sizeof(result) + limit + 8); if (unlikely(ptr == nullptr)) failure_perror("malloc(keyvalue_buffer)", errno); ptr->value.iov_base = ptr->bytes; ptr->value.iov_len = 0; - ptr->limit = limit; + ptr->limit = limit + 8; return buffer(ptr); } @@ -305,14 +343,20 @@ void __hot maker::mk_begin(const serial_t serial, const essentials ¶ms, assert(params.maxlen >= params.minlen); assert(params.maxlen >= length(serial)); - out.value.iov_len = - (params.maxlen > params.minlen) - ? params.minlen + serial % (params.maxlen - params.minlen) - : params.minlen; + out.value.iov_len = std::max(unsigned(params.minlen), length(serial)); + const auto variation = params.maxlen - params.minlen; + if (variation) { + if (serial % (variation + 1)) { + auto refix = serial * UINT64_C(48835288005252737); + refix ^= refix >> 32; + out.value.iov_len = std::max( + out.value.iov_len, params.minlen + 1 + size_t(refix) % variation); + } + } - if ((params.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) == 0 && - out.value.iov_len < 8) - out.value.iov_len = std::max(length(serial), out.value.iov_len); + assert(length(serial) <= out.value.iov_len); + assert(out.value.iov_len >= params.minlen); + assert(out.value.iov_len <= params.maxlen); } void __hot maker::mk_continue(const serial_t serial, const essentials ¶ms, @@ -328,36 +372,30 @@ void __hot maker::mk_continue(const serial_t serial, const essentials ¶ms, unsigned(MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERKEY | MDBX_INTEGERDUP | MDBX_REVERSEKEY | MDBX_REVERSEDUP)) == 0); #endif + assert(length(serial) <= out.value.iov_len); out.value.iov_base = out.bytes; if (params.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) { assert(params.maxlen == params.minlen); - assert(params.minlen == 4 || params.minlen == 8); - if (is_byteorder_le() || params.minlen == 8) - out.u64 = serial; - else - out.u32 = (uint32_t)serial; - } else if (params.flags & unsigned(MDBX_REVERSEKEY | MDBX_REVERSEDUP)) { - if (out.value.iov_len > 8) { - if (params.flags & essentials::prng_fill_flag) { - uint64_t state = serial ^ UINT64_C(0x41803711c9b75f19); - prng_fill(state, out.bytes, out.value.iov_len - 8); - } else - memset(out.bytes, '\0', out.value.iov_len - 8); - unaligned::store(out.bytes + out.value.iov_len - 8, htobe64(serial)); - } else { - out.u64 = htobe64(serial); - if (out.value.iov_len < 8) - out.value.iov_base = out.bytes + 8 - out.value.iov_len; - } + if (params.flags & (MDBX_INTEGERKEY | MDBX_INTEGERDUP)) + assert(params.minlen == 4 || params.minlen == 8); + out.u64 = serial; + if (!is_byteorder_le() && out.value.iov_len != 8) + out.u32 = uint32_t(serial); } else { - out.u64 = htole64(serial); - if (out.value.iov_len > 8) { + const auto prefix = + std::max(std::min(unsigned(params.minlen), 8u), length(serial)); + out.u64 = htobe64(serial); + out.value.iov_base = out.bytes + 8 - prefix; + if (out.value.iov_len > prefix) { if (params.flags & essentials::prng_fill_flag) { uint64_t state = serial ^ UINT64_C(0x923ab47b7ee6f6e4); - prng_fill(state, out.bytes + 8, out.value.iov_len - 8); + prng_fill(state, out.bytes + 8, out.value.iov_len - prefix); } else - memset(out.bytes + 8, '\0', out.value.iov_len - 8); + memset(out.bytes + 8, '\0', out.value.iov_len - prefix); } + if (unlikely(params.flags & (MDBX_REVERSEKEY | MDBX_REVERSEDUP))) + std::reverse((char *)out.value.iov_base, + (char *)out.value.iov_base + out.value.iov_len); } assert(out.value.iov_len >= params.minlen); @@ -368,4 +406,15 @@ void __hot maker::mk_continue(const serial_t serial, const essentials ¶ms, out.bytes + out.limit); } +void log_pair(logging::loglevel level, const char *prefix, const buffer &key, + buffer &value) { + if (log_enabled(level)) { + char dump_key[4096], dump_value[4096]; + logging::output( + level, "%s-pair: key %s, value %s", prefix, + mdbx_dump_val(&key->value, dump_key, sizeof(dump_key)), + mdbx_dump_val(&value->value, dump_value, sizeof(dump_value))); + } +} + } /* namespace keygen */ diff --git a/test/keygen.h b/test/keygen.h index c36cc1a2..96c6f2b7 100644 --- a/test/keygen.h +++ b/test/keygen.h @@ -127,10 +127,19 @@ public: serial_t value_age, const bool keylen_changeable); void setup(const config::actor_params_pod &actor, unsigned actor_id, unsigned thread_number); - void make_ordered(); + void make_linear(); bool is_unordered() const; bool increment(serial_t &serial, int delta) const; + bool increment_key_part(serial_t &serial, int delta, + bool reset_value_part = true) const { + if (reset_value_part) + serial &= ~((serial_t(1) << mapping.split) - 1); + return increment(serial, delta << mapping.split); + } }; +void log_pair(logging::loglevel level, const char *prefix, const buffer &key, + buffer &value); + } /* namespace keygen */ diff --git a/test/main.cc b/test/main.cc index 5d50b043..28d8d261 100644 --- a/test/main.cc +++ b/test/main.cc @@ -270,6 +270,27 @@ int main(int argc, char *const argv[]) { if ((params.table_flags & MDBX_DUPSORT) == 0) params.table_flags &= ~(MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP); + const unsigned keylen_max = params.mdbx_keylen_max(); + if (params.keylen_min > keylen_max) + params.keylen_min = keylen_max; + if (params.keylen_max > keylen_max) + params.keylen_max = keylen_max; + const unsigned keylen_min = params.mdbx_keylen_min(); + if (params.keylen_min < keylen_min) + params.keylen_min = keylen_min; + if (params.keylen_max < keylen_min) + params.keylen_max = keylen_min; + + const unsigned datalen_max = params.mdbx_datalen_max(); + if (params.datalen_min > datalen_max) + params.datalen_min = datalen_max; + if (params.datalen_max > datalen_max) + params.datalen_max = datalen_max; + const unsigned datalen_min = params.mdbx_datalen_min(); + if (params.datalen_min < datalen_min) + params.datalen_min = datalen_min; + if (params.datalen_max < datalen_min) + params.datalen_max = datalen_min; continue; } @@ -371,7 +392,7 @@ int main(int argc, char *const argv[]) { params.datalen_min, config::no_scale, params.mdbx_datalen_min(), params.mdbx_datalen_max())) { - if ((params.table_flags & MDBX_DUPFIXED) || + if ((params.table_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) || params.datalen_max < params.datalen_min) params.datalen_max = params.datalen_min; continue; @@ -380,7 +401,7 @@ int main(int argc, char *const argv[]) { params.datalen_max, config::no_scale, params.mdbx_datalen_min(), params.mdbx_datalen_max())) { - if ((params.table_flags & MDBX_DUPFIXED) || + if ((params.table_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) || params.datalen_min > params.datalen_max) params.datalen_min = params.datalen_max; continue; diff --git a/test/nested.cc b/test/nested.cc index 2456886e..d26382bf 100644 --- a/test/nested.cc +++ b/test/nested.cc @@ -68,8 +68,11 @@ bool testcase_nested::teardown() { log_notice("nested: bailout-clean due '%s'", mdbx_strerror(err)); ok = false; } - } else + } else { + if (txn_guard) + txn_end(false); db_table_close(dbi); + } dbi = 0; } return inherited::teardown() && ok; diff --git a/test/test.cc b/test/test.cc index c676cff3..6a91a35a 100644 --- a/test/test.cc +++ b/test/test.cc @@ -87,7 +87,7 @@ int testcase::hsr_callback(const MDBX_env *env, const MDBX_txn *txn, if (retry == 0) log_notice("hsr_callback: waitfor pid %lu, thread %" PRIuPTR - ", txn #%" PRIu64 ", gap %d, scape %zu", + ", txn #%" PRIu64 ", gap %d, space %zu", (long)pid, (size_t)tid, laggard, gap, space); MDBX_envinfo info; @@ -275,22 +275,31 @@ void testcase::cursor_close() { log_trace("<< cursor_close()"); } +void testcase::cursor_renew() { + log_trace(">> cursor_renew()"); + assert(cursor_guard); + int err = mdbx_cursor_renew(txn_guard.get(), cursor_guard.get()); + if (unlikely(err != MDBX_SUCCESS)) + failure_perror("mdbx_cursor_renew()", err); + log_trace("<< cursor_renew()"); +} + int testcase::breakable_restart() { int rc = MDBX_SUCCESS; if (txn_guard) rc = breakable_commit(); - if (cursor_guard) - cursor_close(); txn_begin(false, MDBX_TXN_READWRITE); + if (cursor_guard) + cursor_renew(); return rc; } void testcase::txn_restart(bool abort, bool readonly, MDBX_txn_flags_t flags) { if (txn_guard) txn_end(abort); - if (cursor_guard) - cursor_close(); txn_begin(readonly, flags); + if (cursor_guard) + cursor_renew(); } void testcase::txn_inject_writefault(void) { @@ -623,6 +632,7 @@ bool test_execute(const actor_config &config_const) { else log_verbose("test successfully (iteration %zi)", iter); config.params.keygen.seed += INT32_C(0xA4F4D37B); + log_verbose("turn keygen to %u", config.params.keygen.seed); } } while (config.params.nrepeat == 0 || iter < config.params.nrepeat); diff --git a/test/test.h b/test/test.h index 21420d99..1a765143 100644 --- a/test/test.h +++ b/test/test.h @@ -42,10 +42,10 @@ bool test_execute(const actor_config &config); std::string thunk_param(const actor_config &config); -void testcase_setup(const char *casename, actor_params ¶ms, +void testcase_setup(const char *casename, const actor_params ¶ms, unsigned &last_space_id); void configure_actor(unsigned &last_space_id, const actor_testcase testcase, - const char *space_id_cstr, const actor_params ¶ms); + const char *space_id_cstr, actor_params params); void keycase_setup(const char *casename, actor_params ¶ms); namespace global { @@ -187,6 +187,7 @@ protected: MDBX_txn_flags_t flags = MDBX_TXN_READWRITE); void cursor_open(MDBX_dbi handle); void cursor_close(); + void cursor_renew(); void txn_inject_writefault(void); void txn_inject_writefault(MDBX_txn *txn); void fetch_canary(); diff --git a/test/utils.cc b/test/utils.cc index 8311b115..c5392245 100644 --- a/test/utils.cc +++ b/test/utils.cc @@ -326,8 +326,8 @@ double double_from_upper(uint64_t salt) { r.ieee.negative = 0; r.ieee.exponent = IEEE754_DOUBLE_BIAS; salt >>= 64 - DBL_MANT_DIG; - r.ieee.mantissa0 = (unsigned)(salt >> 32); - r.ieee.mantissa1 = (unsigned)salt; + r.ieee.mantissa0 = unsigned(salt >> 32); + r.ieee.mantissa1 = unsigned(salt); return r.d; #else const uint64_t top = (UINT64_C(1) << DBL_MANT_DIG) - 1; @@ -340,6 +340,9 @@ bool flipcoin() { return bleach32((uint32_t)entropy_ticks()) & 1; } bool flipcoin_x2() { return (bleach32((uint32_t)entropy_ticks()) & 3) == 0; } bool flipcoin_x3() { return (bleach32((uint32_t)entropy_ticks()) & 7) == 0; } bool flipcoin_x4() { return (bleach32((uint32_t)entropy_ticks()) & 15) == 0; } +bool flipcoin_n(unsigned n) { + return (bleach64(entropy_ticks()) & ((UINT64_C(1) << n) - 1)) == 0; +} bool jitter(unsigned probability_percent) { const uint32_t top = UINT32_MAX - UINT32_MAX % 100; diff --git a/test/utils.h b/test/utils.h index 9e6d4627..f00f34d1 100644 --- a/test/utils.h +++ b/test/utils.h @@ -358,5 +358,6 @@ bool flipcoin(); bool flipcoin_x2(); bool flipcoin_x3(); bool flipcoin_x4(); +bool flipcoin_n(unsigned n); bool jitter(unsigned probability_percent); void jitter_delay(bool extra = false);