From 6f2c1e52ad3b09ee9dfb16fe6304e205c9c0d06c Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 11 Dec 2021 02:56:19 +0300 Subject: [PATCH] mdbx: add `mdbx_cursor_get_batch()`. Resolve https://github.com/erthink/libmdbx/issues/236 --- mdbx.h | 37 +++++++++++++++ src/core.c | 124 +++++++++++++++++++++++++++++++++++++++++++++++++-- test/hill.cc | 5 +++ test/test.cc | 66 +++++++++++++++++++++++++++ test/test.h | 1 + 5 files changed, 230 insertions(+), 3 deletions(-) diff --git a/mdbx.h b/mdbx.h index 4ec12c1c..b0f62320 100644 --- a/mdbx.h +++ b/mdbx.h @@ -4333,6 +4333,43 @@ LIBMDBX_API int mdbx_cursor_copy(const MDBX_cursor *src, MDBX_cursor *dest); LIBMDBX_API int mdbx_cursor_get(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op); +/** \brief Retrieve multiple non-dupsort key/value pairs by cursor. + * \ingroup c_crud + * + * This function retrieves multiple key/data pairs from the database without + * \ref MDBX_DUPSORT option. For `MDBX_DUPSORT` databases please + * use \ref MDBX_GET_MULTIPLE and \ref MDBX_NEXT_MULTIPLE. + * + * The number of key and value items is returned in the `size_t count` + * refers. The addresses and lengths of the keys and values are returned in the + * array to which `pairs` refers. + * \see mdbx_cursor_get() + * + * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). + * \param [out] count The number of key and value item returned, on success + * it always be the even because the key-value + * pairs are returned. + * \param [in,out] pairs A pointer to the array of key value pairs. + * \param [in] limit The size of pairs buffer as the number of items, + * but not a pairs. + * \param [in] op A cursor operation \ref MDBX_cursor_op (only + * \ref MDBX_FIRST, \ref MDBX_NEXT, \ref MDBX_GET_CURRENT + * are supported). + * + * \returns A non-zero error value on failure and 0 on success, + * some possible errors are: + * \retval MDBX_THREAD_MISMATCH Given transaction is not owned + * by current thread. + * \retval MDBX_NOTFOUND No more key-value pairs are available. + * \retval MDBX_ENODATA The cursor is already at the end of data. + * \retval MDBX_RESULT_TRUE The specified limit is less than the available + * key-value pairs on the current page/position + * that the cursor points to. + * \retval MDBX_EINVAL An invalid parameter was specified. */ +LIBMDBX_API int mdbx_cursor_get_batch(MDBX_cursor *cursor, size_t *count, + MDBX_val *pairs, size_t limit, + MDBX_cursor_op op); + /** \brief Store by cursor. * \ingroup c_crud * diff --git a/src/core.c b/src/core.c index d5bcff15..17ec6be2 100644 --- a/src/core.c +++ b/src/core.c @@ -3722,7 +3722,8 @@ static void mdbx_node_del(MDBX_cursor *mc, size_t ksize); static void mdbx_node_shrink(MDBX_page *mp, unsigned indx); static int __must_check_result mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft); -static int __must_check_result mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf, +static int __must_check_result mdbx_node_read(MDBX_cursor *mc, + const MDBX_node *leaf, MDBX_val *data, const txnid_t front); static int __must_check_result mdbx_rebalance(MDBX_cursor *mc); @@ -13572,8 +13573,9 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, * [out] data Updated to point to the node's data. * * Returns 0 on success, non-zero on failure. */ -static __always_inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *node, - MDBX_val *data, const txnid_t front) { +static __always_inline int mdbx_node_read(MDBX_cursor *mc, + const MDBX_node *node, MDBX_val *data, + const txnid_t front) { data->iov_len = node_ds(node); data->iov_base = node_data(node); if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { @@ -14551,6 +14553,122 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } +static int cursor_first_batch(MDBX_cursor *mc) { + if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { + int err = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + + mc->mc_flags |= C_INITIALIZED; + mc->mc_flags &= ~C_EOF; + mc->mc_ki[mc->mc_top] = 0; + return MDBX_SUCCESS; +} + +static int cursor_next_batch(MDBX_cursor *mc) { + if (unlikely(!(mc->mc_flags & C_INITIALIZED))) + return cursor_first_batch(mc); + + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (unlikely(mc->mc_flags & C_EOF)) { + if ((unsigned)mc->mc_ki[mc->mc_top] + 1 >= page_numkeys(mp)) + return MDBX_NOTFOUND; + mc->mc_flags ^= C_EOF; + } + + int ki = mc->mc_ki[mc->mc_top]; + mc->mc_ki[mc->mc_top] = (indx_t)++ki; + const int numkeys = page_numkeys(mp); + if (likely(ki >= numkeys)) { + mdbx_debug("%s", "=====> move to next sibling page"); + mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); + int err = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + if (unlikely(err != MDBX_SUCCESS)) { + mc->mc_flags |= C_EOF; + return err; + } + mp = mc->mc_pg[mc->mc_top]; + mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + } + return MDBX_SUCCESS; +} + +int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, + size_t limit, MDBX_cursor_op op) { + if (unlikely(mc == NULL || count == NULL || limit < 4)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) + return MDBX_INCOMPATIBLE /* must be a non-dupsort subDB */; + + switch (op) { + case MDBX_FIRST: + rc = cursor_first_batch(mc); + break; + case MDBX_NEXT: + rc = cursor_next_batch(mc); + break; + case MDBX_GET_CURRENT: + rc = likely(mc->mc_flags & C_INITIALIZED) ? MDBX_SUCCESS : MDBX_ENODATA; + break; + default: + mdbx_debug("unhandled/unimplemented cursor operation %u", op); + rc = EINVAL; + break; + } + + if (unlikely(rc != MDBX_SUCCESS)) { + *count = 0; + return rc; + } + + const MDBX_page *const page = mc->mc_pg[mc->mc_top]; + const unsigned nkeys = page_numkeys(page); + unsigned i = mc->mc_ki[mc->mc_top], n = 0; + if (unlikely(i >= nkeys)) { + mdbx_cassert(mc, op == MDBX_GET_CURRENT); + mdbx_cassert(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); + *count = 0; + if (mc->mc_flags & C_EOF) { + mdbx_cassert(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); + return MDBX_ENODATA; + } + if (mdbx_cursor_on_last(mc) != MDBX_RESULT_TRUE) + return MDBX_EINVAL /* again MDBX_GET_CURRENT after MDBX_GET_CURRENT */; + mc->mc_flags |= C_EOF; + return MDBX_NOTFOUND; + } + + const txnid_t pp_txnid = pp_txnid4chk(page, mc->mc_txn); + do { + if (unlikely(n + 2 > limit)) { + rc = MDBX_RESULT_TRUE; + break; + } + const MDBX_node *leaf = page_node(page, i); + get_key(leaf, &pairs[n]); + rc = mdbx_node_read(mc, leaf, &pairs[n + 1], pp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) + break; + n += 2; + } while (++i < nkeys); + + mc->mc_ki[mc->mc_top] = (indx_t)i; + *count = n; + return rc; +} + static int mdbx_touch_dbi(MDBX_cursor *mc) { mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); *mc->mc_dbistate |= DBI_DIRTY; diff --git a/test/hill.cc b/test/hill.cc index edd1fc3e..656d17c4 100644 --- a/test/hill.cc +++ b/test/hill.cc @@ -285,6 +285,11 @@ bool testcase_hill::run() { log_notice("hill: reached %d tree depth & %s sub-tree depth(s)", stat.ms_depth, str.c_str()); } + + if ((config.params.table_flags & MDBX_DUPSORT) == 0) { + if (!check_batch_get()) + failure("batch-get verification failed"); + } } while (serial_count > 1) { diff --git a/test/test.cc b/test/test.cc index 380742a9..dc2e42cc 100644 --- a/test/test.cc +++ b/test/test.cc @@ -1192,3 +1192,69 @@ bool testcase::speculum_verify() { mdbx_cursor_close(cursor); return rc; } + +bool testcase::check_batch_get() { + char dump_key[128], dump_value[128]; + char dump_key_batch[128], dump_value_batch[128]; + + MDBX_cursor *cursor; + int err = mdbx_cursor_open(txn_guard.get(), dbi, &cursor); + if (err != MDBX_SUCCESS) + failure_perror("mdbx_cursor_open()", err); + + MDBX_cursor *batch_cursor; + err = mdbx_cursor_open(txn_guard.get(), dbi, &batch_cursor); + if (err != MDBX_SUCCESS) + failure_perror("mdbx_cursor_open()", err); + + MDBX_val pairs[42]; + size_t count = 0xDeadBeef; + err = mdbx_cursor_get_batch(batch_cursor, &count, pairs, ARRAY_LENGTH(pairs), + MDBX_FIRST); + bool rc = true; + size_t i, n = 0; + while (err == MDBX_SUCCESS) { + for (i = 0; i < count; i += 2) { + mdbx::slice key, value; + int err2 = mdbx_cursor_get(cursor, &key, &value, MDBX_NEXT); + if (err2 != MDBX_SUCCESS) + failure_perror("mdbx_cursor_open()", err2); + if (key != pairs[i] || value != pairs[i + 1]) { + log_error( + "batch-get pair mismatch %zu/%zu: sequential{%s, %s} != " + "batch{%s, %s}", + n + i / 2, i, mdbx_dump_val(&key, dump_key, sizeof(dump_key)), + mdbx_dump_val(&value, dump_value, sizeof(dump_value)), + mdbx_dump_val(&pairs[i], dump_key_batch, sizeof(dump_key_batch)), + mdbx_dump_val(&pairs[i + 1], dump_value_batch, + sizeof(dump_value_batch))); + rc = false; + } + } + n += i / 2; + err = mdbx_cursor_get_batch(batch_cursor, &count, pairs, + ARRAY_LENGTH(pairs), MDBX_NEXT); + } + if (err != MDBX_NOTFOUND) + failure_perror("mdbx_cursor_get_batch()", err); + + err = mdbx_cursor_eof(batch_cursor); + if (err != MDBX_RESULT_TRUE) { + log_error("batch-get %s cursor not-eof %d", "batch", err); + rc = false; + } + err = mdbx_cursor_on_last(batch_cursor); + if (err != MDBX_RESULT_TRUE) { + log_error("batch-get %s cursor not-on-last %d", "batch", err); + rc = false; + } + + err = mdbx_cursor_on_last(cursor); + if (err != MDBX_RESULT_TRUE) { + log_error("batch-get %s cursor not-on-last %d", "checked", err); + rc = false; + } + mdbx_cursor_close(cursor); + mdbx_cursor_close(batch_cursor); + return rc; +} diff --git a/test/test.h b/test/test.h index 29a00536..a4d28699 100644 --- a/test/test.h +++ b/test/test.h @@ -227,6 +227,7 @@ protected: const MDBX_val &v) const; bool speculum_verify(); + bool check_batch_get(); int insert(const keygen::buffer &akey, const keygen::buffer &adata, MDBX_put_flags_t flags); int replace(const keygen::buffer &akey, const keygen::buffer &new_value,