diff --git a/CMakeLists.txt b/CMakeLists.txt index 1252afb9..7d45d931 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/.git" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-dbi.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-env.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-extra.c" + AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-get-cached.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-key-transform.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-misc.c" AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/src/api-opts.c" @@ -780,6 +781,7 @@ else() "${MDBX_SOURCE_DIR}/api-dbi.c" "${MDBX_SOURCE_DIR}/api-env.c" "${MDBX_SOURCE_DIR}/api-extra.c" + "${MDBX_SOURCE_DIR}/api-get-cached.c" "${MDBX_SOURCE_DIR}/api-key-transform.c" "${MDBX_SOURCE_DIR}/api-misc.c" "${MDBX_SOURCE_DIR}/api-opts.c" diff --git a/mdbx.h b/mdbx.h index 96d892ec..c8d528f7 100644 --- a/mdbx.h +++ b/mdbx.h @@ -4923,6 +4923,123 @@ LIBMDBX_API int mdbx_get_ex(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MD * \retval MDBX_EINVAL An invalid parameter was specified. */ LIBMDBX_API int mdbx_get_equal_or_great(const MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data); +/** \brief Lightweight transparent cache entry structure used by \ref mdbx_cache_get_SingleThreaded(). + * \ingroup c_crud + * Must be initialized by \ref mdbx_cache_init() before first use. */ +typedef struct MDBX_cache_entry { + MDBX_val data; /**< The cached data value for a corresponding key. + * The NULL value means \ref MDBX_NOTFOUND. */ + uint64_t trunk_txnid; /**< The transaction/MVCC-snapshot ID of a page or other internal DB structure that + * hold the cached data or reflect it state. */ + uint64_t last_confirmed_txnid; /**< The recent transaction/MVCC-snapshot ID wherein the cache entry + * was checked and confirmed. */ +} MDBX_cache_entry; + +/** \brief Initializes the cache entry before the first use. + * \ingroup c_crud + * \see MDBX_cache_entry + * \see mdbx_cache_get() + */ +LIBMDBX_INLINE_API(void, mdbx_cache_init, (MDBX_cache_entry * entry)) { + entry->data.iov_base = NULL; + entry->data.iov_len = 0; + entry->trunk_txnid = 0; + entry->last_confirmed_txnid = 0; +} + +/** \brief Cache entry status returned by \ref mdbx_cache_get(). + * \ingroup c_crud + * \see MDBX_cache_entry + * \see mdbx_cache_init() + */ +typedef enum MDBX_cache_status { + /** \brief The error other than \ref MDBX_NOTFOUND has occurred. + * \details There is no correct result since an error has occurred that is not related + * to the absence of the desired key-value pair. + * The given cache entry has not been changed. */ + MDBX_CACHE_ERROR = -2, + + /** \brief The result was obtained by bypassing the cache, because + * the transaction is too old to using the cache entry. + * \details The cache entry reflects a newer version of the data that is unavailable within + * an MVCC-snapshot used by current transaction. + * The given cache entry has not been changed. + * The result of getting a value is correct until the transaction end. */ + MDBX_CACHE_BEHIND = -1, + + /** \brief The result was obtained by bypassing the cache, because + * the given cache entry being updated by another thread. + * \details When accessing the cache entry, a race condition was detected with its update by another thread. + * Therefore, the result was obtained without using the cache entry and without affecting an operation of other + * threads using it, including the ones performing an update. For a read transaction, the result is correct until + * the transaction end. For a write transactions, the result is correct until the value is explicitly changed or + * the transaction is completed. */ + MDBX_CACHE_RACE = 0, + + /** \brief The result of getting a value is correct, but it cannot be cached since + * the changes have not been committed. + * \details The requested value of a pair is in a dirty state itself or on a dirty page with other updated items. + * This cache entry has not been changed because the corresponding data changes have not yet been committed + * and could be aborted. + * The result of the get operation and data value are valid within the current write transaction + * until any next modification. */ + MDBX_CACHE_DIRTY = 1, + + /** \brief The result of getting a value is correct and was retrieved from the cache entry which is untouched. + * \details There were no changes in the cached data after the last check. + * The given cache entry was not altered as it is complete up-to-date. + * For a read transaction, the result is correct until the transaction end. + * For a write transactions, the result is correct until the value is explicitly changed + * or the transaction is completed. */ + MDBX_CACHE_HIT = 2, + + /** \brief The result of getting a value is correct and has been retrieved from the cache, which has been + * altered to reflect recently committed transactions. + * \details There were no changes in the cached data after the last check. + * The given cache entry has been slightly updated to reflect the relevance of the data for recent committed + * transaction(s). For a read transaction, the result is correct until the transaction end. For a write transactions, + * the result is correct until the value is explicitly changed or the transaction is completed. */ + MDBX_CACHE_CONFIRMED = 3, + + /** \brief The result of getting a value is correct and corresponds to the fresh data readed from the database, + * which also putted into the cache entry. + * \details After the last check, either the value of the requested pair itself changed, + * or it was moved to a new page due to the updating of neighboring items. + * The given cache entry has been completely updated to reflect the actual data. + * For a read transaction, the result is correct until the transaction end. + * For a write transactions, the result is correct until the value is explicitly changed + * or the transaction is completed. */ + MDBX_CACHE_REFRESHED = 4 +} MDBX_cache_status_t; + +/** \brief FIXME + * \ingroup c_crud + * \see mdbx_cache_get() + * \see mdbx_cache_get_SingleThreaded() + */ +typedef struct MDBX_cache_result { + MDBX_error_t errcode; + MDBX_cache_status_t status; +} MDBX_cache_result_t; + +/** \brief FIXME + * \ingroup c_crud + * \see mdbx_cache_get_SingleThreaded() + * \see MDBX_cache_entry + * \see mdbx_cache_init() + */ +LIBMDBX_API MDBX_cache_result_t mdbx_cache_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, + volatile MDBX_cache_entry *entry); + +/** \brief FIXME + * \ingroup c_crud + * \see mdbx_cache_get() + * \see MDBX_cache_entry + * \see mdbx_cache_init() + */ +LIBMDBX_API MDBX_cache_result_t mdbx_cache_get_SingleThreaded(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *data, MDBX_cache_entry *entry); + /** \brief Store items into a table. * \ingroup c_crud * diff --git a/src/alloy.c b/src/alloy.c index 9c3cab5e..f2d73b47 100644 --- a/src/alloy.c +++ b/src/alloy.c @@ -10,6 +10,7 @@ #include "api-dbi.c" #include "api-env.c" #include "api-extra.c" +#include "api-get-cached.c" #include "api-key-transform.c" #include "api-misc.c" #include "api-opts.c" diff --git a/src/api-get-cached.c b/src/api-get-cached.c new file mode 100644 index 00000000..cf9335a7 --- /dev/null +++ b/src/api-get-cached.c @@ -0,0 +1,273 @@ +/// \copyright SPDX-License-Identifier: Apache-2.0 +/// \author Леонид Юрьев aka Leonid Yuriev \date 2025 + +#include "internals.h" + +static MDBX_cache_result_t cache_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, + MDBX_cache_entry *entry); + +static inline bool is_outside_dxb(const MDBX_txn *txn, const void *ptr) { + const MDBX_env *env = txn->env; + const ptrdiff_t offset = ptr_dist(ptr, env->dxb_mmap.base); + return offset < 0 || (size_t)offset >= pgno2bytes(env, txn->geo.first_unallocated); +} + +static inline bool is_not_commited(const MDBX_txn *txn, const page_t *mp) { + tASSERT(txn, mp >= (const page_t *)txn->env->dxb_mmap.base && + mp < (const page_t *)(ptr_disp(txn->env->dxb_mmap.base, + pgno2bytes(txn->env, txn->geo.first_unallocated)))); + return mp->txnid >= txn_basis_snapshot(txn); +} + +MDBX_MAYBE_UNUSED static inline bool is_inside_dxb_and_commited(const MDBX_txn *txn, const void *ptr) { + return !is_outside_dxb(txn, ptr) && !is_not_commited(txn, ptr2page(txn->env, ptr)); +} + +static inline MDBX_cache_result_t cache_result(int err, MDBX_cache_status_t status) { + MDBX_cache_result_t result = {.errcode = err, .status = status}; + return result; +} + +static inline MDBX_cache_result_t cache_error(int err) { + assert(err != MDBX_SUCCESS && err != MDBX_RESULT_TRUE); + return cache_result(err, MDBX_CACHE_ERROR); +} + +static inline MDBX_cache_result_t cache_fallback(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, + MDBX_cache_status_t status) { + MDBX_cache_entry stub = {.last_confirmed_txnid = 0, .trunk_txnid = 0}; + MDBX_cache_result_t result = cache_get(txn, dbi, key, data, &stub); + if (result.status > MDBX_CACHE_DIRTY) + result.status = status; + return result; +} + +__hot static MDBX_cache_result_t cache_get(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, + MDBX_cache_entry *entry) { + DKBUF_DEBUG; + DEBUG("===> cached-get dbi %u, key [%s], entry %p (trunk %" PRIaTXN ", last_confirmed %" PRIaTXN ", data %p/%zu)", + dbi, DKEY_DEBUG(key), entry, entry->trunk_txnid, entry->last_confirmed_txnid, entry->data.iov_base, + entry->data.iov_len); + + if (unlikely(entry->trunk_txnid > entry->last_confirmed_txnid)) + return cache_error(LOG_IFERR(MDBX_INVALID)); + + int err = check_txn(txn, MDBX_TXN_BLOCKED); + if (unlikely(err != MDBX_SUCCESS)) + return cache_error(LOG_IFERR(err)); + + tASSERT(txn, entry->data.iov_base || entry->data.iov_len == 0); + tASSERT(txn, entry->last_confirmed_txnid <= MAX_TXNID); + if (unlikely(txn->txnid < entry->trunk_txnid)) + /* the used/read MVCC-snapshot is behind the cached MVCC-range */ + return cache_fallback(txn, dbi, key, data, MDBX_CACHE_BEHIND); + + if (likely(txn->txnid <= entry->last_confirmed_txnid)) { + /* cache hit fast-path */ + tASSERT(txn, + (!entry->data.iov_base && !entry->data.iov_len) || is_inside_dxb_and_commited(txn, entry->data.iov_base)); + *data = entry->data; + return cache_result(data->iov_base ? MDBX_SUCCESS : MDBX_NOTFOUND, MDBX_CACHE_HIT); + } + + err = dbi_check(txn, dbi); + if (unlikely(err != MDBX_SUCCESS)) + return cache_error(LOG_IFERR(err)); + + const uint64_t committed_snapshot_txnid = txn_basis_snapshot(txn); + txnid_t trunk_txnid = txn->front_txnid; + if (unlikely(txn->dbi_state[dbi] & DBI_STALE)) { + err = tbl_fetch((MDBX_txn *)txn, dbi); + if (unlikely(err != MDBX_SUCCESS)) { + if (err == MDBX_NOTFOUND) { + /* the corresponding table has been deleted */ + not_found: + data->iov_base = nullptr; + data->iov_len = 0; + MDBX_cache_status_t status = MDBX_CACHE_DIRTY; + if (trunk_txnid <= committed_snapshot_txnid) { + status = MDBX_CACHE_CONFIRMED; + if (entry->data.iov_base) { + status = MDBX_CACHE_REFRESHED; + tASSERT(txn, trunk_txnid > entry->trunk_txnid); + entry->data = *data; + entry->trunk_txnid = trunk_txnid; + } + entry->last_confirmed_txnid = committed_snapshot_txnid; + } + return cache_result(err, status); + } + return cache_error(LOG_IFERR(err)); + } + } + + if (txn->dbs[dbi].mod_txnid /* tree->mod_txnid maybe zero in a legacy DB */) + trunk_txnid = txn->dbs[dbi].mod_txnid; + if ((txn->flags & MDBX_TXN_RDONLY) == 0) { + const MDBX_txn *scan = txn; + do + if ((scan->flags & MDBX_TXN_DIRTY) && (dbi == MAIN_DBI || (scan->dbi_state[dbi] & DBI_DIRTY))) { + /* После коммита вложенных тразакций может быть mod_txnid > front */ + trunk_txnid = scan->front_txnid; + break; + } + while (unlikely((scan = scan->parent) != nullptr)); + } + + if (trunk_txnid <= entry->last_confirmed_txnid) { + tASSERT(txn, (txn->dbi_state[dbi] & DBI_DIRTY) == 0); + cache_confirmed: + tASSERT(txn, trunk_txnid < committed_snapshot_txnid && trunk_txnid <= entry->last_confirmed_txnid); + tASSERT(txn, trunk_txnid == entry->trunk_txnid); + tASSERT(txn, + (!entry->data.iov_base && !entry->data.iov_len) || is_inside_dxb_and_commited(txn, entry->data.iov_base)); + *data = entry->data; + entry->last_confirmed_txnid = committed_snapshot_txnid; + return cache_result(data->iov_base ? MDBX_SUCCESS : MDBX_NOTFOUND, MDBX_CACHE_CONFIRMED); + } + + if (unlikely(txn->dbs[dbi].root == P_INVALID)) { + /* the corresponding table is empty now */ + goto not_found; + } + + cursor_couple_t cx; + err = cursor_init(&cx.outer, txn, dbi); + if (unlikely(err != MDBX_SUCCESS)) + return cache_error(LOG_IFERR(err)); + + alignkey_t aligned; + err = check_key(&cx.outer, key, &aligned); + if (unlikely(err != MDBX_SUCCESS)) + return cache_error(LOG_IFERR(err)); + + cx.outer.top = 0; + cx.outer.ki[0] = 0; + err = page_get(&cx.outer, txn->dbs[dbi].root, &cx.outer.pg[0], trunk_txnid); + if (unlikely(err != MDBX_SUCCESS)) + return cache_error(LOG_IFERR(err)); + + page_t *mp = cx.outer.pg[0]; + if ((trunk_txnid = mp->txnid) <= entry->last_confirmed_txnid) + goto cache_confirmed; + + intptr_t ki = page_numkeys(mp) - 1; + while (is_branch(mp)) { + const struct node_search_result nsr = node_search(&cx.outer, key); + if (likely(nsr.node)) + ki = cx.outer.ki[cx.outer.top] + (intptr_t)nsr.exact - 1; + err = page_get(&cx.outer, node_pgno(page_node(mp, ki)), &mp, trunk_txnid); + if (unlikely(err != MDBX_SUCCESS)) + return cache_error(LOG_IFERR(err)); + + if ((trunk_txnid = mp->txnid) <= entry->last_confirmed_txnid) + goto cache_confirmed; + + ki = page_numkeys(mp) - 1; + err = cursor_push(&cx.outer, mp, ki); + if (unlikely(err != MDBX_SUCCESS)) + return cache_error(LOG_IFERR(err)); + } + + if (!MDBX_DISABLE_VALIDATION && unlikely(!check_leaf_type(&cx.outer, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", mp->pgno, mp->flags); + err = MDBX_CORRUPTED; + return cache_error(LOG_IFERR(err)); + } + + struct node_search_result nsr = node_search(&cx.outer, &aligned.key); + if (!nsr.exact) + goto not_found; + + if (unlikely(node_flags(nsr.node) & N_DUP)) { + /* TODO: It is possible to implement support, but need to think through the usage scenarios */ + err = MDBX_EMULTIVAL; + return cache_error(LOG_IFERR(err)); + } + + err = node_read(&cx.outer, nsr.node, data, mp); + if (unlikely(err != MDBX_SUCCESS)) + return cache_error(LOG_IFERR(err)); + + if (trunk_txnid > committed_snapshot_txnid) { + tASSERT(txn, trunk_txnid > entry->last_confirmed_txnid && trunk_txnid > entry->trunk_txnid); + return cache_result(MDBX_SUCCESS, MDBX_CACHE_DIRTY); + } + + tASSERT(txn, is_inside_dxb_and_commited(txn, data->iov_base)); + tASSERT(txn, trunk_txnid <= committed_snapshot_txnid && trunk_txnid > entry->last_confirmed_txnid && + trunk_txnid > entry->trunk_txnid); + entry->data = *data; + entry->trunk_txnid = trunk_txnid; + entry->last_confirmed_txnid = committed_snapshot_txnid; + return cache_result(MDBX_SUCCESS, MDBX_CACHE_REFRESHED); +} + +/*----------------------------------------------------------------------------*/ + +__hot MDBX_cache_result_t mdbx_cache_get_MultiThreadedAtomics(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *data, volatile MDBX_cache_entry *entry) { + + if (unlikely(!key || !data || !entry || data == &entry->data)) + return cache_error(LOG_IFERR(MDBX_EINVAL)); + + MDBX_cache_entry local = *entry; + while (true) { + MDBX_cache_entry again; + again.last_confirmed_txnid = safe64_read((mdbx_atomic_uint64_t *)&entry->last_confirmed_txnid); + if (unlikely(again.last_confirmed_txnid > MAX_TXNID)) { + atomic_yield(); + again.last_confirmed_txnid = safe64_read((mdbx_atomic_uint64_t *)&entry->last_confirmed_txnid); + if (unlikely(again.last_confirmed_txnid > MAX_TXNID)) { + atomic_yield(); + again.last_confirmed_txnid = safe64_read((mdbx_atomic_uint64_t *)&entry->last_confirmed_txnid); + if (unlikely(again.last_confirmed_txnid > MAX_TXNID)) + return cache_fallback(txn, dbi, key, data, MDBX_CACHE_RACE); + } + } + + again.trunk_txnid = entry->trunk_txnid; + again.data = entry->data; + if (local.last_confirmed_txnid == again.last_confirmed_txnid && local.trunk_txnid == again.trunk_txnid && + local.data.iov_base == again.data.iov_base && local.data.iov_len == again.data.iov_len) + break; + + local = again; + atomic_yield(); + } + + MDBX_cache_result_t result = cache_get(txn, dbi, key, data, &local); + if (result.status > MDBX_CACHE_HIT) { + tASSERT(txn, local.last_confirmed_txnid < MAX_TXNID && local.trunk_txnid <= local.last_confirmed_txnid && + local.trunk_txnid > 0); + while (true) { + const txnid_t snap = safe64_read((mdbx_atomic_uint64_t *)&entry->last_confirmed_txnid); + if (snap >= local.last_confirmed_txnid) { + result.status = MDBX_CACHE_RACE; + break; + } + + if (likely(safe64_reset_compare((mdbx_atomic_uint64_t *)&entry->last_confirmed_txnid, snap))) { + entry->trunk_txnid = 0; + osal_compiler_barrier(); + entry->data = local.data; + entry->trunk_txnid = local.trunk_txnid; + safe64_write((mdbx_atomic_uint64_t *)&entry->last_confirmed_txnid, local.last_confirmed_txnid); + break; + } + + atomic_yield(); + } + } + return result; +} + +__hot MDBX_cache_result_t mdbx_cache_get_SingleThreaded(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + MDBX_val *data, MDBX_cache_entry *entry) { + if (unlikely(!key || !data || !entry || data == &entry->data)) + return cache_error(LOG_IFERR(MDBX_EINVAL)); + + return cache_get(txn, dbi, key, data, entry); +} + +LIBMDBX_API void mdbx_cache_init(MDBX_cache_entry *entry) { __inline_mdbx_cache_init(entry); }