mdbx: add shared cache for oldest reader's txnid.

Change-Id: I48cbc778b873445dffa8ecef1fc3633e0193131f
This commit is contained in:
Leo Yuriev 2017-06-14 23:33:13 +03:00
parent c01aeb5c68
commit 81661ff952
2 changed files with 78 additions and 64 deletions

View File

@ -365,18 +365,35 @@ typedef struct MDBX_lockinfo {
/* Flags which environment was opened. */ /* Flags which environment was opened. */
volatile uint32_t mti_envmode; volatile uint32_t mti_envmode;
union {
#ifdef MDBX_OSAL_LOCK #ifdef MDBX_OSAL_LOCK
MDBX_OSAL_LOCK mti_wmutex; MDBX_OSAL_LOCK mti_wmutex;
#endif #endif
uint64_t align_wmutex;
};
union {
/* The number of slots that have been used in the reader table. /* The number of slots that have been used in the reader table.
* This always records the maximum count, it is not decremented * This always records the maximum count, it is not decremented
* when readers release their slots. */ * when readers release their slots. */
volatile unsigned __cache_aligned mti_numreaders; volatile unsigned __cache_aligned mti_numreaders;
uint64_t align_numreaders;
};
union {
#ifdef MDBX_OSAL_LOCK #ifdef MDBX_OSAL_LOCK
/* Mutex protecting access to this table. */ /* Mutex protecting access to this table. */
MDBX_OSAL_LOCK mti_rmutex; MDBX_OSAL_LOCK mti_rmutex;
#endif #endif
uint64_t align_rmutex;
};
union {
volatile txnid_t mti_oldest;
uint64_t align_oldest;
};
uint8_t pad_align[MDBX_CACHELINE_SIZE - sizeof(uint64_t) * 6];
MDBX_reader __cache_aligned mti_readers[1]; MDBX_reader __cache_aligned mti_readers[1];
} MDBX_lockinfo; } MDBX_lockinfo;
@ -650,7 +667,7 @@ struct MDBX_env {
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */
mdbx_thread_key_t me_txkey; /* thread-key for readers */ mdbx_thread_key_t me_txkey; /* thread-key for readers */
txnid_t me_pgoldest; /* ID of oldest reader last time we looked */ volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
MDBX_pgstate me_pgstate; /* state of old pages from freeDB */ MDBX_pgstate me_pgstate; /* state of old pages from freeDB */
#define me_pglast me_pgstate.mf_pglast #define me_pglast me_pgstate.mf_pglast
#define me_pghead me_pgstate.mf_pghead #define me_pghead me_pgstate.mf_pghead
@ -666,13 +683,14 @@ struct MDBX_env {
unsigned me_maxkey_limit; /* max size of a key */ unsigned me_maxkey_limit; /* max size of a key */
mdbx_pid_t me_live_reader; /* have liveness lock in reader table */ mdbx_pid_t me_live_reader; /* have liveness lock in reader table */
void *me_userctx; /* User-settable context */ void *me_userctx; /* User-settable context */
#if MDBX_DEBUG
MDBX_assert_func *me_assert_func; /* Callback for assertion failures */
#endif
size_t me_sync_pending; /* Total dirty/non-sync'ed bytes size_t me_sync_pending; /* Total dirty/non-sync'ed bytes
* since the last mdbx_env_sync() */ * since the last mdbx_env_sync() */
size_t me_sync_threshold; /* Treshold of above to force synchronous flush */ size_t me_sync_threshold; /* Treshold of above to force synchronous flush */
MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */ MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */
txnid_t me_oldest_stub;
#if MDBX_DEBUG
MDBX_assert_func *me_assert_func; /* Callback for assertion failures */
#endif
#ifdef USE_VALGRIND #ifdef USE_VALGRIND
int me_valgrind_handle; int me_valgrind_handle;
#endif #endif

View File

@ -1483,12 +1483,10 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) {
/* Find oldest txnid still referenced. */ /* Find oldest txnid still referenced. */
static txnid_t mdbx_find_oldest(MDBX_txn *txn, int *laggard) { static txnid_t mdbx_find_oldest(MDBX_txn *txn, int *laggard) {
MDBX_env *env = txn->mt_env; MDBX_env *env = txn->mt_env;
const MDBX_meta *const head = mdbx_meta_mostrecent( mdbx_assert(env, (txn->mt_flags & MDBX_RDONLY) == 0);
env, F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) ? false : true);
txnid_t oldest =
meta_txnid(env, head, (txn->mt_flags & MDBX_RDONLY) ? true : false);
int i, reader; int i, reader;
txnid_t oldest = txn->mt_txnid - 1;
const MDBX_reader *const r = env->me_lck->mti_readers; const MDBX_reader *const r = env->me_lck->mti_readers;
for (reader = -1, i = env->me_lck->mti_numreaders; --i >= 0;) { for (reader = -1, i = env->me_lck->mti_numreaders; --i >= 0;) {
if (r[i].mr_pid) { if (r[i].mr_pid) {
@ -1503,7 +1501,9 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn, int *laggard) {
if (laggard) if (laggard)
*laggard = reader; *laggard = reader;
return env->me_pgoldest = oldest;
*env->me_oldest = oldest;
return oldest;
} }
/* Add a page to the txn's dirty list */ /* Add a page to the txn's dirty list */
@ -1614,7 +1614,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
break; break;
oldest = (flags & MDBX_LIFORECLAIM) ? mdbx_find_oldest(txn, NULL) oldest = (flags & MDBX_LIFORECLAIM) ? mdbx_find_oldest(txn, NULL)
: env->me_pgoldest; : env->me_oldest[0];
mdbx_cursor_init(&m2, txn, FREE_DBI, NULL); mdbx_cursor_init(&m2, txn, FREE_DBI, NULL);
if (flags & MDBX_LIFORECLAIM) { if (flags & MDBX_LIFORECLAIM) {
/* Begin from oldest reader if any */ /* Begin from oldest reader if any */
@ -1645,7 +1645,7 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
if (op == MDBX_SET_RANGE) if (op == MDBX_SET_RANGE)
continue; continue;
if (oldest < mdbx_find_oldest(txn, NULL)) { if (oldest < mdbx_find_oldest(txn, NULL)) {
oldest = env->me_pgoldest; oldest = *env->me_oldest;
last = oldest - 1; last = oldest - 1;
key.iov_base = &last; key.iov_base = &last;
key.iov_len = sizeof(last); key.iov_len = sizeof(last);
@ -2283,7 +2283,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
} }
while (1) { while (1) {
MDBX_meta *const meta = mdbx_meta_head(txn->mt_env); MDBX_meta *const meta = mdbx_meta_head(env);
mdbx_jitter4testing(false); mdbx_jitter4testing(false);
const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); const txnid_t snap = mdbx_meta_txnid_fluid(env, meta);
mdbx_jitter4testing(false); mdbx_jitter4testing(false);
@ -2293,8 +2293,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
mdbx_assert(env, r->mr_pid == mdbx_getpid()); mdbx_assert(env, r->mr_pid == mdbx_getpid());
mdbx_assert(env, r->mr_tid == mdbx_thread_self()); mdbx_assert(env, r->mr_tid == mdbx_thread_self());
mdbx_assert(env, r->mr_txnid == snap); mdbx_assert(env, r->mr_txnid == snap);
}
mdbx_coherent_barrier(); mdbx_coherent_barrier();
}
mdbx_jitter4testing(true); mdbx_jitter4testing(true);
/* Snap the state from current meta-head */ /* Snap the state from current meta-head */
@ -2304,14 +2304,15 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
txn->mt_canary = meta->mm_canary; txn->mt_canary = meta->mm_canary;
/* LY: Retry on a race, ITS#7970. */ /* LY: Retry on a race, ITS#7970. */
if (likely(meta == mdbx_meta_head(txn->mt_env) && mdbx_compiler_barrier();
if (likely(meta == mdbx_meta_head(env) &&
snap == mdbx_meta_txnid_fluid(env, meta))) { snap == mdbx_meta_txnid_fluid(env, meta))) {
mdbx_jitter4testing(false); mdbx_jitter4testing(false);
break; break;
} }
} }
mdbx_assert(env, txn->mt_txnid >= mdbx_find_oldest(txn, nullptr)); mdbx_assert(env, txn->mt_txnid >= *env->me_oldest);
txn->mt_ro_reader = r; txn->mt_ro_reader = r;
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
} else { } else {
@ -4314,7 +4315,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) {
err = mdbx_mmap(&addr, (size_t)size, true, env->me_lfd); err = mdbx_mmap(&addr, (size_t)size, true, env->me_lfd);
if (unlikely(err != MDBX_SUCCESS)) if (unlikely(err != MDBX_SUCCESS))
return err; return err;
assert(addr != nullptr); mdbx_assert(env, addr != nullptr);
env->me_lck = addr; env->me_lck = addr;
#ifdef MADV_DODUMP #ifdef MADV_DODUMP
@ -4359,6 +4360,8 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) {
} }
} }
mdbx_assert(env, !MDBX_IS_ERROR(rc));
env->me_oldest = &env->me_lck->mti_oldest;
return rc; return rc;
} }
@ -4451,6 +4454,7 @@ int __cold mdbx_env_open_ex(MDBX_env *env, const char *path, unsigned flags,
goto bailout; goto bailout;
} }
env->me_oldest = &env->me_oldest_stub;
const int dxb_rc = mdbx_setup_dxb(env, lck_rc); const int dxb_rc = mdbx_setup_dxb(env, lck_rc);
if (MDBX_IS_ERROR(dxb_rc)) { if (MDBX_IS_ERROR(dxb_rc)) {
rc = dxb_rc; rc = dxb_rc;
@ -4594,8 +4598,9 @@ static void __cold mdbx_env_close0(MDBX_env *env) {
mdbx_munmap((void *)env->me_lck, mdbx_munmap((void *)env->me_lck,
(env->me_maxreaders - 1) * sizeof(MDBX_reader) + (env->me_maxreaders - 1) * sizeof(MDBX_reader) +
sizeof(MDBX_lockinfo)); sizeof(MDBX_lockinfo));
env->me_lck = NULL; env->me_lck = nullptr;
env->me_pid = 0; env->me_pid = 0;
env->me_oldest = nullptr;
mdbx_lck_destroy(env); mdbx_lck_destroy(env);
if (env->me_lfd != INVALID_HANDLE_VALUE) { if (env->me_lfd != INVALID_HANDLE_VALUE) {
@ -5087,8 +5092,7 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) {
return MDBX_BAD_TXN; return MDBX_BAD_TXN;
} }
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
/* Make sure we're using an up-to-date root */ /* Make sure we're using an up-to-date root */
if (unlikely(*mc->mc_dbflag & DB_STALE)) { if (unlikely(*mc->mc_dbflag & DB_STALE)) {
MDBX_cursor mc2; MDBX_cursor mc2;
@ -5128,8 +5132,7 @@ static int mdbx_page_search(MDBX_cursor *mc, MDBX_val *key, int flags) {
return MDBX_NOTFOUND; return MDBX_NOTFOUND;
} }
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
mdbx_cassert(mc, root >= NUM_METAS); mdbx_cassert(mc, root >= NUM_METAS);
if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root)
if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0))
@ -5238,8 +5241,7 @@ static __inline int mdbx_node_read(MDBX_cursor *mc, MDBX_node *leaf,
pgno_t pgno; pgno_t pgno;
int rc; int rc;
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) { if (!F_ISSET(leaf->mn_flags, F_BIGDATA)) {
data->iov_len = NODEDSZ(leaf); data->iov_len = NODEDSZ(leaf);
data->iov_base = NODEDATA(leaf); data->iov_base = NODEDATA(leaf);
@ -5299,8 +5301,7 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int move_right) {
MDBX_node *indx; MDBX_node *indx;
MDBX_page *mp; MDBX_page *mp;
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
if (unlikely(mc->mc_snum < 2)) { if (unlikely(mc->mc_snum < 2)) {
return MDBX_NOTFOUND; /* root has no siblings */ return MDBX_NOTFOUND; /* root has no siblings */
} }
@ -5530,8 +5531,7 @@ static int mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
MDBX_node *leaf = NULL; MDBX_node *leaf = NULL;
DKBUF; DKBUF;
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) && if ((mc->mc_db->md_flags & MDBX_INTEGERKEY) &&
unlikely(key->iov_len != sizeof(uint32_t) && unlikely(key->iov_len != sizeof(uint32_t) &&
key->iov_len != sizeof(uint64_t))) { key->iov_len != sizeof(uint64_t))) {
@ -5818,8 +5818,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED))
return MDBX_BAD_TXN; return MDBX_BAD_TXN;
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
switch (op) { switch (op) {
case MDBX_GET_CURRENT: { case MDBX_GET_CURRENT: {
if (unlikely(!(mc->mc_flags & C_INITIALIZED))) if (unlikely(!(mc->mc_flags & C_INITIALIZED)))
@ -7072,8 +7071,7 @@ static void mdbx_xcursor_init0(MDBX_cursor *mc) {
static void mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node) { static void mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node) {
MDBX_xcursor *mx = mc->mc_xcursor; MDBX_xcursor *mx = mc->mc_xcursor;
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
if (node->mn_flags & F_SUBDATA) { if (node->mn_flags & F_SUBDATA) {
memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDBX_db)); memcpy(&mx->mx_db, NODEDATA(node), sizeof(MDBX_db));
mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_pg[0] = 0;
@ -7123,8 +7121,7 @@ static void mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx,
int new_dupdata) { int new_dupdata) {
MDBX_xcursor *mx = mc->mc_xcursor; MDBX_xcursor *mx = mc->mc_xcursor;
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
if (new_dupdata) { if (new_dupdata) {
mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_snum = 1;
mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_top = 0;
@ -7164,8 +7161,7 @@ static void mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi,
mc->mc_xcursor = mx; mc->mc_xcursor = mx;
mdbx_xcursor_init0(mc); mdbx_xcursor_init0(mc);
} }
mdbx_cassert(mc, mdbx_cassert(mc, mc->mc_txn->mt_txnid >= mc->mc_txn->mt_env->me_oldest[0]);
mc->mc_txn->mt_txnid >= mdbx_find_oldest(mc->mc_txn, nullptr));
if (unlikely(*mc->mc_dbflag & DB_STALE)) { if (unlikely(*mc->mc_dbflag & DB_STALE)) {
mdbx_page_search(mc, NULL, MDBX_PS_ROOTONLY); mdbx_page_search(mc, NULL, MDBX_PS_ROOTONLY);
} }
@ -7784,8 +7780,8 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) {
static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { static void mdbx_cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) {
unsigned i; unsigned i;
mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= mdbx_cassert(csrc,
mdbx_find_oldest(csrc->mc_txn, nullptr)); csrc->mc_txn->mt_txnid >= csrc->mc_txn->mt_env->me_oldest[0]);
cdst->mc_txn = csrc->mc_txn; cdst->mc_txn = csrc->mc_txn;
cdst->mc_dbi = csrc->mc_dbi; cdst->mc_dbi = csrc->mc_dbi;
cdst->mc_db = csrc->mc_db; cdst->mc_db = csrc->mc_db;