mdbx: kill/remove mm_txnid.

This commit is contained in:
Leo Yuriev 2017-04-21 18:57:34 +03:00
parent f3e31a74ee
commit 585496339a
3 changed files with 24 additions and 96 deletions

View File

@ -361,10 +361,6 @@ typedef struct MDBX_lockinfo {
/* Format of this lock file. Must be set to MDB_LOCK_FORMAT. */ /* Format of this lock file. Must be set to MDB_LOCK_FORMAT. */
uint64_t mti_format; uint64_t mti_format;
/* The ID of the last transaction committed to the database.
* This is recorded here only for convenience; the value can always
* be determined by reading the main database meta pages. */
volatile txnid_t mti_txnid;
#ifdef MDBX_OSAL_LOCK #ifdef MDBX_OSAL_LOCK
MDBX_OSAL_LOCK mti_wmutex; MDBX_OSAL_LOCK mti_wmutex;
#endif #endif
@ -758,21 +754,11 @@ int mdbx_reader_check0(MDB_env *env, int rlocked, int *dead);
#define METAPAGE_2(env) \ #define METAPAGE_2(env) \
(&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) (&((MDB_metabuf *)((env)->me_map + env->me_psize))->mb_metabuf.mm_meta)
static __inline MDB_meta *mdbx_meta_head_w(MDB_env *env) { static __inline MDB_meta *mdbx_meta_head(MDB_env *env) {
MDB_meta *a = METAPAGE_1(env); MDB_meta *a = METAPAGE_1(env);
MDB_meta *b = METAPAGE_2(env); MDB_meta *b = METAPAGE_2(env);
txnid_t head_txnid = env->me_txns->mti_txnid;
mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); return (a->mm_txnid > b->mm_txnid) ? a : b;
if (a->mm_txnid == head_txnid)
return a;
if (likely(b->mm_txnid == head_txnid))
return b;
mdbx_debug("me_txns->mti_txnid not match meta-pages");
mdbx_assert(env, head_txnid == a->mm_txnid || head_txnid == b->mm_txnid);
env->me_flags |= MDB_FATAL_ERROR;
return a;
} }
void mdbx_rthc_dtor(void *rthc); void mdbx_rthc_dtor(void *rthc);

View File

@ -214,15 +214,6 @@ static int __cold mdbx_mutex_failed(MDB_env *env, mdbx_mutex_t *mutex, int rc) {
rc = MDBX_RESULT_TRUE; rc = MDBX_RESULT_TRUE;
rlocked = (mutex == &env->me_txns->mti_rmutex); rlocked = (mutex == &env->me_txns->mti_rmutex);
if (!rlocked) { if (!rlocked) {
/* Keep mtb.mti_txnid updated, otherwise next writer can
* overwrite data which latest meta page refers to.
*
* LY: Hm, how this can happen, if the mtb.mti_txnid
* is updating only at the finish of a successful commit ?
*/
MDB_meta *meta = mdbx_meta_head_w(env);
assert(env->me_txns->mti_txnid == meta->mm_txnid);
(void)meta;
/* env is hosed if the dead thread was ours */ /* env is hosed if the dead thread was ours */
if (env->me_txn) { if (env->me_txn) {
env->me_flags |= MDB_FATAL_ERROR; env->me_flags |= MDB_FATAL_ERROR;

View File

@ -1387,52 +1387,12 @@ static __inline uint64_t mdbx_meta_sign(MDB_meta *meta) {
return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign; return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign;
} }
static MDB_meta *mdbx_meta_head_r(MDB_env *env) {
MDB_meta *a = METAPAGE_1(env);
MDB_meta *b = METAPAGE_2(env), *h;
#ifdef __SANITIZE_THREAD__
mdbx_mutex_lock(&tsan_mutex);
#endif
txnid_t head_txnid = env->me_txns->mti_txnid;
mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0);
if (likely(a->mm_txnid == head_txnid)) {
h = a;
} else if (likely(b->mm_txnid == head_txnid)) {
h = b;
} else {
/* LY: seems got a collision with mdbx_env_sync0() */
mdbx_coherent_barrier();
head_txnid = env->me_txns->mti_txnid;
mdbx_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0);
if (likely(a->mm_txnid == head_txnid)) {
h = a;
} else if (likely(b->mm_txnid == head_txnid)) {
h = b;
} else {
/* LY: got a race again, or DB is corrupted */
int rc = mdbx_txn_lock(env);
h = mdbx_meta_head_w(env);
if (rc == MDB_SUCCESS)
mdbx_txn_unlock(env);
}
}
#ifdef __SANITIZE_THREAD__
mdbx_mutex_unlock(&tsan_mutex);
#endif
return h;
}
static __inline MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env, static __inline MDB_meta *mdbx_env_meta_flipflop(const MDB_env *env,
MDB_meta *meta) { MDB_meta *meta) {
return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env); return (meta == METAPAGE_1(env)) ? METAPAGE_2(env) : METAPAGE_1(env);
} }
static __inline int mdbx_meta_lt(MDB_meta *a, MDB_meta *b) { static __inline int mdbx_meta_lt(const MDB_meta *a, const MDB_meta *b) {
return (META_IS_STEADY(a) == META_IS_STEADY(b)) ? a->mm_txnid < b->mm_txnid return (META_IS_STEADY(a) == META_IS_STEADY(b)) ? a->mm_txnid < b->mm_txnid
: META_IS_STEADY(b); : META_IS_STEADY(b);
} }
@ -1442,17 +1402,12 @@ static txnid_t mdbx_find_oldest(MDB_env *env, int *laggard) {
#ifdef __SANITIZE_THREAD__ #ifdef __SANITIZE_THREAD__
mdbx_mutex_lock(&tsan_mutex); mdbx_mutex_lock(&tsan_mutex);
#endif #endif
const MDB_meta *const a = METAPAGE_1(env);
const MDB_meta *const b = METAPAGE_2(env);
txnid_t oldest = mdbx_meta_lt(a, b) ? b->mm_txnid : a->mm_txnid;
int i, reader; int i, reader;
MDB_reader *r = env->me_txns->mti_readers; const MDB_reader *const r = env->me_txns->mti_readers;
txnid_t oldest = env->me_txns->mti_txnid;
MDB_meta *a = METAPAGE_1(env);
MDB_meta *b = METAPAGE_2(env);
if (META_IS_WEAK(a) && oldest > b->mm_txnid)
oldest = b->mm_txnid;
if (META_IS_WEAK(b) && oldest > a->mm_txnid)
oldest = a->mm_txnid;
for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0;) { for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0;) {
if (r[i].mr_pid) { if (r[i].mr_pid) {
txnid_t snap = r[i].mr_txnid; txnid_t snap = r[i].mr_txnid;
@ -1738,7 +1693,7 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) {
if ((flags & MDBX_ALLOC_GC) && if ((flags & MDBX_ALLOC_GC) &&
((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) { ((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) {
MDB_meta *head = mdbx_meta_head_w(env); MDB_meta *head = mdbx_meta_head(env);
MDB_meta *tail = mdbx_env_meta_flipflop(env, head); MDB_meta *tail = mdbx_env_meta_flipflop(env, head);
if (oldest == tail->mm_txnid && META_IS_WEAK(head) && if (oldest == tail->mm_txnid && META_IS_WEAK(head) &&
@ -1754,10 +1709,9 @@ static int mdbx_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) {
* don't make a steady-sync, but only a legacy-mode checkpoint, * don't make a steady-sync, but only a legacy-mode checkpoint,
* just for resume reclaiming only, not for data consistency. */ * just for resume reclaiming only, not for data consistency. */
mdbx_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu, txnid %zu", mdbx_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu",
head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N', head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N',
tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest, tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N', oldest);
env->me_txns->mti_txnid);
int me_flags = env->me_flags & MDB_WRITEMAP; int me_flags = env->me_flags & MDB_WRITEMAP;
if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC) if ((env->me_flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC)
@ -2025,7 +1979,7 @@ int mdbx_env_sync(MDB_env *env, int force) {
if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR))) if (unlikely(flags & (MDB_RDONLY | MDB_FATAL_ERROR)))
return EACCES; return EACCES;
head = mdbx_meta_head_r(env); head = mdbx_meta_head(env);
if (!META_IS_WEAK(head) && env->me_sync_pending == 0 && if (!META_IS_WEAK(head) && env->me_sync_pending == 0 &&
env->me_mapsize == head->mm_mapsize) env->me_mapsize == head->mm_mapsize)
/* LY: nothing to do */ /* LY: nothing to do */
@ -2054,7 +2008,7 @@ int mdbx_env_sync(MDB_env *env, int force) {
return rc; return rc;
/* LY: head may be changed while the mutex has been acquired. */ /* LY: head may be changed while the mutex has been acquired. */
head = mdbx_meta_head_w(env); head = mdbx_meta_head(env);
rc = MDB_SUCCESS; rc = MDB_SUCCESS;
if (META_IS_WEAK(head) || env->me_sync_pending != 0 || if (META_IS_WEAK(head) || env->me_sync_pending != 0 ||
env->me_mapsize != head->mm_mapsize) { env->me_mapsize != head->mm_mapsize) {
@ -2236,12 +2190,12 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) {
} }
while ((env->me_flags & MDB_FATAL_ERROR) == 0) { while ((env->me_flags & MDB_FATAL_ERROR) == 0) {
MDB_meta *meta = mdbx_meta_head_r(txn->mt_env); MDB_meta *const meta = mdbx_meta_head(txn->mt_env);
txnid_t lead = meta->mm_txnid; txnid_t lead = meta->mm_txnid;
r->mr_txnid = lead; r->mr_txnid = lead;
mdbx_coherent_barrier(); mdbx_coherent_barrier();
txnid_t snap = txn->mt_env->me_txns->mti_txnid; txnid_t snap = mdbx_meta_head(txn->mt_env)->mm_txnid;
/* LY: Retry on a race, ITS#7970. */ /* LY: Retry on a race, ITS#7970. */
if (likely(lead == snap)) { if (likely(lead == snap)) {
txn->mt_txnid = lead; txn->mt_txnid = lead;
@ -2264,7 +2218,7 @@ static int mdbx_txn_renew0(MDB_txn *txn, unsigned flags) {
#ifdef __SANITIZE_THREAD__ #ifdef __SANITIZE_THREAD__
mdbx_mutex_lock(&tsan_mutex); mdbx_mutex_lock(&tsan_mutex);
#endif #endif
MDB_meta *meta = mdbx_meta_head_w(env); MDB_meta *meta = mdbx_meta_head(env);
txn->mt_canary = meta->mm_canary; txn->mt_canary = meta->mm_canary;
txn->mt_txnid = meta->mm_txnid + 1; txn->mt_txnid = meta->mm_txnid + 1;
txn->mt_flags = flags; txn->mt_flags = flags;
@ -3429,7 +3383,7 @@ static int __cold mdbx_env_init_meta(MDB_env *env, MDB_meta *meta) {
static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) { static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) {
int rc; int rc;
MDB_meta *head = mdbx_meta_head_w(env); MDB_meta *head = mdbx_meta_head(env);
size_t prev_mapsize = head->mm_mapsize; size_t prev_mapsize = head->mm_mapsize;
size_t used_size = env->me_psize * (pending->mm_last_pg + 1); size_t used_size = env->me_psize * (pending->mm_last_pg + 1);
@ -3557,7 +3511,6 @@ static int mdbx_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) {
* readers will get consistent data regardless of how fresh or * readers will get consistent data regardless of how fresh or
* how stale their view of these values is. * how stale their view of these values is.
*/ */
env->me_txns->mti_txnid = pending->mm_txnid;
#ifdef __SANITIZE_THREAD__ #ifdef __SANITIZE_THREAD__
mdbx_mutex_unlock(&tsan_mutex); mdbx_mutex_unlock(&tsan_mutex);
#endif #endif
@ -3742,7 +3695,7 @@ int __cold mdbx_env_set_mapsize(MDB_env *env, size_t size) {
return EINVAL; return EINVAL;
/* FIXME: lock/unlock */ /* FIXME: lock/unlock */
meta = mdbx_meta_head_w(env); meta = mdbx_meta_head(env);
if (!size) if (!size)
size = meta->mm_mapsize; size = meta->mm_mapsize;
/* Silently round up to minimum if the size is too small */ /* Silently round up to minimum if the size is too small */
@ -3953,7 +3906,6 @@ static int __cold mdbx_env_setup_locks(MDB_env *env, char *lpath, int mode,
env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_magic = MDB_MAGIC;
env->me_txns->mti_format = MDB_LOCK_FORMAT; env->me_txns->mti_format = MDB_LOCK_FORMAT;
env->me_txns->mti_txnid = ~(txnid_t)0;
} else { } else {
if (env->me_txns->mti_magic != MDB_MAGIC) { if (env->me_txns->mti_magic != MDB_MAGIC) {
mdbx_debug("lock region has invalid magic"); mdbx_debug("lock region has invalid magic");
@ -4079,7 +4031,6 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags,
if (rc == MDB_SUCCESS) { if (rc == MDB_SUCCESS) {
mdbx_debug("opened dbenv %p", (void *)env); mdbx_debug("opened dbenv %p", (void *)env);
if (excl > 0) { if (excl > 0) {
env->me_txns->mti_txnid = meta.mm_txnid;
if (exclusive == NULL || *exclusive < 2) { if (exclusive == NULL || *exclusive < 2) {
/* LY: downgrade lock only if exclusive access not requested. /* LY: downgrade lock only if exclusive access not requested.
* in case exclusive==1, just leave value as is. */ * in case exclusive==1, just leave value as is. */
@ -4116,7 +4067,7 @@ int __cold mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags,
#if MDB_DEBUG #if MDB_DEBUG
if (rc == MDB_SUCCESS) { if (rc == MDB_SUCCESS) {
MDB_meta *meta = mdbx_meta_head_r(env); MDB_meta *meta = mdbx_meta_head(env);
MDB_db *db = &meta->mm_dbs[MAIN_DBI]; MDB_db *db = &meta->mm_dbs[MAIN_DBI];
int toggle = ((char *)meta == PAGEDATA(env->me_map)) ? 0 : 1; int toggle = ((char *)meta == PAGEDATA(env->me_map)) ? 0 : 1;
@ -8673,7 +8624,7 @@ int __cold mdbx_env_stat(MDB_env *env, MDBX_stat *arg, size_t bytes) {
if (unlikely(bytes != sizeof(MDBX_stat))) if (unlikely(bytes != sizeof(MDBX_stat)))
return EINVAL; return EINVAL;
meta = mdbx_meta_head_r(env); meta = mdbx_meta_head(env);
return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg); return mdbx_stat0(env, &meta->mm_dbs[MAIN_DBI], arg);
} }
@ -8694,14 +8645,14 @@ int __cold mdbx_env_info(MDB_env *env, MDBX_envinfo *arg, size_t bytes) {
m2 = METAPAGE_2(env); m2 = METAPAGE_2(env);
do { do {
meta = mdbx_meta_head_r(env); meta = mdbx_meta_head(env);
arg->me_last_txnid = meta->mm_txnid; arg->me_last_txnid = meta->mm_txnid;
arg->me_last_pgno = meta->mm_last_pg; arg->me_last_pgno = meta->mm_last_pg;
arg->me_meta1_txnid = m1->mm_txnid; arg->me_meta1_txnid = m1->mm_txnid;
arg->me_meta1_sign = m1->mm_datasync_sign; arg->me_meta1_sign = m1->mm_datasync_sign;
arg->me_meta2_txnid = m2->mm_txnid; arg->me_meta2_txnid = m2->mm_txnid;
arg->me_meta2_sign = m2->mm_datasync_sign; arg->me_meta2_sign = m2->mm_datasync_sign;
} while (unlikely(arg->me_last_txnid != env->me_txns->mti_txnid || } while (unlikely(arg->me_last_txnid != mdbx_meta_head(env)->mm_txnid ||
arg->me_meta1_sign != m1->mm_datasync_sign || arg->me_meta1_sign != m1->mm_datasync_sign ||
arg->me_meta2_sign != m2->mm_datasync_sign)); arg->me_meta2_sign != m2->mm_datasync_sign));
@ -9606,7 +9557,7 @@ static txnid_t __cold mdbx_oomkick(MDB_env *env, txnid_t oldest) {
continue; continue;
rc = env->me_oom_func(env, pid, tid, oldest, rc = env->me_oom_func(env, pid, tid, oldest,
mdbx_meta_head_w(env)->mm_txnid - oldest, retry); mdbx_meta_head(env)->mm_txnid - oldest, retry);
if (rc < 0) if (rc < 0)
break; break;
@ -9669,7 +9620,7 @@ int mdbx_txn_straggler(MDB_txn *txn, int *percent)
return -1; return -1;
env = txn->mt_env; env = txn->mt_env;
meta = mdbx_meta_head_r(env); meta = mdbx_meta_head(env);
if (percent) { if (percent) {
size_t maxpg = env->me_maxpg; size_t maxpg = env->me_maxpg;
size_t last = meta->mm_last_pg + 1; size_t last = meta->mm_last_pg + 1;