mdbx: refine mdb_env_sync0().

Change-Id: If27986d02b2ca19ba547fe8fa5c8f4a8b9019900
This commit is contained in:
Leo Yuriev 2016-04-20 19:42:53 +03:00
parent 78f1fbd038
commit bfd82b5830

54
mdb.c
View File

@ -1902,7 +1902,7 @@ mdb_meta_head_r(MDB_env *env) {
} else if (likely(b->mm_txnid == head_txnid)) { } else if (likely(b->mm_txnid == head_txnid)) {
h = b; h = b;
} else { } else {
/* LY: seems got a race with mdb_env_sync0() */ /* LY: seems got a collision with mdb_env_sync0() */
mdb_coherent_barrier(); mdb_coherent_barrier();
head_txnid = env->me_txns->mti_txnid; head_txnid = env->me_txns->mti_txnid;
mdb_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0); mdb_assert(env, a->mm_txnid != b->mm_txnid || head_txnid == 0);
@ -2321,6 +2321,12 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags)
* utterly no-sync write mode was requested. In such case * utterly no-sync write mode was requested. In such case
* don't make a steady-sync, but only a legacy-mode checkpoint, * don't make a steady-sync, but only a legacy-mode checkpoint,
* just for resume reclaiming only, not for data consistency. */ * just for resume reclaiming only, not for data consistency. */
mdb_debug("kick-gc: head %zu/%c, tail %zu/%c, oldest %zu, txnid %zu",
head->mm_txnid, META_IS_WEAK(head) ? 'W' : 'N',
tail->mm_txnid, META_IS_WEAK(tail) ? 'W' : 'N',
oldest, env->me_txns->mt1.mtb.mtb_txnid );
int flags = env->me_flags & MDB_WRITEMAP; int flags = env->me_flags & MDB_WRITEMAP;
if ((env->me_flags & MDB_UTTERLY_NOSYNC) == MDB_UTTERLY_NOSYNC) if ((env->me_flags & MDB_UTTERLY_NOSYNC) == MDB_UTTERLY_NOSYNC)
flags |= MDB_UTTERLY_NOSYNC; flags |= MDB_UTTERLY_NOSYNC;
@ -4119,17 +4125,17 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
int rc; int rc;
MDB_meta* head = mdb_meta_head_w(env); MDB_meta* head = mdb_meta_head_w(env);
size_t prev_mapsize = head->mm_mapsize; size_t prev_mapsize = head->mm_mapsize;
MDB_meta* tail = META_IS_WEAK(head) ? head : mdb_env_meta_flipflop(env, head); volatile MDB_meta* target = META_IS_WEAK(head) ? head : mdb_env_meta_flipflop(env, head);
off_t offset = (char*) tail - env->me_map; off_t offset = (char*) target - env->me_map;
size_t used_size = env->me_psize * (pending->mm_last_pg + 1); size_t used_size = env->me_psize * (pending->mm_last_pg + 1);
mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0);
mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0 mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0
|| env->me_mapsize != prev_mapsize); || env->me_mapsize != prev_mapsize);
mdb_assert(env, pending->mm_txnid > head->mm_txnid || META_IS_WEAK(head)); mdb_assert(env, pending->mm_txnid > head->mm_txnid || META_IS_WEAK(head));
mdb_assert(env, pending->mm_txnid > tail->mm_txnid || META_IS_WEAK(tail)); mdb_assert(env, pending->mm_txnid > target->mm_txnid || META_IS_WEAK(target));
MDB_meta* stay = mdb_env_meta_flipflop(env, tail); MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target);
mdb_assert(env, pending->mm_txnid > stay->mm_txnid); mdb_assert(env, pending->mm_txnid > stay->mm_txnid);
pending->mm_mapsize = env->me_mapsize; pending->mm_mapsize = env->me_mapsize;
@ -4157,7 +4163,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
if ((flags & MDB_MAPASYNC) == 0) if ((flags & MDB_MAPASYNC) == 0)
env->me_sync_pending = 0; env->me_sync_pending = 0;
} else { } else {
int (*sync_fd)(int fd) = fdatasync; int (*flush)(int fd) = fdatasync;
if (unlikely(prev_mapsize != pending->mm_mapsize)) { if (unlikely(prev_mapsize != pending->mm_mapsize)) {
/* LY: It is no reason to use fdatasync() here, even in case /* LY: It is no reason to use fdatasync() here, even in case
* no such bug in a kernel. Because "no-bug" mean that a kernel * no such bug in a kernel. Because "no-bug" mean that a kernel
@ -4169,9 +4175,9 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
* *
* For more info about of a corresponding fdatasync() bug * For more info about of a corresponding fdatasync() bug
* see http://www.spinics.net/lists/linux-ext4/msg33714.html */ * see http://www.spinics.net/lists/linux-ext4/msg33714.html */
sync_fd = fsync; flush = fsync;
} }
while(unlikely(sync_fd(env->me_fd) < 0)) { while(unlikely(flush(env->me_fd) < 0)) {
rc = errno; rc = errno;
if (rc != EINTR) if (rc != EINTR)
goto undo; goto undo;
@ -4188,23 +4194,27 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
(flags & MDB_UTTERLY_NOSYNC) == MDB_UTTERLY_NOSYNC (flags & MDB_UTTERLY_NOSYNC) == MDB_UTTERLY_NOSYNC
? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK; ? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK;
} }
mdb_debug("writing meta page %d for root page %zu", mdb_debug("writing meta %d, root %zu, txn_id %zu, %s",
offset >= env->me_psize, pending->mm_dbs[MAIN_DBI].md_root); offset >= env->me_psize, pending->mm_dbs[MAIN_DBI].md_root,
pending->mm_txnid,
META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" : "Legacy" );
if (env->me_flags & MDB_WRITEMAP) { if (env->me_flags & MDB_WRITEMAP) {
#ifdef __SANITIZE_THREAD__ #ifdef __SANITIZE_THREAD__
pthread_mutex_lock(&tsan_mutex); pthread_mutex_lock(&tsan_mutex);
#endif #endif
tail->mm_datasync_sign = MDB_DATASIGN_WEAK; /* LY: 'invalidate' the meta,
tail->mm_txnid = 0; * but mdb_meta_head_r() will be confused/retired in collision case. */
mdb_coherent_barrier(); target->mm_datasync_sign = MDB_DATASIGN_WEAK;
tail->mm_mapsize = pending->mm_mapsize; target->mm_txnid = 0;
tail->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI]; /* LY: update info */
tail->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_mapsize = pending->mm_mapsize;
tail->mm_last_pg = pending->mm_last_pg; target->mm_dbs[FREE_DBI] = pending->mm_dbs[FREE_DBI];
/* (LY) ITS#7969: issue a memory barrier, it is noop for x86. */ target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI];
mdb_coherent_barrier(); target->mm_last_pg = pending->mm_last_pg;
tail->mm_txnid = pending->mm_txnid; /* LY: 'commit' the meta */
tail->mm_datasync_sign = pending->mm_datasync_sign; target->mm_txnid = pending->mm_txnid;
target->mm_datasync_sign = pending->mm_datasync_sign;
} else { } else {
pending->mm_magic = MDB_MAGIC; pending->mm_magic = MDB_MAGIC;
pending->mm_version = MDB_DATA_VERSION; pending->mm_version = MDB_DATA_VERSION;
@ -4220,7 +4230,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
mdb_debug("write failed, disk error?"); mdb_debug("write failed, disk error?");
/* On a failure, the pagecache still contains the new data. /* On a failure, the pagecache still contains the new data.
* Write some old data back, to prevent it from being used. */ * Write some old data back, to prevent it from being used. */
if (pwrite(env->me_fd, tail, sizeof(MDB_meta), offset) == sizeof(MDB_meta)) { if (pwrite(env->me_fd, (void*) target, sizeof(MDB_meta), offset) == sizeof(MDB_meta)) {
/* LY: take a chance, if write succeeds at a magic ;) */ /* LY: take a chance, if write succeeds at a magic ;) */
goto retry; goto retry;
} }