From 3fd4f9cce0d881eaecbc01bbf1650f52cc521171 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 7 Jan 2015 19:09:14 +0300 Subject: [PATCH] lmdb: rework workaround for potential ext3/ext4 corruption issue. Reworked from branch 'mdb.master' origin OpenLDAP: 8b6c425 2015-01-12 More cleanup for fdatasync hack ea89e3d 2015-01-11 Tweak conditionals for fdatasync hack 462dc09 2015-01-08 fdatasync hack, again e86072a 2015-01-08 Revert "Note MDB_SAFE_FDATASYNC" 293d6bb 2015-01-08 Note MDB_SAFE_FDATASYNC 9585c01 2015-01-08 Simpler fdatasync hack 0ef1e0b 2015-01-08 Revert "Fix prev commit for env_sync0" Imported early while forking ReOpenLDAP: 985bbbb 2014-12-21 Fix prev commit for env_sync0 0018eeb 2014-12-18 Hack for potential ext3/ext4 corruption issue Change-Id: I187fd320620b9ced2e3773cac96f281ff65f97d4 --- Makefile | 1 + mdb.c | 146 +++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 95 insertions(+), 52 deletions(-) diff --git a/Makefile b/Makefile index bca5cd38..2d0983ef 100644 --- a/Makefile +++ b/Makefile @@ -11,6 +11,7 @@ # - MDB_USE_POSIX_SEM # - MDB_DSYNC # - MDB_FDATASYNC +# - MDB_FDATASYNC_WORKS # - MDB_USE_PWRITEV # # There may be other macros in mdb.c of interest. You should diff --git a/mdb.c b/mdb.c index 22a92f9d..72716058 100644 --- a/mdb.c +++ b/mdb.c @@ -79,6 +79,14 @@ extern int cacheflush(char *addr, int nbytes, int cache); #define CACHEFLUSH(addr, bytes, cache) #endif +#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS) +/** fdatasync is broken on ext3/ext4fs on older kernels, see + * description in #mdb_env_open2 comments. You can safely + * define MDB_FDATASYNC_WORKS if this code will only be run + * on kernels 3.6 and newer. + */ +# define FDATASYNC_MAYBE_BROKEN +#endif #include #include @@ -368,7 +376,6 @@ static int mdb_mutex_failed(MDB_env *env, mdb_mutex_t *mutex, int rc); */ #ifndef MDB_FDATASYNC # define MDB_FDATASYNC fdatasync -# define HAVE_FDATASYNC 1 #endif #ifndef MDB_MSYNC @@ -1142,6 +1149,10 @@ struct MDB_env { #define MDB_ENV_ACTIVE 0x20000000U /** me_txkey is set */ #define MDB_ENV_TXKEY 0x10000000U +#ifdef FDATASYNC_MAYBE_BROKEN + /** fdatasync may be unreliable */ +# define MDB_BROKEN_DATASYNC 0x08000000U +#endif /* FDATASYNC_MAYBE_BROKEN */ uint32_t me_flags; /**< @ref mdb_env */ unsigned int me_psize; /**< DB page size, inited from me_os_psize */ unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */ @@ -1158,7 +1169,6 @@ struct MDB_env { MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn0; /**< prealloc'd write transaction */ size_t me_mapsize; /**< size of the data memory map */ - size_t me_size; /**< current file size */ pgno_t me_maxpg; /**< me_mapsize / me_psize */ MDB_dbx *me_dbxs; /**< array of static DB info */ uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ @@ -1193,7 +1203,9 @@ struct MDB_env { MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last mdb_env_sync() */ uint64_t me_sync_threshold; /**< Treshold of above to force synchronous flush */ - size_t me_sync_size; /**< Tracking me_size for FGREW/fsync() */ +#ifdef FDATASYNC_MAYBE_BROKEN + size_t me_sync_size; /**< Tracking file size at last sync to decide when fsync() is needed */ +#endif /* FDATASYNC_MAYBE_BROKEN */ MDB_oom_func *me_oom_func; /**< Callback for kicking laggard readers */ }; @@ -2514,34 +2526,18 @@ fail: return rc; } -/* internal env_sync flags: */ -#define FORCE 1 /* as before, force a flush */ -#define FGREW 0x8000 /* file has grown, do a full fsync instead of just - fdatasync. We shouldn't have to do this, according to the POSIX spec. - But common Linux FSs violate the spec and won't sync required metadata - correctly when the file grows. This only makes a difference if the - platform actually distinguishes fdatasync from fsync. - http://www.openldap.org/lists/openldap-devel/201411/msg00000.html */ - static int -mdb_env_sync0(MDB_env *env, unsigned int *flags) +mdb_env_sync0(MDB_env *env, int *force) { - int rc = 0, force; + int rc = 0; if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold) - *flags |= FORCE; - force = *flags & FORCE; - if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { - if (env->me_sync_size != env->me_size) - *flags |= FGREW; + *force = 1; + if (*force || !F_ISSET(env->me_flags, MDB_NOSYNC)) { if (env->me_flags & MDB_WRITEMAP) { - int mode = ((env->me_flags & MDB_MAPASYNC) && !force) - ? MS_ASYNC : MS_SYNC; - - /* LY: skip meta-pages */ - size_t data_offset = env->me_os_psize; - while (data_offset < env->me_psize + env->me_psize) - data_offset += env->me_os_psize; + int mode = ((env->me_flags & MDB_MAPASYNC) && *force == 0) ? MS_ASYNC : MS_SYNC; + /* LY: skip meta-pages, sync ones explicit later */ + size_t data_offset = (env->me_psize * 2 + env->me_os_psize - 1) & ~(env->me_os_psize - 1); if (MDB_MSYNC(env->me_map + data_offset, env->me_mapsize - data_offset, mode)) rc = ErrCode(); #ifdef _WIN32 @@ -2549,20 +2545,22 @@ mdb_env_sync0(MDB_env *env, unsigned int *flags) rc = ErrCode(); #endif } else { -#ifdef HAVE_FDATASYNC - if (*flags & FGREW) { - if (fsync(env->me_fd)) /* Avoid ext-fs bugs, do full sync */ + /* (LY) TODO: sync_file_range() for data and later fdatasync() for meta, + ALSO sync_file_range() needed before calling fsync(). + */ +#ifdef FDATASYNC_MAYBE_BROKEN + if (env->me_sync_size != env->me_mapsize && (env->me_flags & MDB_BROKEN_DATASYNC)) { + if (fsync(env->me_fd)) rc = ErrCode(); + else + env->me_sync_size = env->me_mapsize; } else -#endif +#endif /* FDATASYNC_MAYBE_BROKEN */ if (MDB_FDATASYNC(env->me_fd)) rc = ErrCode(); } - if (! rc) { + if (! rc) env->me_sync_pending = 0; - if (*flags & FGREW) - env->me_sync_size = env->me_size; - } } return rc; } @@ -2570,7 +2568,6 @@ mdb_env_sync0(MDB_env *env, unsigned int *flags) int mdb_env_sync(MDB_env *env, int force) { - unsigned int flags = force ? FORCE | FGREW : 0; MDB_meta *meta; txnid_t checkpoint; int rc, lockfree_countdown = 3; @@ -2590,7 +2587,7 @@ mdb_env_sync(MDB_env *env, int force) checkpoint = meta->mm_txnid; /* first sync data. */ - rc = mdb_env_sync0(env, &flags); + rc = mdb_env_sync0(env, &force); /* then sync meta-pages. */ if (rc == 0 && (env->me_flags & MDB_WRITEMAP)) { @@ -2598,7 +2595,7 @@ mdb_env_sync(MDB_env *env, int force) if (MDB_MSYNC(env->me_map, env->me_psize * 2, mode)) rc = ErrCode(); #ifdef _WIN32 - if (rc == 0 && mode == MS_SYNC && MDB_FDATASYNC(env->me_fd)) + else if (mode == MS_SYNC && MDB_FDATASYNC(env->me_fd)) rc = ErrCode(); #endif } @@ -3657,8 +3654,8 @@ done: int mdb_txn_commit(MDB_txn *txn) { - int rc; - unsigned int i; + int rc, force = 0; + unsigned i; MDB_env *env; if (txn == NULL || txn->mt_env == NULL) @@ -3862,16 +3859,9 @@ mdb_txn_commit(MDB_txn *txn) mdb_audit(txn); #endif - i = 0; -#ifdef HAVE_FDATASYNC - if (txn->mt_next_pgno * env->me_psize > env->me_size) { - i |= FGREW; - env->me_size = txn->mt_next_pgno * env->me_psize; - } -#endif if ((rc = mdb_page_flush(txn, 0)) || - (rc = mdb_env_sync0(env, &i)) || - (rc = mdb_env_write_meta(txn, i != 0))) + (rc = mdb_env_sync0(env, &force)) || + (rc = mdb_env_write_meta(txn, force))) goto fail; /* Free P_LOOSE pages left behind in dirty_list */ @@ -4372,6 +4362,11 @@ mdb_fsize(HANDLE fd, size_t *size) return MDB_SUCCESS; } +#ifdef FDATASYNC_MAYBE_BROKEN +# include +# include +#endif /* FDATASYNC_MAYBE_BROKEN */ + /** Further setup required for opening an LMDB environment */ static int ESECT @@ -4390,6 +4385,54 @@ mdb_env_open2(MDB_env *env) env->me_pidquery = PROCESS_QUERY_INFORMATION; #endif /* _WIN32 */ +#ifdef FDATASYNC_MAYBE_BROKEN + /* ext3/ext4 fdatasync is broken on some older Linux kernels. + * https://lkml.org/lkml/2012/9/3/83 + * Kernels after 3.6-rc6 are known good. + * https://lkml.org/lkml/2012/9/10/556 + * See if the DB is on ext3/ext4, then check for new enough kernel + * Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known + * to be patched. + */ + { + struct statfs st; + fstatfs(env->me_fd, &st); + while (st.f_type == 0xEF53) { + struct utsname uts; + int i; + uname(&uts); + if (uts.release[0] < '3') { + if (!strncmp(uts.release, "2.6.32.", 7)) { + i = atoi(uts.release+7); + if (i >= 60) + break; /* 2.6.32.60 and newer is OK */ + } else if (!strncmp(uts.release, "2.6.34.", 7)) { + i = atoi(uts.release+7); + if (i >= 15) + break; /* 2.6.34.15 and newer is OK */ + } + } else if (uts.release[0] == '3') { + i = atoi(uts.release+2); + if (i > 5) + break; /* 3.6 and newer is OK */ + if (i == 5) { + i = atoi(uts.release+4); + if (i >= 4) + break; /* 3.5.4 and newer is OK */ + } else if (i == 2) { + i = atoi(uts.release+4); + if (i >= 30) + break; /* 3.2.30 and newer is OK */ + } + } else { /* 4.x and newer is OK */ + break; + } + env->me_flags |= MDB_BROKEN_DATASYNC; + break; + } + } +#endif /* FDATASYNC_MAYBE_BROKEN */ + if ((i = mdb_env_read_header(env, &meta)) != 0) { if (i != ENOENT) return i; @@ -4433,10 +4476,6 @@ mdb_env_open2(MDB_env *env) newenv = 0; } - rc = mdb_fsize(env->me_fd, &env->me_size); - if (rc) - return rc; - rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL); if (rc) return rc; @@ -4457,6 +4496,9 @@ mdb_env_open2(MDB_env *env) env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db)); #endif env->me_maxpg = env->me_mapsize / env->me_psize; +#ifdef FDATASYNC_MAYBE_BROKEN + env->me_sync_size = env->me_mapsize; +#endif /* FDATASYNC_MAYBE_BROKEN */ #if MDB_DEBUG {