mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-31 10:58:20 +08:00
lmdb: rework workaround for potential ext3/ext4 corruption issue.
Reworked from branch 'mdb.master' origin OpenLDAP: 8b6c425 2015-01-12 More cleanup for fdatasync hack ea89e3d 2015-01-11 Tweak conditionals for fdatasync hack 462dc09 2015-01-08 fdatasync hack, again e86072a 2015-01-08 Revert "Note MDB_SAFE_FDATASYNC" 293d6bb 2015-01-08 Note MDB_SAFE_FDATASYNC 9585c01 2015-01-08 Simpler fdatasync hack 0ef1e0b 2015-01-08 Revert "Fix prev commit for env_sync0" Imported early while forking ReOpenLDAP: 985bbbb 2014-12-21 Fix prev commit for env_sync0 0018eeb 2014-12-18 Hack for potential ext3/ext4 corruption issue Change-Id: I187fd320620b9ced2e3773cac96f281ff65f97d4
This commit is contained in:
parent
f00d2cdef6
commit
3fd4f9cce0
1
Makefile
1
Makefile
@ -11,6 +11,7 @@
|
|||||||
# - MDB_USE_POSIX_SEM
|
# - MDB_USE_POSIX_SEM
|
||||||
# - MDB_DSYNC
|
# - MDB_DSYNC
|
||||||
# - MDB_FDATASYNC
|
# - MDB_FDATASYNC
|
||||||
|
# - MDB_FDATASYNC_WORKS
|
||||||
# - MDB_USE_PWRITEV
|
# - MDB_USE_PWRITEV
|
||||||
#
|
#
|
||||||
# There may be other macros in mdb.c of interest. You should
|
# There may be other macros in mdb.c of interest. You should
|
||||||
|
146
mdb.c
146
mdb.c
@ -79,6 +79,14 @@ extern int cacheflush(char *addr, int nbytes, int cache);
|
|||||||
#define CACHEFLUSH(addr, bytes, cache)
|
#define CACHEFLUSH(addr, bytes, cache)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux) && !defined(MDB_FDATASYNC_WORKS)
|
||||||
|
/** fdatasync is broken on ext3/ext4fs on older kernels, see
|
||||||
|
* description in #mdb_env_open2 comments. You can safely
|
||||||
|
* define MDB_FDATASYNC_WORKS if this code will only be run
|
||||||
|
* on kernels 3.6 and newer.
|
||||||
|
*/
|
||||||
|
# define FDATASYNC_MAYBE_BROKEN
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <errno.h>
|
#include <errno.h>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
@ -368,7 +376,6 @@ static int mdb_mutex_failed(MDB_env *env, mdb_mutex_t *mutex, int rc);
|
|||||||
*/
|
*/
|
||||||
#ifndef MDB_FDATASYNC
|
#ifndef MDB_FDATASYNC
|
||||||
# define MDB_FDATASYNC fdatasync
|
# define MDB_FDATASYNC fdatasync
|
||||||
# define HAVE_FDATASYNC 1
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef MDB_MSYNC
|
#ifndef MDB_MSYNC
|
||||||
@ -1142,6 +1149,10 @@ struct MDB_env {
|
|||||||
#define MDB_ENV_ACTIVE 0x20000000U
|
#define MDB_ENV_ACTIVE 0x20000000U
|
||||||
/** me_txkey is set */
|
/** me_txkey is set */
|
||||||
#define MDB_ENV_TXKEY 0x10000000U
|
#define MDB_ENV_TXKEY 0x10000000U
|
||||||
|
#ifdef FDATASYNC_MAYBE_BROKEN
|
||||||
|
/** fdatasync may be unreliable */
|
||||||
|
# define MDB_BROKEN_DATASYNC 0x08000000U
|
||||||
|
#endif /* FDATASYNC_MAYBE_BROKEN */
|
||||||
uint32_t me_flags; /**< @ref mdb_env */
|
uint32_t me_flags; /**< @ref mdb_env */
|
||||||
unsigned int me_psize; /**< DB page size, inited from me_os_psize */
|
unsigned int me_psize; /**< DB page size, inited from me_os_psize */
|
||||||
unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
|
unsigned int me_os_psize; /**< OS page size, from #GET_PAGESIZE */
|
||||||
@ -1158,7 +1169,6 @@ struct MDB_env {
|
|||||||
MDB_txn *me_txn; /**< current write transaction */
|
MDB_txn *me_txn; /**< current write transaction */
|
||||||
MDB_txn *me_txn0; /**< prealloc'd write transaction */
|
MDB_txn *me_txn0; /**< prealloc'd write transaction */
|
||||||
size_t me_mapsize; /**< size of the data memory map */
|
size_t me_mapsize; /**< size of the data memory map */
|
||||||
size_t me_size; /**< current file size */
|
|
||||||
pgno_t me_maxpg; /**< me_mapsize / me_psize */
|
pgno_t me_maxpg; /**< me_mapsize / me_psize */
|
||||||
MDB_dbx *me_dbxs; /**< array of static DB info */
|
MDB_dbx *me_dbxs; /**< array of static DB info */
|
||||||
uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
|
uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */
|
||||||
@ -1193,7 +1203,9 @@ struct MDB_env {
|
|||||||
MDB_assert_func *me_assert_func; /**< Callback for assertion failures */
|
MDB_assert_func *me_assert_func; /**< Callback for assertion failures */
|
||||||
uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last mdb_env_sync() */
|
uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last mdb_env_sync() */
|
||||||
uint64_t me_sync_threshold; /**< Treshold of above to force synchronous flush */
|
uint64_t me_sync_threshold; /**< Treshold of above to force synchronous flush */
|
||||||
size_t me_sync_size; /**< Tracking me_size for FGREW/fsync() */
|
#ifdef FDATASYNC_MAYBE_BROKEN
|
||||||
|
size_t me_sync_size; /**< Tracking file size at last sync to decide when fsync() is needed */
|
||||||
|
#endif /* FDATASYNC_MAYBE_BROKEN */
|
||||||
MDB_oom_func *me_oom_func; /**< Callback for kicking laggard readers */
|
MDB_oom_func *me_oom_func; /**< Callback for kicking laggard readers */
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -2514,34 +2526,18 @@ fail:
|
|||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* internal env_sync flags: */
|
|
||||||
#define FORCE 1 /* as before, force a flush */
|
|
||||||
#define FGREW 0x8000 /* file has grown, do a full fsync instead of just
|
|
||||||
fdatasync. We shouldn't have to do this, according to the POSIX spec.
|
|
||||||
But common Linux FSs violate the spec and won't sync required metadata
|
|
||||||
correctly when the file grows. This only makes a difference if the
|
|
||||||
platform actually distinguishes fdatasync from fsync.
|
|
||||||
http://www.openldap.org/lists/openldap-devel/201411/msg00000.html */
|
|
||||||
|
|
||||||
static int
|
static int
|
||||||
mdb_env_sync0(MDB_env *env, unsigned int *flags)
|
mdb_env_sync0(MDB_env *env, int *force)
|
||||||
{
|
{
|
||||||
int rc = 0, force;
|
int rc = 0;
|
||||||
if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold)
|
if (env->me_sync_threshold && env->me_sync_pending >= env->me_sync_threshold)
|
||||||
*flags |= FORCE;
|
*force = 1;
|
||||||
force = *flags & FORCE;
|
if (*force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
|
||||||
if (force || !F_ISSET(env->me_flags, MDB_NOSYNC)) {
|
|
||||||
if (env->me_sync_size != env->me_size)
|
|
||||||
*flags |= FGREW;
|
|
||||||
if (env->me_flags & MDB_WRITEMAP) {
|
if (env->me_flags & MDB_WRITEMAP) {
|
||||||
int mode = ((env->me_flags & MDB_MAPASYNC) && !force)
|
int mode = ((env->me_flags & MDB_MAPASYNC) && *force == 0) ? MS_ASYNC : MS_SYNC;
|
||||||
? MS_ASYNC : MS_SYNC;
|
|
||||||
|
|
||||||
/* LY: skip meta-pages */
|
|
||||||
size_t data_offset = env->me_os_psize;
|
|
||||||
while (data_offset < env->me_psize + env->me_psize)
|
|
||||||
data_offset += env->me_os_psize;
|
|
||||||
|
|
||||||
|
/* LY: skip meta-pages, sync ones explicit later */
|
||||||
|
size_t data_offset = (env->me_psize * 2 + env->me_os_psize - 1) & ~(env->me_os_psize - 1);
|
||||||
if (MDB_MSYNC(env->me_map + data_offset, env->me_mapsize - data_offset, mode))
|
if (MDB_MSYNC(env->me_map + data_offset, env->me_mapsize - data_offset, mode))
|
||||||
rc = ErrCode();
|
rc = ErrCode();
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
@ -2549,20 +2545,22 @@ mdb_env_sync0(MDB_env *env, unsigned int *flags)
|
|||||||
rc = ErrCode();
|
rc = ErrCode();
|
||||||
#endif
|
#endif
|
||||||
} else {
|
} else {
|
||||||
#ifdef HAVE_FDATASYNC
|
/* (LY) TODO: sync_file_range() for data and later fdatasync() for meta,
|
||||||
if (*flags & FGREW) {
|
ALSO sync_file_range() needed before calling fsync().
|
||||||
if (fsync(env->me_fd)) /* Avoid ext-fs bugs, do full sync */
|
*/
|
||||||
|
#ifdef FDATASYNC_MAYBE_BROKEN
|
||||||
|
if (env->me_sync_size != env->me_mapsize && (env->me_flags & MDB_BROKEN_DATASYNC)) {
|
||||||
|
if (fsync(env->me_fd))
|
||||||
rc = ErrCode();
|
rc = ErrCode();
|
||||||
|
else
|
||||||
|
env->me_sync_size = env->me_mapsize;
|
||||||
} else
|
} else
|
||||||
#endif
|
#endif /* FDATASYNC_MAYBE_BROKEN */
|
||||||
if (MDB_FDATASYNC(env->me_fd))
|
if (MDB_FDATASYNC(env->me_fd))
|
||||||
rc = ErrCode();
|
rc = ErrCode();
|
||||||
}
|
}
|
||||||
if (! rc) {
|
if (! rc)
|
||||||
env->me_sync_pending = 0;
|
env->me_sync_pending = 0;
|
||||||
if (*flags & FGREW)
|
|
||||||
env->me_sync_size = env->me_size;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return rc;
|
return rc;
|
||||||
}
|
}
|
||||||
@ -2570,7 +2568,6 @@ mdb_env_sync0(MDB_env *env, unsigned int *flags)
|
|||||||
int
|
int
|
||||||
mdb_env_sync(MDB_env *env, int force)
|
mdb_env_sync(MDB_env *env, int force)
|
||||||
{
|
{
|
||||||
unsigned int flags = force ? FORCE | FGREW : 0;
|
|
||||||
MDB_meta *meta;
|
MDB_meta *meta;
|
||||||
txnid_t checkpoint;
|
txnid_t checkpoint;
|
||||||
int rc, lockfree_countdown = 3;
|
int rc, lockfree_countdown = 3;
|
||||||
@ -2590,7 +2587,7 @@ mdb_env_sync(MDB_env *env, int force)
|
|||||||
checkpoint = meta->mm_txnid;
|
checkpoint = meta->mm_txnid;
|
||||||
|
|
||||||
/* first sync data. */
|
/* first sync data. */
|
||||||
rc = mdb_env_sync0(env, &flags);
|
rc = mdb_env_sync0(env, &force);
|
||||||
|
|
||||||
/* then sync meta-pages. */
|
/* then sync meta-pages. */
|
||||||
if (rc == 0 && (env->me_flags & MDB_WRITEMAP)) {
|
if (rc == 0 && (env->me_flags & MDB_WRITEMAP)) {
|
||||||
@ -2598,7 +2595,7 @@ mdb_env_sync(MDB_env *env, int force)
|
|||||||
if (MDB_MSYNC(env->me_map, env->me_psize * 2, mode))
|
if (MDB_MSYNC(env->me_map, env->me_psize * 2, mode))
|
||||||
rc = ErrCode();
|
rc = ErrCode();
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
if (rc == 0 && mode == MS_SYNC && MDB_FDATASYNC(env->me_fd))
|
else if (mode == MS_SYNC && MDB_FDATASYNC(env->me_fd))
|
||||||
rc = ErrCode();
|
rc = ErrCode();
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
@ -3657,8 +3654,8 @@ done:
|
|||||||
int
|
int
|
||||||
mdb_txn_commit(MDB_txn *txn)
|
mdb_txn_commit(MDB_txn *txn)
|
||||||
{
|
{
|
||||||
int rc;
|
int rc, force = 0;
|
||||||
unsigned int i;
|
unsigned i;
|
||||||
MDB_env *env;
|
MDB_env *env;
|
||||||
|
|
||||||
if (txn == NULL || txn->mt_env == NULL)
|
if (txn == NULL || txn->mt_env == NULL)
|
||||||
@ -3862,16 +3859,9 @@ mdb_txn_commit(MDB_txn *txn)
|
|||||||
mdb_audit(txn);
|
mdb_audit(txn);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
i = 0;
|
|
||||||
#ifdef HAVE_FDATASYNC
|
|
||||||
if (txn->mt_next_pgno * env->me_psize > env->me_size) {
|
|
||||||
i |= FGREW;
|
|
||||||
env->me_size = txn->mt_next_pgno * env->me_psize;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
if ((rc = mdb_page_flush(txn, 0)) ||
|
if ((rc = mdb_page_flush(txn, 0)) ||
|
||||||
(rc = mdb_env_sync0(env, &i)) ||
|
(rc = mdb_env_sync0(env, &force)) ||
|
||||||
(rc = mdb_env_write_meta(txn, i != 0)))
|
(rc = mdb_env_write_meta(txn, force)))
|
||||||
goto fail;
|
goto fail;
|
||||||
|
|
||||||
/* Free P_LOOSE pages left behind in dirty_list */
|
/* Free P_LOOSE pages left behind in dirty_list */
|
||||||
@ -4372,6 +4362,11 @@ mdb_fsize(HANDLE fd, size_t *size)
|
|||||||
return MDB_SUCCESS;
|
return MDB_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef FDATASYNC_MAYBE_BROKEN
|
||||||
|
# include <sys/utsname.h>
|
||||||
|
# include <sys/vfs.h>
|
||||||
|
#endif /* FDATASYNC_MAYBE_BROKEN */
|
||||||
|
|
||||||
/** Further setup required for opening an LMDB environment
|
/** Further setup required for opening an LMDB environment
|
||||||
*/
|
*/
|
||||||
static int ESECT
|
static int ESECT
|
||||||
@ -4390,6 +4385,54 @@ mdb_env_open2(MDB_env *env)
|
|||||||
env->me_pidquery = PROCESS_QUERY_INFORMATION;
|
env->me_pidquery = PROCESS_QUERY_INFORMATION;
|
||||||
#endif /* _WIN32 */
|
#endif /* _WIN32 */
|
||||||
|
|
||||||
|
#ifdef FDATASYNC_MAYBE_BROKEN
|
||||||
|
/* ext3/ext4 fdatasync is broken on some older Linux kernels.
|
||||||
|
* https://lkml.org/lkml/2012/9/3/83
|
||||||
|
* Kernels after 3.6-rc6 are known good.
|
||||||
|
* https://lkml.org/lkml/2012/9/10/556
|
||||||
|
* See if the DB is on ext3/ext4, then check for new enough kernel
|
||||||
|
* Kernels 2.6.32.60, 2.6.34.15, 3.2.30, and 3.5.4 are also known
|
||||||
|
* to be patched.
|
||||||
|
*/
|
||||||
|
{
|
||||||
|
struct statfs st;
|
||||||
|
fstatfs(env->me_fd, &st);
|
||||||
|
while (st.f_type == 0xEF53) {
|
||||||
|
struct utsname uts;
|
||||||
|
int i;
|
||||||
|
uname(&uts);
|
||||||
|
if (uts.release[0] < '3') {
|
||||||
|
if (!strncmp(uts.release, "2.6.32.", 7)) {
|
||||||
|
i = atoi(uts.release+7);
|
||||||
|
if (i >= 60)
|
||||||
|
break; /* 2.6.32.60 and newer is OK */
|
||||||
|
} else if (!strncmp(uts.release, "2.6.34.", 7)) {
|
||||||
|
i = atoi(uts.release+7);
|
||||||
|
if (i >= 15)
|
||||||
|
break; /* 2.6.34.15 and newer is OK */
|
||||||
|
}
|
||||||
|
} else if (uts.release[0] == '3') {
|
||||||
|
i = atoi(uts.release+2);
|
||||||
|
if (i > 5)
|
||||||
|
break; /* 3.6 and newer is OK */
|
||||||
|
if (i == 5) {
|
||||||
|
i = atoi(uts.release+4);
|
||||||
|
if (i >= 4)
|
||||||
|
break; /* 3.5.4 and newer is OK */
|
||||||
|
} else if (i == 2) {
|
||||||
|
i = atoi(uts.release+4);
|
||||||
|
if (i >= 30)
|
||||||
|
break; /* 3.2.30 and newer is OK */
|
||||||
|
}
|
||||||
|
} else { /* 4.x and newer is OK */
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
env->me_flags |= MDB_BROKEN_DATASYNC;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif /* FDATASYNC_MAYBE_BROKEN */
|
||||||
|
|
||||||
if ((i = mdb_env_read_header(env, &meta)) != 0) {
|
if ((i = mdb_env_read_header(env, &meta)) != 0) {
|
||||||
if (i != ENOENT)
|
if (i != ENOENT)
|
||||||
return i;
|
return i;
|
||||||
@ -4433,10 +4476,6 @@ mdb_env_open2(MDB_env *env)
|
|||||||
newenv = 0;
|
newenv = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
rc = mdb_fsize(env->me_fd, &env->me_size);
|
|
||||||
if (rc)
|
|
||||||
return rc;
|
|
||||||
|
|
||||||
rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
|
rc = mdb_env_map(env, (flags & MDB_FIXEDMAP) ? meta.mm_address : NULL);
|
||||||
if (rc)
|
if (rc)
|
||||||
return rc;
|
return rc;
|
||||||
@ -4457,6 +4496,9 @@ mdb_env_open2(MDB_env *env)
|
|||||||
env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db));
|
env->me_maxkey = env->me_nodemax - (NODESIZE + sizeof(MDB_db));
|
||||||
#endif
|
#endif
|
||||||
env->me_maxpg = env->me_mapsize / env->me_psize;
|
env->me_maxpg = env->me_mapsize / env->me_psize;
|
||||||
|
#ifdef FDATASYNC_MAYBE_BROKEN
|
||||||
|
env->me_sync_size = env->me_mapsize;
|
||||||
|
#endif /* FDATASYNC_MAYBE_BROKEN */
|
||||||
|
|
||||||
#if MDB_DEBUG
|
#if MDB_DEBUG
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user