mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-02 00:34:13 +08:00
lmdb: weak/steady for meta-pages.
This is 5/9 for https://github.com/ReOpen/ReOpenLDAP/issues/1 and https://github.com/ReOpen/ReOpenLDAP/issues/2 Change-Id: Ica2dbe0bfd6ba58c00de161e2cd50594ee39c44d
This commit is contained in:
parent
a283d782f6
commit
312135169f
8
lmdb.h
8
lmdb.h
@ -725,8 +725,14 @@ int mdb_env_sync(MDB_env *env, int force);
|
||||
* use any such handles after calling this function will cause a SIGSEGV.
|
||||
* The environment handle will be freed and must not be used again after this call.
|
||||
* @param[in] env An environment handle returned by #mdb_env_create()
|
||||
* @param[in] dont_sync A dont'sync flag, if non-zero the last checkpoint
|
||||
* (meta-page update) will be kept "as is" and may be still "weak"
|
||||
* in NOSYNC/MAPASYNC modes. Such "weak" checkpoint will be ignored
|
||||
* on opening next time, and transactions since the last non-weak
|
||||
* checkpoint (meta-page update) will rolledback for consistency guarantee.
|
||||
*/
|
||||
void mdb_env_close(MDB_env *env);
|
||||
void mdb_env_close_ex(MDB_env *env, int dont_sync);
|
||||
#define mdb_env_close(env) mdb_env_close_ex(env, 0)
|
||||
|
||||
/** @brief Set environment flags.
|
||||
*
|
||||
|
78
mdb.c
78
mdb.c
@ -826,6 +826,11 @@ typedef struct MDB_meta {
|
||||
#define mm_flags mm_dbs[0].md_flags
|
||||
pgno_t mm_last_pg; /**< last used page in file */
|
||||
volatile txnid_t mm_txnid; /**< txnid that committed this page */
|
||||
#define MDB_DATASIGN_NONE 0
|
||||
#define MDB_DATASIGN_WEAK 1
|
||||
volatile uint64_t mm_datasync_sign;
|
||||
#define META_IS_WEAK(meta) ((meta)->mm_datasync_sign == MDB_DATASIGN_WEAK)
|
||||
#define META_IS_STEADY(meta) ((meta)->mm_datasync_sign > MDB_DATASIGN_WEAK)
|
||||
} MDB_meta;
|
||||
|
||||
/** Buffer for a stack-allocated meta page.
|
||||
@ -1873,6 +1878,19 @@ bailout:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static uint64_t mdb_meta_sign(MDB_meta *meta) {
|
||||
uint64_t sign = MDB_DATASIGN_NONE;
|
||||
#if 0 /* TODO */
|
||||
sign = hippeus_hash64(
|
||||
&target->mm_mapsize,
|
||||
sizeof(MDB_meta) - offsetof(MDB_meta, mm_mapsize),
|
||||
meta->mm_version | (uint64_t) MDB_MAGIC << 32
|
||||
);
|
||||
#endif
|
||||
/* LY: newer returns MDB_DATASIGN_NONE or MDB_DATASIGN_WEAK */
|
||||
return (sign > MDB_DATASIGN_WEAK) ? sign : ~sign;
|
||||
}
|
||||
|
||||
/** Check both meta pages to see which one is newer.
|
||||
* @param[in] env the environment handle
|
||||
* @return pointer to last meta-page.
|
||||
@ -1896,6 +1914,13 @@ mdb_find_oldest(MDB_env *env, int *laggard)
|
||||
MDB_reader *r = env->me_txns->mti_readers;
|
||||
txnid_t oldest = env->me_txns->mti_txnid;
|
||||
|
||||
MDB_meta* a = METAPAGE_1(env);
|
||||
MDB_meta* b = METAPAGE_2(env);
|
||||
if (META_IS_WEAK(a) && oldest > b->mm_txnid)
|
||||
oldest = b->mm_txnid;
|
||||
if (META_IS_WEAK(b) && oldest > a->mm_txnid)
|
||||
oldest = a->mm_txnid;
|
||||
|
||||
for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0; ) {
|
||||
if (r[i].mr_pid) {
|
||||
txnid_t snap = r[i].mr_txnid;
|
||||
@ -2488,7 +2513,8 @@ mdb_env_sync(MDB_env *env, int force)
|
||||
head = mdb_env_meta_head(env);
|
||||
}
|
||||
|
||||
if (env->me_sync_pending == 0 && env->me_mapsize == head->mm_mapsize)
|
||||
if (! META_IS_WEAK(head) && env->me_sync_pending == 0
|
||||
&& env->me_mapsize == head->mm_mapsize)
|
||||
/* LY: nothing to do */
|
||||
return MDB_SUCCESS;
|
||||
|
||||
@ -2500,7 +2526,8 @@ mdb_env_sync(MDB_env *env, int force)
|
||||
/* LY: head may be changed while the mutex has been acquired. */
|
||||
head = mdb_env_meta_head(env);
|
||||
rc = MDB_SUCCESS;
|
||||
if (env->me_sync_pending || env->me_mapsize != head->mm_mapsize) {
|
||||
if (META_IS_WEAK(head) || env->me_sync_pending != 0
|
||||
|| env->me_mapsize != head->mm_mapsize) {
|
||||
MDB_meta meta = *head;
|
||||
rc = mdb_env_sync0(env, flags, &meta);
|
||||
}
|
||||
@ -2694,7 +2721,10 @@ mdb_txn_renew0(MDB_txn *txn)
|
||||
meta = mdb_env_meta_head(env);
|
||||
r->mr_txnid = meta->mm_txnid;
|
||||
mdb_coherent_barrier();
|
||||
memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
|
||||
txn->mt_next_pgno = meta->mm_last_pg+1;
|
||||
} while(unlikely(r->mr_txnid != env->me_txns->mti_txnid));
|
||||
|
||||
txn->mt_txnid = r->mr_txnid;
|
||||
txn->mt_u.reader = r;
|
||||
} else {
|
||||
@ -2732,14 +2762,10 @@ mdb_txn_renew0(MDB_txn *txn)
|
||||
txn->mt_lifo_reclaimed[0] = 0;
|
||||
env->me_txn = txn;
|
||||
memcpy(txn->mt_dbiseqs, env->me_dbiseqs, env->me_maxdbs * sizeof(unsigned));
|
||||
memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
|
||||
txn->mt_next_pgno = meta->mm_last_pg+1;
|
||||
}
|
||||
|
||||
/* Copy the DB info and flags */
|
||||
memcpy(txn->mt_dbs, meta->mm_dbs, 2 * sizeof(MDB_db));
|
||||
|
||||
/* Moved to here to avoid a data race in read TXNs */
|
||||
txn->mt_next_pgno = meta->mm_last_pg+1;
|
||||
|
||||
for (i=2; i<txn->mt_numdbs; i++) {
|
||||
x = env->me_dbflags[i];
|
||||
txn->mt_dbs[i].md_flags = x & PERSISTENT_FLAGS;
|
||||
@ -3803,6 +3829,7 @@ mdb_env_init_meta0(MDB_env *env, MDB_meta *meta)
|
||||
meta->mm_flags |= MDB_INTEGERKEY;
|
||||
meta->mm_dbs[0].md_root = P_INVALID;
|
||||
meta->mm_dbs[1].md_root = P_INVALID;
|
||||
meta->mm_datasync_sign = mdb_meta_sign(meta);
|
||||
}
|
||||
|
||||
/** Write the environment parameters of a freshly created DB environment.
|
||||
@ -3851,17 +3878,23 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
|
||||
{
|
||||
int rc;
|
||||
MDB_meta* head = mdb_env_meta_head(env);
|
||||
MDB_meta* tail = mdb_env_meta_flipflop(env, head);
|
||||
size_t prev_mapsize = head->mm_mapsize;
|
||||
MDB_meta* tail = META_IS_WEAK(head) ? head : mdb_env_meta_flipflop(env, head);
|
||||
off_t offset = (char*) tail - env->me_map;
|
||||
|
||||
mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0);
|
||||
mdb_assert(env, env->me_sync_pending != 0 || env->me_mapsize != head->mm_mapsize);
|
||||
mdb_assert(env, pending->mm_txnid > head->mm_txnid);
|
||||
mdb_assert(env, pending->mm_txnid > tail->mm_txnid);
|
||||
mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0
|
||||
|| env->me_mapsize != prev_mapsize);
|
||||
mdb_assert(env, pending->mm_txnid > head->mm_txnid
|
||||
|| (pending->mm_txnid == head->mm_txnid && META_IS_WEAK(head)));
|
||||
mdb_assert(env, pending->mm_txnid > tail->mm_txnid || META_IS_WEAK(tail));
|
||||
|
||||
MDB_meta* stay = mdb_env_meta_flipflop(env, tail);
|
||||
mdb_assert(env, pending->mm_txnid > stay->mm_txnid);
|
||||
|
||||
pending->mm_mapsize = env->me_mapsize;
|
||||
if (unlikely(pending->mm_mapsize != head->mm_mapsize)) {
|
||||
if (pending->mm_mapsize < head->mm_mapsize) {
|
||||
if (unlikely(pending->mm_mapsize != prev_mapsize)) {
|
||||
if (pending->mm_mapsize < prev_mapsize) {
|
||||
/* LY: currently this can't happen, but force full-sync. */
|
||||
flags &= MDB_WRITEMAP;
|
||||
} else {
|
||||
@ -3884,7 +3917,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
|
||||
env->me_sync_pending = 0;
|
||||
} else {
|
||||
int (*sync_fd)(int fd) = fdatasync;
|
||||
if (unlikely(head->mm_mapsize != pending->mm_mapsize)) {
|
||||
if (unlikely(prev_mapsize != pending->mm_mapsize)) {
|
||||
/* LY: It is no reason to use fdatasync() here, even in case
|
||||
* no such bug in a kernel. Because "no-bug" mean that a kernel
|
||||
* internally do nearly the same.
|
||||
@ -3906,9 +3939,14 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
|
||||
}
|
||||
|
||||
/* LY: step#2 - update meta-page. */
|
||||
pending->mm_datasync_sign = env->me_sync_pending
|
||||
? MDB_DATASIGN_WEAK : mdb_meta_sign(pending);
|
||||
mdb_debug("writing meta page %d for root page %zu",
|
||||
offset >= env->me_psize, pending->mm_dbs[MAIN_DBI].md_root);
|
||||
if (env->me_flags & MDB_WRITEMAP) {
|
||||
tail->mm_datasync_sign = MDB_DATASIGN_WEAK;
|
||||
tail->mm_txnid = 0;
|
||||
mdb_coherent_barrier();
|
||||
tail->mm_mapsize = pending->mm_mapsize;
|
||||
tail->mm_dbs[0] = pending->mm_dbs[0];
|
||||
tail->mm_dbs[1] = pending->mm_dbs[1];
|
||||
@ -3916,6 +3954,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
|
||||
/* (LY) ITS#7969: issue a memory barrier, it is noop for x86. */
|
||||
mdb_coherent_barrier();
|
||||
tail->mm_txnid = pending->mm_txnid;
|
||||
tail->mm_datasync_sign = pending->mm_datasync_sign;
|
||||
} else {
|
||||
pending->mm_magic = MDB_MAGIC;
|
||||
pending->mm_version = MDB_DATA_VERSION;
|
||||
@ -3959,9 +3998,9 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
|
||||
}
|
||||
|
||||
/* LY: currently this can't happen, but... */
|
||||
if (unlikely(pending->mm_mapsize < head->mm_mapsize)) {
|
||||
if (unlikely(pending->mm_mapsize < prev_mapsize)) {
|
||||
mdb_assert(env, pending->mm_mapsize == env->me_mapsize);
|
||||
if (unlikely(mremap(env->me_map, head->mm_mapsize, pending->mm_mapsize,
|
||||
if (unlikely(mremap(env->me_map, prev_mapsize, pending->mm_mapsize,
|
||||
MREMAP_FIXED, pending->mm_address) == MAP_FAILED)) {
|
||||
rc = errno;
|
||||
goto fail;
|
||||
@ -4673,13 +4712,16 @@ mdb_env_close0(MDB_env *env)
|
||||
}
|
||||
|
||||
void ESECT
|
||||
mdb_env_close(MDB_env *env)
|
||||
mdb_env_close_ex(MDB_env *env, int dont_sync)
|
||||
{
|
||||
MDB_page *dp;
|
||||
|
||||
if (env == NULL)
|
||||
return;
|
||||
|
||||
if (! dont_sync)
|
||||
mdb_env_sync(env, 1);
|
||||
|
||||
VALGRIND_DESTROY_MEMPOOL(env);
|
||||
while ((dp = env->me_dpages) != NULL) {
|
||||
VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next));
|
||||
|
Loading…
x
Reference in New Issue
Block a user