lmdb: drops no-lock operation mode (now MDB_NOLOCK is UNSUPPORTED).

This is 3/9 for https://github.com/ReOpen/ReOpenLDAP/issues/1
and https://github.com/ReOpen/ReOpenLDAP/issues/2

Change-Id: I7cd5d90c41424d6635accbb10c5801adeb1087e9
This commit is contained in:
Leo Yuriev 2015-05-12 18:29:07 +03:00
parent eec1ccaa77
commit 1c37139b2a
2 changed files with 141 additions and 179 deletions

5
lmdb.h
View File

@ -277,8 +277,9 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel
#define MDB_MAPASYNC 0x100000 #define MDB_MAPASYNC 0x100000
/** tie reader locktable slots to #MDB_txn objects instead of to threads */ /** tie reader locktable slots to #MDB_txn objects instead of to threads */
#define MDB_NOTLS 0x200000 #define MDB_NOTLS 0x200000
/** don't do any locking, caller must manage their own locks */ /** don't do any locking, caller must manage their own locks
#define MDB_NOLOCK 0x400000 * WARNING: ReOpenLDAP don't support this mode. */
#define MDB_NOLOCK__UNSUPPORTED 0x400000
/** don't do readahead */ /** don't do readahead */
#define MDB_NORDAHEAD 0x800000 #define MDB_NORDAHEAD 0x800000
/** don't initialize malloc'd memory before writing to datafile */ /** don't initialize malloc'd memory before writing to datafile */

315
mdb.c
View File

@ -1070,13 +1070,13 @@ struct MDB_env {
unsigned me_os_psize; /**< OS page size, from #GET_PAGESIZE */ unsigned me_os_psize; /**< OS page size, from #GET_PAGESIZE */
unsigned me_maxreaders; /**< size of the reader table */ unsigned me_maxreaders; /**< size of the reader table */
/** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */
volatile int me_close_readers; int me_close_readers;
MDB_dbi me_numdbs; /**< number of DBs opened */ MDB_dbi me_numdbs; /**< number of DBs opened */
MDB_dbi me_maxdbs; /**< size of the DB table */ MDB_dbi me_maxdbs; /**< size of the DB table */
pid_t me_pid; /**< process ID of this env */ pid_t me_pid; /**< process ID of this env */
char *me_path; /**< path to the DB files */ char *me_path; /**< path to the DB files */
char *me_map; /**< the memory map of the data file */ char *me_map; /**< the memory map of the data file */
MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ MDB_txninfo *me_txns; /**< the memory map of the lock file */
MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ MDB_meta *me_metas[2]; /**< pointers to the two meta pages */
void *me_pbuf; /**< scratch area for DUPSORT put() */ void *me_pbuf; /**< scratch area for DUPSORT put() */
MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn; /**< current write transaction */
@ -1913,57 +1913,35 @@ done:
return rc; return rc;
} }
/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ /** Find oldest txnid still referenced. */
static txnid_t static txnid_t
mdb_find_oldest(MDB_txn *txn) mdb_find_oldest(MDB_env *env, int *laggard)
{ {
int i; int i, reader;
txnid_t mr, oldest = txn->mt_txnid - 1; MDB_reader *r = env->me_txns->mti_readers;
if (txn->mt_env->me_txns) { txnid_t oldest = env->me_txns->mti_txnid;
MDB_reader *r = txn->mt_env->me_txns->mti_readers;
for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0; ) {
if (r[i].mr_pid) { if (r[i].mr_pid) {
mr = r[i].mr_txnid; txnid_t snap = r[i].mr_txnid;
if (oldest > mr) if (oldest > snap) {
oldest = mr; oldest = snap;
reader = i;
} }
} }
} }
return oldest;
}
static txnid_t
mdb_laggard_reader(MDB_env *env, int *laggard)
{
txnid_t tail = 0;
if (laggard) if (laggard)
*laggard = -1; *laggard = reader;
if (env->me_txns->mti_txnid > 1) { return oldest;
int i;
MDB_reader *r = env->me_txns->mti_readers;
tail = env->me_txns->mti_txnid - 1;
for (i = env->me_txns->mti_numreaders; --i >= 0; ) {
if (r[i].mr_pid) {
txnid_t mr = r[i].mr_txnid;
if (tail > mr) {
tail = mr;
if (laggard)
*laggard = i;
}
}
}
}
return tail;
} }
static int static int
mdb_oomkick_laggard(MDB_env *env) mdb_oomkick_laggard(MDB_env *env)
{ {
int idx, retry; int reader, retry;
txnid_t snap, tail = mdb_laggard_reader(env, &idx); txnid_t snap, oldest = mdb_find_oldest(env, &reader);
if (idx < 0) if (reader < 0)
return 0; return 0;
for(retry = 0; ; ++retry) { for(retry = 0; ; ++retry) {
@ -1975,21 +1953,21 @@ mdb_oomkick_laggard(MDB_env *env)
if (mdb_reader_check(env, NULL)) if (mdb_reader_check(env, NULL))
break; break;
snap = mdb_laggard_reader(env, NULL); snap = mdb_find_oldest(env, NULL);
if (tail < snap) if (oldest < snap)
return 1; return 1;
if (!env->me_oom_func) if (!env->me_oom_func)
break; break;
r = &env->me_txns->mti_readers[ idx ]; r = &env->me_txns->mti_readers[ reader ];
pid = r->mr_pid; pid = r->mr_pid;
tid = r->mr_tid; tid = r->mr_tid;
if (r->mr_txnid != tail || pid <= 0) if (r->mr_txnid != oldest || pid <= 0)
continue; continue;
rc = env->me_oom_func(env, pid, (void*) tid, tail, rc = env->me_oom_func(env, pid, (void*) tid, oldest,
env->me_metas[ mdb_env_pick_meta(env) ]->mm_txnid - tail, retry); env->me_metas[ mdb_env_pick_meta(env) ]->mm_txnid - oldest, retry);
if (rc < 0) if (rc < 0)
break; break;
@ -2003,8 +1981,8 @@ mdb_oomkick_laggard(MDB_env *env)
} }
} }
snap = mdb_laggard_reader(env, NULL); snap = mdb_find_oldest(env, NULL);
return tail < snap; return oldest < snap;
} }
/** Add a page to the txn's dirty list */ /** Add a page to the txn's dirty list */
@ -2121,7 +2099,7 @@ oomkick_retry:;
last = env->me_pglast - 1; last = env->me_pglast - 1;
op = MDB_SET_RANGE; op = MDB_SET_RANGE;
} else { } else {
oldest = mdb_find_oldest(txn); oldest = mdb_find_oldest(env, NULL);
env->me_pgoldest = oldest; env->me_pgoldest = oldest;
found_old = 1; found_old = 1;
/* Begin from oldest reader if any */ /* Begin from oldest reader if any */
@ -2144,7 +2122,7 @@ oomkick_retry:;
/* Do not fetch more if the record will be too recent */ /* Do not fetch more if the record will be too recent */
if (op != MDB_FIRST && ++last >= oldest) { if (op != MDB_FIRST && ++last >= oldest) {
if (!found_old) { if (!found_old) {
oldest = mdb_find_oldest(txn); oldest = mdb_find_oldest(env, NULL);
env->me_pgoldest = oldest; env->me_pgoldest = oldest;
found_old = 1; found_old = 1;
} }
@ -2157,7 +2135,7 @@ oomkick_retry:;
if (rc == MDB_NOTFOUND && lifo) { if (rc == MDB_NOTFOUND && lifo) {
if (op == MDB_SET_RANGE) if (op == MDB_SET_RANGE)
continue; continue;
env->me_pgoldest = mdb_find_oldest(txn); env->me_pgoldest = mdb_find_oldest(env, NULL);
found_old = 1; found_old = 1;
if (oldest < env->me_pgoldest) { if (oldest < env->me_pgoldest) {
oldest = env->me_pgoldest; oldest = env->me_pgoldest;
@ -2177,7 +2155,7 @@ oomkick_retry:;
last = *(txnid_t*)key.mv_data; last = *(txnid_t*)key.mv_data;
if (oldest <= last) { if (oldest <= last) {
if (!found_old) { if (!found_old) {
oldest = mdb_find_oldest(txn); oldest = mdb_find_oldest(env, NULL);
env->me_pgoldest = oldest; env->me_pgoldest = oldest;
found_old = 1; found_old = 1;
} }
@ -2693,7 +2671,6 @@ static int
mdb_txn_renew0(MDB_txn *txn) mdb_txn_renew0(MDB_txn *txn)
{ {
MDB_env *env = txn->mt_env; MDB_env *env = txn->mt_env;
MDB_txninfo *ti = env->me_txns;
MDB_meta *meta; MDB_meta *meta;
unsigned i, nr; unsigned i, nr;
uint16_t x; uint16_t x;
@ -2705,84 +2682,78 @@ mdb_txn_renew0(MDB_txn *txn)
/* Setup db info */ /* Setup db info */
txn->mt_numdbs = env->me_numdbs; txn->mt_numdbs = env->me_numdbs;
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
if (!ti) {
meta = env->me_metas[ mdb_env_pick_meta(env) ]; MDB_reader *r = (env->me_flags & MDB_NOTLS)
txn->mt_txnid = meta->mm_txnid; ? txn->mt_u.reader : pthread_getspecific(env->me_txkey);
txn->mt_u.reader = NULL;
if (likely(r)) {
if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1))
return MDB_BAD_RSLOT;
} else { } else {
MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : pid_t pid = env->me_pid;
pthread_getspecific(env->me_txkey); pthread_t tid = pthread_self();
if (r) { pthread_mutex_t *rmutex = MDB_MUTEX(env, r);
if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)
return MDB_BAD_RSLOT;
} else {
pid_t pid = env->me_pid;
pthread_t tid = pthread_self();
pthread_mutex_t *rmutex = MDB_MUTEX(env, r);
if (!env->me_live_reader) { if (unlikely(!env->me_live_reader)) {
rc = mdb_reader_pid(env, F_SETLK, pid); rc = mdb_reader_pid(env, F_SETLK, pid);
if (rc) if (unlikely(rc != MDB_SUCCESS))
return rc;
env->me_live_reader = 1;
}
rc = mdb_mutex_lock(env, rmutex);
if (unlikely(rc))
return rc; return rc;
nr = ti->mti_numreaders; env->me_live_reader = 1;
for (i=0; i<nr; i++)
if (ti->mti_readers[i].mr_pid == 0)
break;
if (i == env->me_maxreaders) {
mdb_mutex_unlock(env, rmutex);
return MDB_READERS_FULL;
}
r = &ti->mti_readers[i];
/* Claim the reader slot, carefully since other code
* uses the reader table un-mutexed: First reset the
* slot, next publish it in mti_numreaders. After
* that, it is safe for mdb_env_close() to touch it.
* When it will be closed, we can finally claim it.
*/
r->mr_pid = 0;
r->mr_txnid = (txnid_t)-1;
r->mr_tid = tid;
mdb_coherent_barrier();
if (i == nr)
ti->mti_numreaders = ++nr;
env->me_close_readers = nr;
r->mr_pid = pid;
mdb_mutex_unlock(env, rmutex);
new_notls = (env->me_flags & MDB_NOTLS);
if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
r->mr_pid = 0;
mdb_coherent_barrier();
return rc;
}
} }
do { /* LY: Retry on a race, ITS#7970. */
r->mr_txnid = ti->mti_txnid; rc = mdb_mutex_lock(env, rmutex);
mdb_coherent_barrier(); if (unlikely(rc != MDB_SUCCESS))
} while(r->mr_txnid != ti->mti_txnid); return rc;
txn->mt_txnid = r->mr_txnid; nr = env->me_txns->mti_numreaders;
txn->mt_u.reader = r; for (i=0; i<nr; i++)
if (env->me_txns->mti_readers[i].mr_pid == 0)
break;
if (unlikely(i == env->me_maxreaders)) {
mdb_mutex_unlock(env, rmutex);
return MDB_READERS_FULL;
}
r = &env->me_txns->mti_readers[i];
/* Claim the reader slot, carefully since other code
* uses the reader table un-mutexed: First reset the
* slot, next publish it in mti_numreaders. After
* that, it is safe for mdb_env_close() to touch it.
* When it will be closed, we can finally claim it.
*/
r->mr_pid = 0;
r->mr_txnid = (txnid_t)-1;
r->mr_tid = tid;
mdb_coherent_barrier(); mdb_coherent_barrier();
meta = env->me_metas[txn->mt_txnid & 1]; if (i == nr)
env->me_txns->mti_numreaders = ++nr;
if (env->me_close_readers < nr)
env->me_close_readers = nr;
r->mr_pid = pid;
mdb_mutex_unlock(env, rmutex);
new_notls = (env->me_flags & MDB_NOTLS);
if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) {
r->mr_pid = 0;
mdb_coherent_barrier();
return rc;
}
} }
do { /* LY: Retry on a race, ITS#7970. */
meta = env->me_metas[ mdb_env_pick_meta(env) ];
r->mr_txnid = meta->mm_txnid;
mdb_coherent_barrier();
} while(unlikely(r->mr_txnid != env->me_txns->mti_txnid));
txn->mt_txnid = r->mr_txnid;
txn->mt_u.reader = r;
} else { } else {
/* Not yet touching txn == env->me_txn0, it may be active */ /* Not yet touching txn == env->me_txn0, it may be active */
if (ti) { rc = mdb_mutex_lock(env, MDB_MUTEX(env, w));
rc = mdb_mutex_lock(env, MDB_MUTEX(env, w)); if (unlikely(rc))
if (unlikely(rc)) return rc;
return rc;
txn->mt_txnid = ti->mti_txnid; meta = env->me_metas[ mdb_env_pick_meta(env) ];
meta = env->me_metas[txn->mt_txnid & 1]; txn->mt_txnid = meta->mm_txnid;
} else {
meta = env->me_metas[ mdb_env_pick_meta(env) ];
txn->mt_txnid = meta->mm_txnid;
}
/* Setup db info */ /* Setup db info */
txn->mt_numdbs = env->me_numdbs; txn->mt_numdbs = env->me_numdbs;
txn->mt_txnid++; txn->mt_txnid++;
@ -3089,8 +3060,7 @@ mdb_txn_reset0(MDB_txn *txn, const char *act_)
env->me_txn = NULL; env->me_txn = NULL;
/* The writer mutex was locked in mdb_txn_begin. */ /* The writer mutex was locked in mdb_txn_begin. */
if (env->me_txns) mdb_mutex_unlock(env, MDB_MUTEX(env, w));
mdb_mutex_unlock(env, MDB_MUTEX(env, w));
} else { } else {
txn->mt_parent->mt_child = NULL; txn->mt_parent->mt_child = NULL;
env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate;
@ -3790,8 +3760,7 @@ done:
env->me_txn = NULL; env->me_txn = NULL;
mdb_dbis_update(txn, 1); mdb_dbis_update(txn, 1);
if (env->me_txns) mdb_mutex_unlock(env, MDB_MUTEX(env, w));
mdb_mutex_unlock(env, MDB_MUTEX(env, w));
if (txn != env->me_txn0) if (txn != env->me_txn0)
free(txn); free(txn);
@ -4019,9 +3988,7 @@ done:
* readers will get consistent data regardless of how fresh or * readers will get consistent data regardless of how fresh or
* how stale their view of these values is. * how stale their view of these values is.
*/ */
if (env->me_txns) env->me_txns->mti_txnid = txn->mt_txnid;
env->me_txns->mti_txnid = txn->mt_txnid;
return MDB_SUCCESS; return MDB_SUCCESS;
} }
@ -4606,7 +4573,7 @@ fail:
#define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC| \ #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC| \
MDB_NOMEMINIT|MDB_COALESCE) MDB_NOMEMINIT|MDB_COALESCE)
#define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \
MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_LIFORECLAIM) MDB_WRITEMAP|MDB_NOTLS|MDB_NORDAHEAD|MDB_LIFORECLAIM)
#if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS)
# error "Persistent DB flags & env flags overlap, but both go in mm_flags" # error "Persistent DB flags & env flags overlap, but both go in mm_flags"
@ -4664,7 +4631,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode)
} }
/* For RDONLY, get lockfile after we know datafile exists */ /* For RDONLY, get lockfile after we know datafile exists */
if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { if (!(flags & MDB_RDONLY)) {
rc = mdb_env_setup_locks(env, lpath, mode, &excl); rc = mdb_env_setup_locks(env, lpath, mode, &excl);
if (rc) if (rc)
goto leave; goto leave;
@ -4681,7 +4648,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode)
goto leave; goto leave;
} }
if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { if (flags & MDB_RDONLY) {
rc = mdb_env_setup_locks(env, lpath, mode, &excl); rc = mdb_env_setup_locks(env, lpath, mode, &excl);
if (rc) if (rc)
goto leave; goto leave;
@ -4775,21 +4742,21 @@ mdb_env_close0(MDB_env *env)
(void) close(env->me_mfd); (void) close(env->me_mfd);
if (env->me_fd != INVALID_HANDLE_VALUE) if (env->me_fd != INVALID_HANDLE_VALUE)
(void) close(env->me_fd); (void) close(env->me_fd);
if (env->me_txns) {
pid_t pid = env->me_pid; pid_t pid = env->me_pid;
/* Clearing readers is done in this function because /* Clearing readers is done in this function because
* me_txkey with its destructor must be disabled first. * me_txkey with its destructor must be disabled first.
* *
* We skip the the reader mutex, so we touch only * We skip the the reader mutex, so we touch only
* data owned by this process (me_close_readers and * data owned by this process (me_close_readers and
* our readers), and clear each reader atomically. * our readers), and clear each reader atomically.
*/ */
for (i = env->me_close_readers; --i >= 0; ) for (i = env->me_close_readers; --i >= 0; )
if (env->me_txns->mti_readers[i].mr_pid == pid) if (env->me_txns->mti_readers[i].mr_pid == pid)
env->me_txns->mti_readers[i].mr_pid = 0; env->me_txns->mti_readers[i].mr_pid = 0;
mdb_coherent_barrier(); mdb_coherent_barrier();
munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
}
if (env->me_lfd != INVALID_HANDLE_VALUE) { if (env->me_lfd != INVALID_HANDLE_VALUE) {
(void) close(env->me_lfd); (void) close(env->me_lfd);
} }
@ -8808,21 +8775,19 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd)
if (rc) if (rc)
return rc; return rc;
if (env->me_txns) { /* We must start the actual read txn after blocking writers */
/* We must start the actual read txn after blocking writers */ mdb_txn_reset0(txn, "reset-stage1");
mdb_txn_reset0(txn, "reset-stage1");
/* Temporarily block writers until we snapshot the meta pages */ /* Temporarily block writers until we snapshot the meta pages */
wmutex = MDB_MUTEX(env, w); wmutex = MDB_MUTEX(env, w);
rc = mdb_mutex_lock(env, wmutex); rc = mdb_mutex_lock(env, wmutex);
if (unlikely(rc)) if (unlikely(rc))
goto leave; goto leave;
rc = mdb_txn_renew0(txn); rc = mdb_txn_renew0(txn);
if (rc) { if (rc) {
mdb_mutex_unlock(env, wmutex); mdb_mutex_unlock(env, wmutex);
goto leave; goto leave;
}
} }
wsize = env->me_psize * 2; wsize = env->me_psize * 2;
@ -9071,22 +9036,20 @@ mdb_env_info(MDB_env *env, MDB_envinfo *arg)
arg->me_mapaddr = env->me_metas[toggle]->mm_address; arg->me_mapaddr = env->me_metas[toggle]->mm_address;
arg->me_mapsize = env->me_mapsize; arg->me_mapsize = env->me_mapsize;
arg->me_maxreaders = env->me_maxreaders; arg->me_maxreaders = env->me_maxreaders;
arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; arg->me_numreaders = env->me_txns->mti_numreaders;
arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg;
arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; arg->me_last_txnid = env->me_metas[toggle]->mm_txnid;
arg->me_tail_txnid = 0; arg->me_tail_txnid = 0;
if (env->me_txns) { MDB_reader *r = env->me_txns->mti_readers;
MDB_reader *r = env->me_txns->mti_readers; int i;
int i; arg->me_tail_txnid = arg->me_last_txnid;
arg->me_tail_txnid = arg->me_last_txnid; for (i = arg->me_numreaders; --i >= 0; ) {
for (i = arg->me_numreaders; --i >= 0; ) { if (r[i].mr_pid) {
if (r[i].mr_pid) { txnid_t mr = r[i].mr_txnid;
txnid_t mr = r[i].mr_txnid; if (arg->me_tail_txnid > mr)
if (arg->me_tail_txnid > mr) arg->me_tail_txnid = mr;
arg->me_tail_txnid = mr;
}
} }
} }
@ -9455,9 +9418,7 @@ mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx)
if (!env || !func) if (!env || !func)
return -1; return -1;
if (!env->me_txns) {
return func("(no reader locks)\n", ctx);
}
rdrs = env->me_txns->mti_numreaders; rdrs = env->me_txns->mti_numreaders;
mr = env->me_txns->mti_readers; mr = env->me_txns->mti_readers;
for (i=0; i<rdrs; i++) { for (i=0; i<rdrs; i++) {
@ -9530,7 +9491,7 @@ mdb_reader_check(MDB_env *env, int *dead)
return EINVAL; return EINVAL;
if (dead) if (dead)
*dead = 0; *dead = 0;
return env->me_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; return mdb_reader_check0(env, 0, dead);
} }
/** As #mdb_reader_check(). rlocked = <caller locked the reader mutex>. */ /** As #mdb_reader_check(). rlocked = <caller locked the reader mutex>. */