From 1c37139b2ac3ce3db39ffb8f39ab787fbec0ca30 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Tue, 12 May 2015 18:29:07 +0300 Subject: [PATCH] lmdb: drops no-lock operation mode (now MDB_NOLOCK is UNSUPPORTED). This is 3/9 for https://github.com/ReOpen/ReOpenLDAP/issues/1 and https://github.com/ReOpen/ReOpenLDAP/issues/2 Change-Id: I7cd5d90c41424d6635accbb10c5801adeb1087e9 --- lmdb.h | 5 +- mdb.c | 315 +++++++++++++++++++++++++-------------------------------- 2 files changed, 141 insertions(+), 179 deletions(-) diff --git a/lmdb.h b/lmdb.h index 4ebaa247..d0e9d631 100644 --- a/lmdb.h +++ b/lmdb.h @@ -277,8 +277,9 @@ typedef void (MDB_rel_func)(MDB_val *item, void *oldptr, void *newptr, void *rel #define MDB_MAPASYNC 0x100000 /** tie reader locktable slots to #MDB_txn objects instead of to threads */ #define MDB_NOTLS 0x200000 - /** don't do any locking, caller must manage their own locks */ -#define MDB_NOLOCK 0x400000 + /** don't do any locking, caller must manage their own locks + * WARNING: ReOpenLDAP don't support this mode. */ +#define MDB_NOLOCK__UNSUPPORTED 0x400000 /** don't do readahead */ #define MDB_NORDAHEAD 0x800000 /** don't initialize malloc'd memory before writing to datafile */ diff --git a/mdb.c b/mdb.c index 5bd69740..3c1a808a 100644 --- a/mdb.c +++ b/mdb.c @@ -1070,13 +1070,13 @@ struct MDB_env { unsigned me_os_psize; /**< OS page size, from #GET_PAGESIZE */ unsigned me_maxreaders; /**< size of the reader table */ /** Max #MDB_txninfo.%mti_numreaders of interest to #mdb_env_close() */ - volatile int me_close_readers; + int me_close_readers; MDB_dbi me_numdbs; /**< number of DBs opened */ MDB_dbi me_maxdbs; /**< size of the DB table */ pid_t me_pid; /**< process ID of this env */ char *me_path; /**< path to the DB files */ char *me_map; /**< the memory map of the data file */ - MDB_txninfo *me_txns; /**< the memory map of the lock file or NULL */ + MDB_txninfo *me_txns; /**< the memory map of the lock file */ MDB_meta *me_metas[2]; /**< pointers to the two meta pages */ void *me_pbuf; /**< scratch area for DUPSORT put() */ MDB_txn *me_txn; /**< current write transaction */ @@ -1913,57 +1913,35 @@ done: return rc; } -/** Find oldest txnid still referenced. Expects txn->mt_txnid > 0. */ +/** Find oldest txnid still referenced. */ static txnid_t -mdb_find_oldest(MDB_txn *txn) +mdb_find_oldest(MDB_env *env, int *laggard) { - int i; - txnid_t mr, oldest = txn->mt_txnid - 1; - if (txn->mt_env->me_txns) { - MDB_reader *r = txn->mt_env->me_txns->mti_readers; - for (i = txn->mt_env->me_txns->mti_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - mr = r[i].mr_txnid; - if (oldest > mr) - oldest = mr; + int i, reader; + MDB_reader *r = env->me_txns->mti_readers; + txnid_t oldest = env->me_txns->mti_txnid; + + for (reader = -1, i = env->me_txns->mti_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + txnid_t snap = r[i].mr_txnid; + if (oldest > snap) { + oldest = snap; + reader = i; } } } - return oldest; -} -static txnid_t -mdb_laggard_reader(MDB_env *env, int *laggard) -{ - txnid_t tail = 0; if (laggard) - *laggard = -1; - if (env->me_txns->mti_txnid > 1) { - int i; - MDB_reader *r = env->me_txns->mti_readers; - - tail = env->me_txns->mti_txnid - 1; - for (i = env->me_txns->mti_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - txnid_t mr = r[i].mr_txnid; - if (tail > mr) { - tail = mr; - if (laggard) - *laggard = i; - } - } - } - } - - return tail; + *laggard = reader; + return oldest; } static int mdb_oomkick_laggard(MDB_env *env) { - int idx, retry; - txnid_t snap, tail = mdb_laggard_reader(env, &idx); - if (idx < 0) + int reader, retry; + txnid_t snap, oldest = mdb_find_oldest(env, &reader); + if (reader < 0) return 0; for(retry = 0; ; ++retry) { @@ -1975,21 +1953,21 @@ mdb_oomkick_laggard(MDB_env *env) if (mdb_reader_check(env, NULL)) break; - snap = mdb_laggard_reader(env, NULL); - if (tail < snap) + snap = mdb_find_oldest(env, NULL); + if (oldest < snap) return 1; if (!env->me_oom_func) break; - r = &env->me_txns->mti_readers[ idx ]; + r = &env->me_txns->mti_readers[ reader ]; pid = r->mr_pid; tid = r->mr_tid; - if (r->mr_txnid != tail || pid <= 0) + if (r->mr_txnid != oldest || pid <= 0) continue; - rc = env->me_oom_func(env, pid, (void*) tid, tail, - env->me_metas[ mdb_env_pick_meta(env) ]->mm_txnid - tail, retry); + rc = env->me_oom_func(env, pid, (void*) tid, oldest, + env->me_metas[ mdb_env_pick_meta(env) ]->mm_txnid - oldest, retry); if (rc < 0) break; @@ -2003,8 +1981,8 @@ mdb_oomkick_laggard(MDB_env *env) } } - snap = mdb_laggard_reader(env, NULL); - return tail < snap; + snap = mdb_find_oldest(env, NULL); + return oldest < snap; } /** Add a page to the txn's dirty list */ @@ -2121,7 +2099,7 @@ oomkick_retry:; last = env->me_pglast - 1; op = MDB_SET_RANGE; } else { - oldest = mdb_find_oldest(txn); + oldest = mdb_find_oldest(env, NULL); env->me_pgoldest = oldest; found_old = 1; /* Begin from oldest reader if any */ @@ -2144,7 +2122,7 @@ oomkick_retry:; /* Do not fetch more if the record will be too recent */ if (op != MDB_FIRST && ++last >= oldest) { if (!found_old) { - oldest = mdb_find_oldest(txn); + oldest = mdb_find_oldest(env, NULL); env->me_pgoldest = oldest; found_old = 1; } @@ -2157,7 +2135,7 @@ oomkick_retry:; if (rc == MDB_NOTFOUND && lifo) { if (op == MDB_SET_RANGE) continue; - env->me_pgoldest = mdb_find_oldest(txn); + env->me_pgoldest = mdb_find_oldest(env, NULL); found_old = 1; if (oldest < env->me_pgoldest) { oldest = env->me_pgoldest; @@ -2177,7 +2155,7 @@ oomkick_retry:; last = *(txnid_t*)key.mv_data; if (oldest <= last) { if (!found_old) { - oldest = mdb_find_oldest(txn); + oldest = mdb_find_oldest(env, NULL); env->me_pgoldest = oldest; found_old = 1; } @@ -2693,7 +2671,6 @@ static int mdb_txn_renew0(MDB_txn *txn) { MDB_env *env = txn->mt_env; - MDB_txninfo *ti = env->me_txns; MDB_meta *meta; unsigned i, nr; uint16_t x; @@ -2705,84 +2682,78 @@ mdb_txn_renew0(MDB_txn *txn) /* Setup db info */ txn->mt_numdbs = env->me_numdbs; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - if (!ti) { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; - txn->mt_txnid = meta->mm_txnid; - txn->mt_u.reader = NULL; + + MDB_reader *r = (env->me_flags & MDB_NOTLS) + ? txn->mt_u.reader : pthread_getspecific(env->me_txkey); + + if (likely(r)) { + if (unlikely(r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1)) + return MDB_BAD_RSLOT; } else { - MDB_reader *r = (env->me_flags & MDB_NOTLS) ? txn->mt_u.reader : - pthread_getspecific(env->me_txkey); - if (r) { - if (r->mr_pid != env->me_pid || r->mr_txnid != (txnid_t)-1) - return MDB_BAD_RSLOT; - } else { - pid_t pid = env->me_pid; - pthread_t tid = pthread_self(); - pthread_mutex_t *rmutex = MDB_MUTEX(env, r); + pid_t pid = env->me_pid; + pthread_t tid = pthread_self(); + pthread_mutex_t *rmutex = MDB_MUTEX(env, r); - if (!env->me_live_reader) { - rc = mdb_reader_pid(env, F_SETLK, pid); - if (rc) - return rc; - env->me_live_reader = 1; - } - - rc = mdb_mutex_lock(env, rmutex); - if (unlikely(rc)) + if (unlikely(!env->me_live_reader)) { + rc = mdb_reader_pid(env, F_SETLK, pid); + if (unlikely(rc != MDB_SUCCESS)) return rc; - nr = ti->mti_numreaders; - for (i=0; imti_readers[i].mr_pid == 0) - break; - if (i == env->me_maxreaders) { - mdb_mutex_unlock(env, rmutex); - return MDB_READERS_FULL; - } - r = &ti->mti_readers[i]; - /* Claim the reader slot, carefully since other code - * uses the reader table un-mutexed: First reset the - * slot, next publish it in mti_numreaders. After - * that, it is safe for mdb_env_close() to touch it. - * When it will be closed, we can finally claim it. - */ - r->mr_pid = 0; - r->mr_txnid = (txnid_t)-1; - r->mr_tid = tid; - mdb_coherent_barrier(); - if (i == nr) - ti->mti_numreaders = ++nr; - env->me_close_readers = nr; - r->mr_pid = pid; - mdb_mutex_unlock(env, rmutex); - - new_notls = (env->me_flags & MDB_NOTLS); - if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { - r->mr_pid = 0; - mdb_coherent_barrier(); - return rc; - } + env->me_live_reader = 1; } - do { /* LY: Retry on a race, ITS#7970. */ - r->mr_txnid = ti->mti_txnid; - mdb_coherent_barrier(); - } while(r->mr_txnid != ti->mti_txnid); - txn->mt_txnid = r->mr_txnid; - txn->mt_u.reader = r; + + rc = mdb_mutex_lock(env, rmutex); + if (unlikely(rc != MDB_SUCCESS)) + return rc; + nr = env->me_txns->mti_numreaders; + for (i=0; ime_txns->mti_readers[i].mr_pid == 0) + break; + if (unlikely(i == env->me_maxreaders)) { + mdb_mutex_unlock(env, rmutex); + return MDB_READERS_FULL; + } + r = &env->me_txns->mti_readers[i]; + /* Claim the reader slot, carefully since other code + * uses the reader table un-mutexed: First reset the + * slot, next publish it in mti_numreaders. After + * that, it is safe for mdb_env_close() to touch it. + * When it will be closed, we can finally claim it. + */ + r->mr_pid = 0; + r->mr_txnid = (txnid_t)-1; + r->mr_tid = tid; mdb_coherent_barrier(); - meta = env->me_metas[txn->mt_txnid & 1]; + if (i == nr) + env->me_txns->mti_numreaders = ++nr; + if (env->me_close_readers < nr) + env->me_close_readers = nr; + r->mr_pid = pid; + mdb_mutex_unlock(env, rmutex); + + new_notls = (env->me_flags & MDB_NOTLS); + if (!new_notls && (rc=pthread_setspecific(env->me_txkey, r))) { + r->mr_pid = 0; + mdb_coherent_barrier(); + return rc; + } } + + do { /* LY: Retry on a race, ITS#7970. */ + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + r->mr_txnid = meta->mm_txnid; + mdb_coherent_barrier(); + } while(unlikely(r->mr_txnid != env->me_txns->mti_txnid)); + txn->mt_txnid = r->mr_txnid; + txn->mt_u.reader = r; } else { /* Not yet touching txn == env->me_txn0, it may be active */ - if (ti) { - rc = mdb_mutex_lock(env, MDB_MUTEX(env, w)); - if (unlikely(rc)) - return rc; - txn->mt_txnid = ti->mti_txnid; - meta = env->me_metas[txn->mt_txnid & 1]; - } else { - meta = env->me_metas[ mdb_env_pick_meta(env) ]; - txn->mt_txnid = meta->mm_txnid; - } + rc = mdb_mutex_lock(env, MDB_MUTEX(env, w)); + if (unlikely(rc)) + return rc; + + meta = env->me_metas[ mdb_env_pick_meta(env) ]; + txn->mt_txnid = meta->mm_txnid; + /* Setup db info */ txn->mt_numdbs = env->me_numdbs; txn->mt_txnid++; @@ -3089,8 +3060,7 @@ mdb_txn_reset0(MDB_txn *txn, const char *act_) env->me_txn = NULL; /* The writer mutex was locked in mdb_txn_begin. */ - if (env->me_txns) - mdb_mutex_unlock(env, MDB_MUTEX(env, w)); + mdb_mutex_unlock(env, MDB_MUTEX(env, w)); } else { txn->mt_parent->mt_child = NULL; env->me_pgstate = ((MDB_ntxn *)txn)->mnt_pgstate; @@ -3790,8 +3760,7 @@ done: env->me_txn = NULL; mdb_dbis_update(txn, 1); - if (env->me_txns) - mdb_mutex_unlock(env, MDB_MUTEX(env, w)); + mdb_mutex_unlock(env, MDB_MUTEX(env, w)); if (txn != env->me_txn0) free(txn); @@ -4019,9 +3988,7 @@ done: * readers will get consistent data regardless of how fresh or * how stale their view of these values is. */ - if (env->me_txns) - env->me_txns->mti_txnid = txn->mt_txnid; - + env->me_txns->mti_txnid = txn->mt_txnid; return MDB_SUCCESS; } @@ -4606,7 +4573,7 @@ fail: #define CHANGEABLE (MDB_NOSYNC|MDB_NOMETASYNC|MDB_MAPASYNC| \ MDB_NOMEMINIT|MDB_COALESCE) #define CHANGELESS (MDB_FIXEDMAP|MDB_NOSUBDIR|MDB_RDONLY| \ - MDB_WRITEMAP|MDB_NOTLS|MDB_NOLOCK|MDB_NORDAHEAD|MDB_LIFORECLAIM) + MDB_WRITEMAP|MDB_NOTLS|MDB_NORDAHEAD|MDB_LIFORECLAIM) #if VALID_FLAGS & PERSISTENT_FLAGS & (CHANGEABLE|CHANGELESS) # error "Persistent DB flags & env flags overlap, but both go in mm_flags" @@ -4664,7 +4631,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode) } /* For RDONLY, get lockfile after we know datafile exists */ - if (!(flags & (MDB_RDONLY|MDB_NOLOCK))) { + if (!(flags & MDB_RDONLY)) { rc = mdb_env_setup_locks(env, lpath, mode, &excl); if (rc) goto leave; @@ -4681,7 +4648,7 @@ mdb_env_open(MDB_env *env, const char *path, unsigned flags, mode_t mode) goto leave; } - if ((flags & (MDB_RDONLY|MDB_NOLOCK)) == MDB_RDONLY) { + if (flags & MDB_RDONLY) { rc = mdb_env_setup_locks(env, lpath, mode, &excl); if (rc) goto leave; @@ -4775,21 +4742,21 @@ mdb_env_close0(MDB_env *env) (void) close(env->me_mfd); if (env->me_fd != INVALID_HANDLE_VALUE) (void) close(env->me_fd); - if (env->me_txns) { - pid_t pid = env->me_pid; - /* Clearing readers is done in this function because - * me_txkey with its destructor must be disabled first. - * - * We skip the the reader mutex, so we touch only - * data owned by this process (me_close_readers and - * our readers), and clear each reader atomically. - */ - for (i = env->me_close_readers; --i >= 0; ) - if (env->me_txns->mti_readers[i].mr_pid == pid) - env->me_txns->mti_readers[i].mr_pid = 0; - mdb_coherent_barrier(); - munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); - } + + pid_t pid = env->me_pid; + /* Clearing readers is done in this function because + * me_txkey with its destructor must be disabled first. + * + * We skip the the reader mutex, so we touch only + * data owned by this process (me_close_readers and + * our readers), and clear each reader atomically. + */ + for (i = env->me_close_readers; --i >= 0; ) + if (env->me_txns->mti_readers[i].mr_pid == pid) + env->me_txns->mti_readers[i].mr_pid = 0; + mdb_coherent_barrier(); + munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); + if (env->me_lfd != INVALID_HANDLE_VALUE) { (void) close(env->me_lfd); } @@ -8808,21 +8775,19 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) if (rc) return rc; - if (env->me_txns) { - /* We must start the actual read txn after blocking writers */ - mdb_txn_reset0(txn, "reset-stage1"); + /* We must start the actual read txn after blocking writers */ + mdb_txn_reset0(txn, "reset-stage1"); - /* Temporarily block writers until we snapshot the meta pages */ - wmutex = MDB_MUTEX(env, w); - rc = mdb_mutex_lock(env, wmutex); - if (unlikely(rc)) - goto leave; + /* Temporarily block writers until we snapshot the meta pages */ + wmutex = MDB_MUTEX(env, w); + rc = mdb_mutex_lock(env, wmutex); + if (unlikely(rc)) + goto leave; - rc = mdb_txn_renew0(txn); - if (rc) { - mdb_mutex_unlock(env, wmutex); - goto leave; - } + rc = mdb_txn_renew0(txn); + if (rc) { + mdb_mutex_unlock(env, wmutex); + goto leave; } wsize = env->me_psize * 2; @@ -9071,22 +9036,20 @@ mdb_env_info(MDB_env *env, MDB_envinfo *arg) arg->me_mapaddr = env->me_metas[toggle]->mm_address; arg->me_mapsize = env->me_mapsize; arg->me_maxreaders = env->me_maxreaders; - arg->me_numreaders = env->me_txns ? env->me_txns->mti_numreaders : 0; + arg->me_numreaders = env->me_txns->mti_numreaders; arg->me_last_pgno = env->me_metas[toggle]->mm_last_pg; arg->me_last_txnid = env->me_metas[toggle]->mm_txnid; arg->me_tail_txnid = 0; - if (env->me_txns) { - MDB_reader *r = env->me_txns->mti_readers; - int i; - arg->me_tail_txnid = arg->me_last_txnid; - for (i = arg->me_numreaders; --i >= 0; ) { - if (r[i].mr_pid) { - txnid_t mr = r[i].mr_txnid; - if (arg->me_tail_txnid > mr) - arg->me_tail_txnid = mr; - } + MDB_reader *r = env->me_txns->mti_readers; + int i; + arg->me_tail_txnid = arg->me_last_txnid; + for (i = arg->me_numreaders; --i >= 0; ) { + if (r[i].mr_pid) { + txnid_t mr = r[i].mr_txnid; + if (arg->me_tail_txnid > mr) + arg->me_tail_txnid = mr; } } @@ -9455,9 +9418,7 @@ mdb_reader_list(MDB_env *env, MDB_msg_func *func, void *ctx) if (!env || !func) return -1; - if (!env->me_txns) { - return func("(no reader locks)\n", ctx); - } + rdrs = env->me_txns->mti_numreaders; mr = env->me_txns->mti_readers; for (i=0; ime_txns ? mdb_reader_check0(env, 0, dead) : MDB_SUCCESS; + return mdb_reader_check0(env, 0, dead); } /** As #mdb_reader_check(). rlocked = . */