mdbx: Merge branch 'master' into nexenta.

This commit is contained in:
Leo Yuriev 2016-05-09 22:50:27 +03:00
commit f1acaf72ca
6 changed files with 83 additions and 88 deletions

View File

@ -51,7 +51,7 @@ databases should only be opened once, by the first transaction in
the process. After the first transaction completes, the database the process. After the first transaction completes, the database
handles can freely be used by all subsequent transactions. handles can freely be used by all subsequent transactions.
Within a transaction, #mdb_get() and #mdb_put() can store single Within a transaction, #mdb_get() can retrieve and #mdb_put() can store single
key/value pairs if that is all you need to do (but see \ref Cursors key/value pairs if that is all you need to do (but see \ref Cursors
below if you want to do more). below if you want to do more).

10
lmdb.h
View File

@ -196,7 +196,7 @@ typedef int mdb_filehandle_t;
/** Library minor version */ /** Library minor version */
#define MDB_VERSION_MINOR 9 #define MDB_VERSION_MINOR 9
/** Library patch version */ /** Library patch version */
#define MDB_VERSION_PATCH 42 #define MDB_VERSION_PATCH 19
/** Combine args a,b,c into a single integer for easy version comparisons */ /** Combine args a,b,c into a single integer for easy version comparisons */
#define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c)) #define MDB_VERINT(a,b,c) (((a) << 24) | ((b) << 16) | (c))
@ -206,10 +206,10 @@ typedef int mdb_filehandle_t;
MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH) MDB_VERINT(MDB_VERSION_MAJOR,MDB_VERSION_MINOR,MDB_VERSION_PATCH)
/** The release date of this library version */ /** The release date of this library version */
#define MDB_VERSION_DATE "February 5, 2016, https://github.com/ReOpen/libmdbx" #define MDB_VERSION_DATE "2016-04-06"
/** A stringifier for the version info */ /** A stringifier for the version info */
#define MDB_VERSTR(a,b,c,d) "LMDB " #a "." #b "." #c ": (" d ")" #define MDB_VERSTR(a,b,c,d) "MDBX " #a "." #b "." #c ": (" d ", https://github.com/ReOpen/libmdbx)"
/** A helper for the stringifier macro */ /** A helper for the stringifier macro */
#define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d) #define MDB_VERFOO(a,b,c,d) MDB_VERSTR(a,b,c,d)
@ -1671,12 +1671,14 @@ int mdb_reader_check(MDB_env *env, int *dead);
int mdbx_txn_straggler(MDB_txn *txn, int *percent); int mdbx_txn_straggler(MDB_txn *txn, int *percent);
/** @brief A callback function for killing a laggard readers, /** @brief A callback function for killing a laggard readers,
* called in case of MDB_MAP_FULL error. * but also could waiting ones. Called in case of MDB_MAP_FULL error.
* *
* @param[in] env An environment handle returned by #mdb_env_create(). * @param[in] env An environment handle returned by #mdb_env_create().
* @param[in] pid pid of the reader process. * @param[in] pid pid of the reader process.
* @param[in] thread_id thread_id of the reader thread. * @param[in] thread_id thread_id of the reader thread.
* @param[in] txn Transaction number on which stalled. * @param[in] txn Transaction number on which stalled.
* @param[in] gap a lag from the last commited txn.
* @param[in] retry a retry number, less that zero for notify end of OOM-loop.
* @return -1 on failure (reader is not killed), * @return -1 on failure (reader is not killed),
* 0 on a race condition (no such reader), * 0 on a race condition (no such reader),
* 1 on success (reader was killed), * 1 on success (reader was killed),

149
mdb.c
View File

@ -1977,8 +1977,11 @@ txnid_t mdb_find_oldest(MDB_env *env, int *laggard)
static txnid_t __cold static txnid_t __cold
mdbx_oomkick(MDB_env *env, txnid_t oldest) mdbx_oomkick(MDB_env *env, txnid_t oldest)
{ {
mdb_debug("DB size maxed out");
#if MDBX_MODE_ENABLED
int retry; int retry;
txnid_t snap; txnid_t snap;
mdb_debug("DB size maxed out");
for(retry = 0; ; ++retry) { for(retry = 0; ; ++retry) {
int reader; int reader;
@ -1987,47 +1990,51 @@ mdbx_oomkick(MDB_env *env, txnid_t oldest)
break; break;
snap = mdb_find_oldest(env, &reader); snap = mdb_find_oldest(env, &reader);
if (oldest < snap) if (oldest < snap || reader < 0) {
if (retry && env->me_oom_func) {
/* LY: notify end of oom-loop */
env->me_oom_func(env, 0, 0, oldest, snap - oldest, -retry);
}
return snap; return snap;
}
if (reader < 0) MDB_reader *r;
return 0; pthread_t tid;
pid_t pid;
int rc;
#if MDBX_MODE_ENABLED if (!env->me_oom_func)
{ break;
MDB_reader *r;
pthread_t tid;
pid_t pid;
int rc;
if (!env->me_oom_func) r = &env->me_txns->mti_readers[ reader ];
break; pid = r->mr_pid;
tid = r->mr_tid;
if (r->mr_txnid != oldest || pid <= 0)
continue;
r = &env->me_txns->mti_readers[ reader ]; rc = env->me_oom_func(env, pid, (void*) tid, oldest,
pid = r->mr_pid; mdb_meta_head_w(env)->mm_txnid - oldest, retry);
tid = r->mr_tid; if (rc < 0)
if (r->mr_txnid != oldest || pid <= 0) break;
continue;
rc = env->me_oom_func(env, pid, (void*) tid, oldest, if (rc) {
mdb_meta_head_w(env)->mm_txnid - oldest, retry); r->mr_txnid = ~(txnid_t)0;
if (rc < 0) if (rc > 1) {
break; r->mr_tid = 0;
r->mr_pid = 0;
if (rc) { mdbx_coherent_barrier();
r->mr_txnid = ~(txnid_t)0;
if (rc > 1) {
r->mr_tid = 0;
r->mr_pid = 0;
mdbx_coherent_barrier();
}
} }
} }
#else
break;
#endif /* MDBX_MODE_ENABLED */
} }
if (retry && env->me_oom_func) {
/* LY: notify end of oom-loop */
env->me_oom_func(env, 0, 0, oldest, 0, -retry);
}
#else
(void) oldest;
(void) mdb_reader_check(env, NULL);
#endif /* MDBX_MODE_ENABLED */
return mdb_find_oldest(env, NULL); return mdb_find_oldest(env, NULL);
} }
@ -2069,7 +2076,8 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp)
#define MDBX_ALLOC_CACHE 1 #define MDBX_ALLOC_CACHE 1
#define MDBX_ALLOC_GC 2 #define MDBX_ALLOC_GC 2
#define MDBX_ALLOC_NEW 4 #define MDBX_ALLOC_NEW 4
#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE|MDBX_ALLOC_GC|MDBX_ALLOC_NEW) #define MDBX_ALLOC_KICK 8
#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE|MDBX_ALLOC_GC|MDBX_ALLOC_NEW|MDBX_ALLOC_KICK)
static int static int
mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags)
@ -2090,7 +2098,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags)
if (unlikely(mc->mc_flags & C_RECLAIMING)) { if (unlikely(mc->mc_flags & C_RECLAIMING)) {
/* If mc is updating the freeDB, then the freelist cannot play /* If mc is updating the freeDB, then the freelist cannot play
* catch-up with itself by growing while trying to save it. */ * catch-up with itself by growing while trying to save it. */
flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE | MDBX_LIFORECLAIM); flags &= ~(MDBX_ALLOC_GC | MDBX_ALLOC_KICK | MDBX_COALESCE | MDBX_LIFORECLAIM);
} }
} }
@ -2141,18 +2149,14 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags)
oldest = env->me_pgoldest; oldest = env->me_pgoldest;
mdb_cursor_init(&m2, txn, FREE_DBI, NULL); mdb_cursor_init(&m2, txn, FREE_DBI, NULL);
if (flags & MDBX_LIFORECLAIM) { if (flags & MDBX_LIFORECLAIM) {
if (env->me_pglast > 1) { if (! found_oldest) {
/* Continue lookup from env->me_pglast to lower/first */
last = env->me_pglast - 1;
op = MDB_SET_RANGE;
} else {
oldest = mdb_find_oldest(env, NULL); oldest = mdb_find_oldest(env, NULL);
found_oldest = 1; found_oldest = 1;
/* Begin from oldest reader if any */ }
if (oldest > 2) { /* Begin from oldest reader if any */
last = oldest - 1; if (oldest > 2) {
op = MDB_SET_RANGE; last = oldest - 1;
} op = MDB_SET_RANGE;
} }
} else if (env->me_pglast) { } else if (env->me_pglast) {
/* Continue lookup from env->me_pglast to higher/last */ /* Continue lookup from env->me_pglast to higher/last */
@ -2288,18 +2292,18 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags)
} while (--i > n2); } while (--i > n2);
} }
/* Use new pages from the map when nothing suitable in the freeDB */
i = 0; i = 0;
rc = MDB_NOTFOUND; pgno = txn->mt_next_pgno;
if (likely(flags & MDBX_ALLOC_NEW)) { rc = MDB_MAP_FULL;
/* Use new pages from the map when nothing suitable in the freeDB */ if (likely(pgno + num <= env->me_maxpg)) {
pgno = txn->mt_next_pgno; rc = MDB_NOTFOUND;
if (likely(pgno + num <= env->me_maxpg)) if (likely(flags & MDBX_ALLOC_NEW))
goto done; goto done;
mdb_debug("DB size maxed out");
rc = MDB_MAP_FULL;
} }
if (flags & MDBX_ALLOC_GC) { if ((flags & MDBX_ALLOC_GC)
&& ((flags & MDBX_ALLOC_KICK) || rc == MDB_MAP_FULL)) {
MDB_meta* head = mdb_meta_head_w(env); MDB_meta* head = mdb_meta_head_w(env);
MDB_meta* tail = mdb_env_meta_flipflop(env, head); MDB_meta* tail = mdb_env_meta_flipflop(env, head);
@ -3454,8 +3458,8 @@ again:
if (lifo) { if (lifo) {
if (refill_idx > (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) { if (refill_idx > (txn->mt_lifo_reclaimed ? txn->mt_lifo_reclaimed[0] : 0)) {
/* LY: need more just a txn-id for save page list. */ /* LY: need just a txn-id for save page list. */
rc = mdb_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC); rc = mdb_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK);
if (likely(rc == 0)) if (likely(rc == 0))
/* LY: ok, reclaimed from freedb. */ /* LY: ok, reclaimed from freedb. */
continue; continue;
@ -4868,13 +4872,6 @@ mdbx_env_open_ex(MDB_env *env, const char *path, unsigned flags, mode_t mode, in
if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
return MDB_VERSION_MISMATCH; return MDB_VERSION_MISMATCH;
#if MDBX_LIFORECLAIM
/* LY: don't allow LIFO with just NOMETASYNC */
if ((flags & (MDB_NOMETASYNC | MDBX_LIFORECLAIM | MDB_NOSYNC))
== (MDB_NOMETASYNC | MDBX_LIFORECLAIM))
return EINVAL;
#endif /* MDBX_LIFORECLAIM */
if (env->me_fd != INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS))) if (env->me_fd != INVALID_HANDLE_VALUE || (flags & ~(CHANGEABLE|CHANGELESS)))
return EINVAL; return EINVAL;
@ -5022,6 +5019,7 @@ mdb_env_close0(MDB_env *env)
if (!(env->me_flags & MDB_ENV_ACTIVE)) if (!(env->me_flags & MDB_ENV_ACTIVE))
return; return;
env->me_flags &= ~MDB_ENV_ACTIVE;
/* Doing this here since me_dbxs may not exist during mdb_env_close */ /* Doing this here since me_dbxs may not exist during mdb_env_close */
if (env->me_dbxs) { if (env->me_dbxs) {
@ -5041,7 +5039,12 @@ mdb_env_close0(MDB_env *env)
mdb_midl_free(env->me_free_pgs); mdb_midl_free(env->me_free_pgs);
if (env->me_flags & MDB_ENV_TXKEY) { if (env->me_flags & MDB_ENV_TXKEY) {
struct MDB_rthc *rthc = pthread_getspecific(env->me_txkey);
if (rthc && pthread_setspecific(env->me_txkey, NULL) == 0) {
mdb_env_reader_destr(rthc);
}
pthread_key_delete(env->me_txkey); pthread_key_delete(env->me_txkey);
env->me_flags &= ~MDB_ENV_TXKEY;
} }
if (env->me_map) { if (env->me_map) {
@ -5086,8 +5089,6 @@ mdb_env_close0(MDB_env *env)
if (env->me_lfd != INVALID_HANDLE_VALUE) { if (env->me_lfd != INVALID_HANDLE_VALUE) {
(void) close(env->me_lfd); (void) close(env->me_lfd);
} }
env->me_flags &= ~(MDB_ENV_ACTIVE|MDB_ENV_TXKEY);
} }
#if ! MDBX_MODE_ENABLED #if ! MDBX_MODE_ENABLED
@ -7326,10 +7327,10 @@ mdb_node_add(MDB_cursor *mc, indx_t indx,
node_size += key->mv_size; node_size += key->mv_size;
if (IS_LEAF(mp)) { if (IS_LEAF(mp)) {
mdb_cassert(mc, key && data); mdb_cassert(mc, key && data);
if (F_ISSET(flags, F_BIGDATA)) { if (unlikely(F_ISSET(flags, F_BIGDATA))) {
/* Data already on overflow page. */ /* Data already on overflow page. */
node_size += sizeof(pgno_t); node_size += sizeof(pgno_t);
} else if (node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax) { } else if (unlikely(node_size + data->mv_size > mc->mc_txn->mt_env->me_nodemax)) {
int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize); int ovpages = OVPAGES(data->mv_size, mc->mc_txn->mt_env->me_psize);
int rc; int rc;
/* Put data on overflow page. */ /* Put data on overflow page. */
@ -7377,19 +7378,19 @@ update:
if (IS_LEAF(mp)) { if (IS_LEAF(mp)) {
ndata = NODEDATA(node); ndata = NODEDATA(node);
if (ofp == NULL) { if (unlikely(ofp == NULL)) {
if (F_ISSET(flags, F_BIGDATA)) if (unlikely(F_ISSET(flags, F_BIGDATA)))
memcpy(ndata, data->mv_data, sizeof(pgno_t)); memcpy(ndata, data->mv_data, sizeof(pgno_t));
else if (F_ISSET(flags, MDB_RESERVE)) else if (F_ISSET(flags, MDB_RESERVE))
data->mv_data = ndata; data->mv_data = ndata;
else else if (likely(ndata != data->mv_data))
memcpy(ndata, data->mv_data, data->mv_size); memcpy(ndata, data->mv_data, data->mv_size);
} else { } else {
memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t)); memcpy(ndata, &ofp->mp_pgno, sizeof(pgno_t));
ndata = PAGEDATA(ofp); ndata = PAGEDATA(ofp);
if (F_ISSET(flags, MDB_RESERVE)) if (F_ISSET(flags, MDB_RESERVE))
data->mv_data = ndata; data->mv_data = ndata;
else else if (likely(ndata != data->mv_data))
memcpy(ndata, data->mv_data, data->mv_size); memcpy(ndata, data->mv_data, data->mv_size);
} }
} }
@ -9608,17 +9609,9 @@ mdb_env_set_flags(MDB_env *env, unsigned flags, int onoff)
return rc; return rc;
if (onoff) if (onoff)
flags = env->me_flags | flags; env->me_flags |= flags;
else else
flags = env->me_flags & ~flags; env->me_flags &= ~flags;
#if MDBX_LIFORECLAIM
/* LY: don't allow LIFO with just NOMETASYNC */
if ((flags & (MDB_NOMETASYNC | MDBX_LIFORECLAIM | MDB_NOSYNC))
== (MDB_NOMETASYNC | MDBX_LIFORECLAIM))
return EINVAL;
#endif /* MDBX_LIFORECLAIM */
env->me_flags = flags;
mdb_mutex_unlock(env, mutex); mdb_mutex_unlock(env, mutex);
return MDB_SUCCESS; return MDB_SUCCESS;

View File

@ -221,7 +221,7 @@ int main(int argc, char *argv[])
printf(" Used now: %zu %.1f%%\n", value, value / percent); printf(" Used now: %zu %.1f%%\n", value, value / percent);
value = pages; value = pages;
printf(" Free pages: %zu %.1f%%\n", value, value / percent); printf(" Unallocated: %zu %.1f%%\n", value, value / percent);
value = pages - reclaimable; value = pages - reclaimable;
printf(" Detained: %zu %.1f%%\n", value, value / percent); printf(" Detained: %zu %.1f%%\n", value, value / percent);

View File

@ -57,8 +57,8 @@ static void db_connect() {
LMDB_CHECK(mdb_env_create(&env)); LMDB_CHECK(mdb_env_create(&env));
LMDB_CHECK(mdb_env_set_mapsize(env, 3L * 1024L * 1024L * 1024L)); LMDB_CHECK(mdb_env_set_mapsize(env, 3L * 1024L * 1024L * 1024L));
LMDB_CHECK(mdb_env_set_maxdbs(env, 30)); LMDB_CHECK(mdb_env_set_maxdbs(env, 30));
#if defined(MDB_LIFORECLAIM) #if defined(MDBX_LIFORECLAIM)
LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDB_LIFORECLAIM, 0664)); LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664));
#else #else
LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664));
#endif #endif

View File

@ -64,8 +64,8 @@ static void db_connect() {
LMDB_CHECK(mdb_env_create(&env)); LMDB_CHECK(mdb_env_create(&env));
LMDB_CHECK(mdb_env_set_mapsize(env, 300000L * 4096L)); LMDB_CHECK(mdb_env_set_mapsize(env, 300000L * 4096L));
LMDB_CHECK(mdb_env_set_maxdbs(env, 30)); LMDB_CHECK(mdb_env_set_maxdbs(env, 30));
#if defined(MDB_LIFORECLAIM) #if defined(MDBX_LIFORECLAIM)
LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDB_LIFORECLAIM, 0664)); LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP | MDBX_LIFORECLAIM, 0664));
#else #else
LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664)); LMDB_CHECK(mdb_env_open(env, opt_db_path, MDB_CREATE | MDB_NOSYNC | MDB_WRITEMAP, 0664));
#endif #endif