From cee258fe86191242a5bfbeac435f6cfbf8790151 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 16 Nov 2015 12:20:47 +0300 Subject: [PATCH 01/16] mdbx: fix uninit fields in page_malloc(). Change-Id: I35a162d6b391d33eda4d508e9c1af7238b33665d --- mdb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mdb.c b/mdb.c index f83d36e9..b4c813ce 100644 --- a/mdb.c +++ b/mdb.c @@ -1521,6 +1521,8 @@ mdb_page_malloc(MDB_txn *txn, unsigned num) } #endif VALGRIND_MAKE_MEM_UNDEFINED(np, size); + np->mp_flags = 0; + np->mp_pages = num; return np; } @@ -2317,6 +2319,8 @@ done: np->mp_pgno = pgno; np->mp_ksize = 0; + np->mp_flags = 0; + np->mp_pages = num; mdb_page_dirty(txn, np); *mp = np; From 4bdeed9bd39ca6cdcd56022e095baf180ebc6e0d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 16 Nov 2015 19:43:15 +0300 Subject: [PATCH 02/16] mdbx: fix thread-local-storage memleak. This fix a TLS-memleak for thread from which mdbx_env_close_ex() has been called. Bug was added by while fixing the https://github.com/ReOpen/ReOpenLDAP/issues/48 In general we should explicitly free(), because pthread_key_delete() don't calls a destructor. Change-Id: Ic55a2348caf3be34b4331d5ad101ea33dbbdfa97 --- mdb.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/mdb.c b/mdb.c index b4c813ce..0e17e203 100644 --- a/mdb.c +++ b/mdb.c @@ -2701,7 +2701,7 @@ mdb_txn_renew0(MDB_txn *txn) int rc, new_notls = 0; if ((flags &= MDB_TXN_RDONLY) != 0) { - struct MDB_rthc* rthc = NULL; + struct MDB_rthc *rthc = NULL; MDB_reader *r = NULL; if (likely(env->me_flags & MDB_ENV_TXKEY)) { mdb_assert(env, !(env->me_flags & MDB_NOTLS)); @@ -4426,15 +4426,16 @@ static pthread_mutex_t mdb_rthc_lock = PTHREAD_MUTEX_INITIALIZER; /* LY: TODO: Yet another problem is here - segfault in case if a DSO will * be unloaded before a thread would been finished. */ static void -mdb_env_reader_dest(void *ptr) +mdb_env_reader_destr(void *ptr) { struct MDB_rthc* rthc = ptr; MDB_reader *reader; + if (! rthc) + /* LY: paranoia */ + return; + mdb_ensure(NULL, pthread_mutex_lock(&mdb_rthc_lock) == 0); - /* LY: Here may be a race with mdb_env_close(), - * see https://github.com/ReOpen/ReOpenLDAP/issues/48 - */ reader = rthc->rc_reader; if (reader) { mdb_ensure(NULL, reader->mr_rthc == rthc); @@ -4617,7 +4618,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) fcntl(env->me_lfd, F_SETFD, fdflags); if (!(env->me_flags & MDB_NOTLS)) { - rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest); + rc = pthread_key_create(&env->me_txkey, mdb_env_reader_destr); if (rc) goto fail; env->me_flags |= MDB_ENV_TXKEY; @@ -4910,12 +4911,14 @@ mdb_env_close0(MDB_env *env) mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0); for (i = env->me_close_readers; --i >= 0; ) { MDB_reader *reader = &env->me_txns->mti_readers[i]; - if (reader->mr_pid == pid) { - mdb_ensure(env, reader->mr_rthc->rc_reader == reader); - reader->mr_rthc->rc_reader = NULL; - reader->mr_rthc = NULL; - mdb_compiler_barrier(); + struct MDB_rthc *rthc = reader->mr_rthc; + if (rthc) { + mdb_ensure(env, rthc->rc_reader == reader); + rthc->rc_reader = NULL; + reader->mr_rthc = NULL; + free(rthc); + } reader->mr_pid = 0; } } From 02da85169e93552388974678ddc2db216d0cb8a2 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 18 Nov 2015 00:33:25 +0000 Subject: [PATCH 03/16] mdbx: backport - ITS#8311 fix page_split from update_key. Check for top of stack. Usually the cursor only has height 1 when calling page_split, but not always. Change-Id: Iad221be30edac0f82b650f787e5dbe721cc978e0 --- CHANGES | 1 + mdb.c | 25 +++++++++++++++++-------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/CHANGES b/CHANGES index 15ea76d9..81bee840 100644 --- a/CHANGES +++ b/CHANGES @@ -16,6 +16,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#7789 ensure mapsize >= pages in use Fix ITS#7971 mdb_txn_renew0() new reader slots Fix ITS#7969 use __sync_synchronize on non-x86 + Fix ITS#8311 page_split from update_key Added mdb_txn_id() (ITS#7994) Added robust mutex support Miscellaneous cleanup/simplification diff --git a/mdb.c b/mdb.c index 0e17e203..1e435187 100644 --- a/mdb.c +++ b/mdb.c @@ -6509,16 +6509,18 @@ fix_parent: * update branch key if there is a parent page */ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned short top = mc->mc_top; + unsigned short dtop = 1; mc->mc_top--; /* slot 0 is always an empty key, find real slot */ - while (mc->mc_top && !mc->mc_ki[mc->mc_top]) + while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { mc->mc_top--; + dtop++; + } if (mc->mc_ki[mc->mc_top]) rc2 = mdb_update_key(mc, key); else rc2 = MDB_SUCCESS; - mc->mc_top = top; + mc->mc_top += dtop; if (rc2) return rc2; } @@ -8398,12 +8400,19 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_ksize = mp->mp_ksize; mdb_debug("new right sibling: page %zu", rp->mp_pgno); - if (mc->mc_snum < 2) { + /* Usually when splitting the root page, the cursor + * height is 1. But when called from mdb_update_key, + * the cursor height may be greater because it walks + * up the stack while finding the branch slot to update. + */ + if (mc->mc_top < 1) { if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp))) goto done; /* shift current top to make room for new parent */ - mc->mc_pg[1] = mc->mc_pg[0]; - mc->mc_ki[1] = mc->mc_ki[0]; + for (i=mc->mc_snum; i>0; i--) { + mc->mc_pg[i] = mc->mc_pg[i-1]; + mc->mc_ki[i] = mc->mc_ki[i-1]; + } mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; @@ -8419,8 +8428,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mc->mc_db->md_depth--; goto done; } - mc->mc_snum = 2; - mc->mc_top = 1; + mc->mc_snum++; + mc->mc_top++; ptop = 0; } else { ptop = mc->mc_top-1; From 0230e4fda217b5e72edd95e109a29955d45efc1c Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 19 Nov 2015 13:56:54 +0300 Subject: [PATCH 04/16] mdbx: clarify fsync/fdatasync comment. Change-Id: I2209b5e65ca8c04ff2de9b1b789f1993f24ab454 --- mdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mdb.c b/mdb.c index 1e435187..31cc7c48 100644 --- a/mdb.c +++ b/mdb.c @@ -4054,7 +4054,8 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) if (unlikely(prev_mapsize != pending->mm_mapsize)) { /* LY: It is no reason to use fdatasync() here, even in case * no such bug in a kernel. Because "no-bug" mean that a kernel - * internally do nearly the same. + * internally do nearly the same, e.g. fdatasync() == fsync() + * when no-kernel-bug and file size was changed. * * So, this code is always safe and without appreciable * performance degradation. From f3043badc58189c2641ccb53311078ae372ca765 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 19 Nov 2015 14:01:31 +0300 Subject: [PATCH 05/16] mdbx: tools/tests - fix minor memleaks (for CI with Valgrind). Change-Id: I0e2217d568259f26cd7c4da800a13577962742f7 --- mdb_chk.c | 32 ++++++++++++++++++++++++-------- mtest6.c | 1 + 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/mdb_chk.c b/mdb_chk.c index b0a0a214..94eab10d 100644 --- a/mdb_chk.c +++ b/mdb_chk.c @@ -122,27 +122,42 @@ error(const char* msg, ...) { } } +static void pagemap_cleanup(void) { + int i; + + for( i = 1; i < MAX_DBI; ++i ) { + if (walk.dbi_names[i]) { + free((void *) walk.dbi_names[i]); + walk.dbi_names[i] = NULL; + } + } + + free(walk.pagemap); + walk.pagemap = NULL; +} + static int pagemap_lookup_dbi(const char* dbi) { static int last; + int i; if (last > 0 && strcmp(walk.dbi_names[last], dbi) == 0) return last; - for(last = 1; walk.dbi_names[last] && last < MAX_DBI; ++last) - if (strcmp(walk.dbi_names[last], dbi) == 0) - return last; + for(i = 1; walk.dbi_names[i] && last < MAX_DBI; ++i) + if (strcmp(walk.dbi_names[i], dbi) == 0) + return last = i; - if (last == MAX_DBI) - return last = -1; + if (i == MAX_DBI) + return -1; - walk.dbi_names[last] = strdup(dbi); + walk.dbi_names[i] = strdup(dbi); if (verbose > 1) { print(" - found '%s' area\n", dbi); fflush(NULL); } - return last; + return last = i; } static void problem_add(const char* object, size_t entry_number, const char* msg, const char *extra, ...) { @@ -581,6 +596,8 @@ int main(int argc, char *argv[]) struct timespec timestamp_start, timestamp_finish; double elapsed; + atexit(pagemap_cleanup); + if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { rc = errno; error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -897,7 +914,6 @@ bailout: mdbx_txn_abort(locktxn); if (env) mdbx_env_close(env); - free(walk.pagemap); fflush(NULL); if (rc) { if (rc < 0) diff --git a/mtest6.c b/mtest6.c index ccd745ed..752c4c5e 100644 --- a/mtest6.c +++ b/mtest6.c @@ -159,6 +159,7 @@ int main(int argc,char * argv[]) mdb_dbi_close(env, dbi); #endif mdb_env_close(env); + free(sval); return 0; } From 86abc397e89cb59b96b3573a403b39ea60a05c68 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 18 Nov 2015 16:30:24 +0100 Subject: [PATCH 06/16] mdbx: backport - ITS#8312 Fix loose pages in commit(nested txn). Change-Id: Ie335dcff0c87cfc13abf9b937f5d058d3ea9d841 --- CHANGES | 1 + mdb.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index 81bee840..aba1a092 100644 --- a/CHANGES +++ b/CHANGES @@ -17,6 +17,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#7971 mdb_txn_renew0() new reader slots Fix ITS#7969 use __sync_synchronize on non-x86 Fix ITS#8311 page_split from update_key + Fix ITS#8312 loose pages in nested txn Added mdb_txn_id() (ITS#7994) Added robust mutex support Miscellaneous cleanup/simplification diff --git a/mdb.c b/mdb.c index 31cc7c48..f906ab72 100644 --- a/mdb.c +++ b/mdb.c @@ -3786,7 +3786,7 @@ mdb_txn_commit(MDB_txn *txn) } /* Append our loose page list to parent's */ - for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp)) + for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp)) ; *lp = txn->mt_loose_pgs; parent->mt_loose_count += txn->mt_loose_count; From 12cd2361e7408b305a2d33a98b7c999934b1e3db Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 18 Nov 2015 21:33:51 +0000 Subject: [PATCH 07/16] mdbx: backport - ITS#8313 more for ITS#8062. dummy flags must be init'd due to 3d46d550 Change-Id: I3d543bd3e059da9b007f1e752f20171acc183679 --- CHANGES | 1 + mdb.c | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGES b/CHANGES index aba1a092..134149d3 100644 --- a/CHANGES +++ b/CHANGES @@ -18,6 +18,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#7969 use __sync_synchronize on non-x86 Fix ITS#8311 page_split from update_key Fix ITS#8312 loose pages in nested txn + Fix ITS#8313 mdb_rebalance dummy cursor Added mdb_txn_id() (ITS#7994) Added robust mutex support Miscellaneous cleanup/simplification diff --git a/mdb.c b/mdb.c index f906ab72..0c5e6f35 100644 --- a/mdb.c +++ b/mdb.c @@ -8199,6 +8199,7 @@ mdb_rebalance(MDB_cursor *mc) mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; /* We want mdb_rebalance to find mn when doing fixups */ if (mc->mc_flags & C_SUB) { + dummy.mc_flags = C_INITIALIZED; dummy.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; mc->mc_txn->mt_cursors[mc->mc_dbi] = &dummy; dummy.mc_xcursor = (MDB_xcursor *)&mn; From 7e2000ef7a71eeb2471d7c6e09d5d71f607ba78d Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Wed, 18 Nov 2015 23:38:34 +0000 Subject: [PATCH 08/16] mdbx: backport - ITS#8315 fix ovpage_free. Keep dirty_room sync'd with dirty_list Change-Id: I9e52a72df95ffb504740e8daecf65b62970e9f25 --- CHANGES | 1 + mdb.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGES b/CHANGES index 134149d3..b36aa5d8 100644 --- a/CHANGES +++ b/CHANGES @@ -19,6 +19,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#8311 page_split from update_key Fix ITS#8312 loose pages in nested txn Fix ITS#8313 mdb_rebalance dummy cursor + Fix ITS#8315 dirty_room in nested txn Added mdb_txn_id() (ITS#7994) Added robust mutex support Miscellaneous cleanup/simplification diff --git a/mdb.c b/mdb.c index 0c5e6f35..741c7cb3 100644 --- a/mdb.c +++ b/mdb.c @@ -5541,6 +5541,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp) return MDB_CORRUPTED; } } + txn->mt_dirty_room++; if (!(env->me_flags & MDB_WRITEMAP)) mdb_dpage_free(env, mp); release: @@ -6697,6 +6698,7 @@ current: return ENOMEM; id2.mid = pg; id2.mptr = np; + /* Note - this page is already counted in parent's dirty_room */ rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2); mdb_cassert(mc, rc2 == 0); if (!(flags & MDB_RESERVE)) { From bf81e86d939d164ef23f1e80f5153f84b42e133b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 22 Nov 2015 13:23:43 +0300 Subject: [PATCH 09/16] mdbx: prevents use the env after a fork(). Change-Id: I9c86500ac008a7be7eb16aeff2610fa5cfa22b84 --- mdb.c | 128 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 49 deletions(-) diff --git a/mdb.c b/mdb.c index 741c7cb3..179c550b 100644 --- a/mdb.c +++ b/mdb.c @@ -1033,7 +1033,7 @@ enum { #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ -static void mdb_txn_end(MDB_txn *txn, unsigned mode); +static int mdb_txn_end(MDB_txn *txn, unsigned mode); static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); static int mdb_page_search_root(MDB_cursor *mc, @@ -2700,6 +2700,11 @@ mdb_txn_renew0(MDB_txn *txn) uint16_t x; int rc, new_notls = 0; + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + if ((flags &= MDB_TXN_RDONLY) != 0) { struct MDB_rthc *rthc = NULL; MDB_reader *r = NULL; @@ -2889,6 +2894,11 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **ret) if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDB_VERSION_MISMATCH; + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + flags &= MDB_TXN_BEGIN_FLAGS; flags |= env->me_flags & MDB_WRITEMAP; @@ -3064,12 +3074,17 @@ mdbx_txn_straggler(MDB_txn *txn, int *percent) * @param[in] txn the transaction handle to end * @param[in] mode why and how to end the transaction */ -static void +static int mdb_txn_end(MDB_txn *txn, unsigned mode) { MDB_env *env = txn->mt_env; static const char *const names[] = MDB_END_NAMES; + if (unlikely(txn->mt_env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + /* Export or close DBI handles opened in this txn */ mdb_dbis_update(txn, mode & MDB_END_UPDATE); @@ -3139,6 +3154,8 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) txn->mt_signature = 0; free(txn); } + + return MDB_SUCCESS; } int @@ -3154,8 +3171,7 @@ mdb_txn_reset(MDB_txn *txn) if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) return EINVAL; - mdb_txn_end(txn, MDB_END_RESET); - return MDB_SUCCESS; + return mdb_txn_end(txn, MDB_END_RESET); } int @@ -3170,8 +3186,7 @@ mdb_txn_abort(MDB_txn *txn) if (txn->mt_child) mdb_txn_abort(txn->mt_child); - mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); - return MDB_SUCCESS; + return mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); } static int @@ -3649,6 +3664,11 @@ mdb_txn_commit(MDB_txn *txn) if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDB_VERSION_MISMATCH; + if (unlikely(txn->mt_env->me_pid != getpid())) { + txn->mt_env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + /* mdb_txn_end() mode for a commit which writes nothing */ end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; @@ -3864,8 +3884,7 @@ mdb_txn_commit(MDB_txn *txn) end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; done: - mdb_txn_end(txn, end_mode); - return MDB_SUCCESS; + return mdb_txn_end(txn, end_mode); fail: mdb_txn_abort(txn); @@ -4205,14 +4224,16 @@ mdb_env_map(MDB_env *env, void *addr) return errno; } - if (flags & MDB_NORDAHEAD) { + unsigned madvise_flags = MADV_DONTFORK; + if (flags & MDB_NORDAHEAD) /* Turn off readahead. It's harmful when the DB is larger than RAM. */ -#ifdef MADV_RANDOM - madvise(env->me_map, env->me_mapsize, MADV_RANDOM); -#elif defined(POSIX_MADV_RANDOM) - posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); -#endif /* MADV_RANDOM & POSIX_MADV_RANDOM */ - } + madvise_flags |= MADV_RANDOM; + if (madvise(env->me_map, env->me_mapsize, madvise_flags)) + return errno; + +#ifdef MADV_DONTDUMP + madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); +#endif /* Can happen because the address argument to mmap() is just a * hint. mmap() can pick another, e.g. if the range is in use. @@ -4438,7 +4459,7 @@ mdb_env_reader_destr(void *ptr) mdb_ensure(NULL, pthread_mutex_lock(&mdb_rthc_lock) == 0); reader = rthc->rc_reader; - if (reader) { + if (reader && reader->mr_pid == getpid()) { mdb_ensure(NULL, reader->mr_rthc == rthc); rthc->rc_reader = NULL; reader->mr_rthc = NULL; @@ -4611,7 +4632,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { return MDB_SUCCESS; } - goto fail_errno; + return rc; } /* Lose record locks when exec*() */ @@ -4621,20 +4642,20 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) if (!(env->me_flags & MDB_NOTLS)) { rc = pthread_key_create(&env->me_txkey, mdb_env_reader_destr); if (rc) - goto fail; + return rc; env->me_flags |= MDB_ENV_TXKEY; } /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ - if ((rc = mdb_env_excl_lock(env, excl))) goto fail; + if ((rc = mdb_env_excl_lock(env, excl))) return rc; size = lseek(env->me_lfd, 0, SEEK_END); - if (size == -1) goto fail_errno; + if (size == -1) return errno; rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); if (size < rsize && *excl > 0) { - if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; + if (ftruncate(env->me_lfd, rsize) != 0) return errno; } else { rsize = size; size = rsize - sizeof(MDB_txninfo); @@ -4643,9 +4664,16 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_lfd, 0); if (m == MAP_FAILED) - goto fail_errno; + return errno; env->me_txns = m; + if (madvise(env->me_txns, rsize, MADV_DONTFORK | MADV_WILLNEED)) + return errno; + +#ifdef MADV_DODUMP + madvise(env->me_txns, rsize, MADV_DODUMP); +#endif + if (*excl > 0) { pthread_mutexattr_t mattr; @@ -4656,7 +4684,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) #endif /* MDB_USE_ROBUST */ || (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr)) || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr))) - goto fail; + return rc; pthread_mutexattr_destroy(&mattr); env->me_txns->mti_magic = MDB_MAGIC; @@ -4666,27 +4694,19 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) } else { if (env->me_txns->mti_magic != MDB_MAGIC) { mdb_debug("lock region has invalid magic"); - rc = MDB_INVALID; - goto fail; + return MDB_INVALID; } if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { mdb_debug("lock region has format+version 0x%x, expected 0x%x", env->me_txns->mti_format, MDB_LOCK_FORMAT); - rc = MDB_VERSION_MISMATCH; - goto fail; + return MDB_VERSION_MISMATCH; } rc = errno; - if (rc && rc != EACCES && rc != EAGAIN) { - goto fail; - } + if (rc && rc != EACCES && rc != EAGAIN) + return rc; } return MDB_SUCCESS; - -fail_errno: - rc = errno; -fail: - return rc; } /** The name of the lock file in the DB environment */ @@ -4909,22 +4929,25 @@ mdb_env_close0(MDB_env *env) * data owned by this process (me_close_readers and * our readers), and clear each reader atomically. */ - mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0); - for (i = env->me_close_readers; --i >= 0; ) { - MDB_reader *reader = &env->me_txns->mti_readers[i]; - if (reader->mr_pid == pid) { - struct MDB_rthc *rthc = reader->mr_rthc; - if (rthc) { - mdb_ensure(env, rthc->rc_reader == reader); - rthc->rc_reader = NULL; - reader->mr_rthc = NULL; - free(rthc); + if (pid == getpid()) { + mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0); + for (i = env->me_close_readers; --i >= 0; ) { + MDB_reader *reader = &env->me_txns->mti_readers[i]; + if (reader->mr_pid == pid) { + struct MDB_rthc *rthc = reader->mr_rthc; + if (rthc) { + mdb_ensure(env, rthc->rc_reader == reader); + rthc->rc_reader = NULL; + reader->mr_rthc = NULL; + free(rthc); + } + reader->mr_pid = 0; } - reader->mr_pid = 0; } + mdb_coherent_barrier(); + mdb_ensure(env, pthread_mutex_unlock(&mdb_rthc_lock) == 0); } - mdb_coherent_barrier(); - mdb_ensure(env, pthread_mutex_unlock(&mdb_rthc_lock) == 0); + munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); env->me_txns = NULL; @@ -9171,7 +9194,9 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) return rc; /* We must start the actual read txn after blocking writers */ - mdb_txn_end(txn, MDB_END_RESET_TMP); + rc = mdb_txn_end(txn, MDB_END_RESET_TMP); + if (rc) + return rc; /* Temporarily block writers until we snapshot the meta pages */ wmutex = MDB_MUTEX(env, w); @@ -10016,6 +10041,11 @@ mdb_reader_check0(MDB_env *env, int rlocked, int *dead) pid_t *pids, pid; int rc = MDB_SUCCESS, count = 0; + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + rdrs = env->me_txns->mti_numreaders; pids = malloc((rdrs+1) * sizeof(pid_t)); if (!pids) From 6ba0b8b467ce1f1e95d61e6cf9af7ef6fb9b41aa Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Thu, 19 Nov 2015 20:04:16 +0000 Subject: [PATCH 10/16] mdbx: backport - ITS#8316 cursor fixup in page_merge. Deleting the merged page requires bumping down other ki's of the page's parent. Change-Id: Ifc94e26b8755ede2400521556556a6824bc2c62b --- CHANGES | 1 + mdb.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CHANGES b/CHANGES index b36aa5d8..e9f98c0b 100644 --- a/CHANGES +++ b/CHANGES @@ -20,6 +20,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#8312 loose pages in nested txn Fix ITS#8313 mdb_rebalance dummy cursor Fix ITS#8315 dirty_room in nested txn + Fix ITS#8316 page_merge cursor tracking Added mdb_txn_id() (ITS#7994) Added robust mutex support Miscellaneous cleanup/simplification diff --git a/mdb.c b/mdb.c index 179c550b..5b66c8ed 100644 --- a/mdb.c +++ b/mdb.c @@ -8009,6 +8009,9 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3->mc_pg[top] = pdst; m3->mc_ki[top] += nkeys; m3->mc_ki[top-1] = cdst->mc_ki[top-1]; + } else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] && + m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { + m3->mc_ki[top-1]--; } } } From 90fdef7a6a85b4b3c240c6f15cbfd7857dacb9fe Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 20 Nov 2015 09:20:16 +0000 Subject: [PATCH 11/16] mdbx: backport - Refix root split check from 5da67968afb599697d7557c13b65fb961ec408dd. Change-Id: If012ddcf223a3312bbe13c4b24d776b488ed3772 --- mdb.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mdb.c b/mdb.c index 5b66c8ed..4998eb4e 100644 --- a/mdb.c +++ b/mdb.c @@ -8611,6 +8611,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* Copy separator key to the parent. */ if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { + int snum = mc->mc_snum; mn.mc_snum--; mn.mc_top--; did_split = 1; @@ -8619,13 +8620,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno goto done; /* root split? */ - if (mn.mc_snum == mc->mc_snum) { - mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top]; - mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top]; - mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop]; - mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop]; - mc->mc_snum++; - mc->mc_top++; + if (mc->mc_snum > snum) { ptop++; } /* Right page might now have changed parent. @@ -8756,8 +8751,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno continue; if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) continue; - if (m3->mc_flags & C_SPLITTING) - continue; if (new_root) { int k; /* root split */ @@ -8774,6 +8767,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno m3->mc_snum++; m3->mc_top++; } + if (m3->mc_flags & C_SPLITTING) + continue; if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; From 2154b585c6648700c53dd0e2104af84207a2193f Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 20 Nov 2015 09:47:56 +0000 Subject: [PATCH 12/16] mdbx: backport - Silence some valgrind uninit warnings. Change-Id: Ie26d7bd08a3a4d4ceb833093a9117f1ebaca6cb0 --- mdb.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mdb.c b/mdb.c index 4998eb4e..f254f2dc 100644 --- a/mdb.c +++ b/mdb.c @@ -7790,6 +7790,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; if (m3 != cdst && m3->mc_pg[csrc->mc_top] == mpd && m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) { @@ -7812,6 +7814,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) else m3 = m2; if (m3 == csrc) continue; + if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top) + continue; if (m3->mc_pg[csrc->mc_top] == mps) { if (!m3->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top]; @@ -8112,7 +8116,8 @@ mdb_rebalance(MDB_cursor *mc) m3 = &m2->mc_xcursor->mx_cursor; else m3 = m2; - if (m3->mc_snum < mc->mc_snum) continue; + if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum)) + continue; if (m3->mc_pg[0] == mp) { m3->mc_snum = 0; m3->mc_top = 0; @@ -8148,6 +8153,8 @@ mdb_rebalance(MDB_cursor *mc) else m3 = m2; if (m3 == mc) continue; + if (!(m3->mc_flags & C_INITIALIZED)) + continue; if (m3->mc_pg[0] == mp) { for (i=0; imc_db->md_depth; i++) { m3->mc_pg[i] = m3->mc_pg[i+1]; From 0a97fbcbabff241dd4285924a5a466625818c506 Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Fri, 20 Nov 2015 13:34:11 +0000 Subject: [PATCH 13/16] mdbx: backport - ITS#8300 more for prev commit. Just tell explicitly which direction we moved/merged from Change-Id: Ib1868003d30f3afe71f105c2750253bbc6059610 --- mdb.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/mdb.c b/mdb.c index f254f2dc..21174ac3 100644 --- a/mdb.c +++ b/mdb.c @@ -1059,7 +1059,7 @@ static int mdb_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags); static void mdb_node_del(MDB_cursor *mc, int ksize); static void mdb_node_shrink(MDB_page *mp, indx_t indx); -static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst); +static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); static size_t mdb_branch_size(MDB_env *env, MDB_val *key); @@ -7681,7 +7681,7 @@ mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); /** Move a node from csrc to cdst. */ static int -mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) +mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) { MDB_node *srcnode; MDB_val key, data; @@ -7783,7 +7783,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst) mps = csrc->mc_pg[csrc->mc_top]; /* If we're adding on the left, bump others up */ - if (!cdst->mc_ki[csrc->mc_top]) { + if (fromleft) { mpd = cdst->mc_pg[csrc->mc_top]; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (csrc->mc_flags & C_SUB) @@ -8065,7 +8065,7 @@ static int mdb_rebalance(MDB_cursor *mc) { MDB_node *node; - int rc; + int rc, fromleft; unsigned ptop, minkeys, thresh; MDB_cursor mn; indx_t oldki; @@ -8198,6 +8198,7 @@ mdb_rebalance(MDB_cursor *mc) return rc; mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]); + fromleft = 0; } else { /* There is at least one neighbor to the left. */ @@ -8209,6 +8210,7 @@ mdb_rebalance(MDB_cursor *mc) return rc; mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; mc->mc_ki[mc->mc_top] = 0; + fromleft = 1; } mdb_debug("found neighbor page %zu (%u keys, %.1f%% full)", @@ -8220,13 +8222,13 @@ mdb_rebalance(MDB_cursor *mc) * (A branch page must never have less than 2 keys.) */ if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) { - rc = mdb_node_move(&mn, mc); - if (!mc->mc_ki[mc->mc_top]) { + rc = mdb_node_move(&mn, mc, fromleft); + if (fromleft) { /* if we inserted on left, bump position up */ oldki++; } } else { - if (mc->mc_ki[ptop] == 0) { + if (!fromleft) { rc = mdb_page_merge(&mn, mc); } else { MDB_cursor dummy; From aeea7ebb083d6856bfb59657736eed5bd75d2a41 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 23 Nov 2015 09:29:01 +0300 Subject: [PATCH 14/16] mdbx: backport - ITS#8321 a lot for cursor tracking. Multiple bugs were fixed in the cursor fixups which adjust other open cursors in response to various write ops. Includes: - ITS#8321 Fix del/dupsort. When deleting a dupsort key, if other cursors pointed at that key, set them to uninit'd, not EOF. They no longer have anything to point at. - ITS#8321 don't skip fixups on splitting cursors. Adjustments can't be skipped, in recursive calls each level must fixup their own level. - ITS#8321 fix mdb_cursor_chk(). It was reporting spurious errors due to uninit'd cursors - ITS#8321 fix mdb_cursor_shadow(). Set a valid txn so that cursor fixup code works on the shadows - ITS#8321 fix mdb_cursor_put. Ignore sub-cursors that shouldn't be fixed up - ITS#8321 track temporary cursors. In rebalance/split operations, temporary cursors need to be visible to propagate fixups - ITS#8321 simplify page_split fixups. - ITS#8321 reorganize page_split fixups. DUPFIXED fixups needed to occur after separator update. MDB_RESERVE handling moved after split fixup. Change-Id: I0c04acf54ebf6e84f32996b5723ec6fafb983ad9 --- CHANGES | 1 + mdb.c | 123 +++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 79 insertions(+), 45 deletions(-) diff --git a/CHANGES b/CHANGES index e9f98c0b..3f9084f0 100644 --- a/CHANGES +++ b/CHANGES @@ -21,6 +21,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#8313 mdb_rebalance dummy cursor Fix ITS#8315 dirty_room in nested txn Fix ITS#8316 page_merge cursor tracking + Fix ITS#8321 cursor tracking Added mdb_txn_id() (ITS#7994) Added robust mutex support Miscellaneous cleanup/simplification diff --git a/mdb.c b/mdb.c index 21174ac3..0bc9a48a 100644 --- a/mdb.c +++ b/mdb.c @@ -889,7 +889,6 @@ struct MDB_cursor { #define C_EOF 0x02 /**< No more data */ #define C_SUB 0x04 /**< Cursor is a sub-cursor */ #define C_DEL 0x08 /**< last op was a cursor_del */ -#define C_SPLITTING 0x20 /**< Cursor is in page_split */ #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ #define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ /** @} */ @@ -1401,7 +1400,7 @@ mdb_cursor_chk(MDB_cursor *mc) MDB_node *node; MDB_page *mp; - if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return; + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; for (i=0; imc_top; i++) { mp = mc->mc_pg[i]; node = NODEPTR(mp, mc->mc_ki[i]); @@ -2604,14 +2603,15 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) *bk = *mc; mc->mc_backup = bk; mc->mc_db = &dst->mt_dbs[i]; - /* Kill pointers into src - and dst to reduce abuse: The - * user may not use mc until dst ends. Otherwise we'd... + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = NULL; /* ...set this to dst */ - mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */ + mc->mc_txn = dst; + mc->mc_dbflag = &dst->mt_dbflags[i]; if ((mx = mc->mc_xcursor) != NULL) { *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = NULL; /* ...and dst. */ + mx->mx_cursor.mc_txn = dst; } mc->mc_next = dst->mt_cursors[i]; dst->mt_cursors[i] = mc; @@ -6835,6 +6835,7 @@ put_sub: MDB_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; + int nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; @@ -6842,9 +6843,9 @@ put_sub: if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { mdb_xcursor_init2(m2, mx, new_dupdata); - } else if (!insert_key) { + } else if (!insert_key && m2->mc_ki[i] < nkeys) { MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]); - if (!(n2->mn_flags & F_SUBDATA)) + if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); } } @@ -7678,6 +7679,22 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) static void mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); +/** Track a temporary cursor */ +#define CURSOR_TMP_TRACK(mc, mn, dummy, tracked) \ + if (mc->mc_flags & C_SUB) { \ + dummy.mc_flags = C_INITIALIZED; \ + dummy.mc_xcursor = (MDB_xcursor *)&mn; \ + tracked = &dummy; \ + } else { \ + tracked = &mn; \ + } \ + tracked->mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; \ + mc->mc_txn->mt_cursors[mc->mc_dbi] = tracked + +/** Stop tracking a temporary cursor */ +#define CURSOR_TMP_UNTRACK(mc, tracked) \ + mc->mc_txn->mt_cursors[mc->mc_dbi] = tracked->mc_next + /** Move a node from csrc to cdst. */ static int @@ -7833,6 +7850,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) */ if (csrc->mc_ki[csrc->mc_top] == 0) { if (csrc->mc_ki[csrc->mc_top-1] != 0) { + MDB_cursor dummy, *tracked; if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); } else { @@ -7845,7 +7863,11 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) mdb_cursor_copy(csrc, &mn); mn.mc_snum--; mn.mc_top--; - if (unlikely((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)) + /* We want mdb_rebalance to find mn when doing fixups */ + CURSOR_TMP_TRACK(csrc, mn, dummy, tracked); + rc = mdb_update_key(&mn, &key); + CURSOR_TMP_UNTRACK(csrc, tracked); + if (unlikely(rc != MDB_SUCCESS)) return rc; } if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { @@ -7861,6 +7883,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) if (cdst->mc_ki[cdst->mc_top] == 0) { if (cdst->mc_ki[cdst->mc_top-1] != 0) { + MDB_cursor dummy, *tracked; if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); } else { @@ -7873,7 +7896,11 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) mdb_cursor_copy(cdst, &mn); mn.mc_snum--; mn.mc_top--; - if (unlikely((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)) + /* We want mdb_rebalance to find mn when doing fixups */ + CURSOR_TMP_TRACK(cdst, mn, dummy, tracked); + rc = mdb_update_key(&mn, &key); + CURSOR_TMP_UNTRACK(cdst, tracked); + if (unlikely(rc != MDB_SUCCESS)) return rc; } if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { @@ -8231,24 +8258,13 @@ mdb_rebalance(MDB_cursor *mc) if (!fromleft) { rc = mdb_page_merge(&mn, mc); } else { - MDB_cursor dummy; + MDB_cursor dummy, *tracked; oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; /* We want mdb_rebalance to find mn when doing fixups */ - if (mc->mc_flags & C_SUB) { - dummy.mc_flags = C_INITIALIZED; - dummy.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; - mc->mc_txn->mt_cursors[mc->mc_dbi] = &dummy; - dummy.mc_xcursor = (MDB_xcursor *)&mn; - } else { - mn.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; - mc->mc_txn->mt_cursors[mc->mc_dbi] = &mn; - } + CURSOR_TMP_TRACK(mc, mn, dummy, tracked); rc = mdb_page_merge(mc, &mn); - if (mc->mc_flags & C_SUB) - mc->mc_txn->mt_cursors[mc->mc_dbi] = dummy.mc_next; - else - mc->mc_txn->mt_cursors[mc->mc_dbi] = mn.mc_next; + CURSOR_TMP_UNTRACK(mc, tracked); mdb_cursor_copy(&mn, mc); } mc->mc_flags &= ~C_EOF; @@ -8286,7 +8302,7 @@ mdb_cursor_del0(MDB_cursor *mc) if (m3->mc_ki[mc->mc_top] > ki) m3->mc_ki[mc->mc_top]--; else if (mc->mc_db->md_flags & MDB_DUPSORT) - m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF; + m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } } } @@ -8475,7 +8491,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mdb_debug("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno); } - mc->mc_flags |= C_SPLITTING; mdb_cursor_copy(mc, &mn); mn.mc_pg[mn.mc_top] = rp; mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; @@ -8526,8 +8541,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_lower += sizeof(indx_t); rp->mp_upper -= ksize - sizeof(indx_t); mc->mc_ki[mc->mc_top] = x; - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; } } else { int psize, nsize, k; @@ -8621,11 +8634,15 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno */ if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { int snum = mc->mc_snum; + MDB_cursor dummy, *tracked; mn.mc_snum--; mn.mc_top--; did_split = 1; + /* We want other splits to find mn when doing fixups */ + CURSOR_TMP_TRACK(mc, mn, dummy, tracked); rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); - if (unlikely(rc)) + CURSOR_TMP_UNTRACK(mc, tracked); + if (unlikely(rc != MDB_SUCCESS)) goto done; /* root split? */ @@ -8655,10 +8672,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); mn.mc_top++; } - mc->mc_flags ^= C_SPLITTING; - if (unlikely(rc != MDB_SUCCESS)) { + if (unlikely(rc != MDB_SUCCESS)) goto done; - } if (nflags & MDB_APPEND) { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[mc->mc_top] = 0; @@ -8725,11 +8740,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* reset back to original page */ if (newindx < split_indx) { mc->mc_pg[mc->mc_top] = mp; - if (nflags & MDB_RESERVE) { - node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } } else { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[ptop]++; @@ -8743,13 +8753,32 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno } } } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } } { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; - int fixup = NUMKEYS(mp); + nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) @@ -8762,12 +8791,15 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno continue; if (new_root) { int k; + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; /* root split */ for (k=new_root; k>=0; k--) { m3->mc_ki[k+1] = m3->mc_ki[k]; m3->mc_pg[k+1] = m3->mc_pg[k]; } - if (m3->mc_ki[0] >= split_indx) { + if (m3->mc_ki[0] > nkeys) { m3->mc_ki[0] = 1; } else { m3->mc_ki[0] = 0; @@ -8776,15 +8808,16 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno m3->mc_snum++; m3->mc_top++; } - if (m3->mc_flags & C_SPLITTING) - continue; if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= fixup) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= fixup; - m3->mc_ki[ptop] = mn.mc_ki[ptop]; + m3->mc_ki[mc->mc_top] -= nkeys; + for (i=0; imc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } } } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { From a8e2288537a8aac94de3f9eac437d6b58bb97267 Mon Sep 17 00:00:00 2001 From: Orivej Desh Date: Sun, 22 Nov 2015 00:59:55 +0000 Subject: [PATCH 15/16] mdbx: backport - ITS#8319 mdb_load: explain readline and mdb_cursor_put errors. Change-Id: I99e8ff220bb37109da83bb6088be7597b52c6f00 --- CHANGES | 1 + mdb_load.c | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CHANGES b/CHANGES index 3f9084f0..df82a13c 100644 --- a/CHANGES +++ b/CHANGES @@ -21,6 +21,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#8313 mdb_rebalance dummy cursor Fix ITS#8315 dirty_room in nested txn Fix ITS#8316 page_merge cursor tracking + Fix ITS#8319 mdb_load error messages Fix ITS#8321 cursor tracking Added mdb_txn_id() (ITS#7994) Added robust mutex support diff --git a/mdb_load.c b/mdb_load.c index 97e81e02..c81a0fa8 100644 --- a/mdb_load.c +++ b/mdb_load.c @@ -399,20 +399,22 @@ int main(int argc, char *argv[]) while(1) { rc = readline(&key, &kbuf); - if (rc == EOF) + if (rc) /* rc == EOF */ break; - if (rc) - goto txn_abort; rc = readline(&data, &dbuf); - if (rc) + if (rc) { + fprintf(stderr, "%s: line %" Z "d: failed to read key value\n", prog, lineno); goto txn_abort; + } rc = mdb_cursor_put(mc, &key, &data, putflags); if (rc == MDB_KEYEXIST && putflags) continue; - if (rc) + if (rc) { + fprintf(stderr, "mdb_cursor_put failed, error %d %s\n", rc, mdb_strerror(rc)); goto txn_abort; + } batch++; if (batch == 100) { rc = mdb_txn_commit(txn); From 5d4a23138cad024bc8eec06dadacd8208905a614 Mon Sep 17 00:00:00 2001 From: Orivej Desh Date: Sun, 22 Nov 2015 01:15:14 +0000 Subject: [PATCH 16/16] mdbx: backport - ITS#8320 mdb_load: fix loading data from simple text files. mdb_load -T was supposed to read escaped text, but 21b51cb7 "Add mdb_load" made it read hex. Change-Id: If12a01ee897af0570d95744626815c535cc81f9f --- CHANGES | 1 + mdb_load.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGES b/CHANGES index df82a13c..03db4c3f 100644 --- a/CHANGES +++ b/CHANGES @@ -22,6 +22,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#8315 dirty_room in nested txn Fix ITS#8316 page_merge cursor tracking Fix ITS#8319 mdb_load error messages + Fix ITS#8320 mdb_load plaintext input Fix ITS#8321 cursor tracking Added mdb_txn_id() (ITS#7994) Added robust mutex support diff --git a/mdb_load.c b/mdb_load.c index c81a0fa8..8ec257f9 100644 --- a/mdb_load.c +++ b/mdb_load.c @@ -332,7 +332,7 @@ int main(int argc, char *argv[]) putflags = MDB_NOOVERWRITE|MDB_NODUPDATA; break; case 'T': - mode |= NOHDR; + mode |= NOHDR | PRINT; break; default: usage();