mdbx: Merge branch 'devel' into master.

This commit is contained in:
Leo Yuriev 2015-11-23 10:19:33 +03:00
commit 6e1ba1cf28
5 changed files with 254 additions and 137 deletions

View File

@ -16,6 +16,14 @@ LMDB 0.9.17 Release Engineering
Fix ITS#7789 ensure mapsize >= pages in use
Fix ITS#7971 mdb_txn_renew0() new reader slots
Fix ITS#7969 use __sync_synchronize on non-x86
Fix ITS#8311 page_split from update_key
Fix ITS#8312 loose pages in nested txn
Fix ITS#8313 mdb_rebalance dummy cursor
Fix ITS#8315 dirty_room in nested txn
Fix ITS#8316 page_merge cursor tracking
Fix ITS#8319 mdb_load error messages
Fix ITS#8320 mdb_load plaintext input
Fix ITS#8321 cursor tracking
Added mdb_txn_id() (ITS#7994)
Added robust mutex support
Miscellaneous cleanup/simplification

336
mdb.c
View File

@ -889,7 +889,6 @@ struct MDB_cursor {
#define C_EOF 0x02 /**< No more data */
#define C_SUB 0x04 /**< Cursor is a sub-cursor */
#define C_DEL 0x08 /**< last op was a cursor_del */
#define C_SPLITTING 0x20 /**< Cursor is in page_split */
#define C_UNTRACK 0x40 /**< Un-track cursor when closing */
#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */
/** @} */
@ -1033,7 +1032,7 @@ enum {
#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */
#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */
#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */
static void mdb_txn_end(MDB_txn *txn, unsigned mode);
static int mdb_txn_end(MDB_txn *txn, unsigned mode);
static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl);
static int mdb_page_search_root(MDB_cursor *mc,
@ -1059,7 +1058,7 @@ static int mdb_node_add(MDB_cursor *mc, indx_t indx,
MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags);
static void mdb_node_del(MDB_cursor *mc, int ksize);
static void mdb_node_shrink(MDB_page *mp, indx_t indx);
static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst);
static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft);
static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data);
static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data);
static size_t mdb_branch_size(MDB_env *env, MDB_val *key);
@ -1401,7 +1400,7 @@ mdb_cursor_chk(MDB_cursor *mc)
MDB_node *node;
MDB_page *mp;
if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return;
if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return;
for (i=0; i<mc->mc_top; i++) {
mp = mc->mc_pg[i];
node = NODEPTR(mp, mc->mc_ki[i]);
@ -1521,6 +1520,8 @@ mdb_page_malloc(MDB_txn *txn, unsigned num)
}
#endif
VALGRIND_MAKE_MEM_UNDEFINED(np, size);
np->mp_flags = 0;
np->mp_pages = num;
return np;
}
@ -2317,6 +2318,8 @@ done:
np->mp_pgno = pgno;
np->mp_ksize = 0;
np->mp_flags = 0;
np->mp_pages = num;
mdb_page_dirty(txn, np);
*mp = np;
@ -2600,14 +2603,15 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst)
*bk = *mc;
mc->mc_backup = bk;
mc->mc_db = &dst->mt_dbs[i];
/* Kill pointers into src - and dst to reduce abuse: The
* user may not use mc until dst ends. Otherwise we'd...
/* Kill pointers into src to reduce abuse: The
* user may not use mc until dst ends. But we need a valid
* txn pointer here for cursor fixups to keep working.
*/
mc->mc_txn = NULL; /* ...set this to dst */
mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */
mc->mc_txn = dst;
mc->mc_dbflag = &dst->mt_dbflags[i];
if ((mx = mc->mc_xcursor) != NULL) {
*(MDB_xcursor *)(bk+1) = *mx;
mx->mx_cursor.mc_txn = NULL; /* ...and dst. */
mx->mx_cursor.mc_txn = dst;
}
mc->mc_next = dst->mt_cursors[i];
dst->mt_cursors[i] = mc;
@ -2696,8 +2700,13 @@ mdb_txn_renew0(MDB_txn *txn)
uint16_t x;
int rc, new_notls = 0;
if (unlikely(env->me_pid != getpid())) {
env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
if ((flags &= MDB_TXN_RDONLY) != 0) {
struct MDB_rthc* rthc = NULL;
struct MDB_rthc *rthc = NULL;
MDB_reader *r = NULL;
if (likely(env->me_flags & MDB_ENV_TXKEY)) {
mdb_assert(env, !(env->me_flags & MDB_NOTLS));
@ -2885,6 +2894,11 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **ret)
if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
return MDB_VERSION_MISMATCH;
if (unlikely(env->me_pid != getpid())) {
env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
flags &= MDB_TXN_BEGIN_FLAGS;
flags |= env->me_flags & MDB_WRITEMAP;
@ -3060,12 +3074,17 @@ mdbx_txn_straggler(MDB_txn *txn, int *percent)
* @param[in] txn the transaction handle to end
* @param[in] mode why and how to end the transaction
*/
static void
static int
mdb_txn_end(MDB_txn *txn, unsigned mode)
{
MDB_env *env = txn->mt_env;
static const char *const names[] = MDB_END_NAMES;
if (unlikely(txn->mt_env->me_pid != getpid())) {
env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
/* Export or close DBI handles opened in this txn */
mdb_dbis_update(txn, mode & MDB_END_UPDATE);
@ -3135,6 +3154,8 @@ mdb_txn_end(MDB_txn *txn, unsigned mode)
txn->mt_signature = 0;
free(txn);
}
return MDB_SUCCESS;
}
int
@ -3150,8 +3171,7 @@ mdb_txn_reset(MDB_txn *txn)
if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY)))
return EINVAL;
mdb_txn_end(txn, MDB_END_RESET);
return MDB_SUCCESS;
return mdb_txn_end(txn, MDB_END_RESET);
}
int
@ -3166,8 +3186,7 @@ mdb_txn_abort(MDB_txn *txn)
if (txn->mt_child)
mdb_txn_abort(txn->mt_child);
mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE);
return MDB_SUCCESS;
return mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE);
}
static int
@ -3645,6 +3664,11 @@ mdb_txn_commit(MDB_txn *txn)
if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
return MDB_VERSION_MISMATCH;
if (unlikely(txn->mt_env->me_pid != getpid())) {
txn->mt_env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
/* mdb_txn_end() mode for a commit which writes nothing */
end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE;
@ -3782,7 +3806,7 @@ mdb_txn_commit(MDB_txn *txn)
}
/* Append our loose page list to parent's */
for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(lp))
for (lp = &parent->mt_loose_pgs; *lp; lp = &NEXT_LOOSE_PAGE(*lp))
;
*lp = txn->mt_loose_pgs;
parent->mt_loose_count += txn->mt_loose_count;
@ -3860,8 +3884,7 @@ mdb_txn_commit(MDB_txn *txn)
end_mode = MDB_END_COMMITTED|MDB_END_UPDATE;
done:
mdb_txn_end(txn, end_mode);
return MDB_SUCCESS;
return mdb_txn_end(txn, end_mode);
fail:
mdb_txn_abort(txn);
@ -4050,7 +4073,8 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
if (unlikely(prev_mapsize != pending->mm_mapsize)) {
/* LY: It is no reason to use fdatasync() here, even in case
* no such bug in a kernel. Because "no-bug" mean that a kernel
* internally do nearly the same.
* internally do nearly the same, e.g. fdatasync() == fsync()
* when no-kernel-bug and file size was changed.
*
* So, this code is always safe and without appreciable
* performance degradation.
@ -4200,14 +4224,16 @@ mdb_env_map(MDB_env *env, void *addr)
return errno;
}
if (flags & MDB_NORDAHEAD) {
unsigned madvise_flags = MADV_DONTFORK;
if (flags & MDB_NORDAHEAD)
/* Turn off readahead. It's harmful when the DB is larger than RAM. */
#ifdef MADV_RANDOM
madvise(env->me_map, env->me_mapsize, MADV_RANDOM);
#elif defined(POSIX_MADV_RANDOM)
posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
#endif /* MADV_RANDOM & POSIX_MADV_RANDOM */
}
madvise_flags |= MADV_RANDOM;
if (madvise(env->me_map, env->me_mapsize, madvise_flags))
return errno;
#ifdef MADV_DONTDUMP
madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP);
#endif
/* Can happen because the address argument to mmap() is just a
* hint. mmap() can pick another, e.g. if the range is in use.
@ -4422,17 +4448,18 @@ static pthread_mutex_t mdb_rthc_lock = PTHREAD_MUTEX_INITIALIZER;
/* LY: TODO: Yet another problem is here - segfault in case if a DSO will
* be unloaded before a thread would been finished. */
static void
mdb_env_reader_dest(void *ptr)
mdb_env_reader_destr(void *ptr)
{
struct MDB_rthc* rthc = ptr;
MDB_reader *reader;
if (! rthc)
/* LY: paranoia */
return;
mdb_ensure(NULL, pthread_mutex_lock(&mdb_rthc_lock) == 0);
/* LY: Here may be a race with mdb_env_close(),
* see https://github.com/ReOpen/ReOpenLDAP/issues/48
*/
reader = rthc->rc_reader;
if (reader) {
if (reader && reader->mr_pid == getpid()) {
mdb_ensure(NULL, reader->mr_rthc == rthc);
rthc->rc_reader = NULL;
reader->mr_rthc = NULL;
@ -4605,7 +4632,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
if (rc == EROFS && (env->me_flags & MDB_RDONLY)) {
return MDB_SUCCESS;
}
goto fail_errno;
return rc;
}
/* Lose record locks when exec*() */
@ -4613,22 +4640,22 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
fcntl(env->me_lfd, F_SETFD, fdflags);
if (!(env->me_flags & MDB_NOTLS)) {
rc = pthread_key_create(&env->me_txkey, mdb_env_reader_dest);
rc = pthread_key_create(&env->me_txkey, mdb_env_reader_destr);
if (rc)
goto fail;
return rc;
env->me_flags |= MDB_ENV_TXKEY;
}
/* Try to get exclusive lock. If we succeed, then
* nobody is using the lock region and we should initialize it.
*/
if ((rc = mdb_env_excl_lock(env, excl))) goto fail;
if ((rc = mdb_env_excl_lock(env, excl))) return rc;
size = lseek(env->me_lfd, 0, SEEK_END);
if (size == -1) goto fail_errno;
if (size == -1) return errno;
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
if (size < rsize && *excl > 0) {
if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno;
if (ftruncate(env->me_lfd, rsize) != 0) return errno;
} else {
rsize = size;
size = rsize - sizeof(MDB_txninfo);
@ -4637,9 +4664,16 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_lfd, 0);
if (m == MAP_FAILED)
goto fail_errno;
return errno;
env->me_txns = m;
if (madvise(env->me_txns, rsize, MADV_DONTFORK | MADV_WILLNEED))
return errno;
#ifdef MADV_DODUMP
madvise(env->me_txns, rsize, MADV_DODUMP);
#endif
if (*excl > 0) {
pthread_mutexattr_t mattr;
@ -4650,7 +4684,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
#endif /* MDB_USE_ROBUST */
|| (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr))
|| (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr)))
goto fail;
return rc;
pthread_mutexattr_destroy(&mattr);
env->me_txns->mti_magic = MDB_MAGIC;
@ -4660,27 +4694,19 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
} else {
if (env->me_txns->mti_magic != MDB_MAGIC) {
mdb_debug("lock region has invalid magic");
rc = MDB_INVALID;
goto fail;
return MDB_INVALID;
}
if (env->me_txns->mti_format != MDB_LOCK_FORMAT) {
mdb_debug("lock region has format+version 0x%x, expected 0x%x",
env->me_txns->mti_format, MDB_LOCK_FORMAT);
rc = MDB_VERSION_MISMATCH;
goto fail;
return MDB_VERSION_MISMATCH;
}
rc = errno;
if (rc && rc != EACCES && rc != EAGAIN) {
goto fail;
}
if (rc && rc != EACCES && rc != EAGAIN)
return rc;
}
return MDB_SUCCESS;
fail_errno:
rc = errno;
fail:
return rc;
}
/** The name of the lock file in the DB environment */
@ -4903,20 +4929,25 @@ mdb_env_close0(MDB_env *env)
* data owned by this process (me_close_readers and
* our readers), and clear each reader atomically.
*/
mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0);
for (i = env->me_close_readers; --i >= 0; ) {
MDB_reader *reader = &env->me_txns->mti_readers[i];
if (reader->mr_pid == pid) {
mdb_ensure(env, reader->mr_rthc->rc_reader == reader);
reader->mr_rthc->rc_reader = NULL;
reader->mr_rthc = NULL;
mdb_compiler_barrier();
reader->mr_pid = 0;
if (pid == getpid()) {
mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0);
for (i = env->me_close_readers; --i >= 0; ) {
MDB_reader *reader = &env->me_txns->mti_readers[i];
if (reader->mr_pid == pid) {
struct MDB_rthc *rthc = reader->mr_rthc;
if (rthc) {
mdb_ensure(env, rthc->rc_reader == reader);
rthc->rc_reader = NULL;
reader->mr_rthc = NULL;
free(rthc);
}
reader->mr_pid = 0;
}
}
mdb_coherent_barrier();
mdb_ensure(env, pthread_mutex_unlock(&mdb_rthc_lock) == 0);
}
mdb_coherent_barrier();
mdb_ensure(env, pthread_mutex_unlock(&mdb_rthc_lock) == 0);
munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
env->me_txns = NULL;
@ -5533,6 +5564,7 @@ mdb_ovpage_free(MDB_cursor *mc, MDB_page *mp)
return MDB_CORRUPTED;
}
}
txn->mt_dirty_room++;
if (!(env->me_flags & MDB_WRITEMAP))
mdb_dpage_free(env, mp);
release:
@ -6502,16 +6534,18 @@ fix_parent:
* update branch key if there is a parent page
*/
if (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
unsigned short top = mc->mc_top;
unsigned short dtop = 1;
mc->mc_top--;
/* slot 0 is always an empty key, find real slot */
while (mc->mc_top && !mc->mc_ki[mc->mc_top])
while (mc->mc_top && !mc->mc_ki[mc->mc_top]) {
mc->mc_top--;
dtop++;
}
if (mc->mc_ki[mc->mc_top])
rc2 = mdb_update_key(mc, key);
else
rc2 = MDB_SUCCESS;
mc->mc_top = top;
mc->mc_top += dtop;
if (rc2)
return rc2;
}
@ -6687,6 +6721,7 @@ current:
return ENOMEM;
id2.mid = pg;
id2.mptr = np;
/* Note - this page is already counted in parent's dirty_room */
rc2 = mdb_mid2l_insert(mc->mc_txn->mt_u.dirty_list, &id2);
mdb_cassert(mc, rc2 == 0);
if (!(flags & MDB_RESERVE)) {
@ -6800,6 +6835,7 @@ put_sub:
MDB_xcursor *mx = mc->mc_xcursor;
unsigned i = mc->mc_top;
MDB_page *mp = mc->mc_pg[i];
int nkeys = NUMKEYS(mp);
for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) {
if (m2 == mc || m2->mc_snum < mc->mc_snum) continue;
@ -6807,9 +6843,9 @@ put_sub:
if (m2->mc_pg[i] == mp) {
if (m2->mc_ki[i] == mc->mc_ki[i]) {
mdb_xcursor_init2(m2, mx, new_dupdata);
} else if (!insert_key) {
} else if (!insert_key && m2->mc_ki[i] < nkeys) {
MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]);
if (!(n2->mn_flags & F_SUBDATA))
if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA)
m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2);
}
}
@ -7643,10 +7679,26 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key)
static void
mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst);
/** Track a temporary cursor */
#define CURSOR_TMP_TRACK(mc, mn, dummy, tracked) \
if (mc->mc_flags & C_SUB) { \
dummy.mc_flags = C_INITIALIZED; \
dummy.mc_xcursor = (MDB_xcursor *)&mn; \
tracked = &dummy; \
} else { \
tracked = &mn; \
} \
tracked->mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; \
mc->mc_txn->mt_cursors[mc->mc_dbi] = tracked
/** Stop tracking a temporary cursor */
#define CURSOR_TMP_UNTRACK(mc, tracked) \
mc->mc_txn->mt_cursors[mc->mc_dbi] = tracked->mc_next
/** Move a node from csrc to cdst.
*/
static int
mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft)
{
MDB_node *srcnode;
MDB_val key, data;
@ -7748,13 +7800,15 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
mps = csrc->mc_pg[csrc->mc_top];
/* If we're adding on the left, bump others up */
if (!cdst->mc_ki[csrc->mc_top]) {
if (fromleft) {
mpd = cdst->mc_pg[csrc->mc_top];
for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
if (csrc->mc_flags & C_SUB)
m3 = &m2->mc_xcursor->mx_cursor;
else
m3 = m2;
if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
continue;
if (m3 != cdst &&
m3->mc_pg[csrc->mc_top] == mpd &&
m3->mc_ki[csrc->mc_top] >= cdst->mc_ki[csrc->mc_top]) {
@ -7777,6 +7831,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
else
m3 = m2;
if (m3 == csrc) continue;
if (!(m3->mc_flags & C_INITIALIZED) || m3->mc_top < csrc->mc_top)
continue;
if (m3->mc_pg[csrc->mc_top] == mps) {
if (!m3->mc_ki[csrc->mc_top]) {
m3->mc_pg[csrc->mc_top] = cdst->mc_pg[cdst->mc_top];
@ -7794,6 +7850,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
*/
if (csrc->mc_ki[csrc->mc_top] == 0) {
if (csrc->mc_ki[csrc->mc_top-1] != 0) {
MDB_cursor dummy, *tracked;
if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size);
} else {
@ -7806,7 +7863,11 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
mdb_cursor_copy(csrc, &mn);
mn.mc_snum--;
mn.mc_top--;
if (unlikely((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS))
/* We want mdb_rebalance to find mn when doing fixups */
CURSOR_TMP_TRACK(csrc, mn, dummy, tracked);
rc = mdb_update_key(&mn, &key);
CURSOR_TMP_UNTRACK(csrc, tracked);
if (unlikely(rc != MDB_SUCCESS))
return rc;
}
if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) {
@ -7822,6 +7883,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
if (cdst->mc_ki[cdst->mc_top] == 0) {
if (cdst->mc_ki[cdst->mc_top-1] != 0) {
MDB_cursor dummy, *tracked;
if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) {
key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size);
} else {
@ -7834,7 +7896,11 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst)
mdb_cursor_copy(cdst, &mn);
mn.mc_snum--;
mn.mc_top--;
if (unlikely((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS))
/* We want mdb_rebalance to find mn when doing fixups */
CURSOR_TMP_TRACK(cdst, mn, dummy, tracked);
rc = mdb_update_key(&mn, &key);
CURSOR_TMP_UNTRACK(cdst, tracked);
if (unlikely(rc != MDB_SUCCESS))
return rc;
}
if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) {
@ -7974,6 +8040,9 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst)
m3->mc_pg[top] = pdst;
m3->mc_ki[top] += nkeys;
m3->mc_ki[top-1] = cdst->mc_ki[top-1];
} else if (m3->mc_pg[top-1] == csrc->mc_pg[top-1] &&
m3->mc_ki[top-1] > csrc->mc_ki[top-1]) {
m3->mc_ki[top-1]--;
}
}
}
@ -8023,7 +8092,7 @@ static int
mdb_rebalance(MDB_cursor *mc)
{
MDB_node *node;
int rc;
int rc, fromleft;
unsigned ptop, minkeys, thresh;
MDB_cursor mn;
indx_t oldki;
@ -8074,7 +8143,8 @@ mdb_rebalance(MDB_cursor *mc)
m3 = &m2->mc_xcursor->mx_cursor;
else
m3 = m2;
if (m3->mc_snum < mc->mc_snum) continue;
if (!(m3->mc_flags & C_INITIALIZED) || (m3->mc_snum < mc->mc_snum))
continue;
if (m3->mc_pg[0] == mp) {
m3->mc_snum = 0;
m3->mc_top = 0;
@ -8110,6 +8180,8 @@ mdb_rebalance(MDB_cursor *mc)
else
m3 = m2;
if (m3 == mc) continue;
if (!(m3->mc_flags & C_INITIALIZED))
continue;
if (m3->mc_pg[0] == mp) {
for (i=0; i<mc->mc_db->md_depth; i++) {
m3->mc_pg[i] = m3->mc_pg[i+1];
@ -8153,6 +8225,7 @@ mdb_rebalance(MDB_cursor *mc)
return rc;
mn.mc_ki[mn.mc_top] = 0;
mc->mc_ki[mc->mc_top] = NUMKEYS(mc->mc_pg[mc->mc_top]);
fromleft = 0;
} else {
/* There is at least one neighbor to the left.
*/
@ -8164,6 +8237,7 @@ mdb_rebalance(MDB_cursor *mc)
return rc;
mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1;
mc->mc_ki[mc->mc_top] = 0;
fromleft = 1;
}
mdb_debug("found neighbor page %zu (%u keys, %.1f%% full)",
@ -8175,32 +8249,22 @@ mdb_rebalance(MDB_cursor *mc)
* (A branch page must never have less than 2 keys.)
*/
if (PAGEFILL(mc->mc_txn->mt_env, mn.mc_pg[mn.mc_top]) >= thresh && NUMKEYS(mn.mc_pg[mn.mc_top]) > minkeys) {
rc = mdb_node_move(&mn, mc);
if (!mc->mc_ki[mc->mc_top]) {
rc = mdb_node_move(&mn, mc, fromleft);
if (fromleft) {
/* if we inserted on left, bump position up */
oldki++;
}
} else {
if (mc->mc_ki[ptop] == 0) {
if (!fromleft) {
rc = mdb_page_merge(&mn, mc);
} else {
MDB_cursor dummy;
MDB_cursor dummy, *tracked;
oldki += NUMKEYS(mn.mc_pg[mn.mc_top]);
mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1;
/* We want mdb_rebalance to find mn when doing fixups */
if (mc->mc_flags & C_SUB) {
dummy.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi];
mc->mc_txn->mt_cursors[mc->mc_dbi] = &dummy;
dummy.mc_xcursor = (MDB_xcursor *)&mn;
} else {
mn.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi];
mc->mc_txn->mt_cursors[mc->mc_dbi] = &mn;
}
CURSOR_TMP_TRACK(mc, mn, dummy, tracked);
rc = mdb_page_merge(mc, &mn);
if (mc->mc_flags & C_SUB)
mc->mc_txn->mt_cursors[mc->mc_dbi] = dummy.mc_next;
else
mc->mc_txn->mt_cursors[mc->mc_dbi] = mn.mc_next;
CURSOR_TMP_UNTRACK(mc, tracked);
mdb_cursor_copy(&mn, mc);
}
mc->mc_flags &= ~C_EOF;
@ -8238,7 +8302,7 @@ mdb_cursor_del0(MDB_cursor *mc)
if (m3->mc_ki[mc->mc_top] > ki)
m3->mc_ki[mc->mc_top]--;
else if (mc->mc_db->md_flags & MDB_DUPSORT)
m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF;
m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED;
}
}
}
@ -8391,12 +8455,19 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
rp->mp_ksize = mp->mp_ksize;
mdb_debug("new right sibling: page %zu", rp->mp_pgno);
if (mc->mc_snum < 2) {
/* Usually when splitting the root page, the cursor
* height is 1. But when called from mdb_update_key,
* the cursor height may be greater because it walks
* up the stack while finding the branch slot to update.
*/
if (mc->mc_top < 1) {
if ((rc = mdb_page_new(mc, P_BRANCH, 1, &pp)))
goto done;
/* shift current top to make room for new parent */
mc->mc_pg[1] = mc->mc_pg[0];
mc->mc_ki[1] = mc->mc_ki[0];
for (i=mc->mc_snum; i>0; i--) {
mc->mc_pg[i] = mc->mc_pg[i-1];
mc->mc_ki[i] = mc->mc_ki[i-1];
}
mc->mc_pg[0] = pp;
mc->mc_ki[0] = 0;
mc->mc_db->md_root = pp->mp_pgno;
@ -8412,15 +8483,14 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
mc->mc_db->md_depth--;
goto done;
}
mc->mc_snum = 2;
mc->mc_top = 1;
mc->mc_snum++;
mc->mc_top++;
ptop = 0;
} else {
ptop = mc->mc_top-1;
mdb_debug("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno);
}
mc->mc_flags |= C_SPLITTING;
mdb_cursor_copy(mc, &mn);
mn.mc_pg[mn.mc_top] = rp;
mn.mc_ki[ptop] = mc->mc_ki[ptop]+1;
@ -8471,8 +8541,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
rp->mp_lower += sizeof(indx_t);
rp->mp_upper -= ksize - sizeof(indx_t);
mc->mc_ki[mc->mc_top] = x;
mc->mc_pg[mc->mc_top] = rp;
mc->mc_ki[ptop]++;
}
} else {
int psize, nsize, k;
@ -8565,21 +8633,20 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
/* Copy separator key to the parent.
*/
if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) {
int snum = mc->mc_snum;
MDB_cursor dummy, *tracked;
mn.mc_snum--;
mn.mc_top--;
did_split = 1;
/* We want other splits to find mn when doing fixups */
CURSOR_TMP_TRACK(mc, mn, dummy, tracked);
rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0);
if (unlikely(rc))
CURSOR_TMP_UNTRACK(mc, tracked);
if (unlikely(rc != MDB_SUCCESS))
goto done;
/* root split? */
if (mn.mc_snum == mc->mc_snum) {
mc->mc_pg[mc->mc_snum] = mc->mc_pg[mc->mc_top];
mc->mc_ki[mc->mc_snum] = mc->mc_ki[mc->mc_top];
mc->mc_pg[mc->mc_top] = mc->mc_pg[ptop];
mc->mc_ki[mc->mc_top] = mc->mc_ki[ptop];
mc->mc_snum++;
mc->mc_top++;
if (mc->mc_snum > snum) {
ptop++;
}
/* Right page might now have changed parent.
@ -8605,10 +8672,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0);
mn.mc_top++;
}
mc->mc_flags ^= C_SPLITTING;
if (unlikely(rc != MDB_SUCCESS)) {
if (unlikely(rc != MDB_SUCCESS))
goto done;
}
if (nflags & MDB_APPEND) {
mc->mc_pg[mc->mc_top] = rp;
mc->mc_ki[mc->mc_top] = 0;
@ -8675,11 +8740,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
/* reset back to original page */
if (newindx < split_indx) {
mc->mc_pg[mc->mc_top] = mp;
if (nflags & MDB_RESERVE) {
node = NODEPTR(mp, mc->mc_ki[mc->mc_top]);
if (!(node->mn_flags & F_BIGDATA))
newdata->mv_data = NODEDATA(node);
}
} else {
mc->mc_pg[mc->mc_top] = rp;
mc->mc_ki[ptop]++;
@ -8693,13 +8753,32 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
}
}
}
if (nflags & MDB_RESERVE) {
node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]);
if (!(node->mn_flags & F_BIGDATA))
newdata->mv_data = NODEDATA(node);
}
} else {
if (newindx >= split_indx) {
mc->mc_pg[mc->mc_top] = rp;
mc->mc_ki[ptop]++;
/* Make sure mc_ki is still valid.
*/
if (mn.mc_pg[ptop] != mc->mc_pg[ptop] &&
mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) {
for (i=0; i<=ptop; i++) {
mc->mc_pg[i] = mn.mc_pg[i];
mc->mc_ki[i] = mn.mc_ki[i];
}
}
}
}
{
/* Adjust other cursors pointing to mp */
MDB_cursor *m2, *m3;
MDB_dbi dbi = mc->mc_dbi;
int fixup = NUMKEYS(mp);
nkeys = NUMKEYS(mp);
for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) {
if (mc->mc_flags & C_SUB)
@ -8710,16 +8789,17 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
continue;
if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED))
continue;
if (m3->mc_flags & C_SPLITTING)
continue;
if (new_root) {
int k;
/* sub cursors may be on different DB */
if (m3->mc_pg[0] != mp)
continue;
/* root split */
for (k=new_root; k>=0; k--) {
m3->mc_ki[k+1] = m3->mc_ki[k];
m3->mc_pg[k+1] = m3->mc_pg[k];
}
if (m3->mc_ki[0] >= split_indx) {
if (m3->mc_ki[0] > nkeys) {
m3->mc_ki[0] = 1;
} else {
m3->mc_ki[0] = 0;
@ -8731,10 +8811,13 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) {
if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE))
m3->mc_ki[mc->mc_top]++;
if (m3->mc_ki[mc->mc_top] >= fixup) {
if (m3->mc_ki[mc->mc_top] >= nkeys) {
m3->mc_pg[mc->mc_top] = rp;
m3->mc_ki[mc->mc_top] -= fixup;
m3->mc_ki[ptop] = mn.mc_ki[ptop];
m3->mc_ki[mc->mc_top] -= nkeys;
for (i=0; i<mc->mc_top; i++) {
m3->mc_ki[i] = mn.mc_ki[i];
m3->mc_pg[i] = mn.mc_pg[i];
}
}
} else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] &&
m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {
@ -9151,7 +9234,9 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd)
return rc;
/* We must start the actual read txn after blocking writers */
mdb_txn_end(txn, MDB_END_RESET_TMP);
rc = mdb_txn_end(txn, MDB_END_RESET_TMP);
if (rc)
return rc;
/* Temporarily block writers until we snapshot the meta pages */
wmutex = MDB_MUTEX(env, w);
@ -9996,6 +10081,11 @@ mdb_reader_check0(MDB_env *env, int rlocked, int *dead)
pid_t *pids, pid;
int rc = MDB_SUCCESS, count = 0;
if (unlikely(env->me_pid != getpid())) {
env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
rdrs = env->me_txns->mti_numreaders;
pids = malloc((rdrs+1) * sizeof(pid_t));
if (!pids)

View File

@ -122,27 +122,42 @@ error(const char* msg, ...) {
}
}
static void pagemap_cleanup(void) {
int i;
for( i = 1; i < MAX_DBI; ++i ) {
if (walk.dbi_names[i]) {
free((void *) walk.dbi_names[i]);
walk.dbi_names[i] = NULL;
}
}
free(walk.pagemap);
walk.pagemap = NULL;
}
static int pagemap_lookup_dbi(const char* dbi) {
static int last;
int i;
if (last > 0 && strcmp(walk.dbi_names[last], dbi) == 0)
return last;
for(last = 1; walk.dbi_names[last] && last < MAX_DBI; ++last)
if (strcmp(walk.dbi_names[last], dbi) == 0)
return last;
for(i = 1; walk.dbi_names[i] && last < MAX_DBI; ++i)
if (strcmp(walk.dbi_names[i], dbi) == 0)
return last = i;
if (last == MAX_DBI)
return last = -1;
if (i == MAX_DBI)
return -1;
walk.dbi_names[last] = strdup(dbi);
walk.dbi_names[i] = strdup(dbi);
if (verbose > 1) {
print(" - found '%s' area\n", dbi);
fflush(NULL);
}
return last;
return last = i;
}
static void problem_add(const char* object, size_t entry_number, const char* msg, const char *extra, ...) {
@ -581,6 +596,8 @@ int main(int argc, char *argv[])
struct timespec timestamp_start, timestamp_finish;
double elapsed;
atexit(pagemap_cleanup);
if (clock_gettime(CLOCK_MONOTONIC, &timestamp_start)) {
rc = errno;
error("clock_gettime failed, error %d %s\n", rc, mdbx_strerror(rc));
@ -897,7 +914,6 @@ bailout:
mdbx_txn_abort(locktxn);
if (env)
mdbx_env_close(env);
free(walk.pagemap);
fflush(NULL);
if (rc) {
if (rc < 0)

View File

@ -332,7 +332,7 @@ int main(int argc, char *argv[])
putflags = MDB_NOOVERWRITE|MDB_NODUPDATA;
break;
case 'T':
mode |= NOHDR;
mode |= NOHDR | PRINT;
break;
default:
usage();
@ -399,20 +399,22 @@ int main(int argc, char *argv[])
while(1) {
rc = readline(&key, &kbuf);
if (rc == EOF)
if (rc) /* rc == EOF */
break;
if (rc)
goto txn_abort;
rc = readline(&data, &dbuf);
if (rc)
if (rc) {
fprintf(stderr, "%s: line %" Z "d: failed to read key value\n", prog, lineno);
goto txn_abort;
}
rc = mdb_cursor_put(mc, &key, &data, putflags);
if (rc == MDB_KEYEXIST && putflags)
continue;
if (rc)
if (rc) {
fprintf(stderr, "mdb_cursor_put failed, error %d %s\n", rc, mdb_strerror(rc));
goto txn_abort;
}
batch++;
if (batch == 100) {
rc = mdb_txn_commit(txn);

View File

@ -159,6 +159,7 @@ int main(int argc,char * argv[])
mdb_dbi_close(env, dbi);
#endif
mdb_env_close(env);
free(sval);
return 0;
}