mdbx: Merge branch 'devel'.

This commit is contained in:
Leo Yuriev 2016-08-18 17:10:32 +03:00
commit 8e0f412711
3 changed files with 69 additions and 39 deletions

View File

@ -198,13 +198,15 @@ bench: bench-lmdb.txt bench-mdbx.txt
endif
ci-rule = @( CC=$$(which $1); if [ -n "$$CC" ]; then \
ci-rule = ( CC=$$(which $1); if [ -n "$$CC" ]; then \
CC=$$(readlink -f $$CC); echo -n "probe by $2 ($$CC): " && \
$(MAKE) clean >$1.log 2>$1.err && $(MAKE) all check 1>$1.log 2>$1.err && echo "OK" \
$(MAKE) XCFLAGS="-UNDEBUG -DMDB_DEBUG=2" clean >$1.log 2>$1.err && $(MAKE) all check 1>$1.log 2>$1.err && echo "OK" \
|| ( echo "FAILED"; cat $1.err >&2; exit 1 ); \
else echo "no $2 ($1) for probe"; fi; )
ci:
$(call ci-rule,cc,default C compiler)
$(call ci-rule,gcc,GCC)
$(call ci-rule,clang,clang LLVM)
$(call ci-rule,icc,Intel C)
@if [ "$(CC)" != "gcc" ]; then \
$(call ci-rule,$(CC),default C compiler); \
fi
@$(call ci-rule,gcc,GCC)
@$(call ci-rule,clang,clang LLVM)
@$(call ci-rule,icc,Intel C)

92
mdb.c
View File

@ -532,9 +532,23 @@ typedef struct MDB_txninfo {
+ (1 /* MDB_PIDLOCK */ << 16)))
/** @} */
/** Common header for all page types.
* Overflow records occupy a number of contiguous pages with no
* headers on any page after the first.
/** Common header for all page types. The page type depends on #mp_flags.
*
* #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with
* sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages
* omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header.
*
* #P_OVERFLOW records occupy one or more contiguous pages where only the
* first has a page header. They hold the real data of #F_BIGDATA nodes.
*
* #P_SUBP sub-pages are small leaf "pages" with duplicate data.
* A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page.
* (Duplicate data can also go in sub-databases, which use normal pages.)
*
* #P_META pages contain #MDB_meta, the start point of an LMDB snapshot.
*
* Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once
* in the snapshot: Either used by a database or listed in a freeDB record.
*/
typedef struct MDB_page {
#define mp_pgno mp_p.p_pgno
@ -543,7 +557,7 @@ typedef struct MDB_page {
pgno_t p_pgno; /**< page number */
struct MDB_page *p_next; /**< for in-memory list of freed pages */
} mp_p;
uint16_t mp_ksize;
uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */
/** @defgroup mdb_page Page Flags
* @ingroup internal
* Flags for the page headers.
@ -610,7 +624,9 @@ typedef struct MDB_page {
/** The number of overflow pages needed to store the given size. */
#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
/** Link in #MDB_txn.%mt_loose_pgs list */
/** Link in #MDB_txn.%mt_loose_pgs list.
* Kept outside the page header, which is needed when reusing the page.
*/
#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2))
/** Header for a single key/data pair within a page.
@ -737,9 +753,9 @@ typedef struct MDB_db {
pgno_t md_root; /**< the root page of this tree */
} MDB_db;
/** mdb_dbi_open flags */
#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID))
/** #mdb_dbi_open() flags */
#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
@ -770,7 +786,10 @@ typedef struct MDB_meta {
#define mm_psize mm_dbs[FREE_DBI].md_xsize
/** Any persistent environment flags. @ref mdb_env */
#define mm_flags mm_dbs[FREE_DBI].md_flags
pgno_t mm_last_pg; /**< last used page in file */
/** Last used page in the datafile.
* Actually the file may be shorter if the freeDB lists the final pages.
*/
pgno_t mm_last_pg;
volatile txnid_t mm_txnid; /**< txnid that committed this page */
#define MDB_DATASIGN_NONE 0
#define MDB_DATASIGN_WEAK 1
@ -829,7 +848,7 @@ struct MDB_txn {
* in this transaction, linked through #NEXT_LOOSE_PAGE(page).
*/
MDB_page *mt_loose_pgs;
/* #Number of loose pages (#mt_loose_pgs) */
/** Number of loose pages (#mt_loose_pgs) */
int mt_loose_count;
/** The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
@ -1232,7 +1251,11 @@ static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest);
(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA)))
#else
# define mdb_debug_enabled(type) (0)
# ifndef NDEBUG
# define mdb_debug_enabled(type) (1)
# else
# define mdb_debug_enabled(type) (0)
# endif
# define mdb_audit_enabled() (0)
# define mdb_assert_enabled() (0)
# define mdb_assert_fail(env, msg, func, line) \
@ -1386,7 +1409,7 @@ mdb_page_list(MDB_page *mp)
for (i=0; i<nkeys; i++) {
if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
key.mv_size = nsize = mp->mp_ksize;
key.mv_size = nsize = mp->mp_leaf2_ksize;
key.mv_data = LEAF2KEY(mp, i, nsize);
total += nsize;
mdb_print("key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
@ -2347,7 +2370,7 @@ done:
VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num);
np->mp_pgno = pgno;
np->mp_ksize = 0;
np->mp_leaf2_ksize = 0;
np->mp_flags = 0;
np->mp_pages = num;
mdb_page_dirty(txn, np);
@ -4084,18 +4107,12 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
int rc;
MDB_meta* head = mdb_meta_head_w(env);
size_t prev_mapsize = head->mm_mapsize;
volatile MDB_meta* target = META_IS_WEAK(head) ? head : mdb_env_meta_flipflop(env, head);
off_t offset = (char*) target - env->me_map;
size_t used_size = env->me_psize * (pending->mm_last_pg + 1);
mdb_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env));
mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0);
mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0
|| env->me_mapsize != prev_mapsize);
mdb_assert(env, pending->mm_txnid > head->mm_txnid || META_IS_WEAK(head));
mdb_assert(env, pending->mm_txnid > target->mm_txnid || META_IS_WEAK(target));
MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target);
mdb_assert(env, pending->mm_txnid > stay->mm_txnid);
pending->mm_mapsize = env->me_mapsize;
mdb_assert(env, pending->mm_mapsize >= used_size);
@ -4117,6 +4134,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
if (unlikely(msync(env->me_map, used_size, mode))) {
rc = errno;
/* LY: msync() should never return EINTR */
goto fail;
}
if ((flags & MDB_MAPASYNC) == 0)
@ -4139,7 +4157,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
while(unlikely(flush(env->me_fd) < 0)) {
rc = errno;
if (rc != EINTR)
goto undo;
goto fail;
}
env->me_sync_pending = 0;
}
@ -4150,12 +4168,22 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
pending->mm_datasync_sign = mdb_meta_sign(pending);
} else {
pending->mm_datasync_sign =
(flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC
? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK;
(flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC
? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK;
}
mdb_debug("writing meta %d, root %zu, txn_id %zu, %s",
offset >= env->me_psize, pending->mm_dbs[MAIN_DBI].md_root,
pending->mm_txnid,
volatile MDB_meta* target = (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head))
? head : mdb_env_meta_flipflop(env, head);
off_t offset = (char*) target - env->me_map;
MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target);
mdb_debug("writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, txn_id %zu, %s",
offset >= env->me_psize,
target == head ? "head" : "tail", target->mm_txnid,
META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" : "Legacy",
stay == head ? "head" : "tail", stay->mm_txnid,
META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy",
pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid,
META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" : "Legacy" );
if (env->me_flags & MDB_WRITEMAP) {
@ -6642,7 +6670,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
*/
fp_flags = P_LEAF|P_DIRTY;
fp = env->me_pbuf;
fp->mp_ksize = data->mv_size; /* used if MDB_DUPFIXED */
fp->mp_leaf2_ksize = data->mv_size; /* used if MDB_DUPFIXED */
fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
olddata.mv_size = PAGEHDRSZ;
goto prep_subDB;
@ -6719,7 +6747,7 @@ more:
xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
fp->mp_flags |= P_LEAF2;
fp->mp_ksize = data->mv_size;
fp->mp_leaf2_ksize = data->mv_size;
xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
} else {
xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
@ -6741,7 +6769,7 @@ more:
data->mv_size);
break;
}
offset = fp->mp_ksize;
offset = fp->mp_leaf2_ksize;
if (SIZELEFT(fp) < offset) {
offset *= 4; /* space for 4 more */
break;
@ -6764,7 +6792,7 @@ more:
prep_subDB:
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
fp_flags |= P_LEAF2;
dummy.md_xsize = fp->mp_ksize;
dummy.md_xsize = fp->mp_leaf2_ksize;
dummy.md_flags = MDB_DUPFIXED;
if (mc->mc_db->md_flags & MDB_INTEGERDUP)
dummy.md_flags |= MDB_INTEGERKEY;
@ -6788,11 +6816,11 @@ prep_subDB:
}
if (mp != fp) {
mp->mp_flags = fp_flags | P_DIRTY;
mp->mp_ksize = fp->mp_ksize;
mp->mp_leaf2_ksize = fp->mp_leaf2_ksize;
mp->mp_lower = fp->mp_lower;
mp->mp_upper = fp->mp_upper + offset;
if (fp_flags & P_LEAF2) {
memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_ksize);
memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize);
} else {
memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
olddata.mv_size - fp->mp_upper - PAGEBASE);
@ -7537,7 +7565,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
mx->mx_cursor.mc_ki[0] = 0;
if (mc->mc_db->md_flags & MDB_DUPFIXED) {
mx->mx_db.md_flags = MDB_DUPFIXED;
mx->mx_db.md_xsize = fp->mp_ksize;
mx->mx_db.md_xsize = fp->mp_leaf2_ksize;
if (mc->mc_db->md_flags & MDB_INTEGERDUP)
mx->mx_db.md_flags |= MDB_INTEGERKEY;
}
@ -8621,7 +8649,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
/* Create a right sibling. */
if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
return rc;
rp->mp_ksize = mp->mp_ksize;
rp->mp_leaf2_ksize = mp->mp_leaf2_ksize;
mdb_debug("new right sibling: page %zu", rp->mp_pgno);
/* Usually when splitting the root page, the cursor

2
mdbx.c
View File

@ -213,7 +213,7 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee
if (IS_LEAF2(mp)) {
/* LEAF2 pages have no mp_ptrs[] or node headers */
payload_size += mp->mp_ksize;
payload_size += mp->mp_leaf2_ksize;
continue;
}