mdbx: Merge branch 'devel'.

This commit is contained in:
Leo Yuriev 2016-08-18 17:10:32 +03:00
commit 8e0f412711
3 changed files with 69 additions and 39 deletions

View File

@ -198,13 +198,15 @@ bench: bench-lmdb.txt bench-mdbx.txt
endif endif
ci-rule = @( CC=$$(which $1); if [ -n "$$CC" ]; then \ ci-rule = ( CC=$$(which $1); if [ -n "$$CC" ]; then \
CC=$$(readlink -f $$CC); echo -n "probe by $2 ($$CC): " && \ CC=$$(readlink -f $$CC); echo -n "probe by $2 ($$CC): " && \
$(MAKE) clean >$1.log 2>$1.err && $(MAKE) all check 1>$1.log 2>$1.err && echo "OK" \ $(MAKE) XCFLAGS="-UNDEBUG -DMDB_DEBUG=2" clean >$1.log 2>$1.err && $(MAKE) all check 1>$1.log 2>$1.err && echo "OK" \
|| ( echo "FAILED"; cat $1.err >&2; exit 1 ); \ || ( echo "FAILED"; cat $1.err >&2; exit 1 ); \
else echo "no $2 ($1) for probe"; fi; ) else echo "no $2 ($1) for probe"; fi; )
ci: ci:
$(call ci-rule,cc,default C compiler) @if [ "$(CC)" != "gcc" ]; then \
$(call ci-rule,gcc,GCC) $(call ci-rule,$(CC),default C compiler); \
$(call ci-rule,clang,clang LLVM) fi
$(call ci-rule,icc,Intel C) @$(call ci-rule,gcc,GCC)
@$(call ci-rule,clang,clang LLVM)
@$(call ci-rule,icc,Intel C)

86
mdb.c
View File

@ -532,9 +532,23 @@ typedef struct MDB_txninfo {
+ (1 /* MDB_PIDLOCK */ << 16))) + (1 /* MDB_PIDLOCK */ << 16)))
/** @} */ /** @} */
/** Common header for all page types. /** Common header for all page types. The page type depends on #mp_flags.
* Overflow records occupy a number of contiguous pages with no *
* headers on any page after the first. * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with
* sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages
* omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header.
*
* #P_OVERFLOW records occupy one or more contiguous pages where only the
* first has a page header. They hold the real data of #F_BIGDATA nodes.
*
* #P_SUBP sub-pages are small leaf "pages" with duplicate data.
* A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page.
* (Duplicate data can also go in sub-databases, which use normal pages.)
*
* #P_META pages contain #MDB_meta, the start point of an LMDB snapshot.
*
* Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once
* in the snapshot: Either used by a database or listed in a freeDB record.
*/ */
typedef struct MDB_page { typedef struct MDB_page {
#define mp_pgno mp_p.p_pgno #define mp_pgno mp_p.p_pgno
@ -543,7 +557,7 @@ typedef struct MDB_page {
pgno_t p_pgno; /**< page number */ pgno_t p_pgno; /**< page number */
struct MDB_page *p_next; /**< for in-memory list of freed pages */ struct MDB_page *p_next; /**< for in-memory list of freed pages */
} mp_p; } mp_p;
uint16_t mp_ksize; uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */
/** @defgroup mdb_page Page Flags /** @defgroup mdb_page Page Flags
* @ingroup internal * @ingroup internal
* Flags for the page headers. * Flags for the page headers.
@ -610,7 +624,9 @@ typedef struct MDB_page {
/** The number of overflow pages needed to store the given size. */ /** The number of overflow pages needed to store the given size. */
#define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1)
/** Link in #MDB_txn.%mt_loose_pgs list */ /** Link in #MDB_txn.%mt_loose_pgs list.
* Kept outside the page header, which is needed when reusing the page.
*/
#define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2))
/** Header for a single key/data pair within a page. /** Header for a single key/data pair within a page.
@ -737,9 +753,9 @@ typedef struct MDB_db {
pgno_t md_root; /**< the root page of this tree */ pgno_t md_root; /**< the root page of this tree */
} MDB_db; } MDB_db;
/** mdb_dbi_open flags */
#define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */
#define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID))
/** #mdb_dbi_open() flags */
#define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\
MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE)
@ -770,7 +786,10 @@ typedef struct MDB_meta {
#define mm_psize mm_dbs[FREE_DBI].md_xsize #define mm_psize mm_dbs[FREE_DBI].md_xsize
/** Any persistent environment flags. @ref mdb_env */ /** Any persistent environment flags. @ref mdb_env */
#define mm_flags mm_dbs[FREE_DBI].md_flags #define mm_flags mm_dbs[FREE_DBI].md_flags
pgno_t mm_last_pg; /**< last used page in file */ /** Last used page in the datafile.
* Actually the file may be shorter if the freeDB lists the final pages.
*/
pgno_t mm_last_pg;
volatile txnid_t mm_txnid; /**< txnid that committed this page */ volatile txnid_t mm_txnid; /**< txnid that committed this page */
#define MDB_DATASIGN_NONE 0 #define MDB_DATASIGN_NONE 0
#define MDB_DATASIGN_WEAK 1 #define MDB_DATASIGN_WEAK 1
@ -829,7 +848,7 @@ struct MDB_txn {
* in this transaction, linked through #NEXT_LOOSE_PAGE(page). * in this transaction, linked through #NEXT_LOOSE_PAGE(page).
*/ */
MDB_page *mt_loose_pgs; MDB_page *mt_loose_pgs;
/* #Number of loose pages (#mt_loose_pgs) */ /** Number of loose pages (#mt_loose_pgs) */
int mt_loose_count; int mt_loose_count;
/** The sorted list of dirty pages we temporarily wrote to disk /** The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are * because the dirty list was full. page numbers in here are
@ -1231,8 +1250,12 @@ static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest);
unlikely(mdb_runtime_flags & \ unlikely(mdb_runtime_flags & \
(type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) (type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA)))
#else
# ifndef NDEBUG
# define mdb_debug_enabled(type) (1)
# else # else
# define mdb_debug_enabled(type) (0) # define mdb_debug_enabled(type) (0)
# endif
# define mdb_audit_enabled() (0) # define mdb_audit_enabled() (0)
# define mdb_assert_enabled() (0) # define mdb_assert_enabled() (0)
# define mdb_assert_fail(env, msg, func, line) \ # define mdb_assert_fail(env, msg, func, line) \
@ -1386,7 +1409,7 @@ mdb_page_list(MDB_page *mp)
for (i=0; i<nkeys; i++) { for (i=0; i<nkeys; i++) {
if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */
key.mv_size = nsize = mp->mp_ksize; key.mv_size = nsize = mp->mp_leaf2_ksize;
key.mv_data = LEAF2KEY(mp, i, nsize); key.mv_data = LEAF2KEY(mp, i, nsize);
total += nsize; total += nsize;
mdb_print("key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); mdb_print("key %d: nsize %d, %s\n", i, nsize, DKEY(&key));
@ -2347,7 +2370,7 @@ done:
VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num);
np->mp_pgno = pgno; np->mp_pgno = pgno;
np->mp_ksize = 0; np->mp_leaf2_ksize = 0;
np->mp_flags = 0; np->mp_flags = 0;
np->mp_pages = num; np->mp_pages = num;
mdb_page_dirty(txn, np); mdb_page_dirty(txn, np);
@ -4084,18 +4107,12 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
int rc; int rc;
MDB_meta* head = mdb_meta_head_w(env); MDB_meta* head = mdb_meta_head_w(env);
size_t prev_mapsize = head->mm_mapsize; size_t prev_mapsize = head->mm_mapsize;
volatile MDB_meta* target = META_IS_WEAK(head) ? head : mdb_env_meta_flipflop(env, head);
off_t offset = (char*) target - env->me_map;
size_t used_size = env->me_psize * (pending->mm_last_pg + 1); size_t used_size = env->me_psize * (pending->mm_last_pg + 1);
mdb_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env));
mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0);
mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0 mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0
|| env->me_mapsize != prev_mapsize); || env->me_mapsize != prev_mapsize);
mdb_assert(env, pending->mm_txnid > head->mm_txnid || META_IS_WEAK(head));
mdb_assert(env, pending->mm_txnid > target->mm_txnid || META_IS_WEAK(target));
MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target);
mdb_assert(env, pending->mm_txnid > stay->mm_txnid);
pending->mm_mapsize = env->me_mapsize; pending->mm_mapsize = env->me_mapsize;
mdb_assert(env, pending->mm_mapsize >= used_size); mdb_assert(env, pending->mm_mapsize >= used_size);
@ -4117,6 +4134,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC;
if (unlikely(msync(env->me_map, used_size, mode))) { if (unlikely(msync(env->me_map, used_size, mode))) {
rc = errno; rc = errno;
/* LY: msync() should never return EINTR */
goto fail; goto fail;
} }
if ((flags & MDB_MAPASYNC) == 0) if ((flags & MDB_MAPASYNC) == 0)
@ -4139,7 +4157,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
while(unlikely(flush(env->me_fd) < 0)) { while(unlikely(flush(env->me_fd) < 0)) {
rc = errno; rc = errno;
if (rc != EINTR) if (rc != EINTR)
goto undo; goto fail;
} }
env->me_sync_pending = 0; env->me_sync_pending = 0;
} }
@ -4153,9 +4171,19 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending)
(flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC
? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK; ? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK;
} }
mdb_debug("writing meta %d, root %zu, txn_id %zu, %s",
offset >= env->me_psize, pending->mm_dbs[MAIN_DBI].md_root, volatile MDB_meta* target = (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head))
pending->mm_txnid, ? head : mdb_env_meta_flipflop(env, head);
off_t offset = (char*) target - env->me_map;
MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target);
mdb_debug("writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, txn_id %zu, %s",
offset >= env->me_psize,
target == head ? "head" : "tail", target->mm_txnid,
META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" : "Legacy",
stay == head ? "head" : "tail", stay->mm_txnid,
META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy",
pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid,
META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" : "Legacy" ); META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" : "Legacy" );
if (env->me_flags & MDB_WRITEMAP) { if (env->me_flags & MDB_WRITEMAP) {
@ -6642,7 +6670,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data,
*/ */
fp_flags = P_LEAF|P_DIRTY; fp_flags = P_LEAF|P_DIRTY;
fp = env->me_pbuf; fp = env->me_pbuf;
fp->mp_ksize = data->mv_size; /* used if MDB_DUPFIXED */ fp->mp_leaf2_ksize = data->mv_size; /* used if MDB_DUPFIXED */
fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE);
olddata.mv_size = PAGEHDRSZ; olddata.mv_size = PAGEHDRSZ;
goto prep_subDB; goto prep_subDB;
@ -6719,7 +6747,7 @@ more:
xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size;
if (mc->mc_db->md_flags & MDB_DUPFIXED) { if (mc->mc_db->md_flags & MDB_DUPFIXED) {
fp->mp_flags |= P_LEAF2; fp->mp_flags |= P_LEAF2;
fp->mp_ksize = data->mv_size; fp->mp_leaf2_ksize = data->mv_size;
xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */
} else { } else {
xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) +
@ -6741,7 +6769,7 @@ more:
data->mv_size); data->mv_size);
break; break;
} }
offset = fp->mp_ksize; offset = fp->mp_leaf2_ksize;
if (SIZELEFT(fp) < offset) { if (SIZELEFT(fp) < offset) {
offset *= 4; /* space for 4 more */ offset *= 4; /* space for 4 more */
break; break;
@ -6764,7 +6792,7 @@ more:
prep_subDB: prep_subDB:
if (mc->mc_db->md_flags & MDB_DUPFIXED) { if (mc->mc_db->md_flags & MDB_DUPFIXED) {
fp_flags |= P_LEAF2; fp_flags |= P_LEAF2;
dummy.md_xsize = fp->mp_ksize; dummy.md_xsize = fp->mp_leaf2_ksize;
dummy.md_flags = MDB_DUPFIXED; dummy.md_flags = MDB_DUPFIXED;
if (mc->mc_db->md_flags & MDB_INTEGERDUP) if (mc->mc_db->md_flags & MDB_INTEGERDUP)
dummy.md_flags |= MDB_INTEGERKEY; dummy.md_flags |= MDB_INTEGERKEY;
@ -6788,11 +6816,11 @@ prep_subDB:
} }
if (mp != fp) { if (mp != fp) {
mp->mp_flags = fp_flags | P_DIRTY; mp->mp_flags = fp_flags | P_DIRTY;
mp->mp_ksize = fp->mp_ksize; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize;
mp->mp_lower = fp->mp_lower; mp->mp_lower = fp->mp_lower;
mp->mp_upper = fp->mp_upper + offset; mp->mp_upper = fp->mp_upper + offset;
if (fp_flags & P_LEAF2) { if (fp_flags & P_LEAF2) {
memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_ksize); memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize);
} else { } else {
memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE,
olddata.mv_size - fp->mp_upper - PAGEBASE); olddata.mv_size - fp->mp_upper - PAGEBASE);
@ -7537,7 +7565,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node)
mx->mx_cursor.mc_ki[0] = 0; mx->mx_cursor.mc_ki[0] = 0;
if (mc->mc_db->md_flags & MDB_DUPFIXED) { if (mc->mc_db->md_flags & MDB_DUPFIXED) {
mx->mx_db.md_flags = MDB_DUPFIXED; mx->mx_db.md_flags = MDB_DUPFIXED;
mx->mx_db.md_xsize = fp->mp_ksize; mx->mx_db.md_xsize = fp->mp_leaf2_ksize;
if (mc->mc_db->md_flags & MDB_INTEGERDUP) if (mc->mc_db->md_flags & MDB_INTEGERDUP)
mx->mx_db.md_flags |= MDB_INTEGERKEY; mx->mx_db.md_flags |= MDB_INTEGERKEY;
} }
@ -8621,7 +8649,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno
/* Create a right sibling. */ /* Create a right sibling. */
if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp)))
return rc; return rc;
rp->mp_ksize = mp->mp_ksize; rp->mp_leaf2_ksize = mp->mp_leaf2_ksize;
mdb_debug("new right sibling: page %zu", rp->mp_pgno); mdb_debug("new right sibling: page %zu", rp->mp_pgno);
/* Usually when splitting the root page, the cursor /* Usually when splitting the root page, the cursor

2
mdbx.c
View File

@ -213,7 +213,7 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee
if (IS_LEAF2(mp)) { if (IS_LEAF2(mp)) {
/* LEAF2 pages have no mp_ptrs[] or node headers */ /* LEAF2 pages have no mp_ptrs[] or node headers */
payload_size += mp->mp_ksize; payload_size += mp->mp_leaf2_ksize;
continue; continue;
} }