diff --git a/Makefile b/Makefile index 870166c0..4bc4c810 100644 --- a/Makefile +++ b/Makefile @@ -198,13 +198,15 @@ bench: bench-lmdb.txt bench-mdbx.txt endif -ci-rule = @( CC=$$(which $1); if [ -n "$$CC" ]; then \ +ci-rule = ( CC=$$(which $1); if [ -n "$$CC" ]; then \ CC=$$(readlink -f $$CC); echo -n "probe by $2 ($$CC): " && \ - $(MAKE) clean >$1.log 2>$1.err && $(MAKE) all check 1>$1.log 2>$1.err && echo "OK" \ + $(MAKE) XCFLAGS="-UNDEBUG -DMDB_DEBUG=2" clean >$1.log 2>$1.err && $(MAKE) all check 1>$1.log 2>$1.err && echo "OK" \ || ( echo "FAILED"; cat $1.err >&2; exit 1 ); \ else echo "no $2 ($1) for probe"; fi; ) ci: - $(call ci-rule,cc,default C compiler) - $(call ci-rule,gcc,GCC) - $(call ci-rule,clang,clang LLVM) - $(call ci-rule,icc,Intel C) + @if [ "$(CC)" != "gcc" ]; then \ + $(call ci-rule,$(CC),default C compiler); \ + fi + @$(call ci-rule,gcc,GCC) + @$(call ci-rule,clang,clang LLVM) + @$(call ci-rule,icc,Intel C) diff --git a/mdb.c b/mdb.c index f1995f27..f37b2ba1 100644 --- a/mdb.c +++ b/mdb.c @@ -532,9 +532,23 @@ typedef struct MDB_txninfo { + (1 /* MDB_PIDLOCK */ << 16))) /** @} */ -/** Common header for all page types. - * Overflow records occupy a number of contiguous pages with no - * headers on any page after the first. +/** Common header for all page types. The page type depends on #mp_flags. + * + * #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with + * sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages + * omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. + * + * #P_OVERFLOW records occupy one or more contiguous pages where only the + * first has a page header. They hold the real data of #F_BIGDATA nodes. + * + * #P_SUBP sub-pages are small leaf "pages" with duplicate data. + * A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. + * (Duplicate data can also go in sub-databases, which use normal pages.) + * + * #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. + * + * Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once + * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDB_page { #define mp_pgno mp_p.p_pgno @@ -543,7 +557,7 @@ typedef struct MDB_page { pgno_t p_pgno; /**< page number */ struct MDB_page *p_next; /**< for in-memory list of freed pages */ } mp_p; - uint16_t mp_ksize; + uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ /** @defgroup mdb_page Page Flags * @ingroup internal * Flags for the page headers. @@ -610,7 +624,9 @@ typedef struct MDB_page { /** The number of overflow pages needed to store the given size. */ #define OVPAGES(size, psize) ((PAGEHDRSZ-1 + (size)) / (psize) + 1) - /** Link in #MDB_txn.%mt_loose_pgs list */ + /** Link in #MDB_txn.%mt_loose_pgs list. + * Kept outside the page header, which is needed when reusing the page. + */ #define NEXT_LOOSE_PAGE(p) (*(MDB_page **)((p) + 2)) /** Header for a single key/data pair within a page. @@ -737,9 +753,9 @@ typedef struct MDB_db { pgno_t md_root; /**< the root page of this tree */ } MDB_db; - /** mdb_dbi_open flags */ #define MDB_VALID 0x8000 /**< DB handle is valid, for me_dbflags */ #define PERSISTENT_FLAGS (0xffff & ~(MDB_VALID)) + /** #mdb_dbi_open() flags */ #define VALID_FLAGS (MDB_REVERSEKEY|MDB_DUPSORT|MDB_INTEGERKEY|MDB_DUPFIXED|\ MDB_INTEGERDUP|MDB_REVERSEDUP|MDB_CREATE) @@ -770,7 +786,10 @@ typedef struct MDB_meta { #define mm_psize mm_dbs[FREE_DBI].md_xsize /** Any persistent environment flags. @ref mdb_env */ #define mm_flags mm_dbs[FREE_DBI].md_flags - pgno_t mm_last_pg; /**< last used page in file */ + /** Last used page in the datafile. + * Actually the file may be shorter if the freeDB lists the final pages. + */ + pgno_t mm_last_pg; volatile txnid_t mm_txnid; /**< txnid that committed this page */ #define MDB_DATASIGN_NONE 0 #define MDB_DATASIGN_WEAK 1 @@ -829,7 +848,7 @@ struct MDB_txn { * in this transaction, linked through #NEXT_LOOSE_PAGE(page). */ MDB_page *mt_loose_pgs; - /* #Number of loose pages (#mt_loose_pgs) */ + /** Number of loose pages (#mt_loose_pgs) */ int mt_loose_count; /** The sorted list of dirty pages we temporarily wrote to disk * because the dirty list was full. page numbers in here are @@ -1232,7 +1251,11 @@ static txnid_t mdbx_oomkick(MDB_env *env, txnid_t oldest); (type & (MDBX_DBG_TRACE | MDBX_DBG_EXTRA))) #else -# define mdb_debug_enabled(type) (0) +# ifndef NDEBUG +# define mdb_debug_enabled(type) (1) +# else +# define mdb_debug_enabled(type) (0) +# endif # define mdb_audit_enabled() (0) # define mdb_assert_enabled() (0) # define mdb_assert_fail(env, msg, func, line) \ @@ -1386,7 +1409,7 @@ mdb_page_list(MDB_page *mp) for (i=0; imp_ksize; + key.mv_size = nsize = mp->mp_leaf2_ksize; key.mv_data = LEAF2KEY(mp, i, nsize); total += nsize; mdb_print("key %d: nsize %d, %s\n", i, nsize, DKEY(&key)); @@ -2347,7 +2370,7 @@ done: VALGRIND_MAKE_MEM_UNDEFINED(np, env->me_psize * num); np->mp_pgno = pgno; - np->mp_ksize = 0; + np->mp_leaf2_ksize = 0; np->mp_flags = 0; np->mp_pages = num; mdb_page_dirty(txn, np); @@ -4084,18 +4107,12 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) int rc; MDB_meta* head = mdb_meta_head_w(env); size_t prev_mapsize = head->mm_mapsize; - volatile MDB_meta* target = META_IS_WEAK(head) ? head : mdb_env_meta_flipflop(env, head); - off_t offset = (char*) target - env->me_map; size_t used_size = env->me_psize * (pending->mm_last_pg + 1); + mdb_assert(env, pending != METAPAGE_1(env) && pending != METAPAGE_2(env)); mdb_assert(env, (env->me_flags & (MDB_RDONLY | MDB_FATAL_ERROR)) == 0); mdb_assert(env, META_IS_WEAK(head) || env->me_sync_pending != 0 || env->me_mapsize != prev_mapsize); - mdb_assert(env, pending->mm_txnid > head->mm_txnid || META_IS_WEAK(head)); - mdb_assert(env, pending->mm_txnid > target->mm_txnid || META_IS_WEAK(target)); - - MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target); - mdb_assert(env, pending->mm_txnid > stay->mm_txnid); pending->mm_mapsize = env->me_mapsize; mdb_assert(env, pending->mm_mapsize >= used_size); @@ -4117,6 +4134,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) int mode = (flags & MDB_MAPASYNC) ? MS_ASYNC : MS_SYNC; if (unlikely(msync(env->me_map, used_size, mode))) { rc = errno; + /* LY: msync() should never return EINTR */ goto fail; } if ((flags & MDB_MAPASYNC) == 0) @@ -4139,7 +4157,7 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) while(unlikely(flush(env->me_fd) < 0)) { rc = errno; if (rc != EINTR) - goto undo; + goto fail; } env->me_sync_pending = 0; } @@ -4150,12 +4168,22 @@ mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending) pending->mm_datasync_sign = mdb_meta_sign(pending); } else { pending->mm_datasync_sign = - (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC - ? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK; + (flags & MDBX_UTTERLY_NOSYNC) == MDBX_UTTERLY_NOSYNC + ? MDB_DATASIGN_NONE : MDB_DATASIGN_WEAK; } - mdb_debug("writing meta %d, root %zu, txn_id %zu, %s", - offset >= env->me_psize, pending->mm_dbs[MAIN_DBI].md_root, - pending->mm_txnid, + + volatile MDB_meta* target = (pending->mm_txnid == head->mm_txnid || META_IS_WEAK(head)) + ? head : mdb_env_meta_flipflop(env, head); + off_t offset = (char*) target - env->me_map; + + MDB_meta* stay = mdb_env_meta_flipflop(env, (MDB_meta*) target); + mdb_debug("writing meta %d (%s, was %zu/%s, stay %s %zu/%s), root %zu, txn_id %zu, %s", + offset >= env->me_psize, + target == head ? "head" : "tail", target->mm_txnid, + META_IS_WEAK(target) ? "Weak" : META_IS_STEADY(target) ? "Steady" : "Legacy", + stay == head ? "head" : "tail", stay->mm_txnid, + META_IS_WEAK(stay) ? "Weak" : META_IS_STEADY(stay) ? "Steady" : "Legacy", + pending->mm_dbs[MAIN_DBI].md_root, pending->mm_txnid, META_IS_WEAK(pending) ? "Weak" : META_IS_STEADY(pending) ? "Steady" : "Legacy" ); if (env->me_flags & MDB_WRITEMAP) { @@ -6642,7 +6670,7 @@ mdb_cursor_put(MDB_cursor *mc, MDB_val *key, MDB_val *data, */ fp_flags = P_LEAF|P_DIRTY; fp = env->me_pbuf; - fp->mp_ksize = data->mv_size; /* used if MDB_DUPFIXED */ + fp->mp_leaf2_ksize = data->mv_size; /* used if MDB_DUPFIXED */ fp->mp_lower = fp->mp_upper = (PAGEHDRSZ-PAGEBASE); olddata.mv_size = PAGEHDRSZ; goto prep_subDB; @@ -6719,7 +6747,7 @@ more: xdata.mv_size = PAGEHDRSZ + dkey.mv_size + data->mv_size; if (mc->mc_db->md_flags & MDB_DUPFIXED) { fp->mp_flags |= P_LEAF2; - fp->mp_ksize = data->mv_size; + fp->mp_leaf2_ksize = data->mv_size; xdata.mv_size += 2 * data->mv_size; /* leave space for 2 more */ } else { xdata.mv_size += 2 * (sizeof(indx_t) + NODESIZE) + @@ -6741,7 +6769,7 @@ more: data->mv_size); break; } - offset = fp->mp_ksize; + offset = fp->mp_leaf2_ksize; if (SIZELEFT(fp) < offset) { offset *= 4; /* space for 4 more */ break; @@ -6764,7 +6792,7 @@ more: prep_subDB: if (mc->mc_db->md_flags & MDB_DUPFIXED) { fp_flags |= P_LEAF2; - dummy.md_xsize = fp->mp_ksize; + dummy.md_xsize = fp->mp_leaf2_ksize; dummy.md_flags = MDB_DUPFIXED; if (mc->mc_db->md_flags & MDB_INTEGERDUP) dummy.md_flags |= MDB_INTEGERKEY; @@ -6788,11 +6816,11 @@ prep_subDB: } if (mp != fp) { mp->mp_flags = fp_flags | P_DIRTY; - mp->mp_ksize = fp->mp_ksize; + mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; mp->mp_upper = fp->mp_upper + offset; if (fp_flags & P_LEAF2) { - memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_ksize); + memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); } else { memcpy((char *)mp + mp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEBASE, olddata.mv_size - fp->mp_upper - PAGEBASE); @@ -7537,7 +7565,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) mx->mx_cursor.mc_ki[0] = 0; if (mc->mc_db->md_flags & MDB_DUPFIXED) { mx->mx_db.md_flags = MDB_DUPFIXED; - mx->mx_db.md_xsize = fp->mp_ksize; + mx->mx_db.md_xsize = fp->mp_leaf2_ksize; if (mc->mc_db->md_flags & MDB_INTEGERDUP) mx->mx_db.md_flags |= MDB_INTEGERKEY; } @@ -8621,7 +8649,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* Create a right sibling. */ if ((rc = mdb_page_new(mc, mp->mp_flags, 1, &rp))) return rc; - rp->mp_ksize = mp->mp_ksize; + rp->mp_leaf2_ksize = mp->mp_leaf2_ksize; mdb_debug("new right sibling: page %zu", rp->mp_pgno); /* Usually when splitting the root page, the cursor diff --git a/mdbx.c b/mdbx.c index 7b8d8d6e..7281147f 100644 --- a/mdbx.c +++ b/mdbx.c @@ -213,7 +213,7 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ - payload_size += mp->mp_ksize; + payload_size += mp->mp_leaf2_ksize; continue; }