lmdb: major rework of traversal b-tree for mdb_chk.

Change-Id: I9d382516f76092f44fc1a12d7554039582b87656
This commit is contained in:
Leo Yuriev 2015-09-02 13:30:53 +03:00
parent 8ff2458003
commit 15e0600b6c
3 changed files with 169 additions and 153 deletions

8
lmdb.h
View File

@ -1658,10 +1658,10 @@ typedef void MDB_debug_func(int type, const char *function, int line,
int mdb_setup_debug(int flags, MDB_debug_func* logger, long edge_txn); int mdb_setup_debug(int flags, MDB_debug_func* logger, long edge_txn);
typedef int MDB_pgwalk_func(size_t pgno, unsigned pgnumber, void* ctx, typedef int MDB_pgvisitor_func(size_t pgno, unsigned pgnumber, void* ctx,
const char* dbi, char type, const char* dbi, const char *type,
int payload_bytes, int header_bytes); int payload_bytes, int header_bytes, int unused_bytes);
int mdb_env_pgwalk(MDB_txn *txn, MDB_pgwalk_func* visitor, void* ctx); int mdb_env_pgwalk(MDB_txn *txn, MDB_pgvisitor_func* visitor, void* ctx);
char* mdb_dkey(MDB_val *key, char *buf); char* mdb_dkey(MDB_val *key, char *buf);

239
mdb.c
View File

@ -1472,8 +1472,7 @@ mdb_page_list(MDB_page *mp)
key.mv_data = node->mn_data; key.mv_data = node->mn_data;
nsize = NODESIZE + key.mv_size; nsize = NODESIZE + key.mv_size;
if (IS_BRANCH(mp)) { if (IS_BRANCH(mp)) {
mdb_print("key %d: page %zu, %s\n", i, NODEPGNO(node), mdb_print("key %d: page %zu, %s\n", i, NODEPGNO(node), DKEY(&key));
DKEY(&key));
total += nsize; total += nsize;
} else { } else {
if (F_ISSET(node->mn_flags, F_BIGDATA)) if (F_ISSET(node->mn_flags, F_BIGDATA))
@ -9794,154 +9793,141 @@ mdb_env_get_oomfunc(MDB_env *env)
struct mdb_walk_ctx { struct mdb_walk_ctx {
MDB_txn *mw_txn; MDB_txn *mw_txn;
void *mw_user; void *mw_user;
MDB_pgwalk_func *mw_visitor; MDB_pgvisitor_func *mw_visitor;
}; };
typedef struct mdb_walk_ctx mdb_walk_ctx_t; typedef struct mdb_walk_ctx mdb_walk_ctx_t;
/** Depth-first tree traversal. */ /** Depth-first tree traversal. */
static int ESECT static int ESECT
mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int deep) mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int deep)
{ {
MDB_cursor mc;
MDB_node *ni;
MDB_page *mp; MDB_page *mp;
int rc; int rc, i, nkeys;
unsigned i; unsigned header_size, unused_size, payload_size, align_bytes;
const char* type;
/* Empty DB, nothing to do */
if (pg == P_INVALID) if (pg == P_INVALID)
return MDB_SUCCESS; return MDB_CORRUPTED;
if (deep < 2) { rc = mdb_page_get(ctx->mw_txn, pg, &mp, NULL);
if ((rc = mdb_page_get(ctx->mw_txn, pg, &mp, NULL)) != 0)
return rc;
rc = ctx->mw_visitor(pg, 0, ctx->mw_user, dbi, 'R',
ctx->mw_txn->mt_env->me_psize - PAGEHDRSZ - SIZELEFT(mp), PAGEHDRSZ);
if (rc)
return rc;
}
mc.mc_snum = 1;
mc.mc_top = 0;
mc.mc_txn = ctx->mw_txn;
rc = mdb_page_get(ctx->mw_txn, pg, &mc.mc_pg[0], NULL);
if (rc) if (rc)
return rc; return rc;
if (pg != mp->mp_p.p_pgno)
return MDB_CORRUPTED;
for (mp = mc.mc_pg[mc.mc_top]; IS_BRANCH(mp); ) { nkeys = NUMKEYS(mp);
MDB_node *node; header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower;
unused_size = SIZELEFT(mp);
payload_size = 0;
rc = ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, 'B', /* LY: Don't use mask here, e.g bitwise (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP).
ctx->mw_txn->mt_env->me_psize - PAGEHDRSZ - SIZELEFT(mp), PAGEHDRSZ); * Pages should not me marked dirty/loose or otherwise. */
if (rc) switch (mp->mp_flags) {
return rc; case P_BRANCH:
type = "branch";
if (NUMKEYS(mp) < 1) if (nkeys < 1)
return MDB_CORRUPTED; return MDB_CORRUPTED;
break;
mdb_debug("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp)); case P_LEAF:
mdb_cassert(&mc, NUMKEYS(mp) > 1); type = "leaf";
mdb_debug("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0))); break;
case P_LEAF|P_SUBP:
node = NODEPTR(mp, 0); type = "leaf-dupsort";
if ((rc = mdb_page_get(mc.mc_txn, NODEPGNO(node), &mp, NULL)) != 0) break;
return rc; case P_LEAF|P_LEAF2:
/* #MDB_DUPFIXED records */
mc.mc_ki[mc.mc_top] = 0; type = "leaf-dupfixed";
if ((rc = mdb_cursor_push(&mc, mp))) break;
return rc; case P_LEAF|P_LEAF2|P_SUBP:
} /* #MDB_DUPSORT sub-pages */
type = "leaf-dupfixed-dupsort";
if (!IS_LEAF(mp)) { break;
mdb_debug("internal error, index points to a %02X page!?", case P_META:
mp->mp_flags); case P_OVERFLOW:
mc.mc_txn->mt_flags |= MDB_TXN_ERROR; default:
return MDB_CORRUPTED; return MDB_CORRUPTED;
} }
mc.mc_flags |= C_INITIALIZED; for (align_bytes = i = 0; i < nkeys;
mc.mc_flags &= ~C_EOF; align_bytes += ((payload_size + align_bytes) & 1), i++) {
MDB_node *node;
rc = ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, 'L', if (IS_LEAF2(mp)) {
ctx->mw_txn->mt_env->me_psize - PAGEHDRSZ - SIZELEFT(mp), PAGEHDRSZ); /* LEAF2 pages have no mp_ptrs[] or node headers */
if (rc) payload_size += mp->mp_ksize;
return rc; continue;
while (mc.mc_snum > 0) {
unsigned n;
mp = mc.mc_pg[mc.mc_top];
n = NUMKEYS(mp);
if (IS_LEAF(mp)) {
if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
for (i = 0; i < n; i++) {
ni = NODEPTR(mp, i);
if (ni->mn_flags & F_BIGDATA) {
MDB_page *omp;
pgno_t *pg;
pg = NODEDATA(ni);
rc = mdb_page_get(ctx->mw_txn, *pg, &omp, NULL);
if (rc)
return rc;
rc = ctx->mw_visitor(*pg, omp->mp_pages, ctx->mw_user, dbi, 'L',
ctx->mw_txn->mt_env->me_psize - PAGEHDRSZ - SIZELEFT(mp), PAGEHDRSZ);
if (rc)
return rc;
} else if (ni->mn_flags & F_SUBDATA) {
MDB_db *db = NODEDATA(ni);
char* name = NULL;
if (! (ni->mn_flags & F_DUPDATA)) {
name = NODEKEY(ni);
int namelen = (char*) db - name;
name = memcpy(alloca(namelen + 1), name, namelen);
name[namelen] = 0;
}
rc = mdb_env_walk(ctx, (name && name[0]) ? name : dbi, db->md_root, ni->mn_flags & F_DUPDATA, deep + 1);
if (rc)
return rc;
}
}
}
} else {
mc.mc_ki[mc.mc_top]++;
if (mc.mc_ki[mc.mc_top] < n) {
pgno_t pg;
do {
ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
pg = NODEPGNO(ni);
rc = mdb_page_get(ctx->mw_txn, pg, &mp, NULL);
if (rc)
return rc;
rc = ctx->mw_visitor(pg, 1, ctx->mw_user, dbi, IS_BRANCH(mp) ? 'B' : 'L',
ctx->mw_txn->mt_env->me_psize - PAGEHDRSZ - SIZELEFT(mp), PAGEHDRSZ);
if (rc)
return rc;
mc.mc_top++;
mc.mc_snum++;
mc.mc_ki[mc.mc_top] = 0;
mc.mc_pg[mc.mc_top] = mp;
}
/* Whenever we advance to a sibling branch page,
* we must proceed all the way down to its first leaf.
*/
while (IS_BRANCH(mp));
continue;
}
} }
if (! mc.mc_top) node = NODEPTR(mp, i);
break; payload_size += NODESIZE + node->mn_ksize;
mdb_cursor_pop(&mc); if (IS_BRANCH(mp)) {
rc = mdb_env_walk(ctx, dbi, NODEPGNO(node), flags, deep);
if (rc)
return rc;
continue;
}
assert(IS_LEAF(mp));
if (node->mn_ksize < 1)
return MDB_CORRUPTED;
if (node->mn_flags & F_BIGDATA) {
MDB_page *omp;
pgno_t *opg;
size_t over_header, over_payload, over_unused;
payload_size += sizeof(pgno_t);
opg = NODEDATA(node);
rc = mdb_page_get(ctx->mw_txn, *opg, &omp, NULL);
if (rc)
return rc;
if (*opg != omp->mp_p.p_pgno)
return MDB_CORRUPTED;
/* LY: Don't use mask here, e.g bitwise (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP).
* Pages should not me marked dirty/loose or otherwise. */
if (P_OVERFLOW != omp->mp_flags)
return MDB_CORRUPTED;
over_header = PAGEHDRSZ;
over_payload = NODEDSZ(node);
over_unused = omp->mp_pages * ctx->mw_txn->mt_env->me_psize
- over_payload - over_header;
rc = ctx->mw_visitor(*opg, omp->mp_pages, ctx->mw_user, dbi, "overflow-data",
over_payload, over_header, over_unused);
if (rc)
return rc;
continue;
}
payload_size += NODEDSZ(node);
if (node->mn_flags & F_SUBDATA) {
MDB_db *db = NODEDATA(node);
char* name = NULL;
if (NODEDSZ(node) < 1)
return MDB_CORRUPTED;
if (! (node->mn_flags & F_DUPDATA)) {
name = NODEKEY(node);
int namelen = (char*) db - name;
name = memcpy(alloca(namelen + 1), name, namelen);
name[namelen] = 0;
}
rc = mdb_env_walk(ctx, (name && name[0]) ? name : dbi,
db->md_root, node->mn_flags & F_DUPDATA, deep + 1);
if (rc)
return rc;
}
} }
return rc;
return ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi,
type, payload_size, header_size, unused_size + align_bytes);
} }
int ESECT int ESECT
mdb_env_pgwalk(MDB_txn *txn, MDB_pgwalk_func* visitor, void* user) mdb_env_pgwalk(MDB_txn *txn, MDB_pgvisitor_func* visitor, void* user)
{ {
mdb_walk_ctx_t ctx; mdb_walk_ctx_t ctx;
int rc; int rc;
@ -9950,13 +9936,14 @@ mdb_env_pgwalk(MDB_txn *txn, MDB_pgwalk_func* visitor, void* user)
ctx.mw_user = user; ctx.mw_user = user;
ctx.mw_visitor = visitor; ctx.mw_visitor = visitor;
rc = visitor(0, 2, user, "meta", 'M', sizeof(MDB_meta), PAGEHDRSZ); rc = visitor(0, 2, user, "lmdb", "meta", sizeof(MDB_meta)*2, PAGEHDRSZ*2,
if (! rc) (txn->mt_env->me_psize - sizeof(MDB_meta) - PAGEHDRSZ) *2);
if (! rc && txn->mt_dbs[FREE_DBI].md_root != P_INVALID)
rc = mdb_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0, 0); rc = mdb_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0, 0);
if (! rc) if (! rc && txn->mt_dbs[MAIN_DBI].md_root != P_INVALID)
rc = mdb_env_walk(&ctx, "main", txn->mt_dbs[MAIN_DBI].md_root, 0, 0); rc = mdb_env_walk(&ctx, "main", txn->mt_dbs[MAIN_DBI].md_root, 0, 0);
if (! rc) if (! rc)
rc = visitor(P_INVALID, 0, user, NULL, 0, -1, 0); rc = visitor(P_INVALID, 0, user, NULL, NULL, -1, 0, 0);
return rc; return rc;
} }

View File

@ -134,6 +134,10 @@ static int pagemap_lookup_dbi(const char* dbi) {
return last = -1; return last = -1;
walk.dbi_names[last] = strdup(dbi); walk.dbi_names[last] = strdup(dbi);
if (verbose > 2)
print(" - found '%s' area\n", dbi);
return last; return last;
} }
@ -201,39 +205,55 @@ static size_t problems_pop(struct problem* list) {
} }
static int pgvisitor(size_t pgno, unsigned pgnumber, void* ctx, const char* dbi, static int pgvisitor(size_t pgno, unsigned pgnumber, void* ctx, const char* dbi,
char type, int payload_bytes, int header_bytes) const char* type, int payload_bytes, int header_bytes, int unused_bytes)
{ {
if (pgnumber) { if (type) {
size_t page_bytes = payload_bytes + header_bytes + unused_bytes;
size_t page_size = pgnumber * stat.ms_psize;
int index = pagemap_lookup_dbi(dbi); int index = pagemap_lookup_dbi(dbi);
if (index < 0) if (index < 0)
return ENOMEM; return ENOMEM;
if (verbose > 3) {
print((pgnumber < 2) ? " %s-page %zu" : " %s-span %zu..%zu (%u pages)",
type, pgno, pgno + pgnumber - 1, pgnumber);
print(" of %s: header %i, payload %i, unused %i\n",
dbi, header_bytes, payload_bytes, unused_bytes);
}
walk.pgcount += pgnumber; walk.pgcount += pgnumber;
if (unused_bytes < 0 || (size_t) unused_bytes > page_size)
problem_add(pgno, "illegal unused-bytes", "(%zu < %i < %zu)",
0, unused_bytes, stat.ms_psize);
if (header_bytes < sizeof(long) || header_bytes >= stat.ms_psize - sizeof(long)) if (header_bytes < sizeof(long) || header_bytes >= stat.ms_psize - sizeof(long))
problem_add(pgno, "wrong header-length", "(%zu < %i < %zu)", problem_add(pgno, "illegal header-length", "(%zu < %i < %zu)",
sizeof(long), header_bytes, header_bytes >= stat.ms_psize - sizeof(long)); sizeof(long), header_bytes, stat.ms_psize - sizeof(long));
else if (payload_bytes < 1) else if (payload_bytes < 1)
problem_add(pgno, "empty page", "(payload %zu bytes)", payload_bytes); problem_add(pgno, "empty page", "(payload %i bytes)", payload_bytes);
else if (payload_bytes + header_bytes > pgnumber * stat.ms_psize)
problem_add(pgno, "overflowed page", "(%zu + %zu > %zu)", if (page_bytes != page_size)
payload_bytes, header_bytes, pgnumber * stat.ms_psize); problem_add(pgno, "misused page", "(%zu != %zu (%ih + %ip + %iu))",
page_size, page_bytes, header_bytes, payload_bytes, unused_bytes);
else { else {
walk.dbi_payload_bytes[index] += payload_bytes + header_bytes; walk.dbi_payload_bytes[index] += payload_bytes + header_bytes;
walk.total_payload_bytes += payload_bytes + header_bytes; walk.total_payload_bytes += payload_bytes + header_bytes;
} }
do { if (pgnumber) {
if (pgno >= lastpgno) do {
problem_add(pgno, "wrong page-no", "(> %zi)", lastpgno); if (pgno >= lastpgno)
else if (walk.pagemap[pgno]) problem_add(pgno, "wrong page-no", "(> %zi)", lastpgno);
problem_add(pgno, "page already used", "(in %s)", walk.dbi_names[walk.pagemap[pgno]]); else if (walk.pagemap[pgno])
else { problem_add(pgno, "page already used", "(in %s)", walk.dbi_names[walk.pagemap[pgno]]);
walk.pagemap[pgno] = index; else {
walk.dbi_pages[index] += 1; walk.pagemap[pgno] = index;
} walk.dbi_pages[index] += 1;
++pgno; }
} while(--pgnumber); ++pgno;
} while(--pgnumber);
}
} }
return gotsignal ? EINTR : MDB_SUCCESS; return gotsignal ? EINTR : MDB_SUCCESS;
@ -285,7 +305,7 @@ static int handle_freedb(size_t record_number, MDB_val *key, MDB_val* data) {
for (; i >= span && iptr[i - span] == pg; span++, pg++) ; for (; i >= span && iptr[i - span] == pg; span++, pg++) ;
} }
if (verbose > 2) if (verbose > 2)
print(" - transaction %zu, %zd pages, maxspan %zd%s\n", print(" transaction %zu, %zd pages, maxspan %zd%s\n",
*(size_t *)key->mv_data, number, span, bad); *(size_t *)key->mv_data, number, span, bad);
if (verbose > 3) { if (verbose > 3) {
int j = number - 1; int j = number - 1;
@ -655,8 +675,11 @@ int main(int argc, char *argv[])
info.me_mapsize / k, sf[i]); info.me_mapsize / k, sf[i]);
if (info.me_mapaddr) if (info.me_mapaddr)
print(" - mapaddr %p\n", info.me_mapaddr); print(" - mapaddr %p\n", info.me_mapaddr);
print(" - pagesize %u, max keysize %zu, max readers %u\n", print(" - pagesize %u, max keysize %zu (%s), max readers %u\n",
stat.ms_psize, maxkeysize, info.me_maxreaders); stat.ms_psize, maxkeysize,
(maxkeysize == 511) ? "default" :
(maxkeysize == 0) ? "devel" : "custom",
info.me_maxreaders);
print(" - transactions: last %zu, bottom %zu, lag reading %zi\n", info.me_last_txnid, print(" - transactions: last %zu, bottom %zu, lag reading %zi\n", info.me_last_txnid,
info.me_tail_txnid, info.me_last_txnid - info.me_tail_txnid); info.me_tail_txnid, info.me_last_txnid - info.me_tail_txnid);
@ -712,6 +735,9 @@ int main(int argc, char *argv[])
} }
if (!dont_traversal) { if (!dont_traversal) {
struct problem* saved_list;
size_t traversal_problems;
print("Traversal b-tree...\n"); print("Traversal b-tree...\n");
fflush(NULL); fflush(NULL);
walk.pagemap = calloc(lastpgno, sizeof(*walk.pagemap)); walk.pagemap = calloc(lastpgno, sizeof(*walk.pagemap));
@ -721,7 +747,10 @@ int main(int argc, char *argv[])
goto bailout; goto bailout;
} }
saved_list = problems_push();
rc = mdb_env_pgwalk(txn, pgvisitor, NULL); rc = mdb_env_pgwalk(txn, pgvisitor, NULL);
traversal_problems = problems_pop(saved_list);
if (rc) { if (rc) {
if (rc == EINTR && gotsignal) { if (rc == EINTR && gotsignal) {
print(" - interrupted by signal\n"); print(" - interrupted by signal\n");
@ -759,7 +788,7 @@ int main(int argc, char *argv[])
} }
} }
print(" - summary: average fill %.1f%%, %zu problems\n", print(" - summary: average fill %.1f%%, %zu problems\n",
walk.total_payload_bytes * 100.0 / total_page_bytes, total_problems); walk.total_payload_bytes * 100.0 / total_page_bytes, traversal_problems);
} }
} else if (verbose) { } else if (verbose) {
print("Skipping b-tree walk...\n"); print("Skipping b-tree walk...\n");