From f2703156f04f0862ed2ddd6cc8e2f4c48f7ad254 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Fri, 8 May 2015 03:44:30 +0300 Subject: [PATCH] lmdb: mdb: b-tree walk, page-map check in mdb_chk. Change-Id: I6678b4d891c8fbfbc49ed600212f4ade39e25282 --- lmdb.h | 3 + mdb.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ mdb_chk.c | 148 +++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 284 insertions(+), 30 deletions(-) diff --git a/lmdb.h b/lmdb.h index 1577cf05..e994c973 100644 --- a/lmdb.h +++ b/lmdb.h @@ -1636,6 +1636,9 @@ typedef void MDB_debug_func(int type, const char *function, int line, int mdb_setup_debug(int flags, MDB_debug_func* logger, long edge_txn); +typedef int MDB_pgwalk_func(size_t pgno, unsigned pgnumber, void* ctx, const char* dbi, char type); +int mdb_env_pgwalk(MDB_txn *txn, MDB_pgwalk_func* visitor, void* ctx); + #ifdef __cplusplus } #endif diff --git a/mdb.c b/mdb.c index 64babddd..ff75b6eb 100644 --- a/mdb.c +++ b/mdb.c @@ -205,6 +205,7 @@ static MDB_INLINE void mdb_invalidate_cache(void *addr, int nbytes) { #include #include #include +#include #if defined(__sun) || defined(ANDROID) /* Most platforms have posix_memalign, older may only have memalign */ @@ -9865,4 +9866,166 @@ mdb_env_get_oomfunc(MDB_env *env) return env ? env->me_oom_func : NULL; } +struct mdb_walk_ctx { + MDB_txn *mw_txn; + void *mw_user; + MDB_pgwalk_func *mw_visitor; +}; + +typedef struct mdb_walk_ctx mdb_walk_ctx_t; + +/** Depth-first tree traversal. */ +static int ESECT +mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int deep) +{ + MDB_cursor mc; + MDB_node *ni; + MDB_page *mp; + int rc; + unsigned int i; + + if (deep < 2) { + rc = ctx->mw_visitor(pg, 0, ctx->mw_user, dbi, 'R'); + if (rc) + return rc; + } + + /* Empty DB, nothing to do */ + if (pg == P_INVALID) + return MDB_SUCCESS; + + mc.mc_snum = 1; + mc.mc_top = 0; + mc.mc_txn = ctx->mw_txn; + + rc = mdb_page_get(ctx->mw_txn, pg, &mc.mc_pg[0], NULL); + if (rc) + return rc; + + for (mp = mc.mc_pg[mc.mc_top]; IS_BRANCH(mp); ) { + MDB_node *node; + + rc = ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, 'B'); + if (rc) + return rc; + + if (NUMKEYS(mp) < 1) + return MDB_CORRUPTED; + + mdb_debug("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp)); + mdb_cassert(&mc, NUMKEYS(mp) > 1); + mdb_debug("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0))); + + node = NODEPTR(mp, 0); + + if ((rc = mdb_page_get(mc.mc_txn, NODEPGNO(node), &mp, NULL)) != 0) + return rc; + + mc.mc_ki[mc.mc_top] = 0; + if ((rc = mdb_cursor_push(&mc, mp))) + return rc; + } + + if (!IS_LEAF(mp)) { + mdb_debug("internal error, index points to a %02X page!?", + mp->mp_flags); + mc.mc_txn->mt_flags |= MDB_TXN_ERROR; + return MDB_CORRUPTED; + } + + mc.mc_flags |= C_INITIALIZED; + mc.mc_flags &= ~C_EOF; + + rc = ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, 'L'); + if (rc) + return rc; + + while (mc.mc_snum > 0) { + unsigned n; + mp = mc.mc_pg[mc.mc_top]; + n = NUMKEYS(mp); + + if (IS_LEAF(mp)) { + if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { + for (i = 0; i < n; i++) { + ni = NODEPTR(mp, i); + if (ni->mn_flags & F_BIGDATA) { + MDB_page *omp; + pgno_t *pg; + + pg = NODEDATA(ni); + rc = mdb_page_get(ctx->mw_txn, *pg, &omp, NULL); + if (rc) + return rc; + rc = ctx->mw_visitor(*pg, omp->mp_pages, ctx->mw_user, dbi, 'L'); + if (rc) + return rc; + } else if (ni->mn_flags & F_SUBDATA) { + MDB_db *db = NODEDATA(ni); + char* name = NULL; + if (! (ni->mn_flags & F_DUPDATA)) { + name = NODEKEY(ni); + int namelen = (char*) db - name; + name = memcpy(alloca(namelen + 1), name, namelen); + name[namelen] = 0; + } + rc = mdb_env_walk(ctx, (name && name[0]) ? name : dbi, db->md_root, ni->mn_flags & F_DUPDATA, deep + 1); + if (rc) + return rc; + } + } + } + } else { + mc.mc_ki[mc.mc_top]++; + if (mc.mc_ki[mc.mc_top] < n) { + pgno_t pg; + do { + ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); + pg = NODEPGNO(ni); + rc = mdb_page_get(ctx->mw_txn, pg, &mp, NULL); + if (rc) + return rc; + rc = ctx->mw_visitor(pg, 1, ctx->mw_user, dbi, IS_BRANCH(mp) ? 'B' : 'L'); + if (rc) + return rc; + mc.mc_top++; + mc.mc_snum++; + mc.mc_ki[mc.mc_top] = 0; + mc.mc_pg[mc.mc_top] = mp; + } + /* Whenever we advance to a sibling branch page, + * we must proceed all the way down to its first leaf. + */ + while (IS_BRANCH(mp)); + continue; + } + } + + if (! mc.mc_top) + break; + + mdb_cursor_pop(&mc); + } + return rc; +} + +int mdb_env_pgwalk(MDB_txn *txn, MDB_pgwalk_func* visitor, void* user) +{ + mdb_walk_ctx_t ctx; + int rc; + + ctx.mw_txn = txn; + ctx.mw_user = user; + ctx.mw_visitor = visitor; + + rc = visitor(0, 2, user, "meta", 'M'); + if (! rc) + rc = mdb_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0, 0); + if (! rc) + rc = mdb_env_walk(&ctx, "main", txn->mt_dbs[MAIN_DBI].md_root, 0, 0); + if (! rc) + rc = visitor(P_INVALID, 0, user, NULL, 0); + return rc; +} + /** @} */ diff --git a/mdb_chk.c b/mdb_chk.c index fc747f91..0006cda4 100644 --- a/mdb_chk.c +++ b/mdb_chk.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "lmdb.h" #include "midl.h" @@ -52,13 +53,20 @@ static void signal_hanlder( int sig ) gotsignal = 1; } +#define MAX_DBI 32768 + +const char* dbi_names[MAX_DBI] = { "@gc" }; +size_t dbi_pages[MAX_DBI]; +short *pagemap; + MDB_env *env; MDB_txn *txn; MDB_envinfo info; MDB_stat stat; -size_t maxkeysize, reclaimable_pages, freedb_pages; +size_t maxkeysize, reclaimable_pages, freedb_pages, lastpgno; unsigned userdb_count; unsigned verbose = 1, quiet; +size_t pgcount; static void print(const char* msg, ...) { if (! quiet) { @@ -91,6 +99,23 @@ struct problem { struct problem* problems_list; size_t total_problems; +static int pagemap_lookup_dbi(const char* dbi) { + static int last; + + if (last > 0 && strcmp(dbi_names[last], dbi) == 0) + return last; + + for(last = 1; dbi_names[last] && last < MAX_DBI; ++last) + if (strcmp(dbi_names[last], dbi) == 0) + return last; + + if (last == MAX_DBI) + return last = -1; + + dbi_names[last] = strdup(dbi); + return last; +} + static void problem_add(size_t entry_number, const char* msg, const char *extra, ...) { total_problems++; @@ -149,6 +174,31 @@ static size_t problems_pop(struct problem* list) { return total; } +static int pgvisitor(size_t pgno, unsigned pgnumber, void* ctx, const char* dbi, char type) +{ + if (pgnumber) { + pgcount += pgnumber; + + int index = pagemap_lookup_dbi(dbi); + if (index < 0) + return ENOMEM; + + do { + if (pgno >= lastpgno) + problem_add(pgno, "wrong page-no", "(> %zi)", lastpgno); + else if (pagemap[pgno]) + problem_add(pgno, "page already used", "(in %s)", dbi_names[pagemap[pgno]]); + else { + pagemap[pgno] = index; + dbi_pages[index] += 1; + } + ++pgno; + } while(--pgnumber); + } + + return MDB_SUCCESS; +} + typedef long (visitor)(size_t record_number, MDB_val *key, MDB_val* data); static long process_db(MDB_dbi dbi, char *name, visitor *handler, int silent); @@ -362,7 +412,6 @@ static long process_db(MDB_dbi dbi, char *name, visitor *handler, int silent) if (record_count != ms.ms_entries ) problem_add(record_count, "differentent number of entries", " (%zu != %zu)", record_count, ms.ms_entries); - bailout: problems_count = problems_pop(saved_list); if (! silent && verbose) { @@ -388,6 +437,7 @@ int main(int argc, char *argv[]) char *envname; int envflags = 0; long problems_maindb = 0, problems_freedb = 0, problems_deep = 0; + size_t n; if (argc < 2) { usage(prog); @@ -463,7 +513,16 @@ int main(int argc, char *argv[]) goto bailout; } - if (! quiet && verbose) { + lastpgno = info.me_last_pgno + 1; + errno = 0; + pagemap = calloc(lastpgno, sizeof(*pagemap)); + if (! pagemap) { + rc = errno ? errno : ENOMEM; + error("calloc failed, error %d %s\n", rc, mdb_strerror(rc)); + goto bailout; + } + + if (verbose) { print(" - map size %zu (%.1fMb, %.1fGb)\n", info.me_mapsize, (double) info.me_mapsize / (1024 * 1024), (double) info.me_mapsize / (1024 * 1024 * 1024)); @@ -473,31 +532,6 @@ int main(int argc, char *argv[]) stat.ms_psize, maxkeysize, info.me_maxreaders); print(" - last txn %zu, tail %zu (%zi)\n", info.me_last_txnid, info.me_tail_txnid, info.me_tail_txnid - info.me_last_txnid); - - size_t value = info.me_mapsize / stat.ms_psize; - double percent = value / 100.0; - print(" - pages: %zu total", value); - - value = info.me_last_pgno + 1; - print(", allocated %zu (%.1f%%)", value, value / percent); - - value = info.me_mapsize / stat.ms_psize - (info.me_last_pgno+1); - print(", remained %zu (%.1f%%)", value, value / percent); - - value = info.me_last_pgno + 1 - freedb_pages; - print(", used now %zu (%.1f%%)", value, value / percent); - - value = freedb_pages; - print(", free %zu (%.1f%%)", value, value / percent); - - value = freedb_pages - reclaimable_pages; - print(", reading %zu (%.1f%%)", value, value / percent); - - value = reclaimable_pages; - print(", reclaimable %zu (%.1f%%)", value, value / percent); - - value = info.me_mapsize / stat.ms_psize - (info.me_last_pgno + 1) + reclaimable_pages; - print(", available %zu (%.1f%%)\n", value, value / percent); } rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn); @@ -506,20 +540,74 @@ int main(int argc, char *argv[]) goto bailout; } - problems_maindb = process_db(-1, /* MAINT_DBI */ NULL, NULL, 0); + print("Walking b-tree...\n"); + rc = mdb_env_pgwalk(txn, pgvisitor, NULL); + if (rc) { + error("mdb_env_pgwalk failed, error %d %s\n", rc, mdb_strerror(rc)); + goto bailout; + } + for( n = 0; n < lastpgno; ++n) + if (! pagemap[n]) + dbi_pages[0] += 1; + if (verbose) { + print(" - dbi pages: %zu total", pgcount); + if (verbose > 1) + for (i = 1; i < MAX_DBI && dbi_names[i]; ++i) + print(", %s %zu", dbi_names[i], dbi_pages[i]); + print(", %s %zu\n", dbi_names[0], dbi_pages[0]); + } + + problems_maindb = process_db(-1, /* MAIN_DBI */ NULL, NULL, 0); problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0); + + if (verbose) { + size_t value = info.me_mapsize / stat.ms_psize; + double percent = value / 100.0; + print(" - pages info: %zu total", value); + print(", allocated %zu (%.1f%%)", lastpgno, lastpgno / percent); + + if (verbose > 1) { + value = info.me_mapsize / stat.ms_psize - lastpgno; + print(", remained %zu (%.1f%%)", value, value / percent); + + value = lastpgno - freedb_pages; + print(", used %zu (%.1f%%)", value, value / percent); + + print(", gc %zu (%.1f%%)", freedb_pages, freedb_pages / percent); + + value = freedb_pages - reclaimable_pages; + print(", reading %zu (%.1f%%)", value, value / percent); + + print(", reclaimable %zu (%.1f%%)", reclaimable_pages, reclaimable_pages / percent); + } + + value = info.me_mapsize / stat.ms_psize - lastpgno + reclaimable_pages; + print(", available %zu (%.1f%%)\n", value, value / percent); + } + + if (pgcount != lastpgno - freedb_pages) { + error("used pages mismatch (%zu != %zu)\n", pgcount, lastpgno - freedb_pages); + goto bailout; + } + if (dbi_pages[0] != freedb_pages) { + error("gc pages mismatch (%zu != %zu)\n", dbi_pages[0], freedb_pages); + goto bailout; + } + if (problems_maindb == 0 && problems_freedb == 0) problems_deep = process_db(-1, NULL, handle_maindb, 1); + mdb_txn_abort(txn); if (! userdb_count && verbose) print("%s: %s does not contain multiple databases\n", prog, envname); - if (rc && ! quiet) + if (rc) error("%s: %s: %s\n", prog, envname, mdb_strerror(rc)); bailout: mdb_env_close(env); + free(pagemap); if (rc) return EXIT_FAILURE + 2; if (problems_maindb || problems_freedb)