lmdb: mdb: b-tree walk, page-map check in mdb_chk.

Change-Id: I6678b4d891c8fbfbc49ed600212f4ade39e25282
This commit is contained in:
Leo Yuriev 2015-05-08 03:44:30 +03:00
parent 23720958b6
commit f2703156f0
3 changed files with 284 additions and 30 deletions

3
lmdb.h
View File

@ -1636,6 +1636,9 @@ typedef void MDB_debug_func(int type, const char *function, int line,
int mdb_setup_debug(int flags, MDB_debug_func* logger, long edge_txn);
typedef int MDB_pgwalk_func(size_t pgno, unsigned pgnumber, void* ctx, const char* dbi, char type);
int mdb_env_pgwalk(MDB_txn *txn, MDB_pgwalk_func* visitor, void* ctx);
#ifdef __cplusplus
}
#endif

163
mdb.c
View File

@ -205,6 +205,7 @@ static MDB_INLINE void mdb_invalidate_cache(void *addr, int nbytes) {
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <alloca.h>
#if defined(__sun) || defined(ANDROID)
/* Most platforms have posix_memalign, older may only have memalign */
@ -9865,4 +9866,166 @@ mdb_env_get_oomfunc(MDB_env *env)
return env ? env->me_oom_func : NULL;
}
struct mdb_walk_ctx {
MDB_txn *mw_txn;
void *mw_user;
MDB_pgwalk_func *mw_visitor;
};
typedef struct mdb_walk_ctx mdb_walk_ctx_t;
/** Depth-first tree traversal. */
static int ESECT
mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int deep)
{
MDB_cursor mc;
MDB_node *ni;
MDB_page *mp;
int rc;
unsigned int i;
if (deep < 2) {
rc = ctx->mw_visitor(pg, 0, ctx->mw_user, dbi, 'R');
if (rc)
return rc;
}
/* Empty DB, nothing to do */
if (pg == P_INVALID)
return MDB_SUCCESS;
mc.mc_snum = 1;
mc.mc_top = 0;
mc.mc_txn = ctx->mw_txn;
rc = mdb_page_get(ctx->mw_txn, pg, &mc.mc_pg[0], NULL);
if (rc)
return rc;
for (mp = mc.mc_pg[mc.mc_top]; IS_BRANCH(mp); ) {
MDB_node *node;
rc = ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, 'B');
if (rc)
return rc;
if (NUMKEYS(mp) < 1)
return MDB_CORRUPTED;
mdb_debug("branch page %zu has %u keys", mp->mp_pgno, NUMKEYS(mp));
mdb_cassert(&mc, NUMKEYS(mp) > 1);
mdb_debug("found index 0 to page %zu", NODEPGNO(NODEPTR(mp, 0)));
node = NODEPTR(mp, 0);
if ((rc = mdb_page_get(mc.mc_txn, NODEPGNO(node), &mp, NULL)) != 0)
return rc;
mc.mc_ki[mc.mc_top] = 0;
if ((rc = mdb_cursor_push(&mc, mp)))
return rc;
}
if (!IS_LEAF(mp)) {
mdb_debug("internal error, index points to a %02X page!?",
mp->mp_flags);
mc.mc_txn->mt_flags |= MDB_TXN_ERROR;
return MDB_CORRUPTED;
}
mc.mc_flags |= C_INITIALIZED;
mc.mc_flags &= ~C_EOF;
rc = ctx->mw_visitor(mp->mp_p.p_pgno, 1, ctx->mw_user, dbi, 'L');
if (rc)
return rc;
while (mc.mc_snum > 0) {
unsigned n;
mp = mc.mc_pg[mc.mc_top];
n = NUMKEYS(mp);
if (IS_LEAF(mp)) {
if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) {
for (i = 0; i < n; i++) {
ni = NODEPTR(mp, i);
if (ni->mn_flags & F_BIGDATA) {
MDB_page *omp;
pgno_t *pg;
pg = NODEDATA(ni);
rc = mdb_page_get(ctx->mw_txn, *pg, &omp, NULL);
if (rc)
return rc;
rc = ctx->mw_visitor(*pg, omp->mp_pages, ctx->mw_user, dbi, 'L');
if (rc)
return rc;
} else if (ni->mn_flags & F_SUBDATA) {
MDB_db *db = NODEDATA(ni);
char* name = NULL;
if (! (ni->mn_flags & F_DUPDATA)) {
name = NODEKEY(ni);
int namelen = (char*) db - name;
name = memcpy(alloca(namelen + 1), name, namelen);
name[namelen] = 0;
}
rc = mdb_env_walk(ctx, (name && name[0]) ? name : dbi, db->md_root, ni->mn_flags & F_DUPDATA, deep + 1);
if (rc)
return rc;
}
}
}
} else {
mc.mc_ki[mc.mc_top]++;
if (mc.mc_ki[mc.mc_top] < n) {
pgno_t pg;
do {
ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]);
pg = NODEPGNO(ni);
rc = mdb_page_get(ctx->mw_txn, pg, &mp, NULL);
if (rc)
return rc;
rc = ctx->mw_visitor(pg, 1, ctx->mw_user, dbi, IS_BRANCH(mp) ? 'B' : 'L');
if (rc)
return rc;
mc.mc_top++;
mc.mc_snum++;
mc.mc_ki[mc.mc_top] = 0;
mc.mc_pg[mc.mc_top] = mp;
}
/* Whenever we advance to a sibling branch page,
* we must proceed all the way down to its first leaf.
*/
while (IS_BRANCH(mp));
continue;
}
}
if (! mc.mc_top)
break;
mdb_cursor_pop(&mc);
}
return rc;
}
int mdb_env_pgwalk(MDB_txn *txn, MDB_pgwalk_func* visitor, void* user)
{
mdb_walk_ctx_t ctx;
int rc;
ctx.mw_txn = txn;
ctx.mw_user = user;
ctx.mw_visitor = visitor;
rc = visitor(0, 2, user, "meta", 'M');
if (! rc)
rc = mdb_env_walk(&ctx, "free", txn->mt_dbs[FREE_DBI].md_root, 0, 0);
if (! rc)
rc = mdb_env_walk(&ctx, "main", txn->mt_dbs[MAIN_DBI].md_root, 0, 0);
if (! rc)
rc = visitor(P_INVALID, 0, user, NULL, 0);
return rc;
}
/** @} */

148
mdb_chk.c
View File

@ -26,6 +26,7 @@
#include <unistd.h>
#include <signal.h>
#include <stdarg.h>
#include <malloc.h>
#include "lmdb.h"
#include "midl.h"
@ -52,13 +53,20 @@ static void signal_hanlder( int sig )
gotsignal = 1;
}
#define MAX_DBI 32768
const char* dbi_names[MAX_DBI] = { "@gc" };
size_t dbi_pages[MAX_DBI];
short *pagemap;
MDB_env *env;
MDB_txn *txn;
MDB_envinfo info;
MDB_stat stat;
size_t maxkeysize, reclaimable_pages, freedb_pages;
size_t maxkeysize, reclaimable_pages, freedb_pages, lastpgno;
unsigned userdb_count;
unsigned verbose = 1, quiet;
size_t pgcount;
static void print(const char* msg, ...) {
if (! quiet) {
@ -91,6 +99,23 @@ struct problem {
struct problem* problems_list;
size_t total_problems;
static int pagemap_lookup_dbi(const char* dbi) {
static int last;
if (last > 0 && strcmp(dbi_names[last], dbi) == 0)
return last;
for(last = 1; dbi_names[last] && last < MAX_DBI; ++last)
if (strcmp(dbi_names[last], dbi) == 0)
return last;
if (last == MAX_DBI)
return last = -1;
dbi_names[last] = strdup(dbi);
return last;
}
static void problem_add(size_t entry_number, const char* msg, const char *extra, ...) {
total_problems++;
@ -149,6 +174,31 @@ static size_t problems_pop(struct problem* list) {
return total;
}
static int pgvisitor(size_t pgno, unsigned pgnumber, void* ctx, const char* dbi, char type)
{
if (pgnumber) {
pgcount += pgnumber;
int index = pagemap_lookup_dbi(dbi);
if (index < 0)
return ENOMEM;
do {
if (pgno >= lastpgno)
problem_add(pgno, "wrong page-no", "(> %zi)", lastpgno);
else if (pagemap[pgno])
problem_add(pgno, "page already used", "(in %s)", dbi_names[pagemap[pgno]]);
else {
pagemap[pgno] = index;
dbi_pages[index] += 1;
}
++pgno;
} while(--pgnumber);
}
return MDB_SUCCESS;
}
typedef long (visitor)(size_t record_number, MDB_val *key, MDB_val* data);
static long process_db(MDB_dbi dbi, char *name, visitor *handler, int silent);
@ -362,7 +412,6 @@ static long process_db(MDB_dbi dbi, char *name, visitor *handler, int silent)
if (record_count != ms.ms_entries )
problem_add(record_count, "differentent number of entries",
" (%zu != %zu)", record_count, ms.ms_entries);
bailout:
problems_count = problems_pop(saved_list);
if (! silent && verbose) {
@ -388,6 +437,7 @@ int main(int argc, char *argv[])
char *envname;
int envflags = 0;
long problems_maindb = 0, problems_freedb = 0, problems_deep = 0;
size_t n;
if (argc < 2) {
usage(prog);
@ -463,7 +513,16 @@ int main(int argc, char *argv[])
goto bailout;
}
if (! quiet && verbose) {
lastpgno = info.me_last_pgno + 1;
errno = 0;
pagemap = calloc(lastpgno, sizeof(*pagemap));
if (! pagemap) {
rc = errno ? errno : ENOMEM;
error("calloc failed, error %d %s\n", rc, mdb_strerror(rc));
goto bailout;
}
if (verbose) {
print(" - map size %zu (%.1fMb, %.1fGb)\n", info.me_mapsize,
(double) info.me_mapsize / (1024 * 1024),
(double) info.me_mapsize / (1024 * 1024 * 1024));
@ -473,31 +532,6 @@ int main(int argc, char *argv[])
stat.ms_psize, maxkeysize, info.me_maxreaders);
print(" - last txn %zu, tail %zu (%zi)\n", info.me_last_txnid,
info.me_tail_txnid, info.me_tail_txnid - info.me_last_txnid);
size_t value = info.me_mapsize / stat.ms_psize;
double percent = value / 100.0;
print(" - pages: %zu total", value);
value = info.me_last_pgno + 1;
print(", allocated %zu (%.1f%%)", value, value / percent);
value = info.me_mapsize / stat.ms_psize - (info.me_last_pgno+1);
print(", remained %zu (%.1f%%)", value, value / percent);
value = info.me_last_pgno + 1 - freedb_pages;
print(", used now %zu (%.1f%%)", value, value / percent);
value = freedb_pages;
print(", free %zu (%.1f%%)", value, value / percent);
value = freedb_pages - reclaimable_pages;
print(", reading %zu (%.1f%%)", value, value / percent);
value = reclaimable_pages;
print(", reclaimable %zu (%.1f%%)", value, value / percent);
value = info.me_mapsize / stat.ms_psize - (info.me_last_pgno + 1) + reclaimable_pages;
print(", available %zu (%.1f%%)\n", value, value / percent);
}
rc = mdb_txn_begin(env, NULL, MDB_RDONLY, &txn);
@ -506,20 +540,74 @@ int main(int argc, char *argv[])
goto bailout;
}
problems_maindb = process_db(-1, /* MAINT_DBI */ NULL, NULL, 0);
print("Walking b-tree...\n");
rc = mdb_env_pgwalk(txn, pgvisitor, NULL);
if (rc) {
error("mdb_env_pgwalk failed, error %d %s\n", rc, mdb_strerror(rc));
goto bailout;
}
for( n = 0; n < lastpgno; ++n)
if (! pagemap[n])
dbi_pages[0] += 1;
if (verbose) {
print(" - dbi pages: %zu total", pgcount);
if (verbose > 1)
for (i = 1; i < MAX_DBI && dbi_names[i]; ++i)
print(", %s %zu", dbi_names[i], dbi_pages[i]);
print(", %s %zu\n", dbi_names[0], dbi_pages[0]);
}
problems_maindb = process_db(-1, /* MAIN_DBI */ NULL, NULL, 0);
problems_freedb = process_db(0 /* FREE_DBI */, "free", handle_freedb, 0);
if (verbose) {
size_t value = info.me_mapsize / stat.ms_psize;
double percent = value / 100.0;
print(" - pages info: %zu total", value);
print(", allocated %zu (%.1f%%)", lastpgno, lastpgno / percent);
if (verbose > 1) {
value = info.me_mapsize / stat.ms_psize - lastpgno;
print(", remained %zu (%.1f%%)", value, value / percent);
value = lastpgno - freedb_pages;
print(", used %zu (%.1f%%)", value, value / percent);
print(", gc %zu (%.1f%%)", freedb_pages, freedb_pages / percent);
value = freedb_pages - reclaimable_pages;
print(", reading %zu (%.1f%%)", value, value / percent);
print(", reclaimable %zu (%.1f%%)", reclaimable_pages, reclaimable_pages / percent);
}
value = info.me_mapsize / stat.ms_psize - lastpgno + reclaimable_pages;
print(", available %zu (%.1f%%)\n", value, value / percent);
}
if (pgcount != lastpgno - freedb_pages) {
error("used pages mismatch (%zu != %zu)\n", pgcount, lastpgno - freedb_pages);
goto bailout;
}
if (dbi_pages[0] != freedb_pages) {
error("gc pages mismatch (%zu != %zu)\n", dbi_pages[0], freedb_pages);
goto bailout;
}
if (problems_maindb == 0 && problems_freedb == 0)
problems_deep = process_db(-1, NULL, handle_maindb, 1);
mdb_txn_abort(txn);
if (! userdb_count && verbose)
print("%s: %s does not contain multiple databases\n", prog, envname);
if (rc && ! quiet)
if (rc)
error("%s: %s: %s\n", prog, envname, mdb_strerror(rc));
bailout:
mdb_env_close(env);
free(pagemap);
if (rc)
return EXIT_FAILURE + 2;
if (problems_maindb || problems_freedb)