/* mdbx_chk.c - memory-mapped database check tool */ /* * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted only as authorized by the OpenLDAP * Public License. * * A copy of this license is available in the file LICENSE in the * top-level directory of the distribution or, alternatively, at * . */ #ifdef _MSC_VER #if _MSC_VER > 1800 #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ #define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #include typedef struct flagbit { int bit; const char *name; } flagbit; const flagbit dbflags[] = {{MDBX_DUPSORT, "dupsort"}, {MDBX_INTEGERKEY, "integerkey"}, {MDBX_REVERSEKEY, "reversekey"}, {MDBX_DUPFIXED, "dupfixed"}, {MDBX_REVERSEDUP, "reversedup"}, {MDBX_INTEGERDUP, "integerdup"}, {0, nullptr}}; #if defined(_WIN32) || defined(_WIN64) #include "wingetopt.h" static volatile BOOL user_break; static BOOL WINAPI ConsoleBreakHandlerRoutine(DWORD dwCtrlType) { (void)dwCtrlType; user_break = 1; return true; } static uint64_t GetMilliseconds(void) { LARGE_INTEGER Counter, Frequency; return (QueryPerformanceFrequency(&Frequency) && QueryPerformanceCounter(&Counter)) ? Counter.QuadPart * 1000ul / Frequency.QuadPart : 0; } #else /* WINDOWS */ static volatile sig_atomic_t user_break; static void signal_handler(int sig) { (void)sig; user_break = 1; } #endif /* !WINDOWS */ #define EXIT_INTERRUPTED (EXIT_FAILURE + 4) #define EXIT_FAILURE_SYS (EXIT_FAILURE + 3) #define EXIT_FAILURE_MDBX (EXIT_FAILURE + 2) #define EXIT_FAILURE_CHECK_MAJOR (EXIT_FAILURE + 1) #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE typedef struct { MDBX_val name; struct { uint64_t branch, large_count, large_volume, leaf; uint64_t subleaf_dupsort, leaf_dupfixed, subleaf_dupfixed; uint64_t total, empty, other; } pages; uint64_t payload_bytes; uint64_t lost_bytes; } walk_dbi_t; struct { short *pagemap; uint64_t total_payload_bytes; uint64_t pgcount; walk_dbi_t dbi[MDBX_MAX_DBI + CORE_DBS + /* account pseudo-entry for meta */ 1]; } walk; #define dbi_free walk.dbi[FREE_DBI] #define dbi_main walk.dbi[MAIN_DBI] #define dbi_meta walk.dbi[CORE_DBS] int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION; MDBX_env *env; MDBX_txn *txn; MDBX_envinfo envinfo; size_t userdb_count, skipped_subdb; uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages, unused_pages, backed_pages; unsigned verbose; bool ignore_wrong_order, quiet, dont_traversal; MDBX_val only_subdb; int stuck_meta = -1; struct problem { struct problem *pr_next; size_t count; const char *caption; }; struct problem *problems_list; unsigned total_problems, data_tree_problems, gc_tree_problems; static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) { if (!quiet) { va_list args; fflush(stderr); va_start(args, msg); vfprintf(stdout, msg, args); va_end(args); } } static MDBX_val printable_buf; static void free_printable_buf(void) { osal_free(printable_buf.iov_base); } static const char *sdb_name(const MDBX_val *val) { if (val == MDBX_PGWALK_MAIN) return "@MAIN"; if (val == MDBX_PGWALK_GC) return "@GC"; if (val == MDBX_PGWALK_META) return "@META"; const unsigned char *const data = val->iov_base; const size_t len = val->iov_len; if (data == MDBX_PGWALK_MAIN) return "@MAIN"; if (data == MDBX_PGWALK_GC) return "@GC"; if (data == MDBX_PGWALK_META) return "@META"; if (!len) return ""; if (!data) return ""; if (len > 65536) { static char buf[64]; /* NOTE: There is MSYS2 MinGW bug if you here got * the "unknown conversion type character ā€˜zā€™ in format [-Werror=format=]" * https://stackoverflow.com/questions/74504432/whats-the-proper-way-to-tell-mingw-based-gcc-to-use-ansi-stdio-output-on-windo */ snprintf(buf, sizeof(buf), "", len); return buf; } bool printable = true; bool quoting = false; size_t xchars = 0; for (size_t i = 0; i < val->iov_len && printable; ++i) { quoting |= data[i] != '_' && isalnum(data[i]) == 0; printable = isprint(data[i]) != 0 || (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); } size_t need = len + 1; if (quoting || !printable) need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; if (need > printable_buf.iov_len) { void *ptr = osal_realloc(printable_buf.iov_base, need); if (!ptr) return ""; if (!printable_buf.iov_base) atexit(free_printable_buf); printable_buf.iov_base = ptr; printable_buf.iov_len = need; } char *out = printable_buf.iov_base; if (!quoting) { memcpy(out, data, len); out += len; } else if (printable) { *out++ = '\''; for (size_t i = 0; i < len; ++i) { if (data[i] < ' ') { assert((char *)printable_buf.iov_base + printable_buf.iov_len > out + 4); static const char hex[] = "0123456789abcdef"; out[0] = '\\'; out[1] = 'x'; out[2] = hex[data[i] >> 4]; out[3] = hex[data[i] & 15]; out += 4; } else if (strchr("\"'`\\", data[i])) { assert((char *)printable_buf.iov_base + printable_buf.iov_len > out + 2); out[0] = '\\'; out[1] = data[i]; out += 2; } else { assert((char *)printable_buf.iov_base + printable_buf.iov_len > out + 1); *out++ = data[i]; } } *out++ = '\''; } assert((char *)printable_buf.iov_base + printable_buf.iov_len > out); *out = 0; return printable_buf.iov_base; } static void va_log(MDBX_log_level_t level, const char *function, int line, const char *msg, va_list args) { static const char *const prefixes[] = { "!!!fatal: ", " ! " /* error */, " ~ " /* warning */, " " /* notice */, " // " /* verbose */, " //// " /* debug */, " ////// " /* trace */ }; FILE *out = stdout; if (level <= MDBX_LOG_ERROR) { total_problems++; out = stderr; } if (!quiet && verbose + 1 >= (unsigned)level && (unsigned)level < ARRAY_LENGTH(prefixes)) { fflush(nullptr); fputs(prefixes[level], out); vfprintf(out, msg, args); const bool have_lf = msg[strlen(msg) - 1] == '\n'; if (level == MDBX_LOG_FATAL && function && line) fprintf(out, have_lf ? " %s(), %u\n" : " (%s:%u)\n", function + (strncmp(function, "mdbx_", 5) ? 5 : 0), line); else if (!have_lf) fputc('\n', out); fflush(nullptr); } if (level == MDBX_LOG_FATAL) { #if !MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS exit(EXIT_FAILURE_MDBX); #endif abort(); } } static void MDBX_PRINTF_ARGS(1, 2) error(const char *msg, ...) { va_list args; va_start(args, msg); va_log(MDBX_LOG_ERROR, nullptr, 0, msg, args); va_end(args); } static void logger(MDBX_log_level_t level, const char *function, int line, const char *msg, va_list args) { (void)line; (void)function; if (level < MDBX_LOG_EXTRA) va_log(level, function, line, msg, args); } static int check_user_break(void) { switch (user_break) { case 0: return MDBX_SUCCESS; case 1: print(" - interrupted by signal\n"); fflush(nullptr); user_break = 2; } return MDBX_EINTR; } static void pagemap_cleanup(void) { osal_free(walk.pagemap); walk.pagemap = nullptr; } static bool eq(const MDBX_val a, const MDBX_val b) { return a.iov_len == b.iov_len && (a.iov_base == b.iov_base || a.iov_len == 0 || !memcmp(a.iov_base, b.iov_base, a.iov_len)); } static walk_dbi_t *pagemap_lookup_dbi(const MDBX_val *dbi_name, bool silent) { static walk_dbi_t *last; if (dbi_name == MDBX_PGWALK_MAIN) return &dbi_main; if (dbi_name == MDBX_PGWALK_GC) return &dbi_free; if (dbi_name == MDBX_PGWALK_META) return &dbi_meta; if (last && eq(last->name, *dbi_name)) return last; walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1; for (; dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { if (eq(dbi->name, *dbi_name)) return last = dbi; } if (verbose > 0 && !silent) { print(" - found %s area\n", sdb_name(dbi_name)); fflush(nullptr); } if (dbi == ARRAY_END(walk.dbi)) return nullptr; dbi->name = *dbi_name; return last = dbi; } static void MDBX_PRINTF_ARGS(4, 5) problem_add(const char *object, uint64_t entry_number, const char *msg, const char *extra, ...) { total_problems++; if (!quiet) { int need_fflush = 0; struct problem *p; for (p = problems_list; p; p = p->pr_next) if (p->caption == msg) break; if (!p) { p = osal_calloc(1, sizeof(*p)); if (unlikely(!p)) return; p->caption = msg; p->pr_next = problems_list; problems_list = p; need_fflush = 1; } p->count++; if (verbose > 1) { print(" %s #%" PRIu64 ": %s", object, entry_number, msg); if (extra) { va_list args; printf(" ("); va_start(args, extra); vfprintf(stdout, extra, args); va_end(args); printf(")"); } printf("\n"); if (need_fflush) fflush(nullptr); } } } static struct problem *problems_push(void) { struct problem *p = problems_list; problems_list = nullptr; return p; } static size_t problems_pop(struct problem *list) { size_t count = 0; if (problems_list) { int i; print(" - problems: "); for (i = 0; problems_list; ++i) { struct problem *p = problems_list->pr_next; count += problems_list->count; print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption, problems_list->count); osal_free(problems_list); problems_list = p; } print("\n"); fflush(nullptr); } problems_list = list; return count; } static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, void *const ctx, const int deep, const MDBX_val *dbi_name, const size_t page_size, const MDBX_page_type_t pagetype, const MDBX_error_t err, const size_t nentries, const size_t payload_bytes, const size_t header_bytes, const size_t unused_bytes) { (void)ctx; const bool is_gc_tree = dbi_name == MDBX_PGWALK_GC; if (deep > 42) { problem_add("deep", deep, "too large", nullptr); data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; return MDBX_CORRUPTED /* avoid infinite loop/recursion */; } walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name, false); if (!dbi) { data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; return MDBX_ENOMEM; } const size_t page_bytes = payload_bytes + header_bytes + unused_bytes; walk.pgcount += pgnumber; const char *pagetype_caption; bool branch = false; switch (pagetype) { default: problem_add("page", pgno, "unknown page-type", "type %u, deep %i", (unsigned)pagetype, deep); pagetype_caption = "unknown"; dbi->pages.other += pgnumber; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; break; case MDBX_page_broken: pagetype_caption = "broken"; dbi->pages.other += pgnumber; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; break; case MDBX_subpage_broken: pagetype_caption = "broken-subpage"; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; break; case MDBX_page_meta: pagetype_caption = "meta"; dbi->pages.other += pgnumber; break; case MDBX_page_large: pagetype_caption = "large"; dbi->pages.large_volume += pgnumber; dbi->pages.large_count += 1; break; case MDBX_page_branch: pagetype_caption = "branch"; dbi->pages.branch += pgnumber; branch = true; break; case MDBX_page_leaf: pagetype_caption = "leaf"; dbi->pages.leaf += pgnumber; break; case MDBX_page_dupfixed_leaf: pagetype_caption = "leaf-dupfixed"; dbi->pages.leaf_dupfixed += pgnumber; break; case MDBX_subpage_leaf: pagetype_caption = "subleaf-dupsort"; dbi->pages.subleaf_dupsort += 1; break; case MDBX_subpage_dupfixed_leaf: pagetype_caption = "subleaf-dupfixed"; dbi->pages.subleaf_dupfixed += 1; break; } if (pgnumber) { if (verbose > 3 && (!only_subdb.iov_base || eq(only_subdb, dbi->name))) { if (pgnumber == 1) print(" %s-page %" PRIu64, pagetype_caption, pgno); else print(" %s-span %" PRIu64 "[%u]", pagetype_caption, pgno, pgnumber); print(" of %s: header %" PRIiPTR ", %s %" PRIiPTR ", payload %" PRIiPTR ", unused %" PRIiPTR ", deep %i\n", sdb_name(&dbi->name), header_bytes, (pagetype == MDBX_page_branch) ? "keys" : "entries", nentries, payload_bytes, unused_bytes, deep); } bool already_used = false; for (unsigned n = 0; n < pgnumber; ++n) { uint64_t spanpgno = pgno + n; if (spanpgno >= alloc_pages) { problem_add("page", spanpgno, "wrong page-no", "%s-page: %" PRIu64 " > %" PRIu64 ", deep %i", pagetype_caption, spanpgno, alloc_pages, deep); data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } else if (walk.pagemap[spanpgno]) { walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1]; problem_add("page", spanpgno, (branch && coll_dbi == dbi) ? "loop" : "already used", "%s-page: by %s, deep %i", pagetype_caption, sdb_name(&coll_dbi->name), deep); already_used = true; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } else { walk.pagemap[spanpgno] = (short)(dbi - walk.dbi + 1); dbi->pages.total += 1; } } if (already_used) return branch ? MDBX_RESULT_TRUE /* avoid infinite loop/recursion */ : MDBX_SUCCESS; } if (MDBX_IS_ERROR(err)) { problem_add("page", pgno, "invalid/corrupted", "%s-page", pagetype_caption); data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } else { if (unused_bytes > page_size) { problem_add("page", pgno, "illegal unused-bytes", "%s-page: %u < %" PRIuPTR " < %u", pagetype_caption, 0, unused_bytes, envinfo.mi_dxb_pagesize); data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } if (header_bytes < (int)sizeof(long) || (size_t)header_bytes >= envinfo.mi_dxb_pagesize - sizeof(long)) { problem_add("page", pgno, "illegal header-length", "%s-page: %" PRIuPTR " < %" PRIuPTR " < %" PRIuPTR, pagetype_caption, sizeof(long), header_bytes, envinfo.mi_dxb_pagesize - sizeof(long)); data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } if (payload_bytes < 1) { if (nentries > 1) { problem_add("page", pgno, "zero size-of-entry", "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR " entries", pagetype_caption, payload_bytes, nentries); /* if ((size_t)header_bytes + unused_bytes < page_size) { // LY: hush a misuse error page_bytes = page_size; } */ data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } else { problem_add("page", pgno, "empty", "%s-page: payload %" PRIuPTR " bytes, %" PRIuPTR " entries, deep %i", pagetype_caption, payload_bytes, nentries, deep); dbi->pages.empty += 1; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } } if (pgnumber) { if (page_bytes != page_size) { problem_add("page", pgno, "misused", "%s-page: %" PRIuPTR " != %" PRIuPTR " (%" PRIuPTR "h + %" PRIuPTR "p + %" PRIuPTR "u), deep %i", pagetype_caption, page_size, page_bytes, header_bytes, payload_bytes, unused_bytes, deep); if (page_size > page_bytes) dbi->lost_bytes += page_size - page_bytes; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } else { dbi->payload_bytes += payload_bytes + header_bytes; walk.total_payload_bytes += payload_bytes + header_bytes; } } } return check_user_break(); } typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data); static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, visitor *handler); static int handle_userdb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { (void)record_number; (void)key; (void)data; return check_user_break(); } static int handle_freedb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { char *bad = ""; pgno_t *iptr = data->iov_base; if (key->iov_len != sizeof(txnid_t)) problem_add("entry", record_number, "wrong txn-id size", "key-size %" PRIiPTR, key->iov_len); else { txnid_t txnid; memcpy(&txnid, key->iov_base, sizeof(txnid)); if (txnid < 1 || txnid > envinfo.mi_recent_txnid) problem_add("entry", record_number, "wrong txn-id", "%" PRIaTXN, txnid); else { if (data->iov_len < sizeof(pgno_t) || data->iov_len % sizeof(pgno_t)) problem_add("entry", txnid, "wrong idl size", "%" PRIuPTR, data->iov_len); size_t number = (data->iov_len >= sizeof(pgno_t)) ? *iptr++ : 0; if (number < 1 || number > MDBX_PGL_LIMIT) problem_add("entry", txnid, "wrong idl length", "%" PRIuPTR, number); else if ((number + 1) * sizeof(pgno_t) > data->iov_len) { problem_add("entry", txnid, "trimmed idl", "%" PRIuSIZE " > %" PRIuSIZE " (corruption)", (number + 1) * sizeof(pgno_t), data->iov_len); number = data->iov_len / sizeof(pgno_t) - 1; } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= /* LY: allow gap up to one page. it is ok * and better than shink-and-retry inside update_gc() */ envinfo.mi_dxb_pagesize) problem_add("entry", txnid, "extra idl space", "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", (number + 1) * sizeof(pgno_t), data->iov_len); gc_pages += number; if (envinfo.mi_latter_reader_txnid > txnid) reclaimable_pages += number; pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; pgno_t span = 1; for (unsigned i = 0; i < number; ++i) { if (check_user_break()) return MDBX_EINTR; const pgno_t pgno = iptr[i]; if (pgno < NUM_METAS) problem_add("entry", txnid, "wrong idl entry", "pgno %" PRIaPGNO " < meta-pages %u", pgno, NUM_METAS); else if (pgno >= backed_pages) problem_add("entry", txnid, "wrong idl entry", "pgno %" PRIaPGNO " > backed-pages %" PRIu64, pgno, backed_pages); else if (pgno >= alloc_pages) problem_add("entry", txnid, "wrong idl entry", "pgno %" PRIaPGNO " > alloc-pages %" PRIu64, pgno, alloc_pages - 1); else { if (MDBX_PNL_DISORDERED(prev, pgno)) { bad = " [bad sequence]"; problem_add("entry", txnid, "bad sequence", "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev, (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), i, pgno); } if (walk.pagemap) { int idx = walk.pagemap[pgno]; if (idx == 0) walk.pagemap[pgno] = -1; else if (idx > 0) problem_add("page", pgno, "already used", "by %s", sdb_name(&walk.dbi[idx - 1].name)); else problem_add("page", pgno, "already listed in GC", nullptr); } } prev = pgno; while (i + span < number && iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) : pgno_sub(pgno, span))) ++span; } if (verbose > 3 && !only_subdb.iov_base) { print(" transaction %" PRIaTXN ", %" PRIuPTR " pages, maxspan %" PRIaPGNO "%s\n", txnid, number, span, bad); if (verbose > 4) { for (unsigned i = 0; i < number; i += span) { const pgno_t pgno = iptr[i]; for (span = 1; i + span < number && iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pgno, span) : pgno_sub(pgno, span)); ++span) ; if (span > 1) { print(" %9" PRIaPGNO "[%" PRIaPGNO "]\n", pgno, span); } else print(" %9" PRIaPGNO "\n", pgno); } } } } } return check_user_break(); } static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) { return eq(*a, *b) ? 0 : 1; } static int handle_maindb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { if (data->iov_len == sizeof(MDBX_db)) { int rc = process_db(~0u, key, handle_userdb); if (rc != MDBX_INCOMPATIBLE) { userdb_count++; return rc; } } return handle_userdb(record_number, key, data); } static const char *db_flags2keymode(unsigned flags) { flags &= (MDBX_REVERSEKEY | MDBX_INTEGERKEY); switch (flags) { case 0: return "usual"; case MDBX_REVERSEKEY: return "reserve"; case MDBX_INTEGERKEY: return "ordinal"; case MDBX_REVERSEKEY | MDBX_INTEGERKEY: return "msgpack"; default: assert(false); __unreachable(); } } static const char *db_flags2valuemode(unsigned flags) { flags &= (MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_DUPFIXED | MDBX_INTEGERDUP); switch (flags) { case 0: return "single"; case MDBX_DUPSORT: return "multi"; case MDBX_REVERSEDUP: case MDBX_DUPSORT | MDBX_REVERSEDUP: return "multi-reverse"; case MDBX_DUPFIXED: case MDBX_DUPSORT | MDBX_DUPFIXED: return "multi-samelength"; case MDBX_DUPFIXED | MDBX_REVERSEDUP: case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP: return "multi-reverse-samelength"; case MDBX_INTEGERDUP: case MDBX_DUPSORT | MDBX_INTEGERDUP: case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP: case MDBX_DUPFIXED | MDBX_INTEGERDUP: return "multi-ordinal"; case MDBX_INTEGERDUP | MDBX_REVERSEDUP: case MDBX_DUPSORT | MDBX_INTEGERDUP | MDBX_REVERSEDUP: return "multi-msgpack"; case MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: case MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP: return "reserved"; default: assert(false); __unreachable(); } } static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, visitor *handler) { MDBX_cursor *mc; MDBX_stat ms; MDBX_val key, data; MDBX_val prev_key, prev_data; unsigned flags; int rc, i; struct problem *saved_list; uint64_t problems_count; const bool second_pass = dbi_handle == MAIN_DBI; uint64_t record_count = 0, dups = 0; uint64_t key_bytes = 0, data_bytes = 0; if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) { print(" ! abort processing %s due to a previous error\n", sdb_name(dbi_name)); return MDBX_BAD_TXN; } if (dbi_handle == ~0u) { rc = mdbx_dbi_open_ex2( txn, dbi_name, MDBX_DB_ACCEDE, &dbi_handle, (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr, (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr); if (rc) { if (!dbi_name || rc != MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { error("mdbx_dbi_open(%s) failed, error %d %s\n", sdb_name(dbi_name), rc, mdbx_strerror(rc)); } return rc; } } if (dbi_handle >= CORE_DBS && dbi_name && only_subdb.iov_base && !eq(only_subdb, *dbi_name)) { if (verbose) { print("Skip processing %s...\n", sdb_name(dbi_name)); fflush(nullptr); } skipped_subdb++; return MDBX_SUCCESS; } if (!second_pass && verbose) print("Processing %s...\n", sdb_name(dbi_name)); fflush(nullptr); rc = mdbx_dbi_flags(txn, dbi_handle, &flags); if (rc) { error("mdbx_dbi_flags() failed, error %d %s\n", rc, mdbx_strerror(rc)); return rc; } rc = mdbx_dbi_stat(txn, dbi_handle, &ms, sizeof(ms)); if (rc) { error("mdbx_dbi_stat() failed, error %d %s\n", rc, mdbx_strerror(rc)); return rc; } if (!second_pass && verbose) { print(" - key-value kind: %s-key => %s-value", db_flags2keymode(flags), db_flags2valuemode(flags)); if (verbose > 1) { print(", flags:"); if (!flags) print(" none"); else { for (i = 0; dbflags[i].bit; i++) if (flags & dbflags[i].bit) print(" %s", dbflags[i].name); } if (verbose > 2) print(" (0x%02X), dbi-id %d", flags, dbi_handle); } print("\n"); if (ms.ms_mod_txnid) print(" - last modification txn#%" PRIu64 "\n", ms.ms_mod_txnid); if (verbose > 1) { print(" - page size %u, entries %" PRIu64 "\n", ms.ms_psize, ms.ms_entries); print(" - b-tree depth %u, pages: branch %" PRIu64 ", leaf %" PRIu64 ", overflow %" PRIu64 "\n", ms.ms_depth, ms.ms_branch_pages, ms.ms_leaf_pages, ms.ms_overflow_pages); } } walk_dbi_t *dbi = (dbi_handle < CORE_DBS) ? &walk.dbi[dbi_handle] : pagemap_lookup_dbi(dbi_name, true); if (!dbi) { error("too many DBIs or out of memory\n"); return MDBX_ENOMEM; } if (!dont_traversal) { const uint64_t subtotal_pages = ms.ms_branch_pages + ms.ms_leaf_pages + ms.ms_overflow_pages; if (subtotal_pages != dbi->pages.total) error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "subtotal", subtotal_pages, dbi->pages.total); if (ms.ms_branch_pages != dbi->pages.branch) error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "branch", ms.ms_branch_pages, dbi->pages.branch); const uint64_t allleaf_pages = dbi->pages.leaf + dbi->pages.leaf_dupfixed; if (ms.ms_leaf_pages != allleaf_pages) error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "all-leaf", ms.ms_leaf_pages, allleaf_pages); if (ms.ms_overflow_pages != dbi->pages.large_volume) error("%s pages mismatch (%" PRIu64 " != walked %" PRIu64 ")\n", "large/overlow", ms.ms_overflow_pages, dbi->pages.large_volume); } rc = mdbx_cursor_open(txn, dbi_handle, &mc); if (rc) { error("mdbx_cursor_open() failed, error %d %s\n", rc, mdbx_strerror(rc)); return rc; } if (ignore_wrong_order) { /* for debugging with enabled assertions */ mc->mc_checking |= CC_SKIPORD; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags); saved_list = problems_push(); prev_key.iov_base = nullptr; prev_key.iov_len = 0; prev_data.iov_base = nullptr; prev_data.iov_len = 0; rc = mdbx_cursor_get(mc, &key, &data, MDBX_FIRST); while (rc == MDBX_SUCCESS) { rc = check_user_break(); if (rc) goto bailout; if (!second_pass) { bool bad_key = false; if (key.iov_len > maxkeysize) { problem_add("entry", record_count, "key length exceeds max-key-size", "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); bad_key = true; } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && key.iov_len != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong key length", "%" PRIuPTR " != 4or8", key.iov_len); bad_key = true; } bool bad_data = false; if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && data.iov_len != sizeof(uint32_t)) { problem_add("entry", record_count, "wrong data length", "%" PRIuPTR " != 4or8", data.iov_len); bad_data = true; } if (prev_key.iov_base) { if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && prev_data.iov_len != data.iov_len) { problem_add("entry", record_count, "different data length", "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, data.iov_len); bad_data = true; } if (!bad_key) { int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); if (cmp == 0) { ++dups; if ((flags & MDBX_DUPSORT) == 0) { problem_add("entry", record_count, "duplicated entries", nullptr); if (prev_data.iov_base && data.iov_len == prev_data.iov_len && memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) { problem_add("entry", record_count, "complete duplicate", nullptr); } } else if (!bad_data && prev_data.iov_base) { cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); if (cmp == 0) { problem_add("entry", record_count, "complete duplicate", nullptr); } else if (cmp < 0 && !ignore_wrong_order) { problem_add("entry", record_count, "wrong order of multi-values", nullptr); } } } else if (cmp < 0 && !ignore_wrong_order) { problem_add("entry", record_count, "wrong order of entries", nullptr); } } } if (!bad_key) { if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); prev_key = key; } if (!bad_data) { if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && !prev_data.iov_base) print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); prev_data = data; } } if (handler) { rc = handler(record_count, &key, &data); if (MDBX_IS_ERROR(rc)) goto bailout; } record_count++; key_bytes += key.iov_len; data_bytes += data.iov_len; rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT); } if (rc != MDBX_NOTFOUND) error("mdbx_cursor_get() failed, error %d %s\n", rc, mdbx_strerror(rc)); else rc = 0; if (record_count != ms.ms_entries) problem_add("entry", record_count, "different number of entries", "%" PRIu64 " != %" PRIu64, record_count, ms.ms_entries); bailout: problems_count = problems_pop(saved_list); if (!second_pass && verbose) { print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64 " key's bytes, %" PRIu64 " data's " "bytes, %" PRIu64 " problems\n", record_count, dups, key_bytes, data_bytes, problems_count); fflush(nullptr); } mdbx_cursor_close(mc); return (rc || problems_count) ? MDBX_RESULT_TRUE : MDBX_SUCCESS; } static void usage(char *prog) { fprintf( stderr, "usage: %s " "[-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] [-u|U] dbpath\n" " -V\t\tprint version and exit\n" " -v\t\tmore verbose, could be used multiple times\n" " -q\t\tbe quiet\n" " -c\t\tforce cooperative mode (don't try exclusive)\n" " -w\t\twrite-mode checking\n" " -d\t\tdisable page-by-page traversal of B-tree\n" " -i\t\tignore wrong order errors (for custom comparators case)\n" " -s subdb\tprocess a specific subdatabase only\n" " -u\t\twarmup database before checking\n" " -U\t\twarmup and try lock database pages in memory before checking\n" " -0|1|2\tforce using specific meta-page 0, or 2 for checking\n" " -t\t\tturn to a specified meta-page on successful check\n" " -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n", prog); exit(EXIT_INTERRUPTED); } static bool meta_ot(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, uint64_t sign_b, const bool wanna_steady) { if (txn_a == txn_b) return SIGN_IS_STEADY(sign_b); if (wanna_steady && SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) return SIGN_IS_STEADY(sign_b); return txn_a < txn_b; } static bool meta_eq(txnid_t txn_a, uint64_t sign_a, txnid_t txn_b, uint64_t sign_b) { if (!txn_a || txn_a != txn_b) return false; if (SIGN_IS_STEADY(sign_a) != SIGN_IS_STEADY(sign_b)) return false; return true; } static int meta_recent(const bool wanna_steady) { if (meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady)) return meta_ot(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, wanna_steady) ? 1 : 2; else return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, wanna_steady) ? 2 : 0; } static int meta_tail(int head) { switch (head) { case 0: return meta_ot(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true) ? 1 : 2; case 1: return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, true) ? 0 : 2; case 2: return meta_ot(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, true) ? 0 : 1; default: assert(false); return -1; } } static int meta_head(void) { return meta_recent(false); } void verbose_meta(int num, txnid_t txnid, uint64_t sign, uint64_t bootid_x, uint64_t bootid_y) { const bool have_bootid = (bootid_x | bootid_y) != 0; const bool bootid_match = bootid_x == envinfo.mi_bootid.current.x && bootid_y == envinfo.mi_bootid.current.y; print(" - meta-%d: ", num); switch (sign) { case MDBX_DATASIGN_NONE: print("no-sync/legacy"); break; case MDBX_DATASIGN_WEAK: print("weak-%s", bootid_match ? (have_bootid ? "intact (same boot-id)" : "unknown (no boot-id") : "dead"); break; default: print("steady"); break; } print(" txn#%" PRIu64, txnid); const int head = meta_head(); if (num == head) print(", head"); else if (num == meta_tail(head)) print(", tail"); else print(", stay"); if (stuck_meta >= 0) { if (num == stuck_meta) print(", forced for checking"); } else if (txnid > envinfo.mi_recent_txnid && (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == MDBX_EXCLUSIVE) print(", rolled-back %" PRIu64 " (%" PRIu64 " >>> %" PRIu64 ")", txnid - envinfo.mi_recent_txnid, txnid, envinfo.mi_recent_txnid); print("\n"); } static uint64_t get_meta_txnid(const unsigned meta_id) { switch (meta_id) { default: assert(false); error("unexpected meta_id %u\n", meta_id); return 0; case 0: return envinfo.mi_meta0_txnid; case 1: return envinfo.mi_meta1_txnid; case 2: return envinfo.mi_meta2_txnid; } } static void print_size(const char *prefix, const uint64_t value, const char *suffix) { const char sf[] = "KMGTPEZY"; /* LY: Kilo, Mega, Giga, Tera, Peta, Exa, Zetta, Yotta! */ double k = 1024.0; size_t i; for (i = 0; sf[i + 1] && value / k > 1000.0; ++i) k *= 1024; print("%s%" PRIu64 " (%.2f %cb)%s", prefix, value, value / k, sf[i], suffix); } int main(int argc, char *argv[]) { int rc; char *prog = argv[0]; char *envname; unsigned problems_maindb = 0, problems_freedb = 0, problems_meta = 0; bool write_locked = false; bool turn_meta = false; bool force_turn_meta = false; bool warmup = false; MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default; double elapsed; #if defined(_WIN32) || defined(_WIN64) uint64_t timestamp_start, timestamp_finish; timestamp_start = GetMilliseconds(); #else struct timespec timestamp_start, timestamp_finish; if (clock_gettime(CLOCK_MONOTONIC, ×tamp_start)) { rc = errno; error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc)); return EXIT_FAILURE_SYS; } #endif dbi_meta.name.iov_base = MDBX_PGWALK_META; dbi_free.name.iov_base = MDBX_PGWALK_GC; dbi_main.name.iov_base = MDBX_PGWALK_MAIN; atexit(pagemap_cleanup); if (argc < 2) usage(prog); for (int i; (i = getopt(argc, argv, "uU" "0" "1" "2" "T" "V" "v" "q" "n" "w" "c" "t" "d" "i" "s:")) != EOF;) { switch (i) { case 'V': printf("mdbx_chk version %d.%d.%d.%d\n" " - source: %s %s, commit %s, tree %s\n" " - anchor: %s\n" " - build: %s for %s by %s\n" " - flags: %s\n" " - options: %s\n", mdbx_version.major, mdbx_version.minor, mdbx_version.release, mdbx_version.revision, mdbx_version.git.describe, mdbx_version.git.datetime, mdbx_version.git.commit, mdbx_version.git.tree, mdbx_sourcery_anchor, mdbx_build.datetime, mdbx_build.target, mdbx_build.compiler, mdbx_build.flags, mdbx_build.options); return EXIT_SUCCESS; case 'v': verbose++; break; case '0': stuck_meta = 0; break; case '1': stuck_meta = 1; break; case '2': stuck_meta = 2; break; case 't': turn_meta = true; break; case 'T': turn_meta = force_turn_meta = true; quiet = false; if (verbose < 2) verbose = 2; break; case 'q': quiet = true; break; case 'n': break; case 'w': envflags &= ~MDBX_RDONLY; #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. * See https://libmdbx.dqdkfa.ru/dead-github/issues/67 */ envflags |= MDBX_WRITEMAP; #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ break; case 'c': envflags = (envflags & ~MDBX_EXCLUSIVE) | MDBX_ACCEDE; break; case 'd': dont_traversal = true; break; case 's': if (only_subdb.iov_base && strcmp(only_subdb.iov_base, optarg)) usage(prog); only_subdb.iov_base = optarg; only_subdb.iov_len = strlen(optarg); break; case 'i': ignore_wrong_order = true; break; case 'u': warmup = true; break; case 'U': warmup = true; warmup_flags = MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock; break; default: usage(prog); } } if (optind != argc - 1) usage(prog); rc = MDBX_SUCCESS; if (stuck_meta >= 0 && (envflags & MDBX_EXCLUSIVE) == 0) { error("exclusive mode is required to using specific meta-page(%d) for " "checking.\n", stuck_meta); rc = EXIT_INTERRUPTED; } if (turn_meta) { if (stuck_meta < 0) { error("meta-page must be specified (by -0, -1 or -2 options) to turn to " "it.\n"); rc = EXIT_INTERRUPTED; } if (envflags & MDBX_RDONLY) { error("write-mode must be enabled to turn to the specified meta-page.\n"); rc = EXIT_INTERRUPTED; } if (only_subdb.iov_base || dont_traversal) { error( "whole database checking with b-tree traversal are required to turn " "to the specified meta-page.\n"); rc = EXIT_INTERRUPTED; } } if (rc) exit(rc); #if defined(_WIN32) || defined(_WIN64) SetConsoleCtrlHandler(ConsoleBreakHandlerRoutine, true); #else #ifdef SIGPIPE signal(SIGPIPE, signal_handler); #endif #ifdef SIGHUP signal(SIGHUP, signal_handler); #endif signal(SIGINT, signal_handler); signal(SIGTERM, signal_handler); #endif /* !WINDOWS */ envname = argv[optind]; print("mdbx_chk %s (%s, T-%s)\nRunning for %s in 'read-%s' mode...\n", mdbx_version.git.describe, mdbx_version.git.datetime, mdbx_version.git.tree, envname, (envflags & MDBX_RDONLY) ? "only" : "write"); fflush(nullptr); mdbx_setup_debug((verbose < MDBX_LOG_TRACE - 1) ? (MDBX_log_level_t)(verbose + 1) : MDBX_LOG_TRACE, MDBX_DBG_DUMP | MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, logger); rc = mdbx_env_create(&env); if (rc) { error("mdbx_env_create() failed, error %d %s\n", rc, mdbx_strerror(rc)); return rc < 0 ? EXIT_FAILURE_MDBX : EXIT_FAILURE_SYS; } rc = mdbx_env_set_maxdbs(env, MDBX_MAX_DBI); if (rc) { error("mdbx_env_set_maxdbs() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } if (stuck_meta >= 0) { rc = mdbx_env_open_for_recovery(env, envname, stuck_meta, (envflags & MDBX_RDONLY) ? false : true); } else { rc = mdbx_env_open(env, envname, envflags, 0); if ((envflags & MDBX_EXCLUSIVE) && (rc == MDBX_BUSY || #if defined(_WIN32) || defined(_WIN64) rc == ERROR_LOCK_VIOLATION || rc == ERROR_SHARING_VIOLATION #else rc == EBUSY || rc == EAGAIN #endif )) { envflags &= ~MDBX_EXCLUSIVE; rc = mdbx_env_open(env, envname, envflags | MDBX_ACCEDE, 0); } } if (rc) { error("mdbx_env_open() failed, error %d %s\n", rc, mdbx_strerror(rc)); if (rc == MDBX_WANNA_RECOVERY && (envflags & MDBX_RDONLY)) print("Please run %s in the read-write mode (with '-w' option).\n", prog); goto bailout; } if (verbose) print(" - %s mode\n", (envflags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative"); if ((envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0) { if (verbose) { print(" - taking write lock..."); fflush(nullptr); } rc = mdbx_txn_lock(env, false); if (rc != MDBX_SUCCESS) { error("mdbx_txn_lock() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } if (verbose) print(" done\n"); write_locked = true; } if (warmup) { if (verbose) { print(" - warming up..."); fflush(nullptr); } rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); if (MDBX_IS_ERROR(rc)) { error("mdbx_env_warmup(flags %u) failed, error %d %s\n", warmup_flags, rc, mdbx_strerror(rc)); goto bailout; } if (verbose) print(" %s\n", rc ? "timeout" : "done"); } rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); if (rc) { error("mdbx_txn_begin() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } rc = mdbx_env_info_ex(env, txn, &envinfo, sizeof(envinfo)); if (rc) { error("mdbx_env_info_ex() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } if (verbose) { print(" - current boot-id "); if (envinfo.mi_bootid.current.x | envinfo.mi_bootid.current.y) print("%016" PRIx64 "-%016" PRIx64 "\n", envinfo.mi_bootid.current.x, envinfo.mi_bootid.current.y); else print("unavailable\n"); } mdbx_filehandle_t dxb_fd; rc = mdbx_env_get_fd(env, &dxb_fd); if (rc) { error("mdbx_env_get_fd() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } uint64_t dxb_filesize = 0; #if defined(_WIN32) || defined(_WIN64) { BY_HANDLE_FILE_INFORMATION info; if (!GetFileInformationByHandle(dxb_fd, &info)) rc = GetLastError(); else dxb_filesize = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32; } #else { struct stat st; STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); if (fstat(dxb_fd, &st)) rc = errno; else dxb_filesize = st.st_size; } #endif if (rc) { error("osal_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } errno = 0; const uint64_t dxbfile_pages = dxb_filesize / envinfo.mi_dxb_pagesize; alloc_pages = txn->mt_next_pgno; backed_pages = envinfo.mi_geo.current / envinfo.mi_dxb_pagesize; if (backed_pages > dxbfile_pages) { print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n", backed_pages, dxbfile_pages); ++problems_meta; } if (dxbfile_pages < NUM_METAS) print(" ! file-pages %" PRIu64 " < %u\n", dxbfile_pages, NUM_METAS); if (backed_pages < NUM_METAS) print(" ! backed-pages %" PRIu64 " < %u\n", backed_pages, NUM_METAS); if (backed_pages < NUM_METAS || dxbfile_pages < NUM_METAS) goto bailout; if (backed_pages > MAX_PAGENO + 1) { print(" ! backed-pages %" PRIu64 " > max-pages %" PRIaPGNO "\n", backed_pages, MAX_PAGENO + 1); ++problems_meta; backed_pages = MAX_PAGENO + 1; } if ((envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { if (backed_pages > dxbfile_pages) { print(" ! backed-pages %" PRIu64 " > file-pages %" PRIu64 "\n", backed_pages, dxbfile_pages); ++problems_meta; backed_pages = dxbfile_pages; } if (alloc_pages > backed_pages) { print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", alloc_pages, backed_pages); ++problems_meta; alloc_pages = backed_pages; } } else { /* LY: DB may be shrunk by writer down to the allocated pages. */ if (alloc_pages > backed_pages) { print(" ! alloc-pages %" PRIu64 " > backed-pages %" PRIu64 "\n", alloc_pages, backed_pages); ++problems_meta; alloc_pages = backed_pages; } if (alloc_pages > dxbfile_pages) { print(" ! alloc-pages %" PRIu64 " > file-pages %" PRIu64 "\n", alloc_pages, dxbfile_pages); ++problems_meta; alloc_pages = dxbfile_pages; } if (backed_pages > dxbfile_pages) backed_pages = dxbfile_pages; } if (verbose) { print(" - pagesize %u (%u system), max keysize %d..%d" ", max readers %u\n", envinfo.mi_dxb_pagesize, envinfo.mi_sys_pagesize, mdbx_env_get_maxkeysize_ex(env, MDBX_DUPSORT), mdbx_env_get_maxkeysize_ex(env, 0), envinfo.mi_maxreaders); print_size(" - mapsize ", envinfo.mi_mapsize, "\n"); if (envinfo.mi_geo.lower == envinfo.mi_geo.upper) print_size(" - fixed datafile: ", envinfo.mi_geo.current, ""); else { print_size(" - dynamic datafile: ", envinfo.mi_geo.lower, ""); print_size(" .. ", envinfo.mi_geo.upper, ", "); print_size("+", envinfo.mi_geo.grow, ", "); print_size("-", envinfo.mi_geo.shrink, "\n"); print_size(" - current datafile: ", envinfo.mi_geo.current, ""); } printf(", %" PRIu64 " pages\n", envinfo.mi_geo.current / envinfo.mi_dxb_pagesize); #if defined(_WIN32) || defined(_WIN64) if (envinfo.mi_geo.shrink && envinfo.mi_geo.current != envinfo.mi_geo.upper) print( " WARNING: Due Windows system limitations a " "file couldn't\n be truncated while the database " "is opened. So, the size\n database file " "of may by large than the database itself,\n " "until it will be closed or reopened in read-write mode.\n"); #endif verbose_meta(0, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_bootid.meta0.x, envinfo.mi_bootid.meta0.y); verbose_meta(1, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, envinfo.mi_bootid.meta1.x, envinfo.mi_bootid.meta1.y); verbose_meta(2, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, envinfo.mi_bootid.meta2.x, envinfo.mi_bootid.meta2.y); } if (stuck_meta >= 0) { if (verbose) { print(" - skip checking meta-pages since the %u" " is selected for verification\n", stuck_meta); print(" - transactions: recent %" PRIu64 ", selected for verification %" PRIu64 ", lag %" PRIi64 "\n", envinfo.mi_recent_txnid, get_meta_txnid(stuck_meta), envinfo.mi_recent_txnid - get_meta_txnid(stuck_meta)); } } else { if (verbose > 1) print(" - performs check for meta-pages clashes\n"); if (meta_eq(envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign, envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign)) { print(" ! meta-%d and meta-%d are clashed\n", 0, 1); ++problems_meta; } if (meta_eq(envinfo.mi_meta1_txnid, envinfo.mi_meta1_sign, envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign)) { print(" ! meta-%d and meta-%d are clashed\n", 1, 2); ++problems_meta; } if (meta_eq(envinfo.mi_meta2_txnid, envinfo.mi_meta2_sign, envinfo.mi_meta0_txnid, envinfo.mi_meta0_sign)) { print(" ! meta-%d and meta-%d are clashed\n", 2, 0); ++problems_meta; } const unsigned steady_meta_id = meta_recent(true); const uint64_t steady_meta_txnid = get_meta_txnid(steady_meta_id); const unsigned weak_meta_id = meta_recent(false); const uint64_t weak_meta_txnid = get_meta_txnid(weak_meta_id); if (envflags & MDBX_EXCLUSIVE) { if (verbose > 1) print(" - performs full check recent-txn-id with meta-pages\n"); if (steady_meta_txnid != envinfo.mi_recent_txnid) { print(" ! steady meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 ")\n", steady_meta_id, steady_meta_txnid, envinfo.mi_recent_txnid); ++problems_meta; } } else if (write_locked) { if (verbose > 1) print(" - performs lite check recent-txn-id with meta-pages (not a " "monopolistic mode)\n"); if (weak_meta_txnid != envinfo.mi_recent_txnid) { print(" ! weak meta-%d txn-id mismatch recent-txn-id (%" PRIi64 " != %" PRIi64 ")\n", weak_meta_id, weak_meta_txnid, envinfo.mi_recent_txnid); ++problems_meta; } } else if (verbose) { print(" - skip check recent-txn-id with meta-pages (monopolistic or " "read-write mode only)\n"); } total_problems += problems_meta; if (verbose) print(" - transactions: recent %" PRIu64 ", latter reader %" PRIu64 ", lag %" PRIi64 "\n", envinfo.mi_recent_txnid, envinfo.mi_latter_reader_txnid, envinfo.mi_recent_txnid - envinfo.mi_latter_reader_txnid); } if (!dont_traversal) { struct problem *saved_list; size_t traversal_problems; uint64_t empty_pages, lost_bytes; print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); fflush(nullptr); walk.pagemap = osal_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); if (!walk.pagemap) { rc = errno ? errno : MDBX_ENOMEM; error("calloc() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } saved_list = problems_push(); rc = mdbx_env_pgwalk(txn, pgvisitor, nullptr, true /* always skip key ordering checking to avoid MDBX_CORRUPTED when using custom comparators */); traversal_problems = problems_pop(saved_list); if (rc) { if (rc != MDBX_EINTR || !check_user_break()) error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } for (uint64_t n = 0; n < alloc_pages; ++n) if (!walk.pagemap[n]) unused_pages += 1; empty_pages = lost_bytes = 0; for (walk_dbi_t *dbi = &dbi_main; dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { empty_pages += dbi->pages.empty; lost_bytes += dbi->lost_bytes; } if (verbose) { uint64_t total_page_bytes = walk.pgcount * envinfo.mi_dxb_pagesize; print(" - pages: walked %" PRIu64 ", left/unused %" PRIu64 "\n", walk.pgcount, unused_pages); if (verbose > 1) { for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { print(" %s: subtotal %" PRIu64, sdb_name(&dbi->name), dbi->pages.total); if (dbi->pages.other && dbi->pages.other != dbi->pages.total) print(", other %" PRIu64, dbi->pages.other); if (dbi->pages.branch) print(", branch %" PRIu64, dbi->pages.branch); if (dbi->pages.large_count) print(", large %" PRIu64, dbi->pages.large_count); uint64_t all_leaf = dbi->pages.leaf + dbi->pages.leaf_dupfixed; if (all_leaf) { print(", leaf %" PRIu64, all_leaf); if (verbose > 2 && (dbi->pages.subleaf_dupsort | dbi->pages.leaf_dupfixed | dbi->pages.subleaf_dupfixed)) print(" (usual %" PRIu64 ", sub-dupsort %" PRIu64 ", dupfixed %" PRIu64 ", sub-dupfixed %" PRIu64 ")", dbi->pages.leaf, dbi->pages.subleaf_dupsort, dbi->pages.leaf_dupfixed, dbi->pages.subleaf_dupfixed); } print("\n"); } } if (verbose > 1) print(" - usage: total %" PRIu64 " bytes, payload %" PRIu64 " (%.1f%%), unused " "%" PRIu64 " (%.1f%%)\n", total_page_bytes, walk.total_payload_bytes, walk.total_payload_bytes * 100.0 / total_page_bytes, total_page_bytes - walk.total_payload_bytes, (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); if (verbose > 2) { for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) if (dbi->pages.total) { uint64_t dbi_bytes = dbi->pages.total * envinfo.mi_dxb_pagesize; print(" %s: subtotal %" PRIu64 " bytes (%.1f%%)," " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)", sdb_name(&dbi->name), dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, dbi->payload_bytes, dbi->payload_bytes * 100.0 / dbi_bytes, dbi_bytes - dbi->payload_bytes, (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes); if (dbi->pages.empty) print(", %" PRIu64 " empty pages", dbi->pages.empty); if (dbi->lost_bytes) print(", %" PRIu64 " bytes lost", dbi->lost_bytes); print("\n"); } else print(" %s: empty\n", sdb_name(&dbi->name)); } print(" - summary: average fill %.1f%%", walk.total_payload_bytes * 100.0 / total_page_bytes); if (empty_pages) print(", %" PRIu64 " empty pages", empty_pages); if (lost_bytes) print(", %" PRIu64 " bytes lost", lost_bytes); print(", %" PRIuPTR " problems\n", traversal_problems); } } else if (verbose) { print("Skipping b-tree walk...\n"); fflush(nullptr); } if (gc_tree_problems) { print("Skip processing %s since %s is corrupted (%u problems)\n", "@GC", "b-tree", gc_tree_problems); problems_freedb = gc_tree_problems; } else problems_freedb = process_db(FREE_DBI, MDBX_PGWALK_GC, handle_freedb); if (verbose) { uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize; double percent = value / 100.0; print(" - space: %" PRIu64 " total pages", value); print(", backed %" PRIu64 " (%.1f%%)", backed_pages, backed_pages / percent); print(", allocated %" PRIu64 " (%.1f%%)", alloc_pages, alloc_pages / percent); if (verbose > 1) { value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages; print(", remained %" PRIu64 " (%.1f%%)", value, value / percent); value = dont_traversal ? alloc_pages - gc_pages : walk.pgcount; print(", used %" PRIu64 " (%.1f%%)", value, value / percent); print(", gc %" PRIu64 " (%.1f%%)", gc_pages, gc_pages / percent); value = gc_pages - reclaimable_pages; print(", detained %" PRIu64 " (%.1f%%)", value, value / percent); print(", reclaimable %" PRIu64 " (%.1f%%)", reclaimable_pages, reclaimable_pages / percent); } value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize - alloc_pages + reclaimable_pages; print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); } if ((problems_maindb = data_tree_problems) == 0 && problems_freedb == 0) { if (!dont_traversal && (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { if (walk.pgcount != alloc_pages - gc_pages) { error("used pages mismatch (%" PRIu64 "(walked) != %" PRIu64 "(allocated - GC))\n", walk.pgcount, alloc_pages - gc_pages); } if (unused_pages != gc_pages) { error("GC pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", unused_pages, gc_pages); } } else if (verbose) { print(" - skip check used and GC pages (btree-traversal with " "monopolistic or read-write mode only)\n"); } problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr); if (problems_maindb == 0) { print("Scanning %s for %s...\n", "@MAIN", "sub-database(s)"); if (!process_db(MAIN_DBI, nullptr, handle_maindb)) { if (!userdb_count && verbose) print(" - does not contain multiple databases\n"); } } else { print("Skip processing %s since %s is corrupted (%u problems)\n", "sub-database(s)", "@MAIN", problems_maindb); } } else { print("Skip processing %s since %s is corrupted (%u problems)\n", "@MAIN", "b-tree", data_tree_problems); } if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && (envflags & MDBX_RDONLY) == 0 && !only_subdb.iov_base && stuck_meta < 0 && get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) { print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64 "\n", envinfo.mi_recent_txnid); fflush(nullptr); if (write_locked) { mdbx_txn_unlock(env); write_locked = false; } rc = mdbx_env_sync_ex(env, true, false); if (rc != MDBX_SUCCESS) error("mdbx_env_pgwalk() failed, error %d %s\n", rc, mdbx_strerror(rc)); else { total_problems -= 1; problems_meta -= 1; } } if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb.iov_base && (envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) { const bool successful_check = (rc | total_problems | problems_meta) == 0; if (successful_check || force_turn_meta) { fflush(nullptr); print(" = Performing turn to the specified meta-page (%d) due to %s!\n", stuck_meta, successful_check ? "successful check" : "the -T option was given"); fflush(nullptr); rc = mdbx_env_turn_for_recovery(env, stuck_meta); if (rc != MDBX_SUCCESS) error("mdbx_env_turn_for_recovery() failed, error %d %s\n", rc, mdbx_strerror(rc)); } else { print(" = Skipping turn to the specified meta-page (%d) due to " "unsuccessful check!\n", stuck_meta); } } bailout: if (txn) mdbx_txn_abort(txn); if (write_locked) { mdbx_txn_unlock(env); write_locked = false; } if (env) { const bool dont_sync = rc != 0 || total_problems; mdbx_env_close_ex(env, dont_sync); } fflush(nullptr); if (rc) { if (rc < 0) return user_break ? EXIT_INTERRUPTED : EXIT_FAILURE_SYS; return EXIT_FAILURE_MDBX; } #if defined(_WIN32) || defined(_WIN64) timestamp_finish = GetMilliseconds(); elapsed = (timestamp_finish - timestamp_start) * 1e-3; #else if (clock_gettime(CLOCK_MONOTONIC, ×tamp_finish)) { rc = errno; error("clock_gettime() failed, error %d %s\n", rc, mdbx_strerror(rc)); return EXIT_FAILURE_SYS; } elapsed = timestamp_finish.tv_sec - timestamp_start.tv_sec + (timestamp_finish.tv_nsec - timestamp_start.tv_nsec) * 1e-9; #endif /* !WINDOWS */ if (total_problems) { print("Total %u error%s detected, elapsed %.3f seconds.\n", total_problems, (total_problems > 1) ? "s are" : " is", elapsed); if (problems_meta || problems_maindb || problems_freedb) return EXIT_FAILURE_CHECK_MAJOR; return EXIT_FAILURE_CHECK_MINOR; } print("No error is detected, elapsed %.3f seconds\n", elapsed); return EXIT_SUCCESS; }