mirror of
synced 2025-03-03 16:58:14 +08:00
mdbx: Merge branch 'devel'.
This commit is contained in:
@ -1,4 +1,4 @@
Extended LMDB, aka "Расширенная LMDB".
@ -595,3 +595,111 @@ mdbx_txn_abort() или mdbx_txn_reset(). Что позволяет избави
28. Три мета-страницы вместо двух, что позволяет гарантированно
консистентно обновлять слабые контрольные точки фиксации без риска
повредить крайнюю сильную точку фиксации.
29. В _libmdbx_ реализован автоматический возврат освобождающихся
страниц в область нераспределенного резерва в конце файла данных. При
этом уменьшается количество страниц загруженных в память и участвующих в
цикле обновления данных и записи на диск. Фактически _libmdbx_ выполняет
постоянную компактификацию данных, но не затрачивая на это
дополнительных ресурсов, а только освобождая их. При освобождении места
в БД, в случае наличия поддержки со стороны операционной системы и
установки соответствующих параметров геометрии базы данных, также будет
уменьшаться размер файла на диске.
$ objdump -f -h -j .text libmdbx.so
libmdbx.so: file format elf64-x86-64
architecture: i386:x86-64, flags 0x00000150:
start address 0x000030e0
Idx Name Size VMA LMA File off Algn
11 .text 00014661 000030e0 000030e0 000030e0 2**4
$ objdump -C -T libmdbx.so | grep mdbx | sort
00004057 g DF .text 0000003f Base mdbx_strerror_r
00004096 g DF .text 00000031 Base mdbx_strerror
00004207 g DF .text 00000025 Base mdbx_env_get_maxkeysize
0000422c g DF .text 000000b8 Base mdbx_env_create
000042e4 g DF .text 0000001f Base mdbx_env_set_mapsize
00004f9f g DF .text 00000037 Base mdbx_env_set_maxdbs
00004fd6 g DF .text 00000036 Base mdbx_env_set_maxreaders
0000500c g DF .text 00000027 Base mdbx_env_get_maxreaders
00005033 g DF .text 0000066a Base mdbx_env_open_ex
0000569d g DF .text 00000008 Base mdbx_env_open
000056a5 g DF .text 00000096 Base mdbx_env_close_ex
0000573b g DF .text 00000007 Base mdbx_env_close
00005742 g DF .text 00000047 Base mdbx_env_set_flags
00005789 g DF .text 0000001d Base mdbx_env_get_flags
000057a6 g DF .text 00000014 Base mdbx_env_set_userctx
000057ba g DF .text 0000000f Base mdbx_env_get_userctx
000057c9 g DF .text 0000000d Base mdbx_env_set_assert
000057d6 g DF .text 0000001d Base mdbx_env_get_path
000057f3 g DF .text 00000018 Base mdbx_env_get_fd
0000580b g DF .text 00000056 Base mdbx_env_stat
00005861 g DF .text 00000276 Base mdbx_env_info
00005ad7 g DF .text 00000148 Base mdbx_reader_list
0000656a g DF .text 0000012a Base mdbx_dbi_stat
0000693a g DF .text 00000146 Base mdbx_env_copy2fd
00006a80 g DF .text 0000012e Base mdbx_env_copy
00006bae g DF .text 0000002a Base mdbx_reader_check
00006bd8 g DF .text 000000f9 Base mdbx_setup_debug
00006cd1 g DF .text 00000033 Base mdbx_env_set_syncbytes
00006d04 g DF .text 00000023 Base mdbx_env_set_oomfunc
00006d27 g DF .text 00000019 Base mdbx_env_get_oomfunc
00006d40 g DF .text 00000121 Base mdbx_env_pgwalk
0000ac60 g DF .text 00000163 Base mdbx_dkey
0000add0 g DF .text 00000016 Base mdbx_cmp
0000adf0 g DF .text 00000016 Base mdbx_dcmp
0000ae10 g DF .text 00000271 Base mdbx_env_sync
0000b090 g DF .text 0000001b Base mdbx_txn_env
0000b0b0 g DF .text 0000001c Base mdbx_txn_id
0000b0d0 g DF .text 00000077 Base mdbx_txn_reset
0000b150 g DF .text 00000077 Base mdbx_txn_abort
0000b1d0 g DF .text 00000057 Base mdbx_get_maxkeysize
0000b230 g DF .text 000006b7 Base mdbx_env_set_geometry
0000b8f0 g DF .text 000000ef Base mdbx_cursor_count
0000b9e0 g DF .text 000000ad Base mdbx_cursor_close
0000ba90 g DF .text 0000001b Base mdbx_cursor_txn
0000bab0 g DF .text 00000017 Base mdbx_cursor_dbi
0000bad0 g DF .text 0000007d Base mdbx_dbi_close
0000bb50 g DF .text 000000cc Base mdbx_dbi_flags_ex
0000bc20 g DF .text 00000038 Base mdbx_dbi_flags
0000c250 g DF .text 00000077 Base mdbx_txn_renew
0000c2d0 g DF .text 000004e5 Base mdbx_txn_begin
0000dcb0 g DF .text 00000128 Base mdbx_cursor_open
0000dde0 g DF .text 0000011d Base mdbx_cursor_renew
0000e970 g DF .text 000000fc Base mdbx_get
0000ef00 g DF .text 00000489 Base mdbx_cursor_get
000125e0 g DF .text 00000719 Base mdbx_cursor_del
00012e00 g DF .text 000000e4 Base mdbx_del
00012ef0 g DF .text 000002c3 Base mdbx_drop
000131c0 g DF .text 0000129e Base mdbx_cursor_put
000145d0 g DF .text 000000a7 Base mdbx_put
00014b60 g DF .text 000000bf Base mdbx_dbi_open_ex
00014c20 g DF .text 0000000b Base mdbx_dbi_open
00014c30 g DF .text 00001347 Base mdbx_txn_commit
00015f80 g DF .text 00000105 Base mdbx_txn_straggler
00016090 g DF .text 000000e7 Base mdbx_canary_put
00016180 g DF .text 00000078 Base mdbx_canary_get
00016200 g DF .text 0000006e Base mdbx_cursor_on_first
00016270 g DF .text 00000096 Base mdbx_cursor_on_last
00016310 g DF .text 00000066 Base mdbx_cursor_eof
00016380 g DF .text 00000504 Base mdbx_replace
00016890 g DF .text 0000017d Base mdbx_get_ex
00016a10 g DF .text 000000a4 Base mdbx_is_dirty
00016ac0 g DF .text 00000120 Base mdbx_dbi_sequence
00016be0 g DF .text 00000064 Base mdbx_cursor_get_attr
00016c50 g DF .text 00000064 Base mdbx_get_attr
00016cc0 g DF .text 000000c7 Base mdbx_put_attr
00016d90 g DF .text 000000c7 Base mdbx_cursor_put_attr
00016e60 g DF .text 00000244 Base mdbx_set_attr
@ -1,4 +1,4 @@
- [ ] Перевод mdbx-tools на С++ и сборка для Windows.
- [ ] Переход на CMake, замена заглушек mdbx_version и mdbx_build.
@ -33,7 +33,7 @@
- [ ] Отслеживание времени жизни DBI-хендлов.
- [ ] Отрефакторить mdbx_freelist_save().
- [ ] Хранить "свободный хвост" не связанный с freeDB в META.
- [ ] Возврат выделенных страниц в unallocated tail-pool.
- [x] Возврат выделенных страниц в unallocated tail-pool.
- [ ] Валидатор страниц БД по номеру транзакции:
~0 при переработке и номер транзакции при выделении,
проверять что этот номер больше головы реклайминга и не-больше текущей транзакции.
@ -183,8 +183,8 @@ typedef struct mdbx_build_info {
const char *flags;
} mdbx_build_info;
extern LIBMDBX_API const struct mdbx_version_info mdbx_version;
extern LIBMDBX_API const struct mdbx_build_info mdbx_build;
extern LIBMDBX_API const mdbx_version_info mdbx_version;
extern LIBMDBX_API const mdbx_build_info mdbx_build;
/* The name of the lock file in the DB environment */
#define MDBX_LOCKNAME "/mdbx.lck"
@ -432,13 +432,22 @@ typedef struct MDBX_lockinfo {
/* Two kind lists of pages (aka IDL) */
/* Two kind lists of pages (aka PNL) */
/* An IDL is an ID List, a sorted array of IDs. The first
* element of the array is a counter for how many actual
* IDs are in the list. In the libmdbx IDLs are sorted in
* descending order. */
typedef pgno_t *MDBX_IDL;
/* An PNL is an Page Number List, a sorted array of IDs. The first element of
* the array is a counter for how many actual page-numbers are in the list.
* PNLs are sorted in descending order, this allow cut off a page with lowest
* pgno (at the tail) just truncating the list */
typedef pgno_t *MDBX_PNL;
#define MDBX_PNL_ORDERED(first, last) ((first) < (last))
#define MDBX_PNL_DISORDERED(first, last) ((first) >= (last))
#define MDBX_PNL_ORDERED(first, last) ((first) > (last))
#define MDBX_PNL_DISORDERED(first, last) ((first) <= (last))
/* List of txnid, only for MDBX_env.mt_lifo_reclaimed */
typedef txnid_t *MDBX_TXL;
@ -455,23 +464,23 @@ typedef struct MDBX_ID2 {
* unused. The array is sorted in ascending order by mid. */
typedef MDBX_ID2 *MDBX_ID2L;
/* IDL sizes - likely should be even bigger
/* PNL sizes - likely should be even bigger
* limiting factors: sizeof(pgno_t), thread stack size */
#define MDBX_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */
#define MDBX_IDL_UM_SIZE (1 << (MDBX_IDL_LOGN + 1))
#define MDBX_PNL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */
#define MDBX_PNL_UM_SIZE (1 << (MDBX_PNL_LOGN + 1))
#define MDBX_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(pgno_t))
#define MDBX_IDL_IS_ZERO(ids) ((ids)[0] == 0)
#define MDBX_IDL_CPY(dst, src) (memcpy(dst, src, MDBX_IDL_SIZEOF(src)))
#define MDBX_IDL_FIRST(ids) ((ids)[1])
#define MDBX_IDL_LAST(ids) ((ids)[(ids)[0]])
#define MDBX_PNL_SIZEOF(pl) (((pl)[0] + 1) * sizeof(pgno_t))
#define MDBX_PNL_IS_ZERO(pl) ((pl)[0] == 0)
#define MDBX_PNL_CPY(dst, src) (memcpy(dst, src, MDBX_PNL_SIZEOF(src)))
#define MDBX_PNL_FIRST(pl) ((pl)[1])
#define MDBX_PNL_LAST(pl) ((pl)[(pl)[0]])
/* Current max length of an mdbx_midl_alloc()ed IDL */
#define MDBX_IDL_ALLOCLEN(ids) ((ids)[-1])
/* Current max length of an mdbx_pnl_alloc()ed PNL */
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
/* Internal structures */
@ -503,7 +512,7 @@ struct MDBX_txn {
/* The list of reclaimed txns from freeDB */
MDBX_TXL mt_lifo_reclaimed;
/* The list of pages that became unused during this transaction. */
MDBX_IDL mt_befree_pages;
MDBX_PNL mt_befree_pages;
/* The list of loose pages that became unused and may be reused
* in this transaction, linked through NEXT_LOOSE_PAGE(page). */
MDBX_page *mt_loose_pages;
@ -512,7 +521,7 @@ struct MDBX_txn {
/* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */
MDBX_IDL mt_spill_pages;
MDBX_PNL mt_spill_pages;
union {
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
MDBX_ID2L mt_rw_dirtylist;
@ -699,9 +708,9 @@ struct MDBX_env {
#define me_last_reclaimed me_pgstate.mf_last_reclaimed
#define me_reclaimed_pglist me_pgstate.mf_reclaimed_pglist
MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
/* IDL of pages that became unused in a write txn */
MDBX_IDL me_free_pgs;
/* ID2L of pages written during a write txn. Length MDBX_IDL_UM_SIZE. */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_free_pgs;
/* ID2L of pages written during a write txn. Length MDBX_PNL_UM_SIZE. */
MDBX_ID2L me_dirtylist;
/* Max number of freelist items that can fit in a single overflow page */
unsigned me_maxfree_1pg;
@ -1201,6 +1210,11 @@ static __inline pgno_t pgno_add(pgno_t base, pgno_t augend) {
return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO;
static __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) {
assert(base >= MIN_PAGENO);
return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO;
static __inline size_t pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) {
return mdbx_roundup2(pgno2bytes(env, pgno), env->me_os_psize);
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
/* mdbx_chk.c - memory-mapped database check tool */
/* mdbx_chk.c - memory-mapped database check tool */
* Copyright 2015-2017 Leonid Yuriev <leo@yuriev.ru>
@ -327,8 +327,6 @@ static int handle_userdb(const uint64_t record_number, const MDBX_val *key,
static int handle_freedb(const uint64_t record_number, const MDBX_val *key,
const MDBX_val *data) {
char *bad = "";
pgno_t pg, prev;
int i, number, span = 0;
pgno_t *iptr = data->iov_base;
txnid_t txnid = *(txnid_t *)key->iov_base;
@ -342,46 +340,55 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key,
problem_add("entry", record_number, "wrong idl size", "%" PRIuPTR "",
else {
number = *iptr++;
if (number >= MDBX_IDL_UM_MAX)
const pgno_t number = *iptr++;
if (number >= MDBX_PNL_UM_MAX)
problem_add("entry", record_number, "wrong idl length", "%" PRIiPTR "",
else if ((number + 1) * sizeof(pgno_t) != data->iov_len)
problem_add("entry", record_number, "mismatch idl length",
"%" PRIiPTR " != %" PRIuPTR "", (number + 1) * sizeof(pgno_t),
"%" PRIuSIZE " != %" PRIuSIZE "",
(number + 1) * sizeof(pgno_t), data->iov_len);
else {
freedb_pages += number;
if (envinfo.mi_latter_reader_txnid > txnid)
reclaimable_pages += number;
for (i = number, prev = NUM_METAS - 1; --i >= 0;) {
pg = iptr[i];
pgno_t prev =
MDBX_PNL_ASCENDING ? NUM_METAS - 1 : (pgno_t)envinfo.mi_last_pgno + 1;
pgno_t span = 1;
for (unsigned i = 0; i < number; ++i) {
const pgno_t pg = iptr[i];
if (pg < NUM_METAS || pg > envinfo.mi_last_pgno)
problem_add("entry", record_number, "wrong idl entry",
"%u < %" PRIiPTR " < %" PRIiPTR "", NUM_METAS, pg,
"%u < %" PRIaPGNO " < %" PRIu64 "", NUM_METAS, pg,
else if (pg <= prev) {
else if (MDBX_PNL_DISORDERED(prev, pg)) {
bad = " [bad sequence]";
problem_add("entry", record_number, "bad sequence",
"%" PRIiPTR " <= %" PRIiPTR "", pg, prev);
"%" PRIaPGNO " <> %" PRIaPGNO "", prev, pg);
prev = pg;
pg += span;
for (; i >= span && iptr[i - span] == pg; span++, pg++)
while (i + span < number &&
iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pg, span)
: pgno_sub(pg, span)))
if (verbose > 2 && !only_subdb) {
print(" transaction %" PRIaTXN ", %u pages, maxspan %i%s\n", txnid,
number, span, bad);
print(" transaction %" PRIaTXN ", %" PRIaPGNO
" pages, maxspan %" PRIaPGNO "%s\n",
txnid, number, span, bad);
if (verbose > 3) {
int j = number - 1;
while (j >= 0) {
pg = iptr[j];
for (span = 1; --j >= 0 && iptr[j] == pg + span; span++)
for (unsigned i = 0; i < number; i += span) {
const pgno_t pg = iptr[i];
for (span = 1;
i + span < number &&
iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pg, span)
: pgno_sub(pg, span));
if (span > 1)
print(" %9" PRIaPGNO "[%i]\n", pg, span);
if (span > 1) {
print(" %9" PRIaPGNO "[%" PRIaPGNO "]\n", pg, span);
} else
print(" %9" PRIaPGNO "\n", pg);
@ -234,33 +234,41 @@ int main(int argc, char *argv[]) {
iptr = data.iov_base;
pages += *iptr;
const pgno_t number = *iptr++;
pages += number;
if (envinfo && mei.mi_latter_reader_txnid > *(size_t *)key.iov_base)
reclaimable += *iptr;
reclaimable += number;
if (freinfo > 1) {
char *bad = "";
pgno_t pg, prev;
intptr_t i, j, span = 0;
j = *iptr++;
for (i = j, prev = NUM_METAS - 1; --i >= 0;) {
pg = iptr[i];
if (pg <= prev)
pgno_t prev =
MDBX_PNL_ASCENDING ? NUM_METAS - 1 : (pgno_t)mei.mi_last_pgno + 1;
pgno_t span = 1;
for (unsigned i = 0; i < number; ++i) {
pgno_t pg = iptr[i];
if (MDBX_PNL_DISORDERED(prev, pg))
bad = " [bad sequence]";
prev = pg;
pg += (unsigned)span;
for (; i >= span && iptr[i - span] == pg; span++, pg++)
while (i + span < number &&
iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pg, span)
: pgno_sub(pg, span)))
printf(" Transaction %" PRIaTXN ", %" PRIiPTR
" pages, maxspan %" PRIiPTR "%s\n",
*(txnid_t *)key.iov_base, j, span, bad);
printf(" Transaction %" PRIaTXN ", %" PRIaPGNO
" pages, maxspan %" PRIaPGNO "%s\n",
*(txnid_t *)key.iov_base, number, span, bad);
if (freinfo > 2) {
for (--j; j >= 0;) {
pg = iptr[j];
for (span = 1; --j >= 0 && iptr[j] == pg + span; span++)
for (unsigned i = 0; i < number; i += span) {
const pgno_t pg = iptr[i];
for (span = 1;
i + span < number &&
iptr[i + span] == (MDBX_PNL_ASCENDING ? pgno_add(pg, span)
: pgno_sub(pg, span));
if (span > 1)
printf(" %9" PRIaPGNO "[%" PRIiPTR "]\n", pg, span);
printf(" %9" PRIaPGNO "[%" PRIaPGNO "]\n", pg, span);
printf(" %9" PRIaPGNO "\n", pg);
@ -21,7 +21,7 @@
const struct mdbx_version_info mdbx_version = {
/*LIBMDBX_EXPORTS*/ const mdbx_version_info mdbx_version = {
@ -29,6 +29,6 @@ const struct mdbx_version_info mdbx_version = {
const struct mdbx_build_info mdbx_build = {
/*LIBMDBX_EXPORTS*/ const mdbx_build_info mdbx_build = {
Reference in New Issue
Block a user