mdbx: изменение формата LCK и семантики некоторых внутренних полей.

Изменение формата LCK-файла означает что версии libmdbx использующие
разный формат не смогут работать с одной БД одновременно, а только
поочередно (LCK-файл переписывается при открытии первым открывающим БД
процессом).

1. Поле mti_unsynced_pages теперь 64-битное (чтобы не контролировать
переполнение) и перемещено для соблюдения выравнивания.

2. Поле mti_sync_timestamp переименовано в mti_eoos_timestamp
одновременно со сменой семантики. Теперь время отсчитывается не от
момента сброса данных на диск, а с момента входа в «грязное» состояние.

Скорее всего, текущая версия формата LCK не окончательная
и изменится до релиза.
This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-10-11 13:11:12 +03:00
parent dd9780606b
commit e5fc056035
3 changed files with 42 additions and 36 deletions

4
mdbx.h
View File

@ -2515,7 +2515,9 @@ struct MDBX_envinfo {
uint64_t mi_unsync_volume; uint64_t mi_unsync_volume;
/** Current auto-sync threshold, see \ref mdbx_env_set_syncbytes(). */ /** Current auto-sync threshold, see \ref mdbx_env_set_syncbytes(). */
uint64_t mi_autosync_threshold; uint64_t mi_autosync_threshold;
/** Time since the last steady sync in 1/65536 of second */ /** Time since entering to a "dirty" out-of-sync state in units of 1/65536 of
* second. In other words, this is the time since the last non-steady commit
* or zero if it was steady. */
uint32_t mi_since_sync_seconds16dot16; uint32_t mi_since_sync_seconds16dot16;
/** Current auto-sync period in 1/65536 of second, /** Current auto-sync period in 1/65536 of second,
* see \ref mdbx_env_set_syncperiod(). */ * see \ref mdbx_env_set_syncperiod(). */

View File

@ -4506,6 +4506,8 @@ __must_check_result static int iov_write(iov_ctx_t *ctx) {
#if MDBX_ENABLE_PGOP_STAT #if MDBX_ENABLE_PGOP_STAT
ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops;
#endif /* MDBX_ENABLE_PGOP_STAT */ #endif /* MDBX_ENABLE_PGOP_STAT */
if (!ctx->env->me_lck->mti_eoos_timestamp.weak)
ctx->env->me_lck->mti_eoos_timestamp.weak = osal_monotime();
ctx->err = r.err; ctx->err = r.err;
if (unlikely(ctx->err != MDBX_SUCCESS)) if (unlikely(ctx->err != MDBX_SUCCESS))
ERROR("Write error: %s", mdbx_strerror(ctx->err)); ERROR("Write error: %s", mdbx_strerror(ctx->err));
@ -6787,6 +6789,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) {
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
const uint64_t autosync_period = const uint64_t autosync_period =
atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
/* wipe the last steady-point if one of: /* wipe the last steady-point if one of:
* - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified
* - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted
@ -6806,13 +6809,12 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) {
meta_prefer_steady(env, &txn->tw.troika).ptr_c); meta_prefer_steady(env, &txn->tw.troika).ptr_c);
} else if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || } else if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 ||
(autosync_threshold && (autosync_threshold &&
atomic_load32(&env->me_lck->mti_unsynced_pages, atomic_load64(&env->me_lck->mti_unsynced_pages,
mo_Relaxed) >= autosync_threshold) || mo_Relaxed) >= autosync_threshold) ||
(autosync_period && (autosync_period &&
osal_monotime() - (eoos_timestamp = atomic_load64(
atomic_load64(&env->me_lck->mti_sync_timestamp, &env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
mo_Relaxed) >= osal_monotime() - eoos_timestamp >= autosync_period) ||
autosync_period) ||
next >= txn->mt_geo.upper || next >= txn->mt_geo.upper ||
(next >= txn->mt_end_pgno && (next >= txn->mt_end_pgno &&
(autosync_threshold | autosync_period) == 0)) { (autosync_threshold | autosync_period) == 0)) {
@ -7311,8 +7313,8 @@ retry:;
const meta_troika_t troika = meta_tap(env); const meta_troika_t troika = meta_tap(env);
head = meta_recent(env, &troika); head = meta_recent(env, &troika);
} }
const pgno_t unsynced_pages = const uint64_t unsynced_pages =
atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed);
if (unsynced_pages == 0) { if (unsynced_pages == 0) {
const uint32_t synched_meta_txnid_u32 = const uint32_t synched_meta_txnid_u32 =
atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed);
@ -7320,15 +7322,16 @@ retry:;
goto bailout; goto bailout;
} }
const pgno_t autosync_threshold = const size_t autosync_threshold =
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
const uint64_t autosync_period = const uint64_t autosync_period =
atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) ||
(autosync_period && (autosync_period &&
osal_monotime() - (eoos_timestamp =
atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
autosync_period)) osal_monotime() - eoos_timestamp >= autosync_period))
flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; flags &= MDBX_WRITEMAP /* clear flags for full steady sync */;
if (!inside_txn) { if (!inside_txn) {
@ -7396,7 +7399,7 @@ retry:;
eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0);
if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) {
DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64,
data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c),
unsynced_pages); unsynced_pages);
MDBX_meta meta = *head.ptr_c; MDBX_meta meta = *head.ptr_c;
@ -11341,13 +11344,14 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
const uint64_t autosync_period = const uint64_t autosync_period =
atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
if ((autosync_threshold && if ((autosync_threshold &&
atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >=
autosync_threshold) || autosync_threshold) ||
(autosync_period && (autosync_period &&
osal_monotime() - (eoos_timestamp =
atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
autosync_period)) osal_monotime() - eoos_timestamp >= autosync_period))
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
} }
@ -11459,7 +11463,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
/* LY: step#1 - sync previously written/updated data-pages */ /* LY: step#1 - sync previously written/updated data-pages */
rc = MDBX_RESULT_FALSE /* carry steady */; rc = MDBX_RESULT_FALSE /* carry steady */;
if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) {
eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE;
unsigned sync_op = 0; unsigned sync_op = 0;
@ -11494,10 +11498,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
/* Steady or Weak */ /* Steady or Weak */
if (rc == MDBX_RESULT_FALSE /* carry steady */) { if (rc == MDBX_RESULT_FALSE /* carry steady */) {
atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(),
mo_Relaxed);
unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending)); unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending));
atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); atomic_store64(&env->me_lck->mti_eoos_timestamp, 0, mo_Relaxed);
atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed);
} else { } else {
assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); assert(rc == MDBX_RESULT_TRUE /* carry non-steady */);
unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK);
@ -12652,8 +12655,9 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc,
"rollback NOT needed, steady-sync NEEDED%s", "rollback NOT needed, steady-sync NEEDED%s",
"opening after an unclean shutdown", bootid.x, bootid.y, ""); "opening after an unclean shutdown", bootid.x, bootid.y, "");
header = clone; header = clone;
atomic_store32(&env->me_lck->mti_unsynced_pages, header.mm_geo.next, env->me_lck->mti_unsynced_pages.weak = header.mm_geo.next;
mo_Relaxed); if (!env->me_lck->mti_eoos_timestamp.weak)
env->me_lck->mti_eoos_timestamp.weak = osal_monotime();
break; break;
} }
if (unlikely(!prefer_steady.is_steady)) { if (unlikely(!prefer_steady.is_steady)) {
@ -20947,8 +20951,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper);
arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv));
arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv));
const pgno_t unsynced_pages = const uint64_t unsynced_pages =
atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) +
(atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
(uint32_t)arg->mi_recent_txnid); (uint32_t)arg->mi_recent_txnid);
@ -20963,9 +20967,9 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
arg->mi_sys_pagesize = env->me_os_psize; arg->mi_sys_pagesize = env->me_os_psize;
if (likely(bytes > size_before_bootid)) { if (likely(bytes > size_before_bootid)) {
arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); arg->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages);
const uint64_t monotime_now = osal_monotime(); const uint64_t monotime_now = osal_monotime();
uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); uint64_t ts = atomic_load64(&lck->mti_eoos_timestamp, mo_Relaxed);
arg->mi_since_sync_seconds16dot16 = arg->mi_since_sync_seconds16dot16 =
ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0;
ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed);

View File

@ -370,7 +370,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */ /* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3 #define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */ /* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4 #define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */ /* handle for the DB used to track free pages. */
#define FREE_DBI 0 #define FREE_DBI 0
@ -748,20 +748,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader; atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable /* Timestamp of entering an out-of-sync state. Value is represented in a
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or * suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* clock_gettime(CLOCK_MONOTONIC). */ * or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp; MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */ /* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages; MDBX_atomic_uint64_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
/* Timestamp of the last readers check. */ /* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp; MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */ /* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor; pgno_t mti_readahead_anchor;