mdbx: изменение формата LCK и семантики некоторых внутренних полей.

Изменение формата LCK-файла означает что версии libmdbx использующие
разный формат не смогут работать с одной БД одновременно, а только
поочередно (LCK-файл переписывается при открытии первым открывающим БД
процессом).

1. Поле mti_unsynced_pages теперь 64-битное (чтобы не контролировать
переполнение) и перемещено для соблюдения выравнивания.

2. Поле mti_sync_timestamp переименовано в mti_eoos_timestamp
одновременно со сменой семантики. Теперь время отсчитывается не от
момента сброса данных на диск, а с момента входа в «грязное» состояние.

Скорее всего, текущая версия формата LCK не окончательная
и изменится до релиза.
This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-10-11 13:11:12 +03:00
parent dd9780606b
commit e5fc056035
3 changed files with 42 additions and 36 deletions

4
mdbx.h
View File

@ -2515,7 +2515,9 @@ struct MDBX_envinfo {
uint64_t mi_unsync_volume;
/** Current auto-sync threshold, see \ref mdbx_env_set_syncbytes(). */
uint64_t mi_autosync_threshold;
/** Time since the last steady sync in 1/65536 of second */
/** Time since entering to a "dirty" out-of-sync state in units of 1/65536 of
* second. In other words, this is the time since the last non-steady commit
* or zero if it was steady. */
uint32_t mi_since_sync_seconds16dot16;
/** Current auto-sync period in 1/65536 of second,
* see \ref mdbx_env_set_syncperiod(). */

View File

@ -4506,6 +4506,8 @@ __must_check_result static int iov_write(iov_ctx_t *ctx) {
#if MDBX_ENABLE_PGOP_STAT
ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops;
#endif /* MDBX_ENABLE_PGOP_STAT */
if (!ctx->env->me_lck->mti_eoos_timestamp.weak)
ctx->env->me_lck->mti_eoos_timestamp.weak = osal_monotime();
ctx->err = r.err;
if (unlikely(ctx->err != MDBX_SUCCESS))
ERROR("Write error: %s", mdbx_strerror(ctx->err));
@ -6787,6 +6789,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) {
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
const uint64_t autosync_period =
atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
/* wipe the last steady-point if one of:
* - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified
* - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted
@ -6806,13 +6809,12 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) {
meta_prefer_steady(env, &txn->tw.troika).ptr_c);
} else if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 ||
(autosync_threshold &&
atomic_load32(&env->me_lck->mti_unsynced_pages,
atomic_load64(&env->me_lck->mti_unsynced_pages,
mo_Relaxed) >= autosync_threshold) ||
(autosync_period &&
osal_monotime() -
atomic_load64(&env->me_lck->mti_sync_timestamp,
mo_Relaxed) >=
autosync_period) ||
(eoos_timestamp = atomic_load64(
&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
osal_monotime() - eoos_timestamp >= autosync_period) ||
next >= txn->mt_geo.upper ||
(next >= txn->mt_end_pgno &&
(autosync_threshold | autosync_period) == 0)) {
@ -7311,8 +7313,8 @@ retry:;
const meta_troika_t troika = meta_tap(env);
head = meta_recent(env, &troika);
}
const pgno_t unsynced_pages =
atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed);
const uint64_t unsynced_pages =
atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed);
if (unsynced_pages == 0) {
const uint32_t synched_meta_txnid_u32 =
atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed);
@ -7320,15 +7322,16 @@ retry:;
goto bailout;
}
const pgno_t autosync_threshold =
const size_t autosync_threshold =
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
const uint64_t autosync_period =
atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) ||
(autosync_period &&
osal_monotime() -
atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >=
autosync_period))
(eoos_timestamp =
atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
osal_monotime() - eoos_timestamp >= autosync_period))
flags &= MDBX_WRITEMAP /* clear flags for full steady sync */;
if (!inside_txn) {
@ -7396,7 +7399,7 @@ retry:;
eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0);
if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) {
DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO,
DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64,
data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c),
unsynced_pages);
MDBX_meta meta = *head.ptr_c;
@ -11341,13 +11344,14 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed);
const uint64_t autosync_period =
atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed);
uint64_t eoos_timestamp;
if ((autosync_threshold &&
atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >=
atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >=
autosync_threshold) ||
(autosync_period &&
osal_monotime() -
atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >=
autosync_period))
(eoos_timestamp =
atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) &&
osal_monotime() - eoos_timestamp >= autosync_period))
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
}
@ -11459,7 +11463,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
/* LY: step#1 - sync previously written/updated data-pages */
rc = MDBX_RESULT_FALSE /* carry steady */;
if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) {
if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) {
eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE;
unsigned sync_op = 0;
@ -11494,10 +11498,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
/* Steady or Weak */
if (rc == MDBX_RESULT_FALSE /* carry steady */) {
atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(),
mo_Relaxed);
unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending));
atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed);
atomic_store64(&env->me_lck->mti_eoos_timestamp, 0, mo_Relaxed);
atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed);
} else {
assert(rc == MDBX_RESULT_TRUE /* carry non-steady */);
unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK);
@ -12652,8 +12655,9 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc,
"rollback NOT needed, steady-sync NEEDED%s",
"opening after an unclean shutdown", bootid.x, bootid.y, "");
header = clone;
atomic_store32(&env->me_lck->mti_unsynced_pages, header.mm_geo.next,
mo_Relaxed);
env->me_lck->mti_unsynced_pages.weak = header.mm_geo.next;
if (!env->me_lck->mti_eoos_timestamp.weak)
env->me_lck->mti_eoos_timestamp.weak = osal_monotime();
break;
}
if (unlikely(!prefer_steady.is_steady)) {
@ -20947,8 +20951,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper);
arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv));
arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv));
const pgno_t unsynced_pages =
atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) +
const uint64_t unsynced_pages =
atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) +
(atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) !=
(uint32_t)arg->mi_recent_txnid);
@ -20963,9 +20967,9 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
arg->mi_sys_pagesize = env->me_os_psize;
if (likely(bytes > size_before_bootid)) {
arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages);
arg->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages);
const uint64_t monotime_now = osal_monotime();
uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed);
uint64_t ts = atomic_load64(&lck->mti_eoos_timestamp, mo_Relaxed);
arg->mi_since_sync_seconds16dot16 =
ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0;
ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed);

View File

@ -370,7 +370,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32(
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION 4
#define MDBX_LOCK_VERSION 5
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
@ -748,20 +748,20 @@ typedef struct MDBX_lockinfo {
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Timestamp of entering an out-of-sync state. Value is represented in a
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
* or clock_gettime(CLOCK_MONOTONIC). */
MDBX_atomic_uint64_t mti_eoos_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
MDBX_atomic_uint64_t mti_unsynced_pages;
/* Timestamp of the last readers check. */
MDBX_atomic_uint64_t mti_reader_check_timestamp;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
atomic_pgno_t mti_discarded_tail;
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
pgno_t mti_readahead_anchor;