From e5fc056035f194d682acd1a7bc9ba3a37459aebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Tue, 11 Oct 2022 13:11:12 +0300 Subject: [PATCH] =?UTF-8?q?mdbx:=20=D0=B8=D0=B7=D0=BC=D0=B5=D0=BD=D0=B5?= =?UTF-8?q?=D0=BD=D0=B8=D0=B5=20=D1=84=D0=BE=D1=80=D0=BC=D0=B0=D1=82=D0=B0?= =?UTF-8?q?=20LCK=20=D0=B8=20=D1=81=D0=B5=D0=BC=D0=B0=D0=BD=D1=82=D0=B8?= =?UTF-8?q?=D0=BA=D0=B8=20=D0=BD=D0=B5=D0=BA=D0=BE=D1=82=D0=BE=D1=80=D1=8B?= =?UTF-8?q?=D1=85=20=D0=B2=D0=BD=D1=83=D1=82=D1=80=D0=B5=D0=BD=D0=BD=D0=B8?= =?UTF-8?q?=D1=85=20=D0=BF=D0=BE=D0=BB=D0=B5=D0=B9.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Изменение формата LCK-файла означает что версии libmdbx использующие разный формат не смогут работать с одной БД одновременно, а только поочередно (LCK-файл переписывается при открытии первым открывающим БД процессом). 1. Поле mti_unsynced_pages теперь 64-битное (чтобы не контролировать переполнение) и перемещено для соблюдения выравнивания. 2. Поле mti_sync_timestamp переименовано в mti_eoos_timestamp одновременно со сменой семантики. Теперь время отсчитывается не от момента сброса данных на диск, а с момента входа в «грязное» состояние. Скорее всего, текущая версия формата LCK не окончательная и изменится до релиза. --- mdbx.h | 4 +++- src/core.c | 56 ++++++++++++++++++++++++++----------------------- src/internals.h | 18 ++++++++-------- 3 files changed, 42 insertions(+), 36 deletions(-) diff --git a/mdbx.h b/mdbx.h index fa72b73d..17d1a17b 100644 --- a/mdbx.h +++ b/mdbx.h @@ -2515,7 +2515,9 @@ struct MDBX_envinfo { uint64_t mi_unsync_volume; /** Current auto-sync threshold, see \ref mdbx_env_set_syncbytes(). */ uint64_t mi_autosync_threshold; - /** Time since the last steady sync in 1/65536 of second */ + /** Time since entering to a "dirty" out-of-sync state in units of 1/65536 of + * second. In other words, this is the time since the last non-steady commit + * or zero if it was steady. */ uint32_t mi_since_sync_seconds16dot16; /** Current auto-sync period in 1/65536 of second, * see \ref mdbx_env_set_syncperiod(). */ diff --git a/src/core.c b/src/core.c index 772b1adb..a19dbb6e 100644 --- a/src/core.c +++ b/src/core.c @@ -4506,6 +4506,8 @@ __must_check_result static int iov_write(iov_ctx_t *ctx) { #if MDBX_ENABLE_PGOP_STAT ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; #endif /* MDBX_ENABLE_PGOP_STAT */ + if (!ctx->env->me_lck->mti_eoos_timestamp.weak) + ctx->env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); ctx->err = r.err; if (unlikely(ctx->err != MDBX_SUCCESS)) ERROR("Write error: %s", mdbx_strerror(ctx->err)); @@ -6787,6 +6789,7 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; /* wipe the last steady-point if one of: * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted @@ -6806,13 +6809,12 @@ static pgr_t page_alloc_slowpath(MDBX_cursor *mc, const pgno_t num, int flags) { meta_prefer_steady(env, &txn->tw.troika).ptr_c); } else if ((flags & (MDBX_ALLOC_BACKLOG | MDBX_ALLOC_NEW)) == 0 || (autosync_threshold && - atomic_load32(&env->me_lck->mti_unsynced_pages, + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, - mo_Relaxed) >= - autosync_period) || + (eoos_timestamp = atomic_load64( + &env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period) || next >= txn->mt_geo.upper || (next >= txn->mt_end_pgno && (autosync_threshold | autosync_period) == 0)) { @@ -7311,8 +7313,8 @@ retry:; const meta_troika_t troika = meta_tap(env); head = meta_recent(env, &troika); } - const pgno_t unsynced_pages = - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); + const uint64_t unsynced_pages = + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed); if (unsynced_pages == 0) { const uint32_t synched_meta_txnid_u32 = atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); @@ -7320,15 +7322,16 @@ retry:; goto bailout; } - const pgno_t autosync_threshold = + const size_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || (autosync_period && - osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= - autosync_period)) + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; if (!inside_txn) { @@ -7396,7 +7399,7 @@ retry:; eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { - DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, + DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64, data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), unsynced_pages); MDBX_meta meta = *head.ptr_c; @@ -11341,13 +11344,14 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; if ((autosync_threshold && - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= - autosync_period)) + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ } @@ -11459,7 +11463,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, /* LY: step#1 - sync previously written/updated data-pages */ rc = MDBX_RESULT_FALSE /* carry steady */; - if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { + if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; unsigned sync_op = 0; @@ -11494,10 +11498,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { - atomic_store64(&env->me_lck->mti_sync_timestamp, osal_monotime(), - mo_Relaxed); unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending)); - atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); + atomic_store64(&env->me_lck->mti_eoos_timestamp, 0, mo_Relaxed); + atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); @@ -12652,8 +12655,9 @@ __cold static int setup_dxb(MDBX_env *env, const int lck_rc, "rollback NOT needed, steady-sync NEEDED%s", "opening after an unclean shutdown", bootid.x, bootid.y, ""); header = clone; - atomic_store32(&env->me_lck->mti_unsynced_pages, header.mm_geo.next, - mo_Relaxed); + env->me_lck->mti_unsynced_pages.weak = header.mm_geo.next; + if (!env->me_lck->mti_eoos_timestamp.weak) + env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); break; } if (unlikely(!prefer_steady.is_steady)) { @@ -20947,8 +20951,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); - const pgno_t unsynced_pages = - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + + const uint64_t unsynced_pages = + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != (uint32_t)arg->mi_recent_txnid); @@ -20963,9 +20967,9 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_sys_pagesize = env->me_os_psize; if (likely(bytes > size_before_bootid)) { - arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); + arg->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); const uint64_t monotime_now = osal_monotime(); - uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); + uint64_t ts = atomic_load64(&lck->mti_eoos_timestamp, mo_Relaxed); arg->mi_since_sync_seconds16dot16 = ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); diff --git a/src/internals.h b/src/internals.h index 44cd4347..ab748f3a 100644 --- a/src/internals.h +++ b/src/internals.h @@ -370,7 +370,7 @@ MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( /* FROZEN: The version number for a database's datafile format. */ #define MDBX_DATA_VERSION 3 /* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION 4 +#define MDBX_LOCK_VERSION 5 /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -748,20 +748,20 @@ typedef struct MDBX_lockinfo { atomic_txnid_t mti_oldest_reader; - /* Timestamp of the last steady sync. Value is represented in a suitable - * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or - * clock_gettime(CLOCK_MONOTONIC). */ - MDBX_atomic_uint64_t mti_sync_timestamp; + /* Timestamp of entering an out-of-sync state. Value is represented in a + * suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) + * or clock_gettime(CLOCK_MONOTONIC). */ + MDBX_atomic_uint64_t mti_eoos_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - atomic_pgno_t mti_unsynced_pages; - - /* Number of page which was discarded last time by madvise(MADV_FREE). */ - atomic_pgno_t mti_discarded_tail; + MDBX_atomic_uint64_t mti_unsynced_pages; /* Timestamp of the last readers check. */ MDBX_atomic_uint64_t mti_reader_check_timestamp; + /* Number of page which was discarded last time by madvise(MADV_FREE). */ + atomic_pgno_t mti_discarded_tail; + /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor;