mdbx: предотвращение бесполезных page-faults в режиме MDBX_WRITEMAP (опция сборки MDBX_ENABLE_PREFAULT).

This commit is contained in:
Леонид Юрьев (Leonid Yuriev) 2022-12-04 20:04:13 +03:00
parent dc27d5d30a
commit be3ff92772
4 changed files with 82 additions and 13 deletions

21
mdbx.h
View File

@ -2583,16 +2583,17 @@ struct MDBX_envinfo {
* first process opened the database after everyone had previously closed it). * first process opened the database after everyone had previously closed it).
*/ */
struct { struct {
uint64_t newly; /**< Quantity of a new pages added */ uint64_t newly; /**< Quantity of a new pages added */
uint64_t cow; /**< Quantity of pages copied for update */ uint64_t cow; /**< Quantity of pages copied for update */
uint64_t clone; /**< Quantity of parent's dirty pages clones uint64_t clone; /**< Quantity of parent's dirty pages clones
for nested transactions */ for nested transactions */
uint64_t split; /**< Page splits */ uint64_t split; /**< Page splits */
uint64_t merge; /**< Page merges */ uint64_t merge; /**< Page merges */
uint64_t spill; /**< Quantity of spilled dirty pages */ uint64_t spill; /**< Quantity of spilled dirty pages */
uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ uint64_t unspill; /**< Quantity of unspilled/reloaded pages */
uint64_t wops; /**< Number of explicit write operations (not a pages) uint64_t wops; /**< Number of explicit write operations (not a pages)
to a disk */ to a disk */
uint64_t prefault; /**< Number of prefault write operations (not a pages) */
uint64_t uint64_t
msync; /**< Number of explicit msync-to-disk operations (not a pages) */ msync; /**< Number of explicit msync-to-disk operations (not a pages) */
uint64_t uint64_t

View File

@ -6773,18 +6773,68 @@ static __inline pgr_t page_alloc_finalize(MDBX_env *const env,
ret.page = pgno2page(env, pgno); ret.page = pgno2page(env, pgno);
MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num));
VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
#if MDBX_ENABLE_PREFAULT
/* Содержимое выделенной страницы не нужно, но если страница отсутствует
* в ОЗУ (что весьма вероятно), то любое обращение к ней приведет
* к page-fault:
* - прерыванию по отсутствию страницы;
* - переключение контекста в режим ядра с засыпанием процесса;
* - чтение страницы с диска;
* - обновление PTE и пробуждением процесса;
* - переключение контекста по доступности ЦПУ.
*
* Пытаемся минимизировать накладные расходы записывая страницу, что при
* наличии unified page cache приведет к появлению страницы в ОЗУ без чтения
* с диска. При этом запись на диск должна быть отложена адекватным ядром,
* так как страница отображена в память в режиме чтения-записи и следом в
* неё пишет ЦПУ. */
void *const pattern = ptr_disp(
env->me_pbuf,
(env->me_flags & MDBX_PAGEPERTURB) ? env->me_psize : env->me_psize * 2);
size_t file_offset = pgno2bytes(env, pgno);
/* TODO: добавить проверку через mincore() c кэшированием результатов. */
if (likely(num == 1)) {
osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset);
} else {
struct iovec iov[MDBX_AUXILARY_IOV_MAX];
iov[0].iov_len = env->me_psize;
iov[0].iov_base = pattern;
size_t n = 1, left = num - 1;
do {
iov[n].iov_len = env->me_psize;
iov[n].iov_base = pattern;
if (++n == MDBX_AUXILARY_IOV_MAX) {
osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX,
file_offset);
file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX);
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.prefault.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
n = 0;
}
} while (--left);
osal_pwritev(env->me_lazy_fd, iov, n, file_offset);
}
#if MDBX_ENABLE_PGOP_STAT
env->me_lck->mti_pgop_stat.prefault.weak += 1;
#endif /* MDBX_ENABLE_PGOP_STAT */
#else
if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
memset(ret.page, -1, pgno2bytes(env, num));
#endif /* MDBX_ENABLE_PREFAULT */
} else { } else {
ret.page = page_malloc(txn, num); ret.page = page_malloc(txn, num);
if (unlikely(!ret.page)) { if (unlikely(!ret.page)) {
ret.err = MDBX_ENOMEM; ret.err = MDBX_ENOMEM;
goto bailout; goto bailout;
} }
if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
memset(ret.page, -1, pgno2bytes(env, num));
} }
if (unlikely(env->me_flags & MDBX_PAGEPERTURB))
memset(ret.page, -1, pgno2bytes(env, num));
VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num));
ret.page->mp_pgno = pgno; ret.page->mp_pgno = pgno;
ret.page->mp_leaf2_ksize = 0; ret.page->mp_leaf2_ksize = 0;
ret.page->mp_flags = 0; ret.page->mp_flags = 0;
@ -14428,6 +14478,7 @@ __cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname,
rc = alloc_page_buf(env); rc = alloc_page_buf(env);
if (rc == MDBX_SUCCESS) { if (rc == MDBX_SUCCESS) {
memset(env->me_pbuf, -1, env->me_psize * 2); memset(env->me_pbuf, -1, env->me_psize * 2);
memset(ptr_disp(env->me_pbuf, env->me_psize * 2), 0, env->me_psize);
MDBX_txn *txn = osal_calloc(1, size); MDBX_txn *txn = osal_calloc(1, size);
if (txn) { if (txn) {
txn->mt_dbs = ptr_disp(txn, tsize); txn->mt_dbs = ptr_disp(txn, tsize);
@ -21586,6 +21637,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn,
atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed);
arg->mi_pgop_stat.wops = arg->mi_pgop_stat.wops =
atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed);
arg->mi_pgop_stat.prefault =
atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed);
arg->mi_pgop_stat.msync = arg->mi_pgop_stat.msync =
atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed);
arg->mi_pgop_stat.fsync = arg->mi_pgop_stat.fsync =
@ -24706,6 +24759,7 @@ __dll_export
" MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC)
" MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND)
" MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE)
" MDBX_ENABLE_PREFAULT=" MDBX_STRINGIFY(MDBX_ENABLE_PREFAULT)
" MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT)
" MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC)
#if MDBX_DISABLE_VALIDATION #if MDBX_DISABLE_VALIDATION

View File

@ -619,6 +619,8 @@ typedef struct pgop_stat {
MDBX_atomic_uint64_t MDBX_atomic_uint64_t
fsync; /* Number of explicit fsync/flush-to-disk operations */ fsync; /* Number of explicit fsync/flush-to-disk operations */
MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */
/* Статистика для профилирования GC. /* Статистика для профилирования GC.
* Логически эти данные может быть стоит вынести в другую структуру, * Логически эти данные может быть стоит вынести в другую структуру,
* но разница будет сугубо косметическая. */ * но разница будет сугубо косметическая. */

View File

@ -87,6 +87,18 @@
#error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1
#endif /* MDBX_ENABLE_PGOP_STAT */ #endif /* MDBX_ENABLE_PGOP_STAT */
/** Controls prevention of page-faults of reclaimed and allocated pages in the
* MDBX_WRITEMAP mode by clearing ones through file handle before touching. */
#ifndef MDBX_ENABLE_PREFAULT
#if MDBX_MMAP_INCOHERENT_FILE_WRITE
#define MDBX_ENABLE_PREFAULT 0
#else
#define MDBX_ENABLE_PREFAULT 1
#endif
#elif !(MDBX_ENABLE_PREFAULT == 0 || MDBX_ENABLE_PREFAULT == 1)
#error MDBX_ENABLE_PREFAULT must be defined as 0 or 1
#endif /* MDBX_ENABLE_PREFAULT */
/** Enables chunking long list of retired pages during huge transactions commit /** Enables chunking long list of retired pages during huge transactions commit
* to avoid use sequences of pages. */ * to avoid use sequences of pages. */
#ifndef MDBX_ENABLE_BIGFOOT #ifndef MDBX_ENABLE_BIGFOOT