2024-05-19 22:07:58 +03:00
|
|
|
|
/// \copyright SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
/// \note Please refer to the COPYRIGHT file for explanations license change,
|
|
|
|
|
/// credits and acknowledgments.
|
|
|
|
|
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
|
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
#include "essentials.h"
|
|
|
|
|
|
|
|
|
|
/* The version number for a database's lockfile format. */
|
2024-07-09 16:04:01 +03:00
|
|
|
|
#define MDBX_LOCK_VERSION 6
|
2024-05-19 22:07:58 +03:00
|
|
|
|
|
|
|
|
|
#if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES
|
|
|
|
|
|
|
|
|
|
#define MDBX_LCK_SIGN UINT32_C(0xF10C)
|
|
|
|
|
typedef void osal_ipclock_t;
|
|
|
|
|
#elif MDBX_LOCKING == MDBX_LOCKING_SYSV
|
|
|
|
|
|
|
|
|
|
#define MDBX_LCK_SIGN UINT32_C(0xF18D)
|
|
|
|
|
typedef mdbx_pid_t osal_ipclock_t;
|
|
|
|
|
|
|
|
|
|
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \
|
|
|
|
|
MDBX_LOCKING == MDBX_LOCKING_POSIX2008
|
|
|
|
|
|
|
|
|
|
#define MDBX_LCK_SIGN UINT32_C(0x8017)
|
|
|
|
|
typedef pthread_mutex_t osal_ipclock_t;
|
|
|
|
|
|
|
|
|
|
#elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988
|
|
|
|
|
|
|
|
|
|
#define MDBX_LCK_SIGN UINT32_C(0xFC29)
|
|
|
|
|
typedef sem_t osal_ipclock_t;
|
|
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
#error "FIXME"
|
|
|
|
|
#endif /* MDBX_LOCKING */
|
|
|
|
|
|
|
|
|
|
/* Статистика профилирования работы GC */
|
|
|
|
|
typedef struct gc_prof_stat {
|
|
|
|
|
/* Монотонное время по "настенным часам"
|
|
|
|
|
* затраченное на чтение и поиск внутри GC */
|
|
|
|
|
uint64_t rtime_monotonic;
|
|
|
|
|
/* Процессорное время в режим пользователя
|
|
|
|
|
* на подготовку страниц извлекаемых из GC, включая подкачку с диска. */
|
|
|
|
|
uint64_t xtime_cpu;
|
|
|
|
|
/* Количество итераций чтения-поиска внутри GC при выделении страниц */
|
|
|
|
|
uint32_t rsteps;
|
|
|
|
|
/* Количество запросов на выделение последовательностей страниц,
|
|
|
|
|
* т.е. когда запрашивает выделение больше одной страницы */
|
|
|
|
|
uint32_t xpages;
|
|
|
|
|
/* Счетчик выполнения по медленному пути (slow path execution count) */
|
|
|
|
|
uint32_t spe_counter;
|
|
|
|
|
/* page faults (hard page faults) */
|
|
|
|
|
uint32_t majflt;
|
|
|
|
|
} gc_prof_stat_t;
|
|
|
|
|
|
|
|
|
|
/* Statistics of pages operations for all transactions,
|
|
|
|
|
* including incomplete and aborted. */
|
|
|
|
|
typedef struct pgops {
|
|
|
|
|
mdbx_atomic_uint64_t newly; /* Quantity of a new pages added */
|
|
|
|
|
mdbx_atomic_uint64_t cow; /* Quantity of pages copied for update */
|
|
|
|
|
mdbx_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones
|
|
|
|
|
for nested transactions */
|
|
|
|
|
mdbx_atomic_uint64_t split; /* Page splits */
|
|
|
|
|
mdbx_atomic_uint64_t merge; /* Page merges */
|
|
|
|
|
mdbx_atomic_uint64_t spill; /* Quantity of spilled dirty pages */
|
|
|
|
|
mdbx_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */
|
|
|
|
|
mdbx_atomic_uint64_t
|
|
|
|
|
wops; /* Number of explicit write operations (not a pages) to a disk */
|
|
|
|
|
mdbx_atomic_uint64_t
|
|
|
|
|
msync; /* Number of explicit msync/flush-to-disk operations */
|
|
|
|
|
mdbx_atomic_uint64_t
|
|
|
|
|
fsync; /* Number of explicit fsync/flush-to-disk operations */
|
|
|
|
|
|
|
|
|
|
mdbx_atomic_uint64_t prefault; /* Number of prefault write operations */
|
|
|
|
|
mdbx_atomic_uint64_t mincore; /* Number of mincore() calls */
|
|
|
|
|
|
|
|
|
|
mdbx_atomic_uint32_t
|
|
|
|
|
incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269
|
|
|
|
|
caught */
|
|
|
|
|
mdbx_atomic_uint32_t reserved;
|
|
|
|
|
|
|
|
|
|
/* Статистика для профилирования GC.
|
|
|
|
|
* Логически эти данные, возможно, стоит вынести в другую структуру,
|
|
|
|
|
* но разница будет сугубо косметическая. */
|
|
|
|
|
struct {
|
|
|
|
|
/* Затраты на поддержку данных пользователя */
|
|
|
|
|
gc_prof_stat_t work;
|
|
|
|
|
/* Затраты на поддержку и обновления самой GC */
|
|
|
|
|
gc_prof_stat_t self;
|
|
|
|
|
/* Итераций обновления GC,
|
|
|
|
|
* больше 1 если были повторы/перезапуски */
|
|
|
|
|
uint32_t wloops;
|
|
|
|
|
/* Итерации слияния записей GC */
|
|
|
|
|
uint32_t coalescences;
|
|
|
|
|
/* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */
|
|
|
|
|
uint32_t wipes;
|
|
|
|
|
/* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */
|
|
|
|
|
uint32_t flushes;
|
|
|
|
|
/* Попытки пнуть тормозящих читателей */
|
|
|
|
|
uint32_t kicks;
|
|
|
|
|
} gc_prof;
|
|
|
|
|
} pgop_stat_t;
|
|
|
|
|
|
|
|
|
|
/* Reader Lock Table
|
|
|
|
|
*
|
|
|
|
|
* Readers don't acquire any locks for their data access. Instead, they
|
|
|
|
|
* simply record their transaction ID in the reader table. The reader
|
|
|
|
|
* mutex is needed just to find an empty slot in the reader table. The
|
|
|
|
|
* slot's address is saved in thread-specific data so that subsequent
|
|
|
|
|
* read transactions started by the same thread need no further locking to
|
|
|
|
|
* proceed.
|
|
|
|
|
*
|
|
|
|
|
* If MDBX_NOSTICKYTHREADS is set, the slot address is not saved in
|
|
|
|
|
* thread-specific data. No reader table is used if the database is on a
|
|
|
|
|
* read-only filesystem.
|
|
|
|
|
*
|
|
|
|
|
* Since the database uses multi-version concurrency control, readers don't
|
|
|
|
|
* actually need any locking. This table is used to keep track of which
|
|
|
|
|
* readers are using data from which old transactions, so that we'll know
|
|
|
|
|
* when a particular old transaction is no longer in use. Old transactions
|
|
|
|
|
* that have discarded any data pages can then have those pages reclaimed
|
|
|
|
|
* for use by a later write transaction.
|
|
|
|
|
*
|
|
|
|
|
* The lock table is constructed such that reader slots are aligned with the
|
|
|
|
|
* processor's cache line size. Any slot is only ever used by one thread.
|
|
|
|
|
* This alignment guarantees that there will be no contention or cache
|
|
|
|
|
* thrashing as threads update their own slot info, and also eliminates
|
|
|
|
|
* any need for locking when accessing a slot.
|
|
|
|
|
*
|
|
|
|
|
* A writer thread will scan every slot in the table to determine the oldest
|
|
|
|
|
* outstanding reader transaction. Any freed pages older than this will be
|
|
|
|
|
* reclaimed by the writer. The writer doesn't use any locks when scanning
|
|
|
|
|
* this table. This means that there's no guarantee that the writer will
|
|
|
|
|
* see the most up-to-date reader info, but that's not required for correct
|
|
|
|
|
* operation - all we need is to know the upper bound on the oldest reader,
|
|
|
|
|
* we don't care at all about the newest reader. So the only consequence of
|
|
|
|
|
* reading stale information here is that old pages might hang around a
|
|
|
|
|
* while longer before being reclaimed. That's actually good anyway, because
|
|
|
|
|
* the longer we delay reclaiming old pages, the more likely it is that a
|
|
|
|
|
* string of contiguous pages can be found after coalescing old pages from
|
|
|
|
|
* many old transactions together. */
|
|
|
|
|
|
|
|
|
|
/* The actual reader record, with cacheline padding. */
|
|
|
|
|
typedef struct reader_slot {
|
|
|
|
|
/* Current Transaction ID when this transaction began, or INVALID_TXNID.
|
|
|
|
|
* Multiple readers that start at the same time will probably have the
|
|
|
|
|
* same ID here. Again, it's not important to exclude them from
|
|
|
|
|
* anything; all we need to know is which version of the DB they
|
|
|
|
|
* started from so we can avoid overwriting any data used in that
|
|
|
|
|
* particular version. */
|
|
|
|
|
atomic_txnid_t txnid;
|
|
|
|
|
|
|
|
|
|
/* The information we store in a single slot of the reader table.
|
|
|
|
|
* In addition to a transaction ID, we also record the process and
|
|
|
|
|
* thread ID that owns a slot, so that we can detect stale information,
|
|
|
|
|
* e.g. threads or processes that went away without cleaning up.
|
|
|
|
|
*
|
|
|
|
|
* NOTE: We currently don't check for stale records.
|
|
|
|
|
* We simply re-init the table when we know that we're the only process
|
|
|
|
|
* opening the lock file. */
|
|
|
|
|
|
2024-07-09 16:04:01 +03:00
|
|
|
|
/* Псевдо thread_id для пометки вытесненных читающих транзакций. */
|
|
|
|
|
#define MDBX_TID_TXN_OUSTED (UINT64_MAX - 1)
|
|
|
|
|
|
|
|
|
|
/* Псевдо thread_id для пометки припаркованных читающих транзакций. */
|
|
|
|
|
#define MDBX_TID_TXN_PARKED UINT64_MAX
|
|
|
|
|
|
2024-05-19 22:07:58 +03:00
|
|
|
|
/* The thread ID of the thread owning this txn. */
|
|
|
|
|
mdbx_atomic_uint64_t tid;
|
|
|
|
|
|
|
|
|
|
/* The process ID of the process owning this reader txn. */
|
|
|
|
|
mdbx_atomic_uint32_t pid;
|
|
|
|
|
|
|
|
|
|
/* The number of pages used in the reader's MVCC snapshot,
|
|
|
|
|
* i.e. the value of meta->geometry.first_unallocated and
|
|
|
|
|
* txn->geo.first_unallocated */
|
|
|
|
|
atomic_pgno_t snapshot_pages_used;
|
|
|
|
|
/* Number of retired pages at the time this reader starts transaction. So,
|
|
|
|
|
* at any time the difference meta.pages_retired -
|
|
|
|
|
* reader.snapshot_pages_retired will give the number of pages which this
|
|
|
|
|
* reader restraining from reuse. */
|
|
|
|
|
mdbx_atomic_uint64_t snapshot_pages_retired;
|
|
|
|
|
} reader_slot_t;
|
|
|
|
|
|
|
|
|
|
/* The header for the reader table (a memory-mapped lock file). */
|
|
|
|
|
typedef struct shared_lck {
|
|
|
|
|
/* Stamp identifying this as an MDBX file.
|
|
|
|
|
* It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
|
|
|
|
|
uint64_t magic_and_version;
|
|
|
|
|
|
|
|
|
|
/* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
|
|
|
|
|
uint32_t os_and_format;
|
|
|
|
|
|
|
|
|
|
/* Flags which environment was opened. */
|
|
|
|
|
mdbx_atomic_uint32_t envmode;
|
|
|
|
|
|
|
|
|
|
/* Threshold of un-synced-with-disk pages for auto-sync feature,
|
|
|
|
|
* zero means no-threshold, i.e. auto-sync is disabled. */
|
|
|
|
|
atomic_pgno_t autosync_threshold;
|
|
|
|
|
|
|
|
|
|
/* Low 32-bit of txnid with which meta-pages was synced,
|
|
|
|
|
* i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
|
|
|
|
|
#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3)
|
|
|
|
|
#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8)
|
|
|
|
|
#define MDBX_NOMETASYNC_LAZY_WRITEMAP \
|
|
|
|
|
(MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8)
|
|
|
|
|
mdbx_atomic_uint32_t meta_sync_txnid;
|
|
|
|
|
|
|
|
|
|
/* Period for timed auto-sync feature, i.e. at the every steady checkpoint
|
|
|
|
|
* the mti_unsynced_timeout sets to the current_time + autosync_period.
|
|
|
|
|
* The time value is represented in a suitable system-dependent form, for
|
|
|
|
|
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
|
|
|
|
|
* Zero means timed auto-sync is disabled. */
|
|
|
|
|
mdbx_atomic_uint64_t autosync_period;
|
|
|
|
|
|
|
|
|
|
/* Marker to distinguish uniqueness of DB/CLK. */
|
|
|
|
|
mdbx_atomic_uint64_t bait_uniqueness;
|
|
|
|
|
|
|
|
|
|
/* Paired counter of processes that have mlock()ed part of mmapped DB.
|
|
|
|
|
* The (mlcnt[0] - mlcnt[1]) > 0 means at least one process
|
|
|
|
|
* lock at least one page, so therefore madvise() could return EINVAL. */
|
|
|
|
|
mdbx_atomic_uint32_t mlcnt[2];
|
|
|
|
|
|
|
|
|
|
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
|
|
|
|
|
|
|
|
|
/* Statistics of costly ops of all (running, completed and aborted)
|
|
|
|
|
* transactions */
|
|
|
|
|
pgop_stat_t pgops;
|
|
|
|
|
|
|
|
|
|
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
|
|
|
|
|
|
|
|
|
#if MDBX_LOCKING > 0
|
|
|
|
|
/* Write transaction lock. */
|
|
|
|
|
osal_ipclock_t wrt_lock;
|
|
|
|
|
#endif /* MDBX_LOCKING > 0 */
|
|
|
|
|
|
|
|
|
|
atomic_txnid_t cached_oldest;
|
|
|
|
|
|
|
|
|
|
/* Timestamp of entering an out-of-sync state. Value is represented in a
|
|
|
|
|
* suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME)
|
|
|
|
|
* or clock_gettime(CLOCK_MONOTONIC). */
|
|
|
|
|
mdbx_atomic_uint64_t eoos_timestamp;
|
|
|
|
|
|
|
|
|
|
/* Number un-synced-with-disk pages for auto-sync feature. */
|
|
|
|
|
mdbx_atomic_uint64_t unsynced_pages;
|
|
|
|
|
|
|
|
|
|
/* Timestamp of the last readers check. */
|
|
|
|
|
mdbx_atomic_uint64_t readers_check_timestamp;
|
|
|
|
|
|
|
|
|
|
/* Number of page which was discarded last time by madvise(DONTNEED). */
|
|
|
|
|
atomic_pgno_t discarded_tail;
|
|
|
|
|
|
|
|
|
|
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
|
|
|
|
|
pgno_t readahead_anchor;
|
|
|
|
|
|
|
|
|
|
/* Shared cache for mincore() results */
|
|
|
|
|
struct {
|
|
|
|
|
pgno_t begin[4];
|
|
|
|
|
uint64_t mask[4];
|
|
|
|
|
} mincore_cache;
|
|
|
|
|
|
|
|
|
|
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
|
|
|
|
|
|
|
|
|
#if MDBX_LOCKING > 0
|
|
|
|
|
/* Readeaders table lock. */
|
|
|
|
|
osal_ipclock_t rdt_lock;
|
|
|
|
|
#endif /* MDBX_LOCKING > 0 */
|
|
|
|
|
|
|
|
|
|
/* The number of slots that have been used in the reader table.
|
|
|
|
|
* This always records the maximum count, it is not decremented
|
|
|
|
|
* when readers release their slots. */
|
|
|
|
|
mdbx_atomic_uint32_t rdt_length;
|
|
|
|
|
mdbx_atomic_uint32_t rdt_refresh_flag;
|
|
|
|
|
|
|
|
|
|
#if FLEXIBLE_ARRAY_MEMBERS
|
|
|
|
|
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
|
|
|
|
reader_slot_t rdt[] /* dynamic size */;
|
|
|
|
|
|
|
|
|
|
/* Lockfile format signature: version, features and field layout */
|
|
|
|
|
#define MDBX_LOCK_FORMAT \
|
|
|
|
|
(MDBX_LCK_SIGN * 27733 + (unsigned)sizeof(reader_slot_t) * 13 + \
|
|
|
|
|
(unsigned)offsetof(reader_slot_t, snapshot_pages_used) * 251 + \
|
|
|
|
|
(unsigned)offsetof(lck_t, cached_oldest) * 83 + \
|
|
|
|
|
(unsigned)offsetof(lck_t, rdt_length) * 37 + \
|
|
|
|
|
(unsigned)offsetof(lck_t, rdt) * 29)
|
|
|
|
|
#endif /* FLEXIBLE_ARRAY_MEMBERS */
|
|
|
|
|
} lck_t;
|
|
|
|
|
|
|
|
|
|
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
|
|
|
|
|
|
|
|
|
|
#define MDBX_READERS_LIMIT 32767
|