mdbx: use C11 atomics if available instead of legacy memory barriers.

This done better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc).

Change-Id: Iee831c8dc564f1d027ff84b0d6daa559325d5a9b
This commit is contained in:
Leonid Yuriev 2021-01-30 02:28:12 +03:00
parent bc33875a9e
commit 9f0ff865e8
7 changed files with 643 additions and 442 deletions

View File

@ -205,6 +205,7 @@ cpflags
cplus
cplusplus
cpp
cppreference
cpuid
CROSSCOMPILING
crtdbg

View File

@ -48,6 +48,7 @@ New features:
- more effective refunding/compactification especially for the loosed page cache.
- Added `MDBX_ENABLE_REFUND` and `MDBX_PNL_ASCENDING` internal/advanced build options.
- Added `mdbx_default_pagesize()` function.
- Better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc) by means [C11 atomics](https://en.cppreference.com/w/c/atomic).
Fixes:

File diff suppressed because it is too large Load Diff

View File

@ -186,6 +186,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/*----------------------------------------------------------------------------*/
/* Basic constants and types */
typedef union {
volatile uint32_t weak;
#ifdef MDBX_HAVE_C11ATOMICS
volatile _Atomic uint32_t c11a;
#endif /* MDBX_HAVE_C11ATOMICS */
} MDBX_atomic_uint32_t;
typedef union {
volatile uint64_t weak;
#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
volatile _Atomic uint64_t c11a;
#endif
#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
__anonymous_struct_extension__ struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
MDBX_atomic_uint32_t low, high;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
MDBX_atomic_uint32_t high, low;
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
};
#endif
} MDBX_atomic_uint64_t;
/* The minimum number of keys required in a database page.
* Setting this to a larger value will place a smaller bound on the
* maximum size of a data item. Data items larger than this size will
@ -224,6 +249,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
* MDBX uses 32 bit for page numbers. This limits database
* size up to 2^44 bytes, in case of 4K pages. */
typedef uint32_t pgno_t;
typedef MDBX_atomic_uint32_t atomic_pgno_t;
#define PRIaPGNO PRIu32
#define MAX_PAGENO UINT32_C(0x7FFFffff)
#define MIN_PAGENO NUM_METAS
@ -232,6 +258,7 @@ typedef uint32_t pgno_t;
/* A transaction ID. */
typedef uint64_t txnid_t;
typedef MDBX_atomic_uint64_t atomic_txnid_t;
#define PRIaTXN PRIi64
#define MIN_TXNID UINT64_C(1)
#define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
@ -258,24 +285,6 @@ typedef uint16_t indx_t;
/* Core structures for database and shared memory (i.e. format definition) */
#pragma pack(push, 1)
typedef union mdbx_safe64 {
volatile uint64_t inconsistent;
#if MDBX_64BIT_ATOMIC
volatile uint64_t atomic;
#endif /* MDBX_64BIT_ATOMIC */
__anonymous_struct_extension__ struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
volatile uint32_t low;
volatile uint32_t high;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
volatile uint32_t high;
volatile uint32_t low;
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
};
} mdbx_safe64_t;
/* Information about a single database in the environment. */
typedef struct MDBX_db {
uint16_t md_flags; /* see mdbx_dbi_open */
@ -478,7 +487,7 @@ typedef struct MDBX_reader {
* anything; all we need to know is which version of the DB they
* started from so we can avoid overwriting any data used in that
* particular version. */
mdbx_safe64_t /* txnid_t */ mr_txnid;
MDBX_atomic_uint64_t /* txnid_t */ mr_txnid;
/* The information we store in a single slot of the reader table.
* In addition to a transaction ID, we also record the process and
@ -490,23 +499,18 @@ typedef struct MDBX_reader {
* opening the lock file. */
/* The thread ID of the thread owning this txn. */
#if MDBX_WORDBITS >= 64
volatile uint64_t mr_tid;
#else
volatile uint32_t mr_tid;
volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch,
unused for now */
#endif
MDBX_atomic_uint64_t mr_tid;
/* The process ID of the process owning this reader txn. */
volatile uint32_t mr_pid;
MDBX_atomic_uint32_t mr_pid;
/* The number of pages used in the reader's MVCC snapshot,
* i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
volatile pgno_t mr_snapshot_pages_used;
atomic_pgno_t mr_snapshot_pages_used;
/* Number of retired pages at the time this reader starts transaction. So,
* at any time the difference mm_pages_retired - mr_snapshot_pages_retired
* will give the number of pages which this reader restraining from reuse. */
volatile uint64_t mr_snapshot_pages_retired;
MDBX_atomic_uint64_t mr_snapshot_pages_retired;
} MDBX_reader;
/* The header for the reader table (a memory-mapped lock file). */
@ -519,25 +523,25 @@ typedef struct MDBX_lockinfo {
uint32_t mti_os_and_format;
/* Flags which environment was opened. */
volatile uint32_t mti_envmode;
MDBX_atomic_uint32_t mti_envmode;
/* Threshold of un-synced-with-disk pages for auto-sync feature,
* zero means no-threshold, i.e. auto-sync is disabled. */
volatile pgno_t mti_autosync_threshold;
atomic_pgno_t mti_autosync_threshold;
/* Low 32-bit of txnid with which meta-pages was synced,
* i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
volatile uint32_t mti_meta_sync_txnid;
MDBX_atomic_uint32_t mti_meta_sync_txnid;
/* Period for timed auto-sync feature, i.e. at the every steady checkpoint
* the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
* The time value is represented in a suitable system-dependent form, for
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
* Zero means timed auto-sync is disabled. */
volatile uint64_t mti_autosync_period;
MDBX_atomic_uint64_t mti_autosync_period;
/* Marker to distinguish uniqueness of DB/CLK.*/
volatile uint64_t mti_bait_uniqueness;
MDBX_atomic_uint64_t mti_bait_uniqueness;
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
@ -546,21 +550,21 @@ typedef struct MDBX_lockinfo {
mdbx_ipclock_t mti_wlock;
#endif /* MDBX_LOCKING > 0 */
volatile txnid_t mti_oldest_reader;
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
volatile uint64_t mti_sync_timestamp;
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
volatile pgno_t mti_unsynced_pages;
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
volatile pgno_t mti_discarded_tail;
atomic_pgno_t mti_discarded_tail;
/* Timestamp of the last readers check. */
volatile uint64_t mti_reader_check_timestamp;
MDBX_atomic_uint64_t mti_reader_check_timestamp;
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
@ -572,8 +576,8 @@ typedef struct MDBX_lockinfo {
/* The number of slots that have been used in the reader table.
* This always records the maximum count, it is not decremented
* when readers release their slots. */
volatile unsigned mti_numreaders;
volatile unsigned mti_readers_refresh_flag;
MDBX_atomic_uint32_t mti_numreaders;
MDBX_atomic_uint32_t mti_readers_refresh_flag;
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
@ -912,7 +916,7 @@ typedef struct MDBX_cursor_couple {
/* The database environment. */
struct MDBX_env {
#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
uint32_t me_signature;
MDBX_atomic_uint32_t me_signature;
/* Failed to update the meta page. Probably an I/O error. */
#define MDBX_FATAL_ERROR UINT32_C(0x80000000)
/* Some fields are initialized. */
@ -961,7 +965,7 @@ struct MDBX_env {
MDBX_dbx *me_dbxs; /* array of static DB info */
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */
MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
@ -970,12 +974,12 @@ struct MDBX_env {
unsigned me_branch_nodemax; /* max size of a branch-node */
uint32_t me_live_reader; /* have liveness lock in reader table */
void *me_userctx; /* User-settable context */
volatile uint64_t *me_sync_timestamp;
volatile uint64_t *me_autosync_period;
volatile pgno_t *me_unsynced_pages;
volatile pgno_t *me_autosync_threshold;
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_atomic_uint64_t *me_sync_timestamp;
MDBX_atomic_uint64_t *me_autosync_period;
atomic_pgno_t *me_unsynced_pages;
atomic_pgno_t *me_autosync_threshold;
atomic_pgno_t *me_discarded_tail;
MDBX_atomic_uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
unsigned me_dp_reserve_len;
struct {
@ -992,13 +996,13 @@ struct MDBX_env {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
#endif /* MDBX_LOCKING > 0 */
txnid_t oldest;
uint64_t sync_timestamp;
uint64_t autosync_period;
pgno_t autosync_pending;
pgno_t autosync_threshold;
pgno_t discarded_tail;
uint32_t meta_sync_txnid;
atomic_txnid_t oldest;
MDBX_atomic_uint64_t sync_timestamp;
MDBX_atomic_uint64_t autosync_period;
atomic_pgno_t autosync_pending;
atomic_pgno_t autosync_threshold;
atomic_pgno_t discarded_tail;
MDBX_atomic_uint32_t meta_sync_txnid;
} me_lckless_stub;
#if MDBX_DEBUG
MDBX_assert_func *me_assert_func; /* Callback for assertion failures */

View File

@ -496,7 +496,7 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env,
mdbx_assert(env, rc == 0);
if (rc == 0) {
const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages == 0;
const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0;
mdbx_munmap(&env->me_lck_mmap);
if (synced)
rc = ftruncate(env->me_lfd, 0) ? errno : 0;

View File

@ -265,23 +265,25 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
if (env->me_lck) {
/* Scan LCK for threads of the current process */
const MDBX_reader *const begin = env->me_lck->mti_readers;
const MDBX_reader *const end = begin + env->me_lck->mti_numreaders;
const MDBX_reader *const end =
begin + atomic_load32(&env->me_lck->mti_numreaders, mo_AcquireRelease);
const uintptr_t WriteTxnOwner = env->me_txn0 ? env->me_txn0->mt_owner : 0;
for (const MDBX_reader *reader = begin; reader < end; ++reader) {
if (reader->mr_pid != env->me_pid || !reader->mr_tid) {
if (reader->mr_pid.weak != env->me_pid || !reader->mr_tid.weak) {
skip_lck:
continue;
}
if (reader->mr_tid == CurrentTid || reader->mr_tid == WriteTxnOwner)
if (reader->mr_tid.weak == CurrentTid ||
reader->mr_tid.weak == WriteTxnOwner)
goto skip_lck;
if (env->me_flags & MDBX_NOTLS) {
/* Skip duplicates in no-tls mode */
for (const MDBX_reader *scan = reader; --scan >= begin;)
if (scan->mr_tid == reader->mr_tid)
if (scan->mr_tid.weak == reader->mr_tid.weak)
goto skip_lck;
}
rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid);
rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak);
if (rc != MDBX_SUCCESS) {
bailout_lck:
(void)mdbx_resume_threads_after_remap(*array);
@ -599,7 +601,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env,
if (env->me_map)
mdbx_munmap(&env->me_dxb_mmap);
if (env->me_lck) {
const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages == 0;
const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0;
mdbx_munmap(&env->me_lck_mmap);
if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE &&
mdbx_lck_upgrade(env) == MDBX_SUCCESS)

View File

@ -465,15 +465,17 @@ extern void mdbx_osal_jitter(bool tiny);
/*----------------------------------------------------------------------------*/
/* Atomics */
#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && __has_include(<cstdatomic>)
#if defined(__cplusplus) && !defined(__STDC_NO_ATOMICS__) && (__has_include(<cstdatomic>) || __has_extension(cxx_atomic))
#include <cstdatomic>
#elif !defined(__cplusplus) && (__STDC_VERSION__ >= 201112L) && \
#define MDBX_HAVE_C11ATOMICS
#elif !defined(__cplusplus) && \
(__STDC_VERSION__ >= 201112L || __has_extension(c_atomic)) && \
!defined(__STDC_NO_ATOMICS__) && \
(__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \
!(defined(__GNUC__) || defined(__clang__)))
#include <stdatomic.h>
#define MDBX_HAVE_C11ATOMICS
#elif defined(__GNUC__) || defined(__clang__)
/* LY: nothing required */
#elif defined(_MSC_VER)
#pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */
#pragma warning(disable : 4133) /* 'function': incompatible types - from \
@ -509,14 +511,6 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) {
_ReadWriteBarrier();
#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
__memory_barrier();
if (type > MDBX_BARRIER_COMPILER)
#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64)
__mf();
#elif defined(__i386__) || defined(__x86_64__)
_mm_mfence();
#else
#error "Unknown target for Intel Compiler, please report to us."
#endif
#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
__compiler_barrier();
#elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \
@ -531,21 +525,23 @@ static __maybe_unused __inline void mdbx_compiler_barrier(void) {
}
static __maybe_unused __inline void mdbx_memory_barrier(void) {
#if __has_extension(c_atomic) && !defined(__STDC_NO_ATOMICS__)
atomic_thread_fence(__ATOMIC_SEQ_CST);
#ifdef MDBX_HAVE_C11ATOMICS
atomic_thread_fence(memory_order_seq_cst);
#elif defined(__ATOMIC_SEQ_CST)
#ifdef __clang__
__c11_atomic_thread_fence(__ATOMIC_SEQ_CST);
#else
__atomic_thread_fence(__ATOMIC_SEQ_CST);
#endif
#elif defined(__clang__) || defined(__GNUC__)
__sync_synchronize();
#elif defined(_MSC_VER)
#elif defined(_WIN32) || defined(_WIN64)
MemoryBarrier();
#elif defined(__INTEL_COMPILER) /* LY: Intel Compiler may mimic GCC and MSC */
#if defined(__ia64__) || defined(__ia64) || defined(_M_IA64)
__mf();
#elif defined(__i386__) || defined(__x86_64__)
#if defined(__ia32__)
_mm_mfence();
#else
#error "Unknown target for Intel Compiler, please report to us."
__mf();
#endif
#elif defined(__SUNPRO_C) || defined(__sun) || defined(sun)
__machine_rw_barrier();