mdbx: use C11 atomics if available instead of legacy memory barriers.

This done better support architectures with a weak/relaxed memory consistency model (ARM, AARCH64, PPC, MIPS, RISC-V, etc).

Change-Id: Iee831c8dc564f1d027ff84b0d6daa559325d5a9b
This commit is contained in:
Leonid Yuriev
2021-01-30 02:28:12 +03:00
parent bc33875a9e
commit 9f0ff865e8
7 changed files with 643 additions and 442 deletions

View File

@@ -186,6 +186,31 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
/*----------------------------------------------------------------------------*/
/* Basic constants and types */
typedef union {
volatile uint32_t weak;
#ifdef MDBX_HAVE_C11ATOMICS
volatile _Atomic uint32_t c11a;
#endif /* MDBX_HAVE_C11ATOMICS */
} MDBX_atomic_uint32_t;
typedef union {
volatile uint64_t weak;
#if defined(MDBX_HAVE_C11ATOMICS) && (MDBX_64BIT_CAS || MDBX_64BIT_ATOMIC)
volatile _Atomic uint64_t c11a;
#endif
#if !defined(MDBX_HAVE_C11ATOMICS) || !MDBX_64BIT_CAS || !MDBX_64BIT_ATOMIC
__anonymous_struct_extension__ struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
MDBX_atomic_uint32_t low, high;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
MDBX_atomic_uint32_t high, low;
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
};
#endif
} MDBX_atomic_uint64_t;
/* The minimum number of keys required in a database page.
* Setting this to a larger value will place a smaller bound on the
* maximum size of a data item. Data items larger than this size will
@@ -224,6 +249,7 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
* MDBX uses 32 bit for page numbers. This limits database
* size up to 2^44 bytes, in case of 4K pages. */
typedef uint32_t pgno_t;
typedef MDBX_atomic_uint32_t atomic_pgno_t;
#define PRIaPGNO PRIu32
#define MAX_PAGENO UINT32_C(0x7FFFffff)
#define MIN_PAGENO NUM_METAS
@@ -232,6 +258,7 @@ typedef uint32_t pgno_t;
/* A transaction ID. */
typedef uint64_t txnid_t;
typedef MDBX_atomic_uint64_t atomic_txnid_t;
#define PRIaTXN PRIi64
#define MIN_TXNID UINT64_C(1)
#define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
@@ -258,24 +285,6 @@ typedef uint16_t indx_t;
/* Core structures for database and shared memory (i.e. format definition) */
#pragma pack(push, 1)
typedef union mdbx_safe64 {
volatile uint64_t inconsistent;
#if MDBX_64BIT_ATOMIC
volatile uint64_t atomic;
#endif /* MDBX_64BIT_ATOMIC */
__anonymous_struct_extension__ struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
volatile uint32_t low;
volatile uint32_t high;
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
volatile uint32_t high;
volatile uint32_t low;
#else
#error "FIXME: Unsupported byte order"
#endif /* __BYTE_ORDER__ */
};
} mdbx_safe64_t;
/* Information about a single database in the environment. */
typedef struct MDBX_db {
uint16_t md_flags; /* see mdbx_dbi_open */
@@ -478,7 +487,7 @@ typedef struct MDBX_reader {
* anything; all we need to know is which version of the DB they
* started from so we can avoid overwriting any data used in that
* particular version. */
mdbx_safe64_t /* txnid_t */ mr_txnid;
MDBX_atomic_uint64_t /* txnid_t */ mr_txnid;
/* The information we store in a single slot of the reader table.
* In addition to a transaction ID, we also record the process and
@@ -490,23 +499,18 @@ typedef struct MDBX_reader {
* opening the lock file. */
/* The thread ID of the thread owning this txn. */
#if MDBX_WORDBITS >= 64
volatile uint64_t mr_tid;
#else
volatile uint32_t mr_tid;
volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch,
unused for now */
#endif
MDBX_atomic_uint64_t mr_tid;
/* The process ID of the process owning this reader txn. */
volatile uint32_t mr_pid;
MDBX_atomic_uint32_t mr_pid;
/* The number of pages used in the reader's MVCC snapshot,
* i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
volatile pgno_t mr_snapshot_pages_used;
atomic_pgno_t mr_snapshot_pages_used;
/* Number of retired pages at the time this reader starts transaction. So,
* at any time the difference mm_pages_retired - mr_snapshot_pages_retired
* will give the number of pages which this reader restraining from reuse. */
volatile uint64_t mr_snapshot_pages_retired;
MDBX_atomic_uint64_t mr_snapshot_pages_retired;
} MDBX_reader;
/* The header for the reader table (a memory-mapped lock file). */
@@ -519,25 +523,25 @@ typedef struct MDBX_lockinfo {
uint32_t mti_os_and_format;
/* Flags which environment was opened. */
volatile uint32_t mti_envmode;
MDBX_atomic_uint32_t mti_envmode;
/* Threshold of un-synced-with-disk pages for auto-sync feature,
* zero means no-threshold, i.e. auto-sync is disabled. */
volatile pgno_t mti_autosync_threshold;
atomic_pgno_t mti_autosync_threshold;
/* Low 32-bit of txnid with which meta-pages was synced,
* i.e. for sync-polling in the MDBX_NOMETASYNC mode. */
volatile uint32_t mti_meta_sync_txnid;
MDBX_atomic_uint32_t mti_meta_sync_txnid;
/* Period for timed auto-sync feature, i.e. at the every steady checkpoint
* the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
* The time value is represented in a suitable system-dependent form, for
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
* Zero means timed auto-sync is disabled. */
volatile uint64_t mti_autosync_period;
MDBX_atomic_uint64_t mti_autosync_period;
/* Marker to distinguish uniqueness of DB/CLK.*/
volatile uint64_t mti_bait_uniqueness;
MDBX_atomic_uint64_t mti_bait_uniqueness;
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
@@ -546,21 +550,21 @@ typedef struct MDBX_lockinfo {
mdbx_ipclock_t mti_wlock;
#endif /* MDBX_LOCKING > 0 */
volatile txnid_t mti_oldest_reader;
atomic_txnid_t mti_oldest_reader;
/* Timestamp of the last steady sync. Value is represented in a suitable
* system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or
* clock_gettime(CLOCK_MONOTONIC). */
volatile uint64_t mti_sync_timestamp;
MDBX_atomic_uint64_t mti_sync_timestamp;
/* Number un-synced-with-disk pages for auto-sync feature. */
volatile pgno_t mti_unsynced_pages;
atomic_pgno_t mti_unsynced_pages;
/* Number of page which was discarded last time by madvise(MADV_FREE). */
volatile pgno_t mti_discarded_tail;
atomic_pgno_t mti_discarded_tail;
/* Timestamp of the last readers check. */
volatile uint64_t mti_reader_check_timestamp;
MDBX_atomic_uint64_t mti_reader_check_timestamp;
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
@@ -572,8 +576,8 @@ typedef struct MDBX_lockinfo {
/* The number of slots that have been used in the reader table.
* This always records the maximum count, it is not decremented
* when readers release their slots. */
volatile unsigned mti_numreaders;
volatile unsigned mti_readers_refresh_flag;
MDBX_atomic_uint32_t mti_numreaders;
MDBX_atomic_uint32_t mti_readers_refresh_flag;
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \
(!defined(__cplusplus) && defined(_MSC_VER))
@@ -912,7 +916,7 @@ typedef struct MDBX_cursor_couple {
/* The database environment. */
struct MDBX_env {
#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
uint32_t me_signature;
MDBX_atomic_uint32_t me_signature;
/* Failed to update the meta page. Probably an I/O error. */
#define MDBX_FATAL_ERROR UINT32_C(0x80000000)
/* Some fields are initialized. */
@@ -958,11 +962,11 @@ struct MDBX_env {
mdbx_ipclock_t *me_wlock;
#endif /* MDBX_LOCKING > 0 */
MDBX_dbx *me_dbxs; /* array of static DB info */
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
MDBX_dbx *me_dbxs; /* array of static DB info */
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
atomic_txnid_t *me_oldest; /* ID of oldest reader last time we looked */
MDBX_page *me_dp_reserve; /* list of malloc'd blocks for re-use */
/* PNL of pages that became unused in a write txn */
MDBX_PNL me_retired_pages;
/* Number of freelist items that can fit in a single overflow page */
@@ -970,12 +974,12 @@ struct MDBX_env {
unsigned me_branch_nodemax; /* max size of a branch-node */
uint32_t me_live_reader; /* have liveness lock in reader table */
void *me_userctx; /* User-settable context */
volatile uint64_t *me_sync_timestamp;
volatile uint64_t *me_autosync_period;
volatile pgno_t *me_unsynced_pages;
volatile pgno_t *me_autosync_threshold;
volatile pgno_t *me_discarded_tail;
volatile uint32_t *me_meta_sync_txnid;
MDBX_atomic_uint64_t *me_sync_timestamp;
MDBX_atomic_uint64_t *me_autosync_period;
atomic_pgno_t *me_unsynced_pages;
atomic_pgno_t *me_autosync_threshold;
atomic_pgno_t *me_discarded_tail;
MDBX_atomic_uint32_t *me_meta_sync_txnid;
MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */
unsigned me_dp_reserve_len;
struct {
@@ -992,13 +996,13 @@ struct MDBX_env {
#if MDBX_LOCKING > 0
mdbx_ipclock_t wlock;
#endif /* MDBX_LOCKING > 0 */
txnid_t oldest;
uint64_t sync_timestamp;
uint64_t autosync_period;
pgno_t autosync_pending;
pgno_t autosync_threshold;
pgno_t discarded_tail;
uint32_t meta_sync_txnid;
atomic_txnid_t oldest;
MDBX_atomic_uint64_t sync_timestamp;
MDBX_atomic_uint64_t autosync_period;
atomic_pgno_t autosync_pending;
atomic_pgno_t autosync_threshold;
atomic_pgno_t discarded_tail;
MDBX_atomic_uint32_t meta_sync_txnid;
} me_lckless_stub;
#if MDBX_DEBUG
MDBX_assert_func *me_assert_func; /* Callback for assertion failures */