mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-04 18:14:12 +08:00
mdbx: LCK-format version 3.
The LCK file is re-created at every opportunity when the process that opens the database is the only one that works with it. Thus the change of the format of the file LCK creates only minimal compatibility problems. Applications using different versions of _libmdbx_ will be able to work with one database alternately, but not at the same time. This appears to be an acceptable inconvenience in exchange for new features. Change-Id: I9414b3fffd53d5519c8172c57345b1eaf6e51c77
This commit is contained in:
parent
de88707946
commit
6f8238e1e9
@ -1,4 +1,4 @@
|
|||||||
version: 0.2.0.{build}
|
version: 0.3.2.{build}
|
||||||
|
|
||||||
environment:
|
environment:
|
||||||
matrix:
|
matrix:
|
||||||
|
247
src/bits.h
247
src/bits.h
@ -1,4 +1,4 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
|
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
|
||||||
* and other libmdbx authors: please see AUTHORS file.
|
* and other libmdbx authors: please see AUTHORS file.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -157,7 +157,7 @@
|
|||||||
/* The version number for a database's datafile format. */
|
/* The version number for a database's datafile format. */
|
||||||
#define MDBX_DATA_VERSION 2
|
#define MDBX_DATA_VERSION 2
|
||||||
/* The version number for a database's lockfile format. */
|
/* The version number for a database's lockfile format. */
|
||||||
#define MDBX_LOCK_VERSION 2
|
#define MDBX_LOCK_VERSION 3
|
||||||
|
|
||||||
/* handle for the DB used to track free pages. */
|
/* handle for the DB used to track free pages. */
|
||||||
#define FREE_DBI 0
|
#define FREE_DBI 0
|
||||||
@ -199,77 +199,6 @@ typedef uint16_t indx_t;
|
|||||||
/* Core structures for database and shared memory (i.e. format definition) */
|
/* Core structures for database and shared memory (i.e. format definition) */
|
||||||
#pragma pack(push, 1)
|
#pragma pack(push, 1)
|
||||||
|
|
||||||
/* Reader Lock Table
|
|
||||||
*
|
|
||||||
* Readers don't acquire any locks for their data access. Instead, they
|
|
||||||
* simply record their transaction ID in the reader table. The reader
|
|
||||||
* mutex is needed just to find an empty slot in the reader table. The
|
|
||||||
* slot's address is saved in thread-specific data so that subsequent
|
|
||||||
* read transactions started by the same thread need no further locking to
|
|
||||||
* proceed.
|
|
||||||
*
|
|
||||||
* If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
|
|
||||||
* No reader table is used if the database is on a read-only filesystem.
|
|
||||||
*
|
|
||||||
* Since the database uses multi-version concurrency control, readers don't
|
|
||||||
* actually need any locking. This table is used to keep track of which
|
|
||||||
* readers are using data from which old transactions, so that we'll know
|
|
||||||
* when a particular old transaction is no longer in use. Old transactions
|
|
||||||
* that have discarded any data pages can then have those pages reclaimed
|
|
||||||
* for use by a later write transaction.
|
|
||||||
*
|
|
||||||
* The lock table is constructed such that reader slots are aligned with the
|
|
||||||
* processor's cache line size. Any slot is only ever used by one thread.
|
|
||||||
* This alignment guarantees that there will be no contention or cache
|
|
||||||
* thrashing as threads update their own slot info, and also eliminates
|
|
||||||
* any need for locking when accessing a slot.
|
|
||||||
*
|
|
||||||
* A writer thread will scan every slot in the table to determine the oldest
|
|
||||||
* outstanding reader transaction. Any freed pages older than this will be
|
|
||||||
* reclaimed by the writer. The writer doesn't use any locks when scanning
|
|
||||||
* this table. This means that there's no guarantee that the writer will
|
|
||||||
* see the most up-to-date reader info, but that's not required for correct
|
|
||||||
* operation - all we need is to know the upper bound on the oldest reader,
|
|
||||||
* we don't care at all about the newest reader. So the only consequence of
|
|
||||||
* reading stale information here is that old pages might hang around a
|
|
||||||
* while longer before being reclaimed. That's actually good anyway, because
|
|
||||||
* the longer we delay reclaiming old pages, the more likely it is that a
|
|
||||||
* string of contiguous pages can be found after coalescing old pages from
|
|
||||||
* many old transactions together. */
|
|
||||||
|
|
||||||
/* The actual reader record, with cacheline padding. */
|
|
||||||
typedef struct MDBX_reader {
|
|
||||||
/* Current Transaction ID when this transaction began, or (txnid_t)-1.
|
|
||||||
* Multiple readers that start at the same time will probably have the
|
|
||||||
* same ID here. Again, it's not important to exclude them from
|
|
||||||
* anything; all we need to know is which version of the DB they
|
|
||||||
* started from so we can avoid overwriting any data used in that
|
|
||||||
* particular version. */
|
|
||||||
volatile txnid_t mr_txnid;
|
|
||||||
|
|
||||||
/* The information we store in a single slot of the reader table.
|
|
||||||
* In addition to a transaction ID, we also record the process and
|
|
||||||
* thread ID that owns a slot, so that we can detect stale information,
|
|
||||||
* e.g. threads or processes that went away without cleaning up.
|
|
||||||
*
|
|
||||||
* NOTE: We currently don't check for stale records.
|
|
||||||
* We simply re-init the table when we know that we're the only process
|
|
||||||
* opening the lock file. */
|
|
||||||
|
|
||||||
/* The process ID of the process owning this reader txn. */
|
|
||||||
volatile mdbx_pid_t mr_pid;
|
|
||||||
/* The thread ID of the thread owning this txn. */
|
|
||||||
volatile mdbx_tid_t mr_tid;
|
|
||||||
/* The number of pages used in the reader's MVCC snapshot,
|
|
||||||
* i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
|
|
||||||
volatile pgno_t mr_snapshot_pages;
|
|
||||||
|
|
||||||
/* cache line alignment */
|
|
||||||
uint8_t pad[MDBX_CACHELINE_SIZE - (sizeof(txnid_t) + sizeof(mdbx_pid_t) +
|
|
||||||
sizeof(mdbx_tid_t) + sizeof(pgno_t)) %
|
|
||||||
MDBX_CACHELINE_SIZE];
|
|
||||||
} MDBX_reader;
|
|
||||||
|
|
||||||
/* Information about a single database in the environment. */
|
/* Information about a single database in the environment. */
|
||||||
typedef struct MDBX_db {
|
typedef struct MDBX_db {
|
||||||
uint16_t md_flags; /* see mdbx_dbi_open */
|
uint16_t md_flags; /* see mdbx_dbi_open */
|
||||||
@ -328,6 +257,12 @@ typedef struct MDBX_meta {
|
|||||||
|
|
||||||
/* txnid that committed this page, the second of a two-phase-update pair */
|
/* txnid that committed this page, the second of a two-phase-update pair */
|
||||||
volatile txnid_t mm_txnid_b;
|
volatile txnid_t mm_txnid_b;
|
||||||
|
|
||||||
|
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
|
||||||
|
* DB was previously handled by libmdbx without corresponding feature.
|
||||||
|
* This value in couple with mr_snapshot_pages_retired allows fast estimation
|
||||||
|
* of "how much reader is restraining GC recycling". */
|
||||||
|
uint64_t mm_pages_retired;
|
||||||
} MDBX_meta;
|
} MDBX_meta;
|
||||||
|
|
||||||
/* Common header for all page types. The page type depends on mp_flags.
|
/* Common header for all page types. The page type depends on mp_flags.
|
||||||
@ -416,6 +351,84 @@ typedef struct MDBX_page {
|
|||||||
#define MAX_MAPSIZE MAX_MAPSIZE32
|
#define MAX_MAPSIZE MAX_MAPSIZE32
|
||||||
#endif /* MDBX_WORDBITS */
|
#endif /* MDBX_WORDBITS */
|
||||||
|
|
||||||
|
#pragma pack(pop)
|
||||||
|
|
||||||
|
/* Reader Lock Table
|
||||||
|
*
|
||||||
|
* Readers don't acquire any locks for their data access. Instead, they
|
||||||
|
* simply record their transaction ID in the reader table. The reader
|
||||||
|
* mutex is needed just to find an empty slot in the reader table. The
|
||||||
|
* slot's address is saved in thread-specific data so that subsequent
|
||||||
|
* read transactions started by the same thread need no further locking to
|
||||||
|
* proceed.
|
||||||
|
*
|
||||||
|
* If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
|
||||||
|
* No reader table is used if the database is on a read-only filesystem.
|
||||||
|
*
|
||||||
|
* Since the database uses multi-version concurrency control, readers don't
|
||||||
|
* actually need any locking. This table is used to keep track of which
|
||||||
|
* readers are using data from which old transactions, so that we'll know
|
||||||
|
* when a particular old transaction is no longer in use. Old transactions
|
||||||
|
* that have discarded any data pages can then have those pages reclaimed
|
||||||
|
* for use by a later write transaction.
|
||||||
|
*
|
||||||
|
* The lock table is constructed such that reader slots are aligned with the
|
||||||
|
* processor's cache line size. Any slot is only ever used by one thread.
|
||||||
|
* This alignment guarantees that there will be no contention or cache
|
||||||
|
* thrashing as threads update their own slot info, and also eliminates
|
||||||
|
* any need for locking when accessing a slot.
|
||||||
|
*
|
||||||
|
* A writer thread will scan every slot in the table to determine the oldest
|
||||||
|
* outstanding reader transaction. Any freed pages older than this will be
|
||||||
|
* reclaimed by the writer. The writer doesn't use any locks when scanning
|
||||||
|
* this table. This means that there's no guarantee that the writer will
|
||||||
|
* see the most up-to-date reader info, but that's not required for correct
|
||||||
|
* operation - all we need is to know the upper bound on the oldest reader,
|
||||||
|
* we don't care at all about the newest reader. So the only consequence of
|
||||||
|
* reading stale information here is that old pages might hang around a
|
||||||
|
* while longer before being reclaimed. That's actually good anyway, because
|
||||||
|
* the longer we delay reclaiming old pages, the more likely it is that a
|
||||||
|
* string of contiguous pages can be found after coalescing old pages from
|
||||||
|
* many old transactions together. */
|
||||||
|
|
||||||
|
/* The actual reader record, with cacheline padding. */
|
||||||
|
typedef struct MDBX_reader {
|
||||||
|
/* Current Transaction ID when this transaction began, or (txnid_t)-1.
|
||||||
|
* Multiple readers that start at the same time will probably have the
|
||||||
|
* same ID here. Again, it's not important to exclude them from
|
||||||
|
* anything; all we need to know is which version of the DB they
|
||||||
|
* started from so we can avoid overwriting any data used in that
|
||||||
|
* particular version. */
|
||||||
|
volatile txnid_t mr_txnid;
|
||||||
|
|
||||||
|
/* The information we store in a single slot of the reader table.
|
||||||
|
* In addition to a transaction ID, we also record the process and
|
||||||
|
* thread ID that owns a slot, so that we can detect stale information,
|
||||||
|
* e.g. threads or processes that went away without cleaning up.
|
||||||
|
*
|
||||||
|
* NOTE: We currently don't check for stale records.
|
||||||
|
* We simply re-init the table when we know that we're the only process
|
||||||
|
* opening the lock file. */
|
||||||
|
|
||||||
|
/* The thread ID of the thread owning this txn. */
|
||||||
|
union {
|
||||||
|
volatile mdbx_tid_t mr_tid;
|
||||||
|
volatile uint64_t mr_tid_u64;
|
||||||
|
};
|
||||||
|
/* The process ID of the process owning this reader txn. */
|
||||||
|
union {
|
||||||
|
volatile mdbx_pid_t mr_pid;
|
||||||
|
volatile uint32_t mr_pid_u32;
|
||||||
|
};
|
||||||
|
/* The number of pages used in the reader's MVCC snapshot,
|
||||||
|
* i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
|
||||||
|
volatile pgno_t mr_snapshot_pages_used;
|
||||||
|
/* Number of retired pages at the time this reader starts transaction. So,
|
||||||
|
* at any time the difference mm_pages_retired - mr_snapshot_pages_retired
|
||||||
|
* will give the number of pages which this reader restraining from reuse. */
|
||||||
|
volatile uint64_t mr_snapshot_pages_retired;
|
||||||
|
} MDBX_reader;
|
||||||
|
|
||||||
/* The header for the reader table (a memory-mapped lock file). */
|
/* The header for the reader table (a memory-mapped lock file). */
|
||||||
typedef struct MDBX_lockinfo {
|
typedef struct MDBX_lockinfo {
|
||||||
/* Stamp identifying this as an MDBX file.
|
/* Stamp identifying this as an MDBX file.
|
||||||
@ -428,68 +441,62 @@ typedef struct MDBX_lockinfo {
|
|||||||
/* Flags which environment was opened. */
|
/* Flags which environment was opened. */
|
||||||
volatile uint32_t mti_envmode;
|
volatile uint32_t mti_envmode;
|
||||||
|
|
||||||
#ifdef MDBX_OSAL_LOCK
|
/* Threshold of un-synced-with-disk pages for auto-sync feature,
|
||||||
/* Mutex protecting write-txn. */
|
* zero means no-threshold, i.e. auto-sync is disabled. */
|
||||||
union {
|
volatile pgno_t mti_autosync_threshold;
|
||||||
MDBX_OSAL_LOCK mti_wmutex;
|
/* Period for timed auto-sync feature, i.e. at the every steady checkpoint
|
||||||
uint8_t pad_mti_wmutex[MDBX_OSAL_LOCK_SIZE % sizeof(size_t)];
|
* the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
|
||||||
};
|
* The time value is represented in a suitable system-dependent form, for
|
||||||
#endif
|
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
|
||||||
#define MDBX_lockinfo_SIZE_A \
|
* Zero means timed auto-sync is disabled. */
|
||||||
(8 /* mti_magic_and_version */ + 4 /* mti_os_and_format */ + \
|
volatile uint64_t mti_autosync_period;
|
||||||
4 /* mti_envmode */ + MDBX_OSAL_LOCK_SIZE /* mti_wmutex */ + \
|
|
||||||
MDBX_OSAL_LOCK_SIZE % sizeof(size_t) /* pad_mti_wmutex */)
|
|
||||||
|
|
||||||
/* cache-line alignment */
|
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
|
||||||
uint8_t
|
#ifdef MDBX_OSAL_LOCK
|
||||||
pad_a[MDBX_CACHELINE_SIZE - MDBX_lockinfo_SIZE_A % MDBX_CACHELINE_SIZE];
|
/* Mutex protecting write-txn. */
|
||||||
|
MDBX_OSAL_LOCK mti_wmutex;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
volatile txnid_t mti_oldest_reader;
|
||||||
|
|
||||||
|
/* Timestamp for auto-sync feature, i.e. the steady checkpoint should be
|
||||||
|
* created at the first commit that will be not early this timestamp.
|
||||||
|
* The time value is represented in a suitable system-dependent form, for
|
||||||
|
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
|
||||||
|
* Zero means timed auto-sync is not pending. */
|
||||||
|
volatile uint64_t mti_unsynced_timeout;
|
||||||
|
|
||||||
|
/* Number un-synced-with-disk pages for auto-sync feature. */
|
||||||
|
volatile pgno_t mti_unsynced_pages;
|
||||||
|
|
||||||
|
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
|
||||||
|
|
||||||
|
#ifdef MDBX_OSAL_LOCK
|
||||||
|
/* Mutex protecting readers registration access to this table. */
|
||||||
|
MDBX_OSAL_LOCK mti_rmutex;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* The number of slots that have been used in the reader table.
|
/* The number of slots that have been used in the reader table.
|
||||||
* This always records the maximum count, it is not decremented
|
* This always records the maximum count, it is not decremented
|
||||||
* when readers release their slots. */
|
* when readers release their slots. */
|
||||||
volatile unsigned mti_numreaders;
|
volatile unsigned mti_numreaders;
|
||||||
|
volatile unsigned mti_readers_refresh_flag;
|
||||||
|
|
||||||
#ifdef MDBX_OSAL_LOCK
|
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
|
||||||
/* Mutex protecting readers registration access to this table. */
|
MDBX_reader mti_readers[1];
|
||||||
union {
|
|
||||||
MDBX_OSAL_LOCK mti_rmutex;
|
|
||||||
uint8_t pad_mti_rmutex[MDBX_OSAL_LOCK_SIZE % sizeof(size_t)];
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
volatile txnid_t mti_oldest;
|
|
||||||
volatile uint32_t mti_readers_refresh_flag;
|
|
||||||
|
|
||||||
#define MDBX_lockinfo_SIZE_B \
|
|
||||||
(sizeof(unsigned) /* mti_numreaders */ + \
|
|
||||||
MDBX_OSAL_LOCK_SIZE /* mti_rmutex */ + sizeof(txnid_t) /* mti_oldest */ + \
|
|
||||||
sizeof(uint32_t) /* mti_readers_refresh_flag */ + \
|
|
||||||
MDBX_OSAL_LOCK_SIZE % sizeof(size_t) /* pad_mti_rmutex */)
|
|
||||||
|
|
||||||
/* cache-line alignment */
|
|
||||||
uint8_t
|
|
||||||
pad_b[MDBX_CACHELINE_SIZE - MDBX_lockinfo_SIZE_B % MDBX_CACHELINE_SIZE];
|
|
||||||
|
|
||||||
MDBX_reader mti_readers[1];
|
|
||||||
|
|
||||||
} MDBX_lockinfo;
|
} MDBX_lockinfo;
|
||||||
|
|
||||||
#pragma pack(pop)
|
|
||||||
|
|
||||||
#define MDBX_LOCKINFO_WHOLE_SIZE \
|
|
||||||
((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \
|
|
||||||
~((size_t)MDBX_CACHELINE_SIZE - 1))
|
|
||||||
|
|
||||||
/* Lockfile format signature: version, features and field layout */
|
/* Lockfile format signature: version, features and field layout */
|
||||||
#define MDBX_LOCK_FORMAT \
|
#define MDBX_LOCK_FORMAT \
|
||||||
((MDBX_OSAL_LOCK_SIGN << 16) + \
|
(MDBX_OSAL_LOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \
|
||||||
(uint16_t)(MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1))
|
(unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \
|
||||||
|
(unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \
|
||||||
|
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 29)
|
||||||
|
|
||||||
#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
|
#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
|
||||||
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
|
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
|
||||||
|
|
||||||
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
|
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
|
||||||
#define MDBX_LOCK_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
|
|
||||||
|
|
||||||
#ifndef MDBX_ASSUME_MALLOC_OVERHEAD
|
#ifndef MDBX_ASSUME_MALLOC_OVERHEAD
|
||||||
#define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u)
|
#define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u)
|
||||||
@ -772,8 +779,6 @@ struct MDBX_env {
|
|||||||
unsigned me_psize2log; /* log2 of DB page size */
|
unsigned me_psize2log; /* log2 of DB page size */
|
||||||
unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */
|
unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */
|
||||||
unsigned me_maxreaders; /* size of the reader table */
|
unsigned me_maxreaders; /* size of the reader table */
|
||||||
/* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */
|
|
||||||
unsigned me_close_readers;
|
|
||||||
mdbx_fastmutex_t me_dbi_lock;
|
mdbx_fastmutex_t me_dbi_lock;
|
||||||
MDBX_dbi me_numdbs; /* number of DBs opened */
|
MDBX_dbi me_numdbs; /* number of DBs opened */
|
||||||
MDBX_dbi me_maxdbs; /* size of the DB table */
|
MDBX_dbi me_maxdbs; /* size of the DB table */
|
||||||
|
@ -41,7 +41,7 @@ uint32_t linux_kernel_version;
|
|||||||
static int op_setlk = F_SETLK, op_setlkw = F_SETLKW, op_getlk = F_GETLK;
|
static int op_setlk = F_SETLK, op_setlkw = F_SETLKW, op_getlk = F_GETLK;
|
||||||
|
|
||||||
/*----------------------------------------------------------------------------*/
|
/*----------------------------------------------------------------------------*/
|
||||||
/* rthc */
|
/* global constructor/destructor */
|
||||||
|
|
||||||
static __cold __attribute__((constructor)) void mdbx_global_constructor(void) {
|
static __cold __attribute__((constructor)) void mdbx_global_constructor(void) {
|
||||||
struct utsname buffer;
|
struct utsname buffer;
|
||||||
@ -72,6 +72,7 @@ static __cold __attribute__((constructor)) void mdbx_global_constructor(void) {
|
|||||||
op_getlk = F_OFD_GETLK;
|
op_getlk = F_OFD_GETLK;
|
||||||
}
|
}
|
||||||
#endif /* OFD locks */
|
#endif /* OFD locks */
|
||||||
|
|
||||||
mdbx_rthc_global_init();
|
mdbx_rthc_global_init();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
/*
|
/*
|
||||||
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
|
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
|
||||||
* and other libmdbx authors: please see AUTHORS file.
|
* and other libmdbx authors: please see AUTHORS file.
|
||||||
* All rights reserved.
|
* All rights reserved.
|
||||||
@ -179,7 +179,7 @@ void mdbx_txn_unlock(MDBX_env *env) {
|
|||||||
#define LCK_LO_OFFSET 0
|
#define LCK_LO_OFFSET 0
|
||||||
#define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders)
|
#define LCK_LO_LEN offsetof(MDBX_lockinfo, mti_numreaders)
|
||||||
#define LCK_UP_OFFSET LCK_LO_LEN
|
#define LCK_UP_OFFSET LCK_LO_LEN
|
||||||
#define LCK_UP_LEN (MDBX_LOCKINFO_WHOLE_SIZE - LCK_UP_OFFSET)
|
#define LCK_UP_LEN (sizeof(MDBX_lockinfo) - LCK_UP_OFFSET)
|
||||||
#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN
|
#define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN
|
||||||
#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN
|
#define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN
|
||||||
|
|
||||||
@ -414,7 +414,9 @@ int mdbx_lck_seize(MDBX_env *env) {
|
|||||||
|
|
||||||
assert(env->me_fd != INVALID_HANDLE_VALUE);
|
assert(env->me_fd != INVALID_HANDLE_VALUE);
|
||||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||||
return MDBX_RESULT_TRUE /* files were must be opened non-shareable */;
|
return MDBX_RESULT_TRUE /* nope since files were must be opened
|
||||||
|
non-shareable */
|
||||||
|
;
|
||||||
|
|
||||||
if (env->me_lfd == INVALID_HANDLE_VALUE) {
|
if (env->me_lfd == INVALID_HANDLE_VALUE) {
|
||||||
/* LY: without-lck mode (e.g. on read-only filesystem) */
|
/* LY: without-lck mode (e.g. on read-only filesystem) */
|
||||||
@ -459,7 +461,8 @@ int mdbx_lck_downgrade(MDBX_env *env, bool complete) {
|
|||||||
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
assert(env->me_lfd != INVALID_HANDLE_VALUE);
|
||||||
|
|
||||||
if (env->me_flags & MDBX_EXCLUSIVE)
|
if (env->me_flags & MDBX_EXCLUSIVE)
|
||||||
return MDBX_SUCCESS /* files were must be opened non-shareable */;
|
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
|
||||||
|
;
|
||||||
|
|
||||||
/* 1) must be at E-E (exclusive-write) */
|
/* 1) must be at E-E (exclusive-write) */
|
||||||
if (!complete) {
|
if (!complete) {
|
||||||
|
35
src/mdbx.c
35
src/mdbx.c
@ -2177,7 +2177,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) {
|
|||||||
if (unlikely(lck == NULL /* exclusive mode */))
|
if (unlikely(lck == NULL /* exclusive mode */))
|
||||||
return env->me_oldest_stub = edge;
|
return env->me_oldest_stub = edge;
|
||||||
|
|
||||||
const txnid_t last_oldest = lck->mti_oldest;
|
const txnid_t last_oldest = lck->mti_oldest_reader;
|
||||||
mdbx_tassert(txn, edge >= last_oldest);
|
mdbx_tassert(txn, edge >= last_oldest);
|
||||||
if (likely(last_oldest == edge))
|
if (likely(last_oldest == edge))
|
||||||
return edge;
|
return edge;
|
||||||
@ -2206,8 +2206,8 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) {
|
|||||||
|
|
||||||
if (oldest != last_oldest) {
|
if (oldest != last_oldest) {
|
||||||
mdbx_notice("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, oldest);
|
mdbx_notice("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, oldest);
|
||||||
mdbx_tassert(txn, oldest >= lck->mti_oldest);
|
mdbx_tassert(txn, oldest >= lck->mti_oldest_reader);
|
||||||
lck->mti_oldest = oldest;
|
lck->mti_oldest_reader = oldest;
|
||||||
}
|
}
|
||||||
return oldest;
|
return oldest;
|
||||||
}
|
}
|
||||||
@ -2221,14 +2221,14 @@ static __cold pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
|
|||||||
retry:
|
retry:
|
||||||
if (lck->mti_readers[i].mr_pid) {
|
if (lck->mti_readers[i].mr_pid) {
|
||||||
/* mdbx_jitter4testing(true); */
|
/* mdbx_jitter4testing(true); */
|
||||||
const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages;
|
const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages_used;
|
||||||
const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid;
|
const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid;
|
||||||
mdbx_memory_barrier();
|
mdbx_memory_barrier();
|
||||||
if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages ||
|
if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages_used ||
|
||||||
snap_txnid != lck->mti_readers[i].mr_txnid))
|
snap_txnid != lck->mti_readers[i].mr_txnid))
|
||||||
goto retry;
|
goto retry;
|
||||||
if (largest < snap_pages &&
|
if (largest < snap_pages &&
|
||||||
lck->mti_oldest <= /* ignore pending updates */ snap_txnid &&
|
lck->mti_oldest_reader <= /* ignore pending updates */ snap_txnid &&
|
||||||
snap_txnid <= env->me_txn0->mt_txnid)
|
snap_txnid <= env->me_txn0->mt_txnid)
|
||||||
largest = snap_pages;
|
largest = snap_pages;
|
||||||
}
|
}
|
||||||
@ -3176,9 +3176,16 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
|
|||||||
}
|
}
|
||||||
#endif /* MDBX_TXN_CHECKPID */
|
#endif /* MDBX_TXN_CHECKPID */
|
||||||
|
|
||||||
STATIC_ASSERT(sizeof(MDBX_reader) == MDBX_CACHELINE_SIZE);
|
STATIC_ASSERT(sizeof(MDBX_reader) == 32);
|
||||||
|
#ifdef MDBX_OSAL_LOCK
|
||||||
|
STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_wmutex) % MDBX_CACHELINE_SIZE == 0);
|
||||||
|
STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_rmutex) % MDBX_CACHELINE_SIZE == 0);
|
||||||
|
#else
|
||||||
|
STATIC_ASSERT(
|
||||||
|
offsetof(MDBX_lockinfo, mti_oldest_reader) % MDBX_CACHELINE_SIZE == 0);
|
||||||
STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE ==
|
STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_numreaders) % MDBX_CACHELINE_SIZE ==
|
||||||
0);
|
0);
|
||||||
|
#endif
|
||||||
STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE ==
|
STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE ==
|
||||||
0);
|
0);
|
||||||
|
|
||||||
@ -3258,8 +3265,6 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
|
|||||||
mdbx_flush_noncoherent_cpu_writeback();
|
mdbx_flush_noncoherent_cpu_writeback();
|
||||||
if (slot == nreaders)
|
if (slot == nreaders)
|
||||||
env->me_lck->mti_numreaders = ++nreaders;
|
env->me_lck->mti_numreaders = ++nreaders;
|
||||||
if (env->me_close_readers < nreaders)
|
|
||||||
env->me_close_readers = nreaders;
|
|
||||||
r->mr_pid = env->me_pid;
|
r->mr_pid = env->me_pid;
|
||||||
mdbx_rdt_unlock(env);
|
mdbx_rdt_unlock(env);
|
||||||
|
|
||||||
@ -3275,7 +3280,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
|
|||||||
const txnid_t snap = mdbx_meta_txnid_fluid(env, meta);
|
const txnid_t snap = mdbx_meta_txnid_fluid(env, meta);
|
||||||
mdbx_jitter4testing(false);
|
mdbx_jitter4testing(false);
|
||||||
if (likely(r)) {
|
if (likely(r)) {
|
||||||
r->mr_snapshot_pages = meta->mm_geo.next;
|
r->mr_snapshot_pages_used = meta->mm_geo.next;
|
||||||
r->mr_txnid = snap;
|
r->mr_txnid = snap;
|
||||||
mdbx_jitter4testing(false);
|
mdbx_jitter4testing(false);
|
||||||
mdbx_assert(env, r->mr_pid == mdbx_getpid());
|
mdbx_assert(env, r->mr_pid == mdbx_getpid());
|
||||||
@ -3679,8 +3684,9 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
|
|||||||
if (txn->mt_ro_reader) {
|
if (txn->mt_ro_reader) {
|
||||||
mdbx_ensure(env, /* paranoia is appropriate here */
|
mdbx_ensure(env, /* paranoia is appropriate here */
|
||||||
txn->mt_txnid == txn->mt_ro_reader->mr_txnid &&
|
txn->mt_txnid == txn->mt_ro_reader->mr_txnid &&
|
||||||
txn->mt_ro_reader->mr_txnid >= env->me_lck->mti_oldest);
|
txn->mt_ro_reader->mr_txnid >=
|
||||||
txn->mt_ro_reader->mr_snapshot_pages = 0;
|
env->me_lck->mti_oldest_reader);
|
||||||
|
txn->mt_ro_reader->mr_snapshot_pages_used = 0;
|
||||||
txn->mt_ro_reader->mr_txnid = ~(txnid_t)0;
|
txn->mt_ro_reader->mr_txnid = ~(txnid_t)0;
|
||||||
mdbx_memory_barrier();
|
mdbx_memory_barrier();
|
||||||
env->me_lck->mti_readers_refresh_flag = true;
|
env->me_lck->mti_readers_refresh_flag = true;
|
||||||
@ -6510,8 +6516,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
|
|||||||
env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC;
|
env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC;
|
||||||
env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT;
|
env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT;
|
||||||
} else {
|
} else {
|
||||||
if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC &&
|
if (env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC) {
|
||||||
env->me_lck->mti_magic_and_version != MDBX_LOCK_MAGIC_DEVEL) {
|
|
||||||
mdbx_error("lock region has invalid magic/version");
|
mdbx_error("lock region has invalid magic/version");
|
||||||
return ((env->me_lck->mti_magic_and_version >> 8) != MDBX_MAGIC)
|
return ((env->me_lck->mti_magic_and_version >> 8) != MDBX_MAGIC)
|
||||||
? MDBX_INVALID
|
? MDBX_INVALID
|
||||||
@ -6525,7 +6530,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
|
|||||||
}
|
}
|
||||||
|
|
||||||
mdbx_assert(env, !MDBX_IS_ERROR(rc));
|
mdbx_assert(env, !MDBX_IS_ERROR(rc));
|
||||||
env->me_oldest = &env->me_lck->mti_oldest;
|
env->me_oldest = &env->me_lck->mti_oldest_reader;
|
||||||
#ifdef MDBX_OSAL_LOCK
|
#ifdef MDBX_OSAL_LOCK
|
||||||
env->me_wmutex = &env->me_lck->mti_wmutex;
|
env->me_wmutex = &env->me_lck->mti_wmutex;
|
||||||
#endif
|
#endif
|
||||||
|
26
src/osal.h
26
src/osal.h
@ -41,7 +41,6 @@
|
|||||||
|
|
||||||
/*----------------------------------------------------------------------------*/
|
/*----------------------------------------------------------------------------*/
|
||||||
/* C99 includes */
|
/* C99 includes */
|
||||||
|
|
||||||
#include <inttypes.h>
|
#include <inttypes.h>
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@ -59,6 +58,22 @@
|
|||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
#endif /* xBSD */
|
#endif /* xBSD */
|
||||||
|
|
||||||
|
/* C11 stdalign.h */
|
||||||
|
#if __has_include(<stdalign.h>)
|
||||||
|
#include <stdalign.h>
|
||||||
|
#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L
|
||||||
|
#define alignas(N) _Alignas(N)
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#define alignas(N) __declspec(align(N))
|
||||||
|
#elif __has_attribute(aligned) || defined(__GNUC__)
|
||||||
|
#define alignas(N) __attribute__((aligned(N)))
|
||||||
|
#else
|
||||||
|
#error "FIXME: Required _alignas() or equivalent."
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*----------------------------------------------------------------------------*/
|
||||||
|
/* Systems includes */
|
||||||
|
|
||||||
#ifndef _POSIX_C_SOURCE
|
#ifndef _POSIX_C_SOURCE
|
||||||
#ifdef _POSIX_SOURCE
|
#ifdef _POSIX_SOURCE
|
||||||
#define _POSIX_C_SOURCE 1
|
#define _POSIX_C_SOURCE 1
|
||||||
@ -71,9 +86,6 @@
|
|||||||
#define _XOPEN_SOURCE 0
|
#define _XOPEN_SOURCE 0
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
/*----------------------------------------------------------------------------*/
|
|
||||||
/* Systems includes */
|
|
||||||
|
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
#define WIN32_LEAN_AND_MEAN
|
#define WIN32_LEAN_AND_MEAN
|
||||||
#include <tlhelp32.h>
|
#include <tlhelp32.h>
|
||||||
@ -599,12 +611,6 @@ void mdbx_osal_jitter(bool tiny);
|
|||||||
#define MDBX_OSAL_LOCK_SIGN UINT32_C(0x8017)
|
#define MDBX_OSAL_LOCK_SIGN UINT32_C(0x8017)
|
||||||
#endif /* MDBX_OSAL_LOCK */
|
#endif /* MDBX_OSAL_LOCK */
|
||||||
|
|
||||||
#ifdef MDBX_OSAL_LOCK
|
|
||||||
#define MDBX_OSAL_LOCK_SIZE sizeof(MDBX_OSAL_LOCK)
|
|
||||||
#else
|
|
||||||
#define MDBX_OSAL_LOCK_SIZE 0
|
|
||||||
#endif /* MDBX_OSAL_LOCK_SIZE */
|
|
||||||
|
|
||||||
/// \brief Инициализация объектов синхронизации внутри текущего процесса
|
/// \brief Инициализация объектов синхронизации внутри текущего процесса
|
||||||
/// связанных с экземпляром MDBX_env.
|
/// связанных с экземпляром MDBX_env.
|
||||||
/// \return Код ошибки или 0 в случае успеха.
|
/// \return Код ошибки или 0 в случае успеха.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user