mdbx: backport - fix DB-shrinking race with copy-asis & readers.

Change-Id: I893b388d186b6425ab60be4b7cc6bf9b67142def
This commit is contained in:
Leonid Yuriev 2019-07-09 15:23:19 +03:00
parent 961f08a5d2
commit 1a123b5395
2 changed files with 61 additions and 18 deletions

View File

@ -253,11 +253,14 @@ typedef struct MDBX_reader {
volatile mdbx_pid_t mr_pid; volatile mdbx_pid_t mr_pid;
/* The thread ID of the thread owning this txn. */ /* The thread ID of the thread owning this txn. */
volatile mdbx_tid_t mr_tid; volatile mdbx_tid_t mr_tid;
/* The number of pages used in the reader's MVCC snapshot,
* i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
volatile pgno_t mr_snapshot_pages;
/* cache line alignment */ /* cache line alignment */
uint8_t pad[MDBX_CACHELINE_SIZE - uint8_t pad[MDBX_CACHELINE_SIZE - (sizeof(txnid_t) + sizeof(mdbx_pid_t) +
(sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t)) % sizeof(mdbx_tid_t) + sizeof(pgno_t)) %
MDBX_CACHELINE_SIZE]; MDBX_CACHELINE_SIZE];
} MDBX_reader; } MDBX_reader;
/* Information about a single database in the environment. */ /* Information about a single database in the environment. */

View File

@ -2163,7 +2163,7 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) {
mdbx_tassert(txn, edge <= txn->mt_txnid - 1); mdbx_tassert(txn, edge <= txn->mt_txnid - 1);
MDBX_lockinfo *const lck = env->me_lck; MDBX_lockinfo *const lck = env->me_lck;
if (unlikely(env->me_lck == NULL /* exclusive mode */)) if (unlikely(lck == NULL /* exclusive mode */))
return env->me_oldest_stub = edge; return env->me_oldest_stub = edge;
const txnid_t last_oldest = lck->mti_oldest; const txnid_t last_oldest = lck->mti_oldest;
@ -2201,6 +2201,32 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) {
return oldest; return oldest;
} }
/* Find largest mvcc-snapshot still referenced. */
static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) {
MDBX_lockinfo *const lck = env->me_lck;
if (likely(lck != NULL /* exclusive mode */)) {
const unsigned snap_nreaders = lck->mti_numreaders;
for (unsigned i = 0; i < snap_nreaders; ++i) {
retry:
if (lck->mti_readers[i].mr_pid) {
/* mdbx_jitter4testing(true); */
const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages;
const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid;
mdbx_memory_barrier();
if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages ||
snap_txnid != lck->mti_readers[i].mr_txnid))
goto retry;
if (largest < snap_pages &&
lck->mti_oldest <= /* ignore pending updates */ snap_txnid &&
snap_txnid <= env->me_txn0->mt_txnid)
largest = snap_pages;
}
}
}
return largest;
}
/* Add a page to the txn's dirty list */ /* Add a page to the txn's dirty list */
static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) {
int (*const adder)(MDBX_DPL, pgno_t pgno, MDBX_page * page) = int (*const adder)(MDBX_DPL, pgno_t pgno, MDBX_page * page) =
@ -3231,6 +3257,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); const txnid_t snap = mdbx_meta_txnid_fluid(env, meta);
mdbx_jitter4testing(false); mdbx_jitter4testing(false);
if (r) { if (r) {
r->mr_snapshot_pages = meta->mm_geo.next;
r->mr_txnid = snap; r->mr_txnid = snap;
mdbx_jitter4testing(false); mdbx_jitter4testing(false);
mdbx_assert(env, r->mr_pid == mdbx_getpid()); mdbx_assert(env, r->mr_pid == mdbx_getpid());
@ -3267,6 +3294,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) {
mdbx_assert(env, txn->mt_txnid >= *env->me_oldest); mdbx_assert(env, txn->mt_txnid >= *env->me_oldest);
txn->mt_ro_reader = r; txn->mt_ro_reader = r;
txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */
mdbx_ensure(env, txn->mt_txnid >=
/* paranoia is appropriate here */ *env->me_oldest);
} else { } else {
/* Not yet touching txn == env->me_txn0, it may be active */ /* Not yet touching txn == env->me_txn0, it may be active */
mdbx_jitter4testing(false); mdbx_jitter4testing(false);
@ -3616,13 +3645,20 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) {
(void *)env, txn->mt_dbs[MAIN_DBI].md_root, (void *)env, txn->mt_dbs[MAIN_DBI].md_root,
txn->mt_dbs[FREE_DBI].md_root); txn->mt_dbs[FREE_DBI].md_root);
mdbx_ensure(env, txn->mt_txnid >=
/* paranoia is appropriate here */ *env->me_oldest);
if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) {
#if defined(_WIN32) || defined(_WIN64) #if defined(_WIN32) || defined(_WIN64)
if (txn->mt_flags & MDBX_SHRINK_ALLOWED) if (txn->mt_flags & MDBX_SHRINK_ALLOWED)
mdbx_srwlock_ReleaseShared(&env->me_remap_guard); mdbx_srwlock_ReleaseShared(&env->me_remap_guard);
#endif #endif
if (txn->mt_ro_reader) { if (txn->mt_ro_reader) {
mdbx_ensure(env, /* paranoia is appropriate here */
txn->mt_txnid == txn->mt_ro_reader->mr_txnid &&
txn->mt_ro_reader->mr_txnid >= env->me_lck->mti_oldest);
txn->mt_ro_reader->mr_snapshot_pages = 0;
txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; txn->mt_ro_reader->mr_txnid = ~(txnid_t)0;
mdbx_memory_barrier();
env->me_lck->mti_readers_refresh_flag = true; env->me_lck->mti_readers_refresh_flag = true;
if (mode & MDBX_END_SLOT) { if (mode & MDBX_END_SLOT) {
if ((env->me_flags & MDBX_ENV_TXKEY) == 0) if ((env->me_flags & MDBX_ENV_TXKEY) == 0)
@ -5311,19 +5347,23 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink && if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink &&
pending->mm_geo.now - pending->mm_geo.next > pending->mm_geo.now - pending->mm_geo.next >
pending->mm_geo.shrink + backlog_gap) { pending->mm_geo.shrink + backlog_gap) {
const pgno_t aligner = const pgno_t largest = mdbx_find_largest(env, pending->mm_geo.next);
pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; if (pending->mm_geo.now > largest &&
const pgno_t with_backlog_gap = pending->mm_geo.next + backlog_gap; pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) {
const pgno_t aligned = pgno_align2os_pgno( const pgno_t aligner =
env, with_backlog_gap + aligner - with_backlog_gap % aligner); pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink;
const pgno_t bottom = const pgno_t with_backlog_gap = largest + backlog_gap;
(aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; const pgno_t aligned = pgno_align2os_pgno(
if (pending->mm_geo.now > bottom) { env, with_backlog_gap + aligner - with_backlog_gap % aligner);
flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ const pgno_t bottom =
shrink = pending->mm_geo.now - bottom; (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower;
pending->mm_geo.now = bottom; if (pending->mm_geo.now > bottom) {
if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */
mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); shrink = pending->mm_geo.now - bottom;
pending->mm_geo.now = bottom;
if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a)
mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1);
}
} }
} }
@ -5974,7 +6014,7 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) {
} }
/* Further setup required for opening an MDBX environment */ /* Further setup required for opening an MDBX environment */
static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
uint64_t filesize_before_mmap; uint64_t filesize_before_mmap;
MDBX_meta meta; MDBX_meta meta;
int rc = MDBX_RESULT_FALSE; int rc = MDBX_RESULT_FALSE;