From ce75405ccc62947938f670195cbc534a31b4348f Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Tue, 9 Jul 2019 15:23:19 +0300 Subject: [PATCH] mdbx: backport - fix DB-shrinking race with copy-asis & readers. Change-Id: I893b388d186b6425ab60be4b7cc6bf9b67142def --- src/bits.h | 9 +++++--- src/mdbx.c | 68 +++++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 60 insertions(+), 17 deletions(-) diff --git a/src/bits.h b/src/bits.h index b7094ef2..d23d72d5 100644 --- a/src/bits.h +++ b/src/bits.h @@ -244,11 +244,14 @@ typedef struct MDBX_reader { volatile mdbx_pid_t mr_pid; /* The thread ID of the thread owning this txn. */ volatile mdbx_tid_t mr_tid; + /* The number of pages used in the reader's MVCC snapshot, + * i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */ + volatile pgno_t mr_snapshot_pages; /* cache line alignment */ - uint8_t pad[MDBX_CACHELINE_SIZE - - (sizeof(txnid_t) + sizeof(mdbx_pid_t) + sizeof(mdbx_tid_t)) % - MDBX_CACHELINE_SIZE]; + uint8_t pad[MDBX_CACHELINE_SIZE - (sizeof(txnid_t) + sizeof(mdbx_pid_t) + + sizeof(mdbx_tid_t) + sizeof(pgno_t)) % + MDBX_CACHELINE_SIZE]; } MDBX_reader; /* Information about a single database in the environment. */ diff --git a/src/mdbx.c b/src/mdbx.c index 104d9ea7..b1b7c391 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1934,6 +1934,32 @@ static txnid_t mdbx_find_oldest(MDBX_txn *txn) { return oldest; } +/* Find largest mvcc-snapshot still referenced. */ +static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { + MDBX_lockinfo *const lck = env->me_lck; + if (likely(lck != NULL /* exclusive mode */)) { + const unsigned snap_nreaders = lck->mti_numreaders; + for (unsigned i = 0; i < snap_nreaders; ++i) { + retry: + if (lck->mti_readers[i].mr_pid) { + /* mdbx_jitter4testing(true); */ + const pgno_t snap_pages = lck->mti_readers[i].mr_snapshot_pages; + const txnid_t snap_txnid = lck->mti_readers[i].mr_txnid; + mdbx_memory_barrier(); + if (unlikely(snap_pages != lck->mti_readers[i].mr_snapshot_pages || + snap_txnid != lck->mti_readers[i].mr_txnid)) + goto retry; + if (largest < snap_pages && + lck->mti_oldest <= /* ignore pending updates */ snap_txnid && + snap_txnid <= env->me_txn0->mt_txnid) + largest = snap_pages; + } + } + } + + return largest; +} + /* Add a page to the txn's dirty list */ static void mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp) { MDBX_ID2 mid; @@ -2962,6 +2988,7 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { const txnid_t snap = mdbx_meta_txnid_fluid(env, meta); mdbx_jitter4testing(false); if (r) { + r->mr_snapshot_pages = meta->mm_geo.next; r->mr_txnid = snap; mdbx_jitter4testing(false); mdbx_assert(env, r->mr_pid == mdbx_getpid()); @@ -2998,6 +3025,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { mdbx_assert(env, txn->mt_txnid >= *env->me_oldest); txn->mt_ro_reader = r; txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ *env->me_oldest); } else { /* Not yet touching txn == env->me_txn0, it may be active */ mdbx_jitter4testing(false); @@ -3338,9 +3367,16 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode) { (void *)env, txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); + mdbx_ensure(env, txn->mt_txnid >= + /* paranoia is appropriate here */ *env->me_oldest); if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { if (txn->mt_ro_reader) { + mdbx_ensure(env, /* paranoia is appropriate here */ + txn->mt_txnid == txn->mt_ro_reader->mr_txnid && + txn->mt_ro_reader->mr_txnid >= env->me_lck->mti_oldest); + txn->mt_ro_reader->mr_snapshot_pages = 0; txn->mt_ro_reader->mr_txnid = ~(txnid_t)0; + mdbx_memory_barrier(); env->me_lck->mti_readers_refresh_flag = true; if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) @@ -4836,19 +4872,23 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if ((flags & MDBX_SHRINK_ALLOWED) && pending->mm_geo.shrink && pending->mm_geo.now - pending->mm_geo.next > pending->mm_geo.shrink + backlog_gap) { - const pgno_t aligner = - pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; - const pgno_t with_backlog_gap = pending->mm_geo.next + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); - const pgno_t bottom = - (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) - mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + const pgno_t largest = mdbx_find_largest(env, pending->mm_geo.next); + if (pending->mm_geo.now > largest && + pending->mm_geo.now - largest > pending->mm_geo.shrink + backlog_gap) { + const pgno_t aligner = + pending->mm_geo.grow ? pending->mm_geo.grow : pending->mm_geo.shrink; + const pgno_t with_backlog_gap = largest + backlog_gap; + const pgno_t aligned = pgno_align2os_pgno( + env, with_backlog_gap + aligner - with_backlog_gap % aligner); + const pgno_t bottom = + (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (mdbx_meta_txnid_stable(env, head) == pending->mm_txnid_a) + mdbx_meta_set_txnid(env, pending, pending->mm_txnid_a + 1); + } } } @@ -5467,7 +5507,7 @@ int __cold mdbx_env_get_maxreaders(MDBX_env *env, unsigned *readers) { } /* Further setup required for opening an MDBX environment */ -static int __cold mdbx_setup_dxb(MDBX_env *env, int lck_rc) { +static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) { uint64_t filesize_before_mmap; MDBX_meta meta; int rc = MDBX_RESULT_FALSE;