libmdbx/src/mvcc-readers.c

415 lines
14 KiB
C
Raw Normal View History

/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#include "internals.h"
bsr_t mvcc_bind_slot(MDBX_env *env) {
eASSERT(env, env->lck_mmap.lck);
eASSERT(env, env->lck->magic_and_version == MDBX_LOCK_MAGIC);
eASSERT(env, env->lck->os_and_format == MDBX_LOCK_FORMAT);
bsr_t result = {lck_rdt_lock(env), nullptr};
if (unlikely(MDBX_IS_ERROR(result.err)))
return result;
if (unlikely(env->flags & ENV_FATAL_ERROR)) {
lck_rdt_unlock(env);
result.err = MDBX_PANIC;
return result;
}
if (unlikely(!env->dxb_mmap.base)) {
lck_rdt_unlock(env);
result.err = MDBX_EPERM;
return result;
}
if (unlikely(env->registered_reader_pid != env->pid)) {
result.err = lck_rpid_set(env);
if (unlikely(result.err != MDBX_SUCCESS)) {
lck_rdt_unlock(env);
return result;
}
env->registered_reader_pid = env->pid;
}
result.err = MDBX_SUCCESS;
size_t slot, nreaders;
while (1) {
nreaders = env->lck->rdt_length.weak;
for (slot = 0; slot < nreaders; slot++)
if (!atomic_load32(&env->lck->rdt[slot].pid, mo_AcquireRelease))
break;
if (likely(slot < env->max_readers))
break;
result.err = mvcc_cleanup_dead(env, true, nullptr);
if (result.err != MDBX_RESULT_TRUE) {
lck_rdt_unlock(env);
result.err = (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err;
return result;
}
}
result.rslot = &env->lck->rdt[slot];
/* Claim the reader slot, carefully since other code
* uses the reader table un-mutexed: First reset the
* slot, next publish it in lck->rdt_length. After
* that, it is safe for mdbx_env_close() to touch it.
* When it will be closed, we can finally claim it. */
atomic_store32(&result.rslot->pid, 0, mo_AcquireRelease);
safe64_reset(&result.rslot->txnid, true);
if (slot == nreaders)
env->lck->rdt_length.weak = (uint32_t)++nreaders;
result.rslot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self();
atomic_store32(&result.rslot->pid, env->pid, mo_AcquireRelease);
lck_rdt_unlock(env);
if (likely(env->flags & ENV_TXKEY)) {
eASSERT(env, env->registered_reader_pid == env->pid);
thread_rthc_set(env->me_txkey, result.rslot);
}
return result;
}
__hot txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady) {
const uint32_t nothing_changed = MDBX_STRING_TETRAD("None");
eASSERT(env, steady <= env->basal_txn->txnid);
lck_t *const lck = env->lck_mmap.lck;
if (unlikely(lck == nullptr /* exclusive without-lck mode */)) {
eASSERT(env, env->lck == lckless_stub(env));
env->lck->rdt_refresh_flag.weak = nothing_changed;
return env->lck->cached_oldest.weak = steady;
}
const txnid_t prev_oldest = atomic_load64(&lck->cached_oldest, mo_AcquireRelease);
eASSERT(env, steady >= prev_oldest);
txnid_t new_oldest = prev_oldest;
while (nothing_changed != atomic_load32(&lck->rdt_refresh_flag, mo_AcquireRelease)) {
lck->rdt_refresh_flag.weak = nothing_changed;
jitter4testing(false);
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
new_oldest = steady;
for (size_t i = 0; i < snap_nreaders; ++i) {
const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease);
if (!pid)
continue;
jitter4testing(true);
const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid);
if (unlikely(rtxn < prev_oldest)) {
if (unlikely(nothing_changed == atomic_load32(&lck->rdt_refresh_flag, mo_AcquireRelease)) &&
safe64_reset_compare(&lck->rdt[i].txnid, rtxn)) {
NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN,
i, snap_nreaders, pid, rtxn, prev_oldest, steady);
}
continue;
}
if (rtxn < new_oldest) {
new_oldest = rtxn;
if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest)
break;
}
}
}
if (new_oldest != prev_oldest) {
VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest);
eASSERT(env, new_oldest >= lck->cached_oldest.weak);
atomic_store64(&lck->cached_oldest, new_oldest, mo_Relaxed);
}
return new_oldest;
}
pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page) {
lck_t *const lck = env->lck_mmap.lck;
if (likely(lck != nullptr /* check for exclusive without-lck mode */)) {
retry:;
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
for (size_t i = 0; i < snap_nreaders; ++i) {
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) {
/* jitter4testing(true); */
const pgno_t snap_pages = atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed);
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
if (unlikely(snap_pages != atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_AcquireRelease) ||
snap_txnid != safe64_read(&lck->rdt[i].txnid)))
goto retry;
if (last_used_page < snap_pages && snap_txnid <= env->basal_txn->txnid)
last_used_page = snap_pages;
}
}
}
return last_used_page;
}
/* Find largest mvcc-snapshot still referenced by this process. */
pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest) {
lck_t *const lck = env->lck_mmap.lck;
if (likely(lck != nullptr /* exclusive mode */)) {
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
for (size_t i = 0; i < snap_nreaders; ++i) {
retry:
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease) == env->pid) {
/* jitter4testing(true); */
const pgno_t snap_pages = atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed);
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
if (unlikely(snap_pages != atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_AcquireRelease) ||
snap_txnid != safe64_read(&lck->rdt[i].txnid)))
goto retry;
if (largest < snap_pages &&
atomic_load64(&lck->cached_oldest, mo_AcquireRelease) <=
/* ignore pending updates */ snap_txnid &&
snap_txnid <= MAX_TXNID)
largest = snap_pages;
}
}
}
return largest;
}
static bool pid_insert(uint32_t *list, uint32_t pid) {
/* binary search of pid in list */
size_t base = 0;
size_t cursor = 1;
int32_t val = 0;
size_t n = /* length */ list[0];
while (n > 0) {
size_t pivot = n >> 1;
cursor = base + pivot + 1;
val = pid - list[cursor];
if (val < 0) {
n = pivot;
} else if (val > 0) {
base = cursor;
n -= pivot + 1;
} else {
/* found, so it's a duplicate */
return false;
}
}
if (val > 0)
++cursor;
list[0]++;
for (n = list[0]; n > cursor; n--)
list[n] = list[n - 1];
list[n] = pid;
return true;
}
__cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked, int *dead) {
int rc = check_env(env, true);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
eASSERT(env, rdt_locked >= 0);
lck_t *const lck = env->lck_mmap.lck;
if (unlikely(lck == nullptr)) {
/* exclusive mode */
if (dead)
*dead = 0;
return MDBX_SUCCESS;
}
const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease);
uint32_t pidsbuf_onstask[142];
uint32_t *const pids = (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask))
? pidsbuf_onstask
: osal_malloc((snap_nreaders + 1) * sizeof(uint32_t));
if (unlikely(!pids))
return MDBX_ENOMEM;
pids[0] = 0;
int count = 0;
for (size_t i = 0; i < snap_nreaders; i++) {
const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease);
if (pid == 0)
continue /* skip empty */;
if (pid == env->pid)
continue /* skip self */;
if (!pid_insert(pids, pid))
continue /* such pid already processed */;
int err = lck_rpid_check(env, pid);
if (err == MDBX_RESULT_TRUE)
continue /* reader is live */;
if (err != MDBX_SUCCESS) {
rc = err;
break /* lck_rpid_check() failed */;
}
/* stale reader found */
if (!rdt_locked) {
err = lck_rdt_lock(env);
if (MDBX_IS_ERROR(err)) {
rc = err;
break;
}
rdt_locked = -1;
if (err == MDBX_RESULT_TRUE) {
/* mutex recovered, the mdbx_ipclock_failed() checked all readers */
rc = MDBX_RESULT_TRUE;
break;
}
/* a other process may have clean and reused slot, recheck */
if (lck->rdt[i].pid.weak != pid)
continue;
err = lck_rpid_check(env, pid);
if (MDBX_IS_ERROR(err)) {
rc = err;
break;
}
if (err != MDBX_SUCCESS)
continue /* the race with other process, slot reused */;
}
/* clean it */
for (size_t ii = i; ii < snap_nreaders; ii++) {
if (lck->rdt[ii].pid.weak == pid) {
DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, lck->rdt[ii].txnid.weak);
atomic_store32(&lck->rdt[ii].pid, 0, mo_Relaxed);
atomic_store32(&lck->rdt_refresh_flag, true, mo_AcquireRelease);
count++;
}
}
}
if (likely(!MDBX_IS_ERROR(rc)))
atomic_store64(&lck->readers_check_timestamp, osal_monotime(), mo_Relaxed);
if (rdt_locked < 0)
lck_rdt_unlock(env);
if (pids != pidsbuf_onstask)
osal_free(pids);
if (dead)
*dead = count;
return rc;
}
__cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler);
osal_memory_fence(mo_AcquireRelease, false);
MDBX_hsr_func *const callback = env->hsr_callback;
txnid_t oldest = 0;
bool notify_eof_of_loop = false;
int retry = 0;
do {
const txnid_t steady = env->txn->tw.troika.txnid[env->txn->tw.troika.prefer_steady];
env->lck->rdt_refresh_flag.weak = /* force refresh */ true;
oldest = mvcc_shapshot_oldest(env, steady);
eASSERT(env, oldest < env->basal_txn->txnid);
eASSERT(env, oldest >= straggler);
eASSERT(env, oldest >= env->lck->cached_oldest.weak);
lck_t *const lck = env->lck_mmap.lck;
if (oldest == steady || oldest > straggler || /* without-LCK mode */ !lck)
break;
if (MDBX_IS_ERROR(mvcc_cleanup_dead(env, false, nullptr)))
break;
reader_slot_t *stucked = nullptr;
uint64_t hold_retired = 0;
for (size_t i = 0; i < lck->rdt_length.weak; ++i) {
uint32_t pid;
reader_slot_t *const rslot = &lck->rdt[i];
txnid_t rtxn = safe64_read(&rslot->txnid);
retry:
if (rtxn == straggler && (pid = atomic_load32(&rslot->pid, mo_AcquireRelease)) != 0) {
const uint64_t tid = safe64_read(&rslot->tid);
if (tid == MDBX_TID_TXN_PARKED) {
/* Читающая транзакция была помечена владельцем как "припаркованная",
* т.е. подлежащая асинхронному прерыванию, либо восстановлению
* по активности читателя.
*
* Если первый CAS(slot->tid) будет успешным, то
* safe64_reset_compare() безопасно очистит txnid, либо откажется
* из-за того что читатель сбросил и/или перезапустил транзакцию.
* При этом читатеть может не заметить вытестения, если приступит
* к завершению транзакции. Все эти исходы нас устраивют.
*
* Если первый CAS(slot->tid) будет НЕ успешным, то значит читатеть
* восстановил транзакцию, либо завершил её, либо даже освободил слот.
*/
bool ousted =
#if MDBX_64BIT_CAS
atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED, MDBX_TID_TXN_OUSTED);
#else
atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED, (uint32_t)MDBX_TID_TXN_OUSTED);
#endif
if (likely(ousted)) {
ousted = safe64_reset_compare(&rslot->txnid, rtxn);
NOTICE("ousted-%s parked read-txn %" PRIaTXN ", pid %u, tid 0x%" PRIx64, ousted ? "complete" : "half", rtxn,
pid, tid);
eASSERT(env, ousted || safe64_read(&rslot->txnid) > straggler);
continue;
}
rtxn = safe64_read(&rslot->txnid);
goto retry;
}
hold_retired = atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed);
stucked = rslot;
}
}
if (!callback || !stucked)
break;
uint32_t pid = atomic_load32(&stucked->pid, mo_AcquireRelease);
uint64_t tid = safe64_read(&stucked->tid);
if (safe64_read(&stucked->txnid) != straggler || !pid)
continue;
const meta_ptr_t head = meta_recent(env, &env->txn->tw.troika);
const txnid_t gap = (head.txnid - straggler) / xMDBX_TXNID_STEP;
const uint64_t head_retired = unaligned_peek_u64(4, head.ptr_c->pages_retired);
const size_t space = (head_retired > hold_retired) ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) : 0;
int rc = callback(env, env->txn, pid, (mdbx_tid_t)((intptr_t)tid), straggler,
(gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry);
if (rc < 0)
/* hsr returned error and/or agree MDBX_MAP_FULL error */
break;
if (rc > 0) {
if (rc == 1) {
/* hsr reported transaction (will be) aborted asynchronous */
safe64_reset_compare(&stucked->txnid, straggler);
} else {
/* hsr reported reader process was killed and slot should be cleared */
safe64_reset(&stucked->txnid, true);
atomic_store64(&stucked->tid, 0, mo_Relaxed);
atomic_store32(&stucked->pid, 0, mo_AcquireRelease);
}
} else if (!notify_eof_of_loop) {
#if MDBX_ENABLE_PROFGC
env->lck->pgops.gc_prof.kicks += 1;
#endif /* MDBX_ENABLE_PROFGC */
notify_eof_of_loop = true;
}
} while (++retry < INT_MAX);
if (notify_eof_of_loop) {
/* notify end of hsr-loop */
const txnid_t turn = oldest - straggler;
if (turn)
NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, straggler, oldest, turn);
callback(env, env->txn, 0, 0, straggler, (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry);
}
return oldest;
}