/// \copyright SPDX-License-Identifier: Apache-2.0 /// \author Леонид Юрьев aka Leonid Yuriev \date 2015-2024 #include "internals.h" bsr_t mvcc_bind_slot(MDBX_env *env) { eASSERT(env, env->lck_mmap.lck); eASSERT(env, env->lck->magic_and_version == MDBX_LOCK_MAGIC); eASSERT(env, env->lck->os_and_format == MDBX_LOCK_FORMAT); bsr_t result = {lck_rdt_lock(env), nullptr}; if (unlikely(MDBX_IS_ERROR(result.err))) return result; if (unlikely(env->flags & ENV_FATAL_ERROR)) { lck_rdt_unlock(env); result.err = MDBX_PANIC; return result; } if (unlikely(!env->dxb_mmap.base)) { lck_rdt_unlock(env); result.err = MDBX_EPERM; return result; } if (unlikely(env->registered_reader_pid != env->pid)) { result.err = lck_rpid_set(env); if (unlikely(result.err != MDBX_SUCCESS)) { lck_rdt_unlock(env); return result; } env->registered_reader_pid = env->pid; } result.err = MDBX_SUCCESS; size_t slot, nreaders; while (1) { nreaders = env->lck->rdt_length.weak; for (slot = 0; slot < nreaders; slot++) if (!atomic_load32(&env->lck->rdt[slot].pid, mo_AcquireRelease)) break; if (likely(slot < env->max_readers)) break; result.err = mvcc_cleanup_dead(env, true, nullptr); if (result.err != MDBX_RESULT_TRUE) { lck_rdt_unlock(env); result.err = (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err; return result; } } result.rslot = &env->lck->rdt[slot]; /* Claim the reader slot, carefully since other code * uses the reader table un-mutexed: First reset the * slot, next publish it in lck->rdt_length. After * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ atomic_store32(&result.rslot->pid, 0, mo_AcquireRelease); safe64_reset(&result.rslot->txnid, true); if (slot == nreaders) env->lck->rdt_length.weak = (uint32_t)++nreaders; result.rslot->tid.weak = (env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self(); atomic_store32(&result.rslot->pid, env->pid, mo_AcquireRelease); lck_rdt_unlock(env); if (likely(env->flags & ENV_TXKEY)) { eASSERT(env, env->registered_reader_pid == env->pid); thread_rthc_set(env->me_txkey, result.rslot); } return result; } __hot txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady) { const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); eASSERT(env, steady <= env->basal_txn->txnid); lck_t *const lck = env->lck_mmap.lck; if (unlikely(lck == nullptr /* exclusive without-lck mode */)) { eASSERT(env, env->lck == lckless_stub(env)); env->lck->rdt_refresh_flag.weak = nothing_changed; return env->lck->cached_oldest.weak = steady; } const txnid_t prev_oldest = atomic_load64(&lck->cached_oldest, mo_AcquireRelease); eASSERT(env, steady >= prev_oldest); txnid_t new_oldest = prev_oldest; while (nothing_changed != atomic_load32(&lck->rdt_refresh_flag, mo_AcquireRelease)) { lck->rdt_refresh_flag.weak = nothing_changed; jitter4testing(false); const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease); new_oldest = steady; for (size_t i = 0; i < snap_nreaders; ++i) { const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease); if (!pid) continue; jitter4testing(true); const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid); if (unlikely(rtxn < prev_oldest)) { if (unlikely(nothing_changed == atomic_load32(&lck->rdt_refresh_flag, mo_AcquireRelease)) && safe64_reset_compare(&lck->rdt[i].txnid, rtxn)) { NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, i, snap_nreaders, pid, rtxn, prev_oldest, steady); } continue; } if (rtxn < new_oldest) { new_oldest = rtxn; if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest) break; } } } if (new_oldest != prev_oldest) { VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest); eASSERT(env, new_oldest >= lck->cached_oldest.weak); atomic_store64(&lck->cached_oldest, new_oldest, mo_Relaxed); } return new_oldest; } pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page) { lck_t *const lck = env->lck_mmap.lck; if (likely(lck != nullptr /* check for exclusive without-lck mode */)) { retry:; const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease); for (size_t i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) { /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid); if (unlikely(snap_pages != atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_AcquireRelease) || snap_txnid != safe64_read(&lck->rdt[i].txnid))) goto retry; if (last_used_page < snap_pages && snap_txnid <= env->basal_txn->txnid) last_used_page = snap_pages; } } } return last_used_page; } /* Find largest mvcc-snapshot still referenced by this process. */ pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest) { lck_t *const lck = env->lck_mmap.lck; if (likely(lck != nullptr /* exclusive mode */)) { const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease); for (size_t i = 0; i < snap_nreaders; ++i) { retry: if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease) == env->pid) { /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid); if (unlikely(snap_pages != atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_AcquireRelease) || snap_txnid != safe64_read(&lck->rdt[i].txnid))) goto retry; if (largest < snap_pages && atomic_load64(&lck->cached_oldest, mo_AcquireRelease) <= /* ignore pending updates */ snap_txnid && snap_txnid <= MAX_TXNID) largest = snap_pages; } } } return largest; } static bool pid_insert(uint32_t *list, uint32_t pid) { /* binary search of pid in list */ size_t base = 0; size_t cursor = 1; int32_t val = 0; size_t n = /* length */ list[0]; while (n > 0) { size_t pivot = n >> 1; cursor = base + pivot + 1; val = pid - list[cursor]; if (val < 0) { n = pivot; } else if (val > 0) { base = cursor; n -= pivot + 1; } else { /* found, so it's a duplicate */ return false; } } if (val > 0) ++cursor; list[0]++; for (n = list[0]; n > cursor; n--) list[n] = list[n - 1]; list[n] = pid; return true; } __cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked, int *dead) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; eASSERT(env, rdt_locked >= 0); lck_t *const lck = env->lck_mmap.lck; if (unlikely(lck == nullptr)) { /* exclusive mode */ if (dead) *dead = 0; return MDBX_SUCCESS; } const size_t snap_nreaders = atomic_load32(&lck->rdt_length, mo_AcquireRelease); uint32_t pidsbuf_onstask[142]; uint32_t *const pids = (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) ? pidsbuf_onstask : osal_malloc((snap_nreaders + 1) * sizeof(uint32_t)); if (unlikely(!pids)) return MDBX_ENOMEM; pids[0] = 0; int count = 0; for (size_t i = 0; i < snap_nreaders; i++) { const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease); if (pid == 0) continue /* skip empty */; if (pid == env->pid) continue /* skip self */; if (!pid_insert(pids, pid)) continue /* such pid already processed */; int err = lck_rpid_check(env, pid); if (err == MDBX_RESULT_TRUE) continue /* reader is live */; if (err != MDBX_SUCCESS) { rc = err; break /* lck_rpid_check() failed */; } /* stale reader found */ if (!rdt_locked) { err = lck_rdt_lock(env); if (MDBX_IS_ERROR(err)) { rc = err; break; } rdt_locked = -1; if (err == MDBX_RESULT_TRUE) { /* mutex recovered, the mdbx_ipclock_failed() checked all readers */ rc = MDBX_RESULT_TRUE; break; } /* a other process may have clean and reused slot, recheck */ if (lck->rdt[i].pid.weak != pid) continue; err = lck_rpid_check(env, pid); if (MDBX_IS_ERROR(err)) { rc = err; break; } if (err != MDBX_SUCCESS) continue /* the race with other process, slot reused */; } /* clean it */ for (size_t ii = i; ii < snap_nreaders; ii++) { if (lck->rdt[ii].pid.weak == pid) { DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, lck->rdt[ii].txnid.weak); atomic_store32(&lck->rdt[ii].pid, 0, mo_Relaxed); atomic_store32(&lck->rdt_refresh_flag, true, mo_AcquireRelease); count++; } } } if (likely(!MDBX_IS_ERROR(rc))) atomic_store64(&lck->readers_check_timestamp, osal_monotime(), mo_Relaxed); if (rdt_locked < 0) lck_rdt_unlock(env); if (pids != pidsbuf_onstask) osal_free(pids); if (dead) *dead = count; return rc; } int txn_park(MDBX_txn *txn, bool autounpark) { reader_slot_t *const rslot = txn->to.reader; tASSERT(txn, (txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) == MDBX_TXN_RDONLY); tASSERT(txn, txn->to.reader->tid.weak < MDBX_TID_TXN_OUSTED); if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) != MDBX_TXN_RDONLY)) return MDBX_BAD_TXN; const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed); const uint64_t tid = atomic_load64(&rslot->tid, mo_Relaxed); const uint64_t txnid = atomic_load64(&rslot->txnid, mo_Relaxed); if (unlikely(pid != txn->env->pid)) { ERROR("unexpected pid %u%s%u", pid, " != must ", txn->env->pid); return MDBX_PROBLEM; } if (unlikely(tid != txn->owner || txnid != txn->txnid)) { ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%0zx" " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, tid, " != must ", txn->owner, txnid, " != must ", txn->txnid); return MDBX_BAD_RSLOT; } atomic_store64(&rslot->tid, MDBX_TID_TXN_PARKED, mo_AcquireRelease); atomic_store32(&txn->env->lck->rdt_refresh_flag, true, mo_Relaxed); txn->flags += autounpark ? MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK : MDBX_TXN_PARKED; return MDBX_SUCCESS; } int txn_unpark(MDBX_txn *txn) { if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD | MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) != (MDBX_TXN_RDONLY | MDBX_TXN_PARKED))) return MDBX_BAD_TXN; for (reader_slot_t *const rslot = txn->to.reader; rslot; atomic_yield()) { const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed); uint64_t tid = safe64_read(&rslot->tid); uint64_t txnid = safe64_read(&rslot->txnid); if (unlikely(pid != txn->env->pid)) { ERROR("unexpected pid %u%s%u", pid, " != expected ", txn->env->pid); return MDBX_PROBLEM; } if (unlikely(tid == MDBX_TID_TXN_OUSTED || txnid >= SAFE64_INVALID_THRESHOLD)) break; if (unlikely(tid != MDBX_TID_TXN_PARKED || txnid != txn->txnid)) { ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%" PRIx64 " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, tid, " != must ", MDBX_TID_TXN_OUSTED, txnid, " != must ", txn->txnid); break; } if (unlikely((txn->flags & MDBX_TXN_ERROR))) break; #if MDBX_64BIT_CAS if (unlikely(!atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED, txn->owner))) continue; #else atomic_store32(&rslot->tid.high, (uint32_t)((uint64_t)txn->owner >> 32), mo_Relaxed); if (unlikely(!atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED, (uint32_t)txn->owner))) { atomic_store32(&rslot->tid.high, (uint32_t)(MDBX_TID_TXN_PARKED >> 32), mo_AcquireRelease); continue; } #endif txnid = safe64_read(&rslot->txnid); tid = safe64_read(&rslot->tid); if (unlikely(txnid != txn->txnid || tid != txn->owner)) { ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%zx" " and/or txn-id %" PRIaTXN "%s%" PRIaTXN, tid, " != must ", txn->owner, txnid, " != must ", txn->txnid); break; } txn->flags &= ~(MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK); return MDBX_SUCCESS; } int err = txn_end(txn, TXN_END_OUSTED | TXN_END_RESET | TXN_END_UPDATE); return err ? err : MDBX_OUSTED; } __cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) { DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler); osal_memory_fence(mo_AcquireRelease, false); MDBX_hsr_func *const callback = env->hsr_callback; txnid_t oldest = 0; bool notify_eof_of_loop = false; int retry = 0; do { const txnid_t steady = env->txn->tw.troika.txnid[env->txn->tw.troika.prefer_steady]; env->lck->rdt_refresh_flag.weak = /* force refresh */ true; oldest = mvcc_shapshot_oldest(env, steady); eASSERT(env, oldest < env->basal_txn->txnid); eASSERT(env, oldest >= straggler); eASSERT(env, oldest >= env->lck->cached_oldest.weak); lck_t *const lck = env->lck_mmap.lck; if (oldest == steady || oldest > straggler || /* without-LCK mode */ !lck) break; if (MDBX_IS_ERROR(mvcc_cleanup_dead(env, false, nullptr))) break; reader_slot_t *stucked = nullptr; uint64_t hold_retired = 0; for (size_t i = 0; i < lck->rdt_length.weak; ++i) { uint32_t pid; reader_slot_t *const rslot = &lck->rdt[i]; txnid_t rtxn = safe64_read(&rslot->txnid); retry: if (rtxn == straggler && (pid = atomic_load32(&rslot->pid, mo_AcquireRelease)) != 0) { const uint64_t tid = safe64_read(&rslot->tid); if (tid == MDBX_TID_TXN_PARKED) { /* Читающая транзакция была помечена владельцем как "припаркованная", * т.е. подлежащая асинхронному прерыванию, либо восстановлению * по активности читателя. * * Если первый CAS(slot->tid) будет успешным, то * safe64_reset_compare() безопасно очистит txnid, либо откажется * из-за того что читатель сбросил и/или перезапустил транзакцию. * При этом читатеть может не заметить вытестения, если приступит * к завершению транзакции. Все эти исходы нас устраивют. * * Если первый CAS(slot->tid) будет НЕ успешным, то значит читатеть * восстановил транзакцию, либо завершил её, либо даже освободил слот. */ bool ousted = #if MDBX_64BIT_CAS atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED, MDBX_TID_TXN_OUSTED); #else atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED, (uint32_t)MDBX_TID_TXN_OUSTED); #endif if (likely(ousted)) { ousted = safe64_reset_compare(&rslot->txnid, rtxn); NOTICE("ousted-%s parked read-txn %" PRIaTXN ", pid %u, tid 0x%" PRIx64, ousted ? "complete" : "half", rtxn, pid, tid); eASSERT(env, ousted || safe64_read(&rslot->txnid) > straggler); continue; } rtxn = safe64_read(&rslot->txnid); goto retry; } hold_retired = atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed); stucked = rslot; } } if (!callback || !stucked) break; uint32_t pid = atomic_load32(&stucked->pid, mo_AcquireRelease); uint64_t tid = safe64_read(&stucked->tid); if (safe64_read(&stucked->txnid) != straggler || !pid) continue; const meta_ptr_t head = meta_recent(env, &env->txn->tw.troika); const txnid_t gap = (head.txnid - straggler) / xMDBX_TXNID_STEP; const uint64_t head_retired = unaligned_peek_u64(4, head.ptr_c->pages_retired); const size_t space = (head_retired > hold_retired) ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) : 0; int rc = callback(env, env->txn, pid, (mdbx_tid_t)((intptr_t)tid), straggler, (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); if (rc < 0) /* hsr returned error and/or agree MDBX_MAP_FULL error */ break; if (rc > 0) { if (rc == 1) { /* hsr reported transaction (will be) aborted asynchronous */ safe64_reset_compare(&stucked->txnid, straggler); } else { /* hsr reported reader process was killed and slot should be cleared */ safe64_reset(&stucked->txnid, true); atomic_store64(&stucked->tid, 0, mo_Relaxed); atomic_store32(&stucked->pid, 0, mo_AcquireRelease); } } else if (!notify_eof_of_loop) { #if MDBX_ENABLE_PROFGC env->lck->pgops.gc_prof.kicks += 1; #endif /* MDBX_ENABLE_PROFGC */ notify_eof_of_loop = true; } } while (++retry < INT_MAX); if (notify_eof_of_loop) { /* notify end of hsr-loop */ const txnid_t turn = oldest - straggler; if (turn) NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, straggler, oldest, turn); callback(env, env->txn, 0, 0, straggler, (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry); } return oldest; } /*----------------------------------------------------------------------------*/ __cold int mdbx_thread_register(const MDBX_env *env) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (unlikely(!env->lck_mmap.lck)) return (env->flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; if (unlikely((env->flags & ENV_TXKEY) == 0)) { eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS); return MDBX_EINVAL /* MDBX_NOSTICKYTHREADS mode */; } eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY); reader_slot_t *r = thread_rthc_get(env->me_txkey); if (unlikely(r != nullptr)) { eASSERT(env, r->pid.weak == env->pid); eASSERT(env, r->tid.weak == osal_thread_self()); if (unlikely(r->pid.weak != env->pid)) return MDBX_BAD_RSLOT; return MDBX_RESULT_TRUE /* already registered */; } return mvcc_bind_slot((MDBX_env *)env).err; } __cold int mdbx_thread_unregister(const MDBX_env *env) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (unlikely(!env->lck_mmap.lck)) return MDBX_RESULT_TRUE; if (unlikely((env->flags & ENV_TXKEY) == 0)) { eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS); return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */; } eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY); reader_slot_t *r = thread_rthc_get(env->me_txkey); if (unlikely(r == nullptr)) return MDBX_RESULT_TRUE /* not registered */; eASSERT(env, r->pid.weak == env->pid); eASSERT(env, r->tid.weak == osal_thread_self()); if (unlikely(r->pid.weak != env->pid || r->tid.weak != osal_thread_self())) return MDBX_BAD_RSLOT; eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD); if (unlikely(r->txnid.weak < SAFE64_INVALID_THRESHOLD)) return MDBX_BUSY /* transaction is still active */; atomic_store32(&r->pid, 0, mo_Relaxed); atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease); thread_rthc_set(env->me_txkey, nullptr); return MDBX_SUCCESS; }