2024-05-19 22:07:58 +03:00
|
|
|
|
/// \copyright SPDX-License-Identifier: Apache-2.0
|
|
|
|
|
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
|
|
|
|
|
|
|
|
|
#include "internals.h"
|
|
|
|
|
|
2024-07-09 16:04:01 +03:00
|
|
|
|
bsr_t mvcc_bind_slot(MDBX_env *env) {
|
2024-05-19 22:07:58 +03:00
|
|
|
|
eASSERT(env, env->lck_mmap.lck);
|
|
|
|
|
eASSERT(env, env->lck->magic_and_version == MDBX_LOCK_MAGIC);
|
|
|
|
|
eASSERT(env, env->lck->os_and_format == MDBX_LOCK_FORMAT);
|
|
|
|
|
|
|
|
|
|
bsr_t result = {lck_rdt_lock(env), nullptr};
|
|
|
|
|
if (unlikely(MDBX_IS_ERROR(result.err)))
|
|
|
|
|
return result;
|
|
|
|
|
if (unlikely(env->flags & ENV_FATAL_ERROR)) {
|
|
|
|
|
lck_rdt_unlock(env);
|
|
|
|
|
result.err = MDBX_PANIC;
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
if (unlikely(!env->dxb_mmap.base)) {
|
|
|
|
|
lck_rdt_unlock(env);
|
|
|
|
|
result.err = MDBX_EPERM;
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (unlikely(env->registered_reader_pid != env->pid)) {
|
|
|
|
|
result.err = lck_rpid_set(env);
|
|
|
|
|
if (unlikely(result.err != MDBX_SUCCESS)) {
|
|
|
|
|
lck_rdt_unlock(env);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
env->registered_reader_pid = env->pid;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result.err = MDBX_SUCCESS;
|
|
|
|
|
size_t slot, nreaders;
|
|
|
|
|
while (1) {
|
|
|
|
|
nreaders = env->lck->rdt_length.weak;
|
|
|
|
|
for (slot = 0; slot < nreaders; slot++)
|
|
|
|
|
if (!atomic_load32(&env->lck->rdt[slot].pid, mo_AcquireRelease))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (likely(slot < env->max_readers))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
result.err = mvcc_cleanup_dead(env, true, nullptr);
|
|
|
|
|
if (result.err != MDBX_RESULT_TRUE) {
|
|
|
|
|
lck_rdt_unlock(env);
|
|
|
|
|
result.err =
|
|
|
|
|
(result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err;
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
result.rslot = &env->lck->rdt[slot];
|
|
|
|
|
/* Claim the reader slot, carefully since other code
|
|
|
|
|
* uses the reader table un-mutexed: First reset the
|
|
|
|
|
* slot, next publish it in lck->rdt_length. After
|
|
|
|
|
* that, it is safe for mdbx_env_close() to touch it.
|
|
|
|
|
* When it will be closed, we can finally claim it. */
|
|
|
|
|
atomic_store32(&result.rslot->pid, 0, mo_AcquireRelease);
|
|
|
|
|
safe64_reset(&result.rslot->txnid, true);
|
|
|
|
|
if (slot == nreaders)
|
|
|
|
|
env->lck->rdt_length.weak = (uint32_t)++nreaders;
|
2024-07-09 16:04:01 +03:00
|
|
|
|
result.rslot->tid.weak =
|
|
|
|
|
(env->flags & MDBX_NOSTICKYTHREADS) ? 0 : osal_thread_self();
|
2024-05-19 22:07:58 +03:00
|
|
|
|
atomic_store32(&result.rslot->pid, env->pid, mo_AcquireRelease);
|
|
|
|
|
lck_rdt_unlock(env);
|
|
|
|
|
|
|
|
|
|
if (likely(env->flags & ENV_TXKEY)) {
|
|
|
|
|
eASSERT(env, env->registered_reader_pid == env->pid);
|
|
|
|
|
thread_rthc_set(env->me_txkey, result.rslot);
|
|
|
|
|
}
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__hot txnid_t mvcc_shapshot_oldest(MDBX_env *const env, const txnid_t steady) {
|
|
|
|
|
const uint32_t nothing_changed = MDBX_STRING_TETRAD("None");
|
|
|
|
|
eASSERT(env, steady <= env->basal_txn->txnid);
|
|
|
|
|
|
|
|
|
|
lck_t *const lck = env->lck_mmap.lck;
|
|
|
|
|
if (unlikely(lck == nullptr /* exclusive without-lck mode */)) {
|
|
|
|
|
eASSERT(env, env->lck == lckless_stub(env));
|
|
|
|
|
env->lck->rdt_refresh_flag.weak = nothing_changed;
|
|
|
|
|
return env->lck->cached_oldest.weak = steady;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const txnid_t prev_oldest =
|
|
|
|
|
atomic_load64(&lck->cached_oldest, mo_AcquireRelease);
|
|
|
|
|
eASSERT(env, steady >= prev_oldest);
|
|
|
|
|
|
|
|
|
|
txnid_t new_oldest = prev_oldest;
|
|
|
|
|
while (nothing_changed !=
|
|
|
|
|
atomic_load32(&lck->rdt_refresh_flag, mo_AcquireRelease)) {
|
|
|
|
|
lck->rdt_refresh_flag.weak = nothing_changed;
|
|
|
|
|
jitter4testing(false);
|
|
|
|
|
const size_t snap_nreaders =
|
|
|
|
|
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
|
|
|
|
new_oldest = steady;
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < snap_nreaders; ++i) {
|
|
|
|
|
const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease);
|
|
|
|
|
if (!pid)
|
|
|
|
|
continue;
|
|
|
|
|
jitter4testing(true);
|
|
|
|
|
|
|
|
|
|
const txnid_t rtxn = safe64_read(&lck->rdt[i].txnid);
|
|
|
|
|
if (unlikely(rtxn < prev_oldest)) {
|
|
|
|
|
if (unlikely(nothing_changed == atomic_load32(&lck->rdt_refresh_flag,
|
|
|
|
|
mo_AcquireRelease)) &&
|
|
|
|
|
safe64_reset_compare(&lck->rdt[i].txnid, rtxn)) {
|
|
|
|
|
NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN
|
|
|
|
|
" < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN,
|
|
|
|
|
i, snap_nreaders, pid, rtxn, prev_oldest, steady);
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (rtxn < new_oldest) {
|
|
|
|
|
new_oldest = rtxn;
|
|
|
|
|
if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (new_oldest != prev_oldest) {
|
|
|
|
|
VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest);
|
|
|
|
|
eASSERT(env, new_oldest >= lck->cached_oldest.weak);
|
|
|
|
|
atomic_store64(&lck->cached_oldest, new_oldest, mo_Relaxed);
|
|
|
|
|
}
|
|
|
|
|
return new_oldest;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pgno_t mvcc_snapshot_largest(const MDBX_env *env, pgno_t last_used_page) {
|
|
|
|
|
lck_t *const lck = env->lck_mmap.lck;
|
|
|
|
|
if (likely(lck != nullptr /* check for exclusive without-lck mode */)) {
|
|
|
|
|
retry:;
|
|
|
|
|
const size_t snap_nreaders =
|
|
|
|
|
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
|
|
|
|
for (size_t i = 0; i < snap_nreaders; ++i) {
|
|
|
|
|
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) {
|
|
|
|
|
/* jitter4testing(true); */
|
|
|
|
|
const pgno_t snap_pages =
|
|
|
|
|
atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed);
|
|
|
|
|
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
|
|
|
|
|
if (unlikely(snap_pages !=
|
|
|
|
|
atomic_load32(&lck->rdt[i].snapshot_pages_used,
|
|
|
|
|
mo_AcquireRelease) ||
|
|
|
|
|
snap_txnid != safe64_read(&lck->rdt[i].txnid)))
|
|
|
|
|
goto retry;
|
|
|
|
|
if (last_used_page < snap_pages && snap_txnid <= env->basal_txn->txnid)
|
|
|
|
|
last_used_page = snap_pages;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return last_used_page;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Find largest mvcc-snapshot still referenced by this process. */
|
|
|
|
|
pgno_t mvcc_largest_this(MDBX_env *env, pgno_t largest) {
|
|
|
|
|
lck_t *const lck = env->lck_mmap.lck;
|
|
|
|
|
if (likely(lck != nullptr /* exclusive mode */)) {
|
|
|
|
|
const size_t snap_nreaders =
|
|
|
|
|
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
|
|
|
|
for (size_t i = 0; i < snap_nreaders; ++i) {
|
|
|
|
|
retry:
|
|
|
|
|
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease) == env->pid) {
|
|
|
|
|
/* jitter4testing(true); */
|
|
|
|
|
const pgno_t snap_pages =
|
|
|
|
|
atomic_load32(&lck->rdt[i].snapshot_pages_used, mo_Relaxed);
|
|
|
|
|
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
|
|
|
|
|
if (unlikely(snap_pages !=
|
|
|
|
|
atomic_load32(&lck->rdt[i].snapshot_pages_used,
|
|
|
|
|
mo_AcquireRelease) ||
|
|
|
|
|
snap_txnid != safe64_read(&lck->rdt[i].txnid)))
|
|
|
|
|
goto retry;
|
|
|
|
|
if (largest < snap_pages &&
|
|
|
|
|
atomic_load64(&lck->cached_oldest, mo_AcquireRelease) <=
|
|
|
|
|
/* ignore pending updates */ snap_txnid &&
|
|
|
|
|
snap_txnid <= MAX_TXNID)
|
|
|
|
|
largest = snap_pages;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return largest;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static bool pid_insert(uint32_t *list, uint32_t pid) {
|
|
|
|
|
/* binary search of pid in list */
|
|
|
|
|
size_t base = 0;
|
|
|
|
|
size_t cursor = 1;
|
|
|
|
|
int32_t val = 0;
|
|
|
|
|
size_t n = /* length */ list[0];
|
|
|
|
|
|
|
|
|
|
while (n > 0) {
|
|
|
|
|
size_t pivot = n >> 1;
|
|
|
|
|
cursor = base + pivot + 1;
|
|
|
|
|
val = pid - list[cursor];
|
|
|
|
|
|
|
|
|
|
if (val < 0) {
|
|
|
|
|
n = pivot;
|
|
|
|
|
} else if (val > 0) {
|
|
|
|
|
base = cursor;
|
|
|
|
|
n -= pivot + 1;
|
|
|
|
|
} else {
|
|
|
|
|
/* found, so it's a duplicate */
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (val > 0)
|
|
|
|
|
++cursor;
|
|
|
|
|
|
|
|
|
|
list[0]++;
|
|
|
|
|
for (n = list[0]; n > cursor; n--)
|
|
|
|
|
list[n] = list[n - 1];
|
|
|
|
|
list[n] = pid;
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__cold MDBX_INTERNAL int mvcc_cleanup_dead(MDBX_env *env, int rdt_locked,
|
|
|
|
|
int *dead) {
|
|
|
|
|
int rc = check_env(env, true);
|
|
|
|
|
if (unlikely(rc != MDBX_SUCCESS))
|
|
|
|
|
return rc;
|
|
|
|
|
|
|
|
|
|
eASSERT(env, rdt_locked >= 0);
|
|
|
|
|
lck_t *const lck = env->lck_mmap.lck;
|
|
|
|
|
if (unlikely(lck == nullptr)) {
|
|
|
|
|
/* exclusive mode */
|
|
|
|
|
if (dead)
|
|
|
|
|
*dead = 0;
|
|
|
|
|
return MDBX_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const size_t snap_nreaders =
|
|
|
|
|
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
|
|
|
|
uint32_t pidsbuf_onstask[142];
|
|
|
|
|
uint32_t *const pids =
|
|
|
|
|
(snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask))
|
|
|
|
|
? pidsbuf_onstask
|
|
|
|
|
: osal_malloc((snap_nreaders + 1) * sizeof(uint32_t));
|
|
|
|
|
if (unlikely(!pids))
|
|
|
|
|
return MDBX_ENOMEM;
|
|
|
|
|
|
|
|
|
|
pids[0] = 0;
|
|
|
|
|
int count = 0;
|
|
|
|
|
for (size_t i = 0; i < snap_nreaders; i++) {
|
|
|
|
|
const uint32_t pid = atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease);
|
|
|
|
|
if (pid == 0)
|
|
|
|
|
continue /* skip empty */;
|
|
|
|
|
if (pid == env->pid)
|
|
|
|
|
continue /* skip self */;
|
|
|
|
|
if (!pid_insert(pids, pid))
|
|
|
|
|
continue /* such pid already processed */;
|
|
|
|
|
|
|
|
|
|
int err = lck_rpid_check(env, pid);
|
|
|
|
|
if (err == MDBX_RESULT_TRUE)
|
|
|
|
|
continue /* reader is live */;
|
|
|
|
|
|
|
|
|
|
if (err != MDBX_SUCCESS) {
|
|
|
|
|
rc = err;
|
|
|
|
|
break /* lck_rpid_check() failed */;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* stale reader found */
|
|
|
|
|
if (!rdt_locked) {
|
|
|
|
|
err = lck_rdt_lock(env);
|
|
|
|
|
if (MDBX_IS_ERROR(err)) {
|
|
|
|
|
rc = err;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
rdt_locked = -1;
|
|
|
|
|
if (err == MDBX_RESULT_TRUE) {
|
|
|
|
|
/* mutex recovered, the mdbx_ipclock_failed() checked all readers */
|
|
|
|
|
rc = MDBX_RESULT_TRUE;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* a other process may have clean and reused slot, recheck */
|
|
|
|
|
if (lck->rdt[i].pid.weak != pid)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
err = lck_rpid_check(env, pid);
|
|
|
|
|
if (MDBX_IS_ERROR(err)) {
|
|
|
|
|
rc = err;
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (err != MDBX_SUCCESS)
|
|
|
|
|
continue /* the race with other process, slot reused */;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* clean it */
|
|
|
|
|
for (size_t ii = i; ii < snap_nreaders; ii++) {
|
|
|
|
|
if (lck->rdt[ii].pid.weak == pid) {
|
|
|
|
|
DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid,
|
|
|
|
|
lck->rdt[ii].txnid.weak);
|
|
|
|
|
atomic_store32(&lck->rdt[ii].pid, 0, mo_Relaxed);
|
|
|
|
|
atomic_store32(&lck->rdt_refresh_flag, true, mo_AcquireRelease);
|
|
|
|
|
count++;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (likely(!MDBX_IS_ERROR(rc)))
|
|
|
|
|
atomic_store64(&lck->readers_check_timestamp, osal_monotime(), mo_Relaxed);
|
|
|
|
|
|
|
|
|
|
if (rdt_locked < 0)
|
|
|
|
|
lck_rdt_unlock(env);
|
|
|
|
|
|
|
|
|
|
if (pids != pidsbuf_onstask)
|
|
|
|
|
osal_free(pids);
|
|
|
|
|
|
|
|
|
|
if (dead)
|
|
|
|
|
*dead = count;
|
|
|
|
|
return rc;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-09 16:04:01 +03:00
|
|
|
|
int txn_park(MDBX_txn *txn, bool autounpark) {
|
|
|
|
|
reader_slot_t *const rslot = txn->to.reader;
|
|
|
|
|
tASSERT(txn, (txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY |
|
|
|
|
|
MDBX_TXN_PARKED)) == MDBX_TXN_RDONLY);
|
|
|
|
|
tASSERT(txn, txn->to.reader->tid.weak < MDBX_TID_TXN_OUSTED);
|
|
|
|
|
if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY |
|
|
|
|
|
MDBX_TXN_PARKED)) != MDBX_TXN_RDONLY))
|
|
|
|
|
return MDBX_BAD_TXN;
|
|
|
|
|
|
|
|
|
|
const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed);
|
|
|
|
|
const uint64_t tid = atomic_load64(&rslot->tid, mo_Relaxed);
|
|
|
|
|
const uint64_t txnid = atomic_load64(&rslot->txnid, mo_Relaxed);
|
|
|
|
|
if (unlikely(pid != txn->env->pid)) {
|
|
|
|
|
ERROR("unexpected pid %u%s%u", pid, " != must ", txn->env->pid);
|
|
|
|
|
return MDBX_PROBLEM;
|
|
|
|
|
}
|
|
|
|
|
if (unlikely(tid != txn->owner || txnid != txn->txnid)) {
|
|
|
|
|
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%0zx"
|
|
|
|
|
" and/or txn-id %" PRIaTXN "%s%" PRIaTXN,
|
|
|
|
|
tid, " != must ", txn->owner, txnid, " != must ", txn->txnid);
|
|
|
|
|
return MDBX_BAD_RSLOT;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
atomic_store64(&rslot->tid, MDBX_TID_TXN_PARKED, mo_AcquireRelease);
|
|
|
|
|
atomic_store32(&txn->env->lck->rdt_refresh_flag, true, mo_Relaxed);
|
|
|
|
|
txn->flags +=
|
|
|
|
|
autounpark ? MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK : MDBX_TXN_PARKED;
|
|
|
|
|
return MDBX_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int txn_unpark(MDBX_txn *txn) {
|
|
|
|
|
if (unlikely((txn->flags & (MDBX_TXN_FINISHED | MDBX_TXN_HAS_CHILD |
|
|
|
|
|
MDBX_TXN_RDONLY | MDBX_TXN_PARKED)) !=
|
|
|
|
|
(MDBX_TXN_RDONLY | MDBX_TXN_PARKED)))
|
|
|
|
|
return MDBX_BAD_TXN;
|
|
|
|
|
|
|
|
|
|
for (reader_slot_t *const rslot = txn->to.reader; rslot; atomic_yield()) {
|
|
|
|
|
const uint32_t pid = atomic_load32(&rslot->pid, mo_Relaxed);
|
|
|
|
|
uint64_t tid = safe64_read(&rslot->tid);
|
|
|
|
|
uint64_t txnid = safe64_read(&rslot->txnid);
|
|
|
|
|
if (unlikely(pid != txn->env->pid)) {
|
|
|
|
|
ERROR("unexpected pid %u%s%u", pid, " != expected ", txn->env->pid);
|
|
|
|
|
return MDBX_PROBLEM;
|
|
|
|
|
}
|
|
|
|
|
if (unlikely(tid == MDBX_TID_TXN_OUSTED ||
|
|
|
|
|
txnid >= SAFE64_INVALID_THRESHOLD))
|
|
|
|
|
break;
|
|
|
|
|
if (unlikely(tid != MDBX_TID_TXN_PARKED || txnid != txn->txnid)) {
|
|
|
|
|
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%" PRIx64
|
|
|
|
|
" and/or txn-id %" PRIaTXN "%s%" PRIaTXN,
|
|
|
|
|
tid, " != must ", MDBX_TID_TXN_OUSTED, txnid, " != must ",
|
|
|
|
|
txn->txnid);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
if (unlikely((txn->flags & MDBX_TXN_ERROR)))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
#if MDBX_64BIT_CAS
|
|
|
|
|
if (unlikely(!atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED, txn->owner)))
|
|
|
|
|
continue;
|
|
|
|
|
#else
|
|
|
|
|
atomic_store32(&rslot->tid.high, (uint32_t)((uint64_t)txn->owner >> 32),
|
|
|
|
|
mo_Relaxed);
|
|
|
|
|
if (unlikely(!atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED,
|
|
|
|
|
(uint32_t)txn->owner))) {
|
|
|
|
|
atomic_store32(&rslot->tid.high, (uint32_t)(MDBX_TID_TXN_PARKED >> 32),
|
|
|
|
|
mo_AcquireRelease);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
txnid = safe64_read(&rslot->txnid);
|
|
|
|
|
tid = safe64_read(&rslot->tid);
|
|
|
|
|
if (unlikely(txnid != txn->txnid || tid != txn->owner)) {
|
|
|
|
|
ERROR("unexpected thread-id 0x%" PRIx64 "%s0x%zx"
|
|
|
|
|
" and/or txn-id %" PRIaTXN "%s%" PRIaTXN,
|
|
|
|
|
tid, " != must ", txn->owner, txnid, " != must ", txn->txnid);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
txn->flags &= ~(MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK);
|
|
|
|
|
return MDBX_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int err = txn_end(txn, TXN_END_OUSTED | TXN_END_RESET | TXN_END_UPDATE);
|
|
|
|
|
return err ? err : MDBX_OUSTED;
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-19 22:07:58 +03:00
|
|
|
|
__cold txnid_t mvcc_kick_laggards(MDBX_env *env, const txnid_t straggler) {
|
|
|
|
|
DEBUG("DB size maxed out by reading #%" PRIaTXN, straggler);
|
|
|
|
|
osal_memory_fence(mo_AcquireRelease, false);
|
|
|
|
|
MDBX_hsr_func *const callback = env->hsr_callback;
|
|
|
|
|
txnid_t oldest = 0;
|
|
|
|
|
bool notify_eof_of_loop = false;
|
|
|
|
|
int retry = 0;
|
|
|
|
|
do {
|
|
|
|
|
const txnid_t steady =
|
|
|
|
|
env->txn->tw.troika.txnid[env->txn->tw.troika.prefer_steady];
|
|
|
|
|
env->lck->rdt_refresh_flag.weak = /* force refresh */ true;
|
|
|
|
|
oldest = mvcc_shapshot_oldest(env, steady);
|
|
|
|
|
eASSERT(env, oldest < env->basal_txn->txnid);
|
|
|
|
|
eASSERT(env, oldest >= straggler);
|
|
|
|
|
eASSERT(env, oldest >= env->lck->cached_oldest.weak);
|
|
|
|
|
|
|
|
|
|
lck_t *const lck = env->lck_mmap.lck;
|
|
|
|
|
if (oldest == steady || oldest > straggler || /* without-LCK mode */ !lck)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (MDBX_IS_ERROR(mvcc_cleanup_dead(env, false, nullptr)))
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
reader_slot_t *stucked = nullptr;
|
|
|
|
|
uint64_t hold_retired = 0;
|
|
|
|
|
for (size_t i = 0; i < lck->rdt_length.weak; ++i) {
|
2024-07-09 16:04:01 +03:00
|
|
|
|
uint32_t pid;
|
|
|
|
|
reader_slot_t *const rslot = &lck->rdt[i];
|
|
|
|
|
txnid_t rtxn = safe64_read(&rslot->txnid);
|
|
|
|
|
retry:
|
2024-05-19 22:07:58 +03:00
|
|
|
|
if (rtxn == straggler &&
|
2024-07-09 16:04:01 +03:00
|
|
|
|
(pid = atomic_load32(&rslot->pid, mo_AcquireRelease)) != 0) {
|
|
|
|
|
const uint64_t tid = safe64_read(&rslot->tid);
|
|
|
|
|
if (tid == MDBX_TID_TXN_PARKED) {
|
|
|
|
|
/* Читающая транзакция была помечена владельцем как "припаркованная",
|
|
|
|
|
* т.е. подлежащая асинхронному прерыванию, либо восстановлению
|
|
|
|
|
* по активности читателя.
|
|
|
|
|
*
|
|
|
|
|
* Если первый CAS(slot->tid) будет успешным, то
|
|
|
|
|
* safe64_reset_compare() безопасно очистит txnid, либо откажется
|
|
|
|
|
* из-за того что читатель сбросил и/или перезапустил транзакцию.
|
|
|
|
|
* При этом читатеть может не заметить вытестения, если приступит
|
|
|
|
|
* к завершению транзакции. Все эти исходы нас устраивют.
|
|
|
|
|
*
|
|
|
|
|
* Если первый CAS(slot->tid) будет НЕ успешным, то значит читатеть
|
|
|
|
|
* восстановил транзакцию, либо завершил её, либо даже освободил слот.
|
|
|
|
|
*/
|
|
|
|
|
bool ousted =
|
|
|
|
|
#if MDBX_64BIT_CAS
|
|
|
|
|
atomic_cas64(&rslot->tid, MDBX_TID_TXN_PARKED,
|
|
|
|
|
MDBX_TID_TXN_OUSTED);
|
|
|
|
|
#else
|
|
|
|
|
atomic_cas32(&rslot->tid.low, (uint32_t)MDBX_TID_TXN_PARKED,
|
|
|
|
|
(uint32_t)MDBX_TID_TXN_OUSTED);
|
|
|
|
|
#endif
|
|
|
|
|
if (likely(ousted)) {
|
|
|
|
|
ousted = safe64_reset_compare(&rslot->txnid, rtxn);
|
|
|
|
|
NOTICE("ousted-%s parked read-txn %" PRIaTXN
|
|
|
|
|
", pid %u, tid 0x%" PRIx64,
|
|
|
|
|
ousted ? "complete" : "half", rtxn, pid, tid);
|
|
|
|
|
eASSERT(env, ousted || safe64_read(&rslot->txnid) > straggler);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
rtxn = safe64_read(&rslot->txnid);
|
|
|
|
|
goto retry;
|
|
|
|
|
}
|
|
|
|
|
hold_retired =
|
|
|
|
|
atomic_load64(&lck->rdt[i].snapshot_pages_retired, mo_Relaxed);
|
|
|
|
|
stucked = rslot;
|
2024-05-19 22:07:58 +03:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-09 16:04:01 +03:00
|
|
|
|
if (!callback || !stucked)
|
2024-05-19 22:07:58 +03:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
uint32_t pid = atomic_load32(&stucked->pid, mo_AcquireRelease);
|
2024-07-09 16:04:01 +03:00
|
|
|
|
uint64_t tid = safe64_read(&stucked->tid);
|
|
|
|
|
if (safe64_read(&stucked->txnid) != straggler || !pid)
|
2024-05-19 22:07:58 +03:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
const meta_ptr_t head = meta_recent(env, &env->txn->tw.troika);
|
|
|
|
|
const txnid_t gap = (head.txnid - straggler) / xMDBX_TXNID_STEP;
|
|
|
|
|
const uint64_t head_retired =
|
|
|
|
|
unaligned_peek_u64(4, head.ptr_c->pages_retired);
|
|
|
|
|
const size_t space =
|
|
|
|
|
(head_retired > hold_retired)
|
|
|
|
|
? pgno2bytes(env, (pgno_t)(head_retired - hold_retired))
|
|
|
|
|
: 0;
|
|
|
|
|
int rc =
|
|
|
|
|
callback(env, env->txn, pid, (mdbx_tid_t)((intptr_t)tid), straggler,
|
|
|
|
|
(gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry);
|
|
|
|
|
if (rc < 0)
|
|
|
|
|
/* hsr returned error and/or agree MDBX_MAP_FULL error */
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (rc > 0) {
|
|
|
|
|
if (rc == 1) {
|
|
|
|
|
/* hsr reported transaction (will be) aborted asynchronous */
|
|
|
|
|
safe64_reset_compare(&stucked->txnid, straggler);
|
|
|
|
|
} else {
|
|
|
|
|
/* hsr reported reader process was killed and slot should be cleared */
|
|
|
|
|
safe64_reset(&stucked->txnid, true);
|
|
|
|
|
atomic_store64(&stucked->tid, 0, mo_Relaxed);
|
|
|
|
|
atomic_store32(&stucked->pid, 0, mo_AcquireRelease);
|
|
|
|
|
}
|
|
|
|
|
} else if (!notify_eof_of_loop) {
|
|
|
|
|
#if MDBX_ENABLE_PROFGC
|
|
|
|
|
env->lck->pgops.gc_prof.kicks += 1;
|
|
|
|
|
#endif /* MDBX_ENABLE_PROFGC */
|
|
|
|
|
notify_eof_of_loop = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} while (++retry < INT_MAX);
|
|
|
|
|
|
|
|
|
|
if (notify_eof_of_loop) {
|
|
|
|
|
/* notify end of hsr-loop */
|
|
|
|
|
const txnid_t turn = oldest - straggler;
|
|
|
|
|
if (turn)
|
|
|
|
|
NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN,
|
|
|
|
|
straggler, oldest, turn);
|
|
|
|
|
callback(env, env->txn, 0, 0, straggler,
|
|
|
|
|
(turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry);
|
|
|
|
|
}
|
|
|
|
|
return oldest;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*----------------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
|
|
__cold int mdbx_thread_register(const MDBX_env *env) {
|
|
|
|
|
int rc = check_env(env, true);
|
|
|
|
|
if (unlikely(rc != MDBX_SUCCESS))
|
2024-11-25 21:20:02 +03:00
|
|
|
|
return LOG_IFERR(rc);
|
2024-05-19 22:07:58 +03:00
|
|
|
|
|
|
|
|
|
if (unlikely(!env->lck_mmap.lck))
|
2024-11-25 21:20:02 +03:00
|
|
|
|
return LOG_IFERR((env->flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM);
|
2024-05-19 22:07:58 +03:00
|
|
|
|
|
|
|
|
|
if (unlikely((env->flags & ENV_TXKEY) == 0)) {
|
|
|
|
|
eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS);
|
2024-11-25 21:20:02 +03:00
|
|
|
|
return LOG_IFERR(MDBX_EINVAL) /* MDBX_NOSTICKYTHREADS mode */;
|
2024-05-19 22:07:58 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY);
|
|
|
|
|
reader_slot_t *r = thread_rthc_get(env->me_txkey);
|
|
|
|
|
if (unlikely(r != nullptr)) {
|
|
|
|
|
eASSERT(env, r->pid.weak == env->pid);
|
|
|
|
|
eASSERT(env, r->tid.weak == osal_thread_self());
|
|
|
|
|
if (unlikely(r->pid.weak != env->pid))
|
2024-11-25 21:20:02 +03:00
|
|
|
|
return LOG_IFERR(MDBX_BAD_RSLOT);
|
2024-05-19 22:07:58 +03:00
|
|
|
|
return MDBX_RESULT_TRUE /* already registered */;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-25 21:20:02 +03:00
|
|
|
|
return LOG_IFERR(mvcc_bind_slot((MDBX_env *)env).err);
|
2024-05-19 22:07:58 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
__cold int mdbx_thread_unregister(const MDBX_env *env) {
|
|
|
|
|
int rc = check_env(env, true);
|
|
|
|
|
if (unlikely(rc != MDBX_SUCCESS))
|
2024-11-25 21:20:02 +03:00
|
|
|
|
return LOG_IFERR(rc);
|
2024-05-19 22:07:58 +03:00
|
|
|
|
|
|
|
|
|
if (unlikely(!env->lck_mmap.lck))
|
|
|
|
|
return MDBX_RESULT_TRUE;
|
|
|
|
|
|
|
|
|
|
if (unlikely((env->flags & ENV_TXKEY) == 0)) {
|
|
|
|
|
eASSERT(env, env->flags & MDBX_NOSTICKYTHREADS);
|
|
|
|
|
return MDBX_RESULT_TRUE /* MDBX_NOSTICKYTHREADS mode */;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
eASSERT(env, (env->flags & (MDBX_NOSTICKYTHREADS | ENV_TXKEY)) == ENV_TXKEY);
|
|
|
|
|
reader_slot_t *r = thread_rthc_get(env->me_txkey);
|
|
|
|
|
if (unlikely(r == nullptr))
|
|
|
|
|
return MDBX_RESULT_TRUE /* not registered */;
|
|
|
|
|
|
|
|
|
|
eASSERT(env, r->pid.weak == env->pid);
|
|
|
|
|
eASSERT(env, r->tid.weak == osal_thread_self());
|
|
|
|
|
if (unlikely(r->pid.weak != env->pid || r->tid.weak != osal_thread_self()))
|
2024-11-25 21:20:02 +03:00
|
|
|
|
return LOG_IFERR(MDBX_BAD_RSLOT);
|
2024-05-19 22:07:58 +03:00
|
|
|
|
|
|
|
|
|
eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD);
|
|
|
|
|
if (unlikely(r->txnid.weak < SAFE64_INVALID_THRESHOLD))
|
2024-11-25 21:20:02 +03:00
|
|
|
|
return LOG_IFERR(MDBX_BUSY) /* transaction is still active */;
|
2024-05-19 22:07:58 +03:00
|
|
|
|
|
|
|
|
|
atomic_store32(&r->pid, 0, mo_Relaxed);
|
|
|
|
|
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease);
|
|
|
|
|
thread_rthc_set(env->me_txkey, nullptr);
|
|
|
|
|
return MDBX_SUCCESS;
|
|
|
|
|
}
|