mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-20 05:18:21 +08:00
b054a69e72
Сценарий достаточно запутанный/сложный.
2037 lines
76 KiB
C
2037 lines
76 KiB
C
/// \copyright SPDX-License-Identifier: Apache-2.0
|
||
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
|
||
|
||
#include "internals.h"
|
||
|
||
__hot txnid_t txn_snapshot_oldest(const MDBX_txn *const txn) {
|
||
return mvcc_shapshot_oldest(
|
||
txn->env, txn->tw.troika.txnid[txn->tw.troika.prefer_steady]);
|
||
}
|
||
|
||
static void done_cursors(MDBX_txn *txn, const bool merge) {
|
||
tASSERT(txn, txn->cursors[FREE_DBI] == nullptr);
|
||
TXN_FOREACH_DBI_FROM(txn, i, /* skip FREE_DBI */ 1) {
|
||
MDBX_cursor *mc = txn->cursors[i];
|
||
if (mc) {
|
||
txn->cursors[i] = nullptr;
|
||
do {
|
||
MDBX_cursor *const next = mc->next;
|
||
cursor_eot(mc, merge);
|
||
mc = next;
|
||
} while (mc);
|
||
}
|
||
}
|
||
}
|
||
|
||
int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) {
|
||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||
dpl_t *const dl = dpl_sort(txn);
|
||
int rc = MDBX_SUCCESS;
|
||
size_t r, w, total_npages = 0;
|
||
for (w = 0, r = 1; r <= dl->length; ++r) {
|
||
page_t *dp = dl->items[r].ptr;
|
||
if (dp->flags & P_LOOSE) {
|
||
dl->items[++w] = dl->items[r];
|
||
continue;
|
||
}
|
||
unsigned npages = dpl_npages(dl, r);
|
||
total_npages += npages;
|
||
rc = iov_page(txn, ctx, dp, npages);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
}
|
||
|
||
if (!iov_empty(ctx)) {
|
||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||
rc = iov_write(ctx);
|
||
}
|
||
|
||
if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->env->lazy_fd) {
|
||
txn->env->lck->unsynced_pages.weak += total_npages;
|
||
if (!txn->env->lck->eoos_timestamp.weak)
|
||
txn->env->lck->eoos_timestamp.weak = osal_monotime();
|
||
}
|
||
|
||
txn->tw.dirtylist->pages_including_loose -= total_npages;
|
||
while (r <= dl->length)
|
||
dl->items[++w] = dl->items[r++];
|
||
|
||
dl->sorted = dpl_setlen(dl, w);
|
||
txn->tw.dirtyroom += r - 1 - w;
|
||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||
(txn->parent ? txn->parent->tw.dirtyroom
|
||
: txn->env->options.dp_limit));
|
||
tASSERT(txn, txn->tw.dirtylist->length == txn->tw.loose_count);
|
||
tASSERT(txn, txn->tw.dirtylist->pages_including_loose == txn->tw.loose_count);
|
||
return rc;
|
||
}
|
||
|
||
/* Merge child txn into parent */
|
||
static void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn,
|
||
const size_t parent_retired_len) {
|
||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0);
|
||
dpl_t *const src = dpl_sort(txn);
|
||
|
||
/* Remove refunded pages from parent's dirty list */
|
||
dpl_t *const dst = dpl_sort(parent);
|
||
if (MDBX_ENABLE_REFUND) {
|
||
size_t n = dst->length;
|
||
while (n && dst->items[n].pgno >= parent->geo.first_unallocated) {
|
||
const unsigned npages = dpl_npages(dst, n);
|
||
page_shadow_release(txn->env, dst->items[n].ptr, npages);
|
||
--n;
|
||
}
|
||
parent->tw.dirtyroom += dst->sorted - n;
|
||
dst->sorted = dpl_setlen(dst, n);
|
||
tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length ==
|
||
(parent->parent ? parent->parent->tw.dirtyroom
|
||
: parent->env->options.dp_limit));
|
||
}
|
||
|
||
/* Remove reclaimed pages from parent's dirty list */
|
||
const pnl_t reclaimed_list = parent->tw.relist;
|
||
dpl_sift(parent, reclaimed_list, false);
|
||
|
||
/* Move retired pages from parent's dirty & spilled list to reclaimed */
|
||
size_t r, w, d, s, l;
|
||
for (r = w = parent_retired_len;
|
||
++r <= MDBX_PNL_GETSIZE(parent->tw.retired_pages);) {
|
||
const pgno_t pgno = parent->tw.retired_pages[r];
|
||
const size_t di = dpl_exist(parent, pgno);
|
||
const size_t si = !di ? spill_search(parent, pgno) : 0;
|
||
unsigned npages;
|
||
const char *kind;
|
||
if (di) {
|
||
page_t *dp = dst->items[di].ptr;
|
||
tASSERT(parent, (dp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE |
|
||
P_SPILLED)) == 0);
|
||
npages = dpl_npages(dst, di);
|
||
page_wash(parent, di, dp, npages);
|
||
kind = "dirty";
|
||
l = 1;
|
||
if (unlikely(npages > l)) {
|
||
/* OVERFLOW-страница могла быть переиспользована по частям. Тогда
|
||
* в retired-списке может быть только начало последовательности,
|
||
* а остаток растащен по dirty, spilled и reclaimed спискам. Поэтому
|
||
* переносим в reclaimed с проверкой на обрыв последовательности.
|
||
* В любом случае, все осколки будут учтены и отфильтрованы, т.е. если
|
||
* страница была разбита на части, то важно удалить dirty-элемент,
|
||
* а все осколки будут учтены отдельно. */
|
||
|
||
/* Список retired страниц не сортирован, но для ускорения сортировки
|
||
* дополняется в соответствии с MDBX_PNL_ASCENDING */
|
||
#if MDBX_PNL_ASCENDING
|
||
const size_t len = MDBX_PNL_GETSIZE(parent->tw.retired_pages);
|
||
while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) {
|
||
++r;
|
||
if (++l == npages)
|
||
break;
|
||
}
|
||
#else
|
||
while (w > parent_retired_len &&
|
||
parent->tw.retired_pages[w - 1] == pgno + l) {
|
||
--w;
|
||
if (++l == npages)
|
||
break;
|
||
}
|
||
#endif
|
||
}
|
||
} else if (unlikely(si)) {
|
||
l = npages = 1;
|
||
spill_remove(parent, si, 1);
|
||
kind = "spilled";
|
||
} else {
|
||
parent->tw.retired_pages[++w] = pgno;
|
||
continue;
|
||
}
|
||
|
||
DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l,
|
||
kind, pgno);
|
||
int err = pnl_insert_span(&parent->tw.relist, pgno, l);
|
||
ENSURE(txn->env, err == MDBX_SUCCESS);
|
||
}
|
||
MDBX_PNL_SETSIZE(parent->tw.retired_pages, w);
|
||
|
||
/* Filter-out parent spill list */
|
||
if (parent->tw.spilled.list &&
|
||
MDBX_PNL_GETSIZE(parent->tw.spilled.list) > 0) {
|
||
const pnl_t sl = spill_purge(parent);
|
||
size_t len = MDBX_PNL_GETSIZE(sl);
|
||
if (len) {
|
||
/* Remove refunded pages from parent's spill list */
|
||
if (MDBX_ENABLE_REFUND &&
|
||
MDBX_PNL_MOST(sl) >= (parent->geo.first_unallocated << 1)) {
|
||
#if MDBX_PNL_ASCENDING
|
||
size_t i = MDBX_PNL_GETSIZE(sl);
|
||
assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl));
|
||
do {
|
||
if ((sl[i] & 1) == 0)
|
||
DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
|
||
i -= 1;
|
||
} while (i && sl[i] >= (parent->geo.first_unallocated << 1));
|
||
MDBX_PNL_SETSIZE(sl, i);
|
||
#else
|
||
assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl));
|
||
size_t i = 0;
|
||
do {
|
||
++i;
|
||
if ((sl[i] & 1) == 0)
|
||
DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1);
|
||
} while (i < len && sl[i + 1] >= (parent->geo.first_unallocated << 1));
|
||
MDBX_PNL_SETSIZE(sl, len -= i);
|
||
memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0]));
|
||
#endif
|
||
}
|
||
tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->geo.first_unallocated
|
||
<< 1));
|
||
|
||
/* Remove reclaimed pages from parent's spill list */
|
||
s = MDBX_PNL_GETSIZE(sl), r = MDBX_PNL_GETSIZE(reclaimed_list);
|
||
/* Scanning from end to begin */
|
||
while (s && r) {
|
||
if (sl[s] & 1) {
|
||
--s;
|
||
continue;
|
||
}
|
||
const pgno_t spilled_pgno = sl[s] >> 1;
|
||
const pgno_t reclaimed_pgno = reclaimed_list[r];
|
||
if (reclaimed_pgno != spilled_pgno) {
|
||
const bool cmp = MDBX_PNL_ORDERED(spilled_pgno, reclaimed_pgno);
|
||
s -= !cmp;
|
||
r -= cmp;
|
||
} else {
|
||
DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO,
|
||
reclaimed_pgno);
|
||
spill_remove(parent, s, 1);
|
||
--s;
|
||
--r;
|
||
}
|
||
}
|
||
|
||
/* Remove anything in our dirty list from parent's spill list */
|
||
/* Scanning spill list in descend order */
|
||
const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1;
|
||
s = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(sl) : 1;
|
||
d = src->length;
|
||
while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_GETSIZE(sl))) {
|
||
if (sl[s] & 1) {
|
||
s += step;
|
||
continue;
|
||
}
|
||
const pgno_t spilled_pgno = sl[s] >> 1;
|
||
const pgno_t dirty_pgno_form = src->items[d].pgno;
|
||
const unsigned npages = dpl_npages(src, d);
|
||
const pgno_t dirty_pgno_to = dirty_pgno_form + npages;
|
||
if (dirty_pgno_form > spilled_pgno) {
|
||
--d;
|
||
continue;
|
||
}
|
||
if (dirty_pgno_to <= spilled_pgno) {
|
||
s += step;
|
||
continue;
|
||
}
|
||
|
||
DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages,
|
||
dirty_pgno_form);
|
||
spill_remove(parent, s, 1);
|
||
s += step;
|
||
}
|
||
|
||
/* Squash deleted pagenums if we deleted any */
|
||
spill_purge(parent);
|
||
}
|
||
}
|
||
|
||
/* Remove anything in our spill list from parent's dirty list */
|
||
if (txn->tw.spilled.list) {
|
||
tASSERT(txn,
|
||
pnl_check_allocated(txn->tw.spilled.list,
|
||
(size_t)parent->geo.first_unallocated << 1));
|
||
dpl_sift(parent, txn->tw.spilled.list, true);
|
||
tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length ==
|
||
(parent->parent ? parent->parent->tw.dirtyroom
|
||
: parent->env->options.dp_limit));
|
||
}
|
||
|
||
/* Find length of merging our dirty list with parent's and release
|
||
* filter-out pages */
|
||
for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) {
|
||
page_t *sp = src->items[s].ptr;
|
||
tASSERT(parent, (sp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE |
|
||
P_LOOSE | P_SPILLED)) == 0);
|
||
const unsigned s_npages = dpl_npages(src, s);
|
||
const pgno_t s_pgno = src->items[s].pgno;
|
||
|
||
page_t *dp = dst->items[d].ptr;
|
||
tASSERT(parent, (dp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE |
|
||
P_SPILLED)) == 0);
|
||
const unsigned d_npages = dpl_npages(dst, d);
|
||
const pgno_t d_pgno = dst->items[d].pgno;
|
||
|
||
if (d_pgno >= s_pgno + s_npages) {
|
||
--d;
|
||
++l;
|
||
} else if (d_pgno + d_npages <= s_pgno) {
|
||
if (sp->flags != P_LOOSE) {
|
||
sp->txnid = parent->front_txnid;
|
||
sp->flags &= ~P_SPILLED;
|
||
}
|
||
--s;
|
||
++l;
|
||
} else {
|
||
dst->items[d--].ptr = nullptr;
|
||
page_shadow_release(txn->env, dp, d_npages);
|
||
}
|
||
}
|
||
assert(dst->sorted == dst->length);
|
||
tASSERT(parent, dst->detent >= l + d + s);
|
||
dst->sorted = l + d + s; /* the merged length */
|
||
|
||
while (s > 0) {
|
||
page_t *sp = src->items[s].ptr;
|
||
tASSERT(parent, (sp->flags & ~(P_LEAF | P_DUPFIX | P_BRANCH | P_LARGE |
|
||
P_LOOSE | P_SPILLED)) == 0);
|
||
if (sp->flags != P_LOOSE) {
|
||
sp->txnid = parent->front_txnid;
|
||
sp->flags &= ~P_SPILLED;
|
||
}
|
||
--s;
|
||
}
|
||
|
||
/* Merge our dirty list into parent's, i.e. merge(dst, src) -> dst */
|
||
if (dst->sorted >= dst->length) {
|
||
/* from end to begin with dst extending */
|
||
for (l = dst->sorted, s = src->length, d = dst->length; s > 0 && d > 0;) {
|
||
if (unlikely(l <= d)) {
|
||
/* squash to get a gap of free space for merge */
|
||
for (r = w = 1; r <= d; ++r)
|
||
if (dst->items[r].ptr) {
|
||
if (w != r) {
|
||
dst->items[w] = dst->items[r];
|
||
dst->items[r].ptr = nullptr;
|
||
}
|
||
++w;
|
||
}
|
||
VERBOSE("squash to begin for extending-merge %zu -> %zu", d, w - 1);
|
||
d = w - 1;
|
||
continue;
|
||
}
|
||
assert(l > d);
|
||
if (dst->items[d].ptr) {
|
||
dst->items[l--] = (dst->items[d].pgno > src->items[s].pgno)
|
||
? dst->items[d--]
|
||
: src->items[s--];
|
||
} else
|
||
--d;
|
||
}
|
||
if (s > 0) {
|
||
assert(l == s);
|
||
while (d > 0) {
|
||
assert(dst->items[d].ptr == nullptr);
|
||
--d;
|
||
}
|
||
do {
|
||
assert(l > 0);
|
||
dst->items[l--] = src->items[s--];
|
||
} while (s > 0);
|
||
} else {
|
||
assert(l == d);
|
||
while (l > 0) {
|
||
assert(dst->items[l].ptr != nullptr);
|
||
--l;
|
||
}
|
||
}
|
||
} else {
|
||
/* from begin to end with shrinking (a lot of new large/overflow pages) */
|
||
for (l = s = d = 1; s <= src->length && d <= dst->length;) {
|
||
if (unlikely(l >= d)) {
|
||
/* squash to get a gap of free space for merge */
|
||
for (r = w = dst->length; r >= d; --r)
|
||
if (dst->items[r].ptr) {
|
||
if (w != r) {
|
||
dst->items[w] = dst->items[r];
|
||
dst->items[r].ptr = nullptr;
|
||
}
|
||
--w;
|
||
}
|
||
VERBOSE("squash to end for shrinking-merge %zu -> %zu", d, w + 1);
|
||
d = w + 1;
|
||
continue;
|
||
}
|
||
assert(l < d);
|
||
if (dst->items[d].ptr) {
|
||
dst->items[l++] = (dst->items[d].pgno < src->items[s].pgno)
|
||
? dst->items[d++]
|
||
: src->items[s++];
|
||
} else
|
||
++d;
|
||
}
|
||
if (s <= src->length) {
|
||
assert(dst->sorted - l == src->length - s);
|
||
while (d <= dst->length) {
|
||
assert(dst->items[d].ptr == nullptr);
|
||
--d;
|
||
}
|
||
do {
|
||
assert(l <= dst->sorted);
|
||
dst->items[l++] = src->items[s++];
|
||
} while (s <= src->length);
|
||
} else {
|
||
assert(dst->sorted - l == dst->length - d);
|
||
while (l <= dst->sorted) {
|
||
assert(l <= d && d <= dst->length && dst->items[d].ptr);
|
||
dst->items[l++] = dst->items[d++];
|
||
}
|
||
}
|
||
}
|
||
parent->tw.dirtyroom -= dst->sorted - dst->length;
|
||
assert(parent->tw.dirtyroom <= parent->env->options.dp_limit);
|
||
dpl_setlen(dst, dst->sorted);
|
||
parent->tw.dirtylru = txn->tw.dirtylru;
|
||
|
||
/* В текущем понимании выгоднее пересчитать кол-во страниц,
|
||
* чем подмешивать лишние ветвления и вычисления в циклы выше. */
|
||
dst->pages_including_loose = 0;
|
||
for (r = 1; r <= dst->length; ++r)
|
||
dst->pages_including_loose += dpl_npages(dst, r);
|
||
|
||
tASSERT(parent, dpl_check(parent));
|
||
dpl_free(txn);
|
||
|
||
if (txn->tw.spilled.list) {
|
||
if (parent->tw.spilled.list) {
|
||
/* Must not fail since space was preserved above. */
|
||
pnl_merge(parent->tw.spilled.list, txn->tw.spilled.list);
|
||
pnl_free(txn->tw.spilled.list);
|
||
} else {
|
||
parent->tw.spilled.list = txn->tw.spilled.list;
|
||
parent->tw.spilled.least_removed = txn->tw.spilled.least_removed;
|
||
}
|
||
tASSERT(parent, dpl_check(parent));
|
||
}
|
||
|
||
parent->flags &= ~MDBX_TXN_HAS_CHILD;
|
||
if (parent->tw.spilled.list) {
|
||
assert(pnl_check_allocated(parent->tw.spilled.list,
|
||
(size_t)parent->geo.first_unallocated << 1));
|
||
if (MDBX_PNL_GETSIZE(parent->tw.spilled.list))
|
||
parent->flags |= MDBX_TXN_SPILLS;
|
||
}
|
||
}
|
||
|
||
static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) {
|
||
MDBX_env *const env = txn->env;
|
||
if (MDBX_ENABLE_PROFGC) {
|
||
pgop_stat_t *const ptr = &env->lck->pgops;
|
||
latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter;
|
||
latency->gc_prof.work_rtime_monotonic =
|
||
osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic);
|
||
latency->gc_prof.work_xtime_cpu =
|
||
osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu);
|
||
latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps;
|
||
latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages;
|
||
latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt;
|
||
|
||
latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter;
|
||
latency->gc_prof.self_rtime_monotonic =
|
||
osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic);
|
||
latency->gc_prof.self_xtime_cpu =
|
||
osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu);
|
||
latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps;
|
||
latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages;
|
||
latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt;
|
||
|
||
latency->gc_prof.wloops = ptr->gc_prof.wloops;
|
||
latency->gc_prof.coalescences = ptr->gc_prof.coalescences;
|
||
latency->gc_prof.wipes = ptr->gc_prof.wipes;
|
||
latency->gc_prof.flushes = ptr->gc_prof.flushes;
|
||
latency->gc_prof.kicks = ptr->gc_prof.kicks;
|
||
if (txn == env->basal_txn)
|
||
memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof));
|
||
} else
|
||
memset(&latency->gc_prof, 0, sizeof(latency->gc_prof));
|
||
}
|
||
|
||
int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) {
|
||
STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD -
|
||
MDBX_TXN_ERROR - MDBX_TXN_PARKED);
|
||
const uint64_t ts_0 = latency ? osal_monotime() : 0;
|
||
uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0;
|
||
|
||
int rc = check_txn(txn, MDBX_TXN_FINISHED);
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
if (latency)
|
||
memset(latency, 0, sizeof(*latency));
|
||
return rc;
|
||
}
|
||
|
||
MDBX_env *const env = txn->env;
|
||
#if MDBX_ENV_CHECKPID
|
||
if (unlikely(env->pid != osal_getpid())) {
|
||
env->flags |= ENV_FATAL_ERROR;
|
||
if (latency)
|
||
memset(latency, 0, sizeof(*latency));
|
||
return MDBX_PANIC;
|
||
}
|
||
#endif /* MDBX_ENV_CHECKPID */
|
||
|
||
if (unlikely(txn->flags & MDBX_TXN_ERROR)) {
|
||
rc = MDBX_RESULT_TRUE;
|
||
goto fail;
|
||
}
|
||
|
||
/* txn_end() mode for a commit which writes nothing */
|
||
unsigned end_mode =
|
||
TXN_END_PURE_COMMIT | TXN_END_UPDATE | TXN_END_SLOT | TXN_END_FREE;
|
||
if (unlikely(txn->flags & MDBX_TXN_RDONLY))
|
||
goto done;
|
||
|
||
if ((txn->flags & MDBX_NOSTICKYTHREADS) &&
|
||
unlikely(txn->owner != osal_thread_self())) {
|
||
rc = MDBX_THREAD_MISMATCH;
|
||
goto fail;
|
||
}
|
||
|
||
if (txn->nested) {
|
||
rc = mdbx_txn_commit_ex(txn->nested, nullptr);
|
||
tASSERT(txn, txn->nested == nullptr);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto fail;
|
||
}
|
||
|
||
if (unlikely(txn != env->txn)) {
|
||
DEBUG("%s", "attempt to commit unknown transaction");
|
||
rc = MDBX_EINVAL;
|
||
goto fail;
|
||
}
|
||
|
||
if (txn->parent) {
|
||
tASSERT(txn, audit_ex(txn, 0, false) == 0);
|
||
eASSERT(env, txn != env->basal_txn);
|
||
MDBX_txn *const parent = txn->parent;
|
||
eASSERT(env, parent->signature == txn_signature);
|
||
eASSERT(env,
|
||
parent->nested == txn && (parent->flags & MDBX_TXN_HAS_CHILD) != 0);
|
||
eASSERT(env, dpl_check(txn));
|
||
|
||
if (txn->tw.dirtylist->length == 0 && !(txn->flags & MDBX_TXN_DIRTY) &&
|
||
parent->n_dbi == txn->n_dbi) {
|
||
TXN_FOREACH_DBI_ALL(txn, i) {
|
||
tASSERT(txn, (txn->dbi_state[i] & DBI_DIRTY) == 0);
|
||
if ((txn->dbi_state[i] & DBI_STALE) &&
|
||
!(parent->dbi_state[i] & DBI_STALE))
|
||
tASSERT(txn,
|
||
memcmp(&parent->dbs[i], &txn->dbs[i], sizeof(tree_t)) == 0);
|
||
}
|
||
|
||
tASSERT(txn, memcmp(&parent->geo, &txn->geo, sizeof(parent->geo)) == 0);
|
||
tASSERT(txn, memcmp(&parent->canary, &txn->canary,
|
||
sizeof(parent->canary)) == 0);
|
||
tASSERT(txn, !txn->tw.spilled.list ||
|
||
MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0);
|
||
tASSERT(txn, txn->tw.loose_count == 0);
|
||
|
||
/* fast completion of pure nested transaction */
|
||
VERBOSE("fast-complete pure nested txn %" PRIaTXN, txn->txnid);
|
||
end_mode = TXN_END_PURE_COMMIT | TXN_END_SLOT | TXN_END_FREE;
|
||
goto done;
|
||
}
|
||
|
||
/* Preserve space for spill list to avoid parent's state corruption
|
||
* if allocation fails. */
|
||
const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages;
|
||
tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages));
|
||
const size_t retired_delta =
|
||
MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len;
|
||
if (retired_delta) {
|
||
rc = pnl_need(&txn->tw.relist, retired_delta);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto fail;
|
||
}
|
||
|
||
if (txn->tw.spilled.list) {
|
||
if (parent->tw.spilled.list) {
|
||
rc = pnl_need(&parent->tw.spilled.list,
|
||
MDBX_PNL_GETSIZE(txn->tw.spilled.list));
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto fail;
|
||
}
|
||
spill_purge(txn);
|
||
}
|
||
|
||
if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length >
|
||
parent->tw.dirtylist->detent &&
|
||
!dpl_reserve(parent, txn->tw.dirtylist->length +
|
||
parent->tw.dirtylist->length))) {
|
||
rc = MDBX_ENOMEM;
|
||
goto fail;
|
||
}
|
||
|
||
//-------------------------------------------------------------------------
|
||
|
||
parent->tw.gc.reclaimed = txn->tw.gc.reclaimed;
|
||
txn->tw.gc.reclaimed = nullptr;
|
||
|
||
parent->tw.retired_pages = txn->tw.retired_pages;
|
||
txn->tw.retired_pages = nullptr;
|
||
|
||
pnl_free(parent->tw.relist);
|
||
parent->tw.relist = txn->tw.relist;
|
||
txn->tw.relist = nullptr;
|
||
parent->tw.gc.time_acc = txn->tw.gc.time_acc;
|
||
parent->tw.gc.last_reclaimed = txn->tw.gc.last_reclaimed;
|
||
|
||
parent->geo = txn->geo;
|
||
parent->canary = txn->canary;
|
||
parent->flags |= txn->flags & MDBX_TXN_DIRTY;
|
||
|
||
/* Move loose pages to parent */
|
||
#if MDBX_ENABLE_REFUND
|
||
parent->tw.loose_refund_wl = txn->tw.loose_refund_wl;
|
||
#endif /* MDBX_ENABLE_REFUND */
|
||
parent->tw.loose_count = txn->tw.loose_count;
|
||
parent->tw.loose_pages = txn->tw.loose_pages;
|
||
|
||
/* Merge our cursors into parent's and close them */
|
||
done_cursors(txn, true);
|
||
end_mode |= TXN_END_EOTDONE;
|
||
|
||
/* Update parent's DBs array */
|
||
eASSERT(env, parent->n_dbi == txn->n_dbi);
|
||
TXN_FOREACH_DBI_ALL(txn, dbi) {
|
||
if (txn->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)) {
|
||
parent->dbs[dbi] = txn->dbs[dbi];
|
||
/* preserve parent's status */
|
||
const uint8_t state =
|
||
txn->dbi_state[dbi] |
|
||
(parent->dbi_state[dbi] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY));
|
||
DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", dbi,
|
||
(parent->dbi_state[dbi] != state) ? "update" : "still",
|
||
parent->dbi_state[dbi], state);
|
||
parent->dbi_state[dbi] = state;
|
||
} else {
|
||
eASSERT(env,
|
||
txn->dbi_state[dbi] == (parent->dbi_state[dbi] &
|
||
~(DBI_FRESH | DBI_CREAT | DBI_DIRTY)));
|
||
}
|
||
}
|
||
|
||
if (latency) {
|
||
ts_1 = osal_monotime();
|
||
ts_2 = /* no gc-update */ ts_1;
|
||
ts_3 = /* no audit */ ts_2;
|
||
ts_4 = /* no write */ ts_3;
|
||
ts_5 = /* no sync */ ts_4;
|
||
}
|
||
txn_merge(parent, txn, parent_retired_len);
|
||
env->txn = parent;
|
||
parent->nested = nullptr;
|
||
tASSERT(parent, dpl_check(parent));
|
||
|
||
#if MDBX_ENABLE_REFUND
|
||
txn_refund(parent);
|
||
if (ASSERT_ENABLED()) {
|
||
/* Check parent's loose pages not suitable for refund */
|
||
for (page_t *lp = parent->tw.loose_pages; lp; lp = page_next(lp)) {
|
||
tASSERT(parent, lp->pgno < parent->tw.loose_refund_wl &&
|
||
lp->pgno + 1 < parent->geo.first_unallocated);
|
||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||
}
|
||
/* Check parent's reclaimed pages not suitable for refund */
|
||
if (MDBX_PNL_GETSIZE(parent->tw.relist))
|
||
tASSERT(parent, MDBX_PNL_MOST(parent->tw.relist) + 1 <
|
||
parent->geo.first_unallocated);
|
||
}
|
||
#endif /* MDBX_ENABLE_REFUND */
|
||
|
||
txn->signature = 0;
|
||
osal_free(txn);
|
||
tASSERT(parent, audit_ex(parent, 0, false) == 0);
|
||
rc = MDBX_SUCCESS;
|
||
goto provide_latency;
|
||
}
|
||
|
||
if (!txn->tw.dirtylist) {
|
||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||
} else {
|
||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||
(txn->parent ? txn->parent->tw.dirtyroom
|
||
: env->options.dp_limit));
|
||
}
|
||
done_cursors(txn, false);
|
||
end_mode |= TXN_END_EOTDONE;
|
||
|
||
if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) &&
|
||
(txn->flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) {
|
||
TXN_FOREACH_DBI_ALL(txn, i) {
|
||
tASSERT(txn, !(txn->dbi_state[i] & DBI_DIRTY));
|
||
}
|
||
#if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT
|
||
rc = txn_end(txn, end_mode);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto fail;
|
||
rc = MDBX_RESULT_TRUE;
|
||
goto provide_latency;
|
||
#else
|
||
goto done;
|
||
#endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */
|
||
}
|
||
|
||
DEBUG("committing txn %" PRIaTXN " %p on env %p, root page %" PRIaPGNO
|
||
"/%" PRIaPGNO,
|
||
txn->txnid, (void *)txn, (void *)env, txn->dbs[MAIN_DBI].root,
|
||
txn->dbs[FREE_DBI].root);
|
||
|
||
if (txn->n_dbi > CORE_DBS) {
|
||
/* Update table root pointers */
|
||
cursor_couple_t cx;
|
||
rc = cursor_init(&cx.outer, txn, MAIN_DBI);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto fail;
|
||
cx.outer.next = txn->cursors[MAIN_DBI];
|
||
txn->cursors[MAIN_DBI] = &cx.outer;
|
||
TXN_FOREACH_DBI_USER(txn, i) {
|
||
if ((txn->dbi_state[i] & DBI_DIRTY) == 0)
|
||
continue;
|
||
tree_t *const db = &txn->dbs[i];
|
||
DEBUG("update main's entry for sub-db %zu, mod_txnid %" PRIaTXN
|
||
" -> %" PRIaTXN,
|
||
i, db->mod_txnid, txn->txnid);
|
||
/* Может быть mod_txnid > front после коммита вложенных тразакций */
|
||
db->mod_txnid = txn->txnid;
|
||
MDBX_val data = {db, sizeof(tree_t)};
|
||
rc = cursor_put(&cx.outer, &env->kvs[i].name, &data, N_TREE);
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||
goto fail;
|
||
}
|
||
}
|
||
txn->cursors[MAIN_DBI] = cx.outer.next;
|
||
}
|
||
|
||
ts_1 = latency ? osal_monotime() : 0;
|
||
|
||
gcu_t gcu_ctx;
|
||
gc_cputime = latency ? osal_cputime(nullptr) : 0;
|
||
rc = gc_update_init(txn, &gcu_ctx);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto fail;
|
||
rc = gc_update(txn, &gcu_ctx);
|
||
gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0;
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto fail;
|
||
|
||
tASSERT(txn, txn->tw.loose_count == 0);
|
||
txn->dbs[FREE_DBI].mod_txnid = (txn->dbi_state[FREE_DBI] & DBI_DIRTY)
|
||
? txn->txnid
|
||
: txn->dbs[FREE_DBI].mod_txnid;
|
||
|
||
txn->dbs[MAIN_DBI].mod_txnid = (txn->dbi_state[MAIN_DBI] & DBI_DIRTY)
|
||
? txn->txnid
|
||
: txn->dbs[MAIN_DBI].mod_txnid;
|
||
|
||
ts_2 = latency ? osal_monotime() : 0;
|
||
ts_3 = ts_2;
|
||
if (AUDIT_ENABLED()) {
|
||
rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true);
|
||
ts_3 = osal_monotime();
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto fail;
|
||
}
|
||
|
||
bool need_flush_for_nometasync = false;
|
||
const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
|
||
const uint32_t meta_sync_txnid =
|
||
atomic_load32(&env->lck->meta_sync_txnid, mo_Relaxed);
|
||
/* sync prev meta */
|
||
if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) {
|
||
/* Исправление унаследованного от LMDB недочета:
|
||
*
|
||
* Всё хорошо, если все процессы работающие с БД не используют WRITEMAP.
|
||
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
|
||
* сохранена в результате fdatasync() при записи данных этой транзакции.
|
||
*
|
||
* Всё хорошо, если все процессы работающие с БД используют WRITEMAP
|
||
* без MDBX_AVOID_MSYNC.
|
||
* Тогда мета-страница (обновленная, но не сброшенная на диск) будет
|
||
* сохранена в результате msync() при записи данных этой транзакции.
|
||
*
|
||
* Если же в процессах работающих с БД используется оба метода, как sync()
|
||
* в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то
|
||
* становится невозможным обеспечить фиксацию на диске мета-страницы
|
||
* предыдущей транзакции и данных текущей транзакции, за счет одной
|
||
* sync-операцией выполняемой после записи данных текущей транзакции.
|
||
* Соответственно, требуется явно обновлять мета-страницу, что полностью
|
||
* уничтожает выгоду от NOMETASYNC. */
|
||
const uint32_t txnid_dist =
|
||
((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC)
|
||
? MDBX_NOMETASYNC_LAZY_FD
|
||
: MDBX_NOMETASYNC_LAZY_WRITEMAP;
|
||
/* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync()
|
||
* или msync() для гарантированной фиксации на диске мета-страницы,
|
||
* которая была "лениво" отправлена на запись в предыдущей транзакции,
|
||
* но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */
|
||
if (
|
||
#if defined(_WIN32) || defined(_WIN64)
|
||
!env->ioring.overlapped_fd &&
|
||
#endif
|
||
meta_sync_txnid == (uint32_t)head.txnid - txnid_dist)
|
||
need_flush_for_nometasync = true;
|
||
else {
|
||
rc = meta_sync(env, head);
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
ERROR("txn-%s: error %d", "presync-meta", rc);
|
||
goto fail;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (txn->tw.dirtylist) {
|
||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC);
|
||
tASSERT(txn, txn->tw.loose_count == 0);
|
||
|
||
mdbx_filehandle_t fd =
|
||
#if defined(_WIN32) || defined(_WIN64)
|
||
env->ioring.overlapped_fd ? env->ioring.overlapped_fd : env->lazy_fd;
|
||
(void)need_flush_for_nometasync;
|
||
#else
|
||
(need_flush_for_nometasync || env->dsync_fd == INVALID_HANDLE_VALUE ||
|
||
txn->tw.dirtylist->length > env->options.writethrough_threshold ||
|
||
atomic_load64(&env->lck->unsynced_pages, mo_Relaxed))
|
||
? env->lazy_fd
|
||
: env->dsync_fd;
|
||
#endif /* Windows */
|
||
|
||
iov_ctx_t write_ctx;
|
||
rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length,
|
||
txn->tw.dirtylist->pages_including_loose, fd, false);
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
ERROR("txn-%s: error %d", "iov-init", rc);
|
||
goto fail;
|
||
}
|
||
|
||
rc = txn_write(txn, &write_ctx);
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
ERROR("txn-%s: error %d", "write", rc);
|
||
goto fail;
|
||
}
|
||
} else {
|
||
tASSERT(txn, (txn->flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC);
|
||
env->lck->unsynced_pages.weak += txn->tw.writemap_dirty_npages;
|
||
if (!env->lck->eoos_timestamp.weak)
|
||
env->lck->eoos_timestamp.weak = osal_monotime();
|
||
}
|
||
|
||
/* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */
|
||
ts_4 = latency ? osal_monotime() : 0;
|
||
|
||
meta_t meta;
|
||
memcpy(meta.magic_and_version, head.ptr_c->magic_and_version, 8);
|
||
meta.reserve16 = head.ptr_c->reserve16;
|
||
meta.validator_id = head.ptr_c->validator_id;
|
||
meta.extra_pagehdr = head.ptr_c->extra_pagehdr;
|
||
unaligned_poke_u64(4, meta.pages_retired,
|
||
unaligned_peek_u64(4, head.ptr_c->pages_retired) +
|
||
MDBX_PNL_GETSIZE(txn->tw.retired_pages));
|
||
meta.geometry = txn->geo;
|
||
meta.trees.gc = txn->dbs[FREE_DBI];
|
||
meta.trees.main = txn->dbs[MAIN_DBI];
|
||
meta.canary = txn->canary;
|
||
memcpy(&meta.dxbid, &head.ptr_c->dxbid, sizeof(meta.dxbid));
|
||
|
||
txnid_t commit_txnid = txn->txnid;
|
||
#if MDBX_ENABLE_BIGFOOT
|
||
if (gcu_ctx.bigfoot > txn->txnid) {
|
||
commit_txnid = gcu_ctx.bigfoot;
|
||
TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid,
|
||
(size_t)(commit_txnid - txn->txnid));
|
||
}
|
||
#endif
|
||
meta.unsafe_sign = DATASIGN_NONE;
|
||
meta_set_txnid(env, &meta, commit_txnid);
|
||
|
||
rc = dxb_sync_locked(env, env->flags | txn->flags | txn_shrink_allowed, &meta,
|
||
&txn->tw.troika);
|
||
|
||
ts_5 = latency ? osal_monotime() : 0;
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
env->flags |= ENV_FATAL_ERROR;
|
||
ERROR("txn-%s: error %d", "sync", rc);
|
||
goto fail;
|
||
}
|
||
|
||
end_mode = TXN_END_COMMITTED | TXN_END_UPDATE | TXN_END_EOTDONE;
|
||
|
||
done:
|
||
if (latency)
|
||
take_gcprof(txn, latency);
|
||
rc = txn_end(txn, end_mode);
|
||
|
||
provide_latency:
|
||
if (latency) {
|
||
latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0;
|
||
latency->gc_wallclock =
|
||
(ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0;
|
||
latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0;
|
||
latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0;
|
||
latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0;
|
||
latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0;
|
||
const uint64_t ts_6 = osal_monotime();
|
||
latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0;
|
||
latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0);
|
||
}
|
||
return rc;
|
||
|
||
fail:
|
||
txn->flags |= MDBX_TXN_ERROR;
|
||
if (latency)
|
||
take_gcprof(txn, latency);
|
||
txn_abort(txn);
|
||
goto provide_latency;
|
||
}
|
||
|
||
int txn_abort(MDBX_txn *txn) {
|
||
if (txn->flags & MDBX_TXN_RDONLY)
|
||
/* LY: don't close DBI-handles */
|
||
return txn_end(txn, TXN_END_ABORT | TXN_END_UPDATE | TXN_END_SLOT |
|
||
TXN_END_FREE);
|
||
|
||
if (unlikely(txn->flags & MDBX_TXN_FINISHED))
|
||
return MDBX_BAD_TXN;
|
||
|
||
if (txn->nested)
|
||
txn_abort(txn->nested);
|
||
|
||
tASSERT(txn, (txn->flags & MDBX_TXN_ERROR) || dpl_check(txn));
|
||
return txn_end(txn, TXN_END_ABORT | TXN_END_SLOT | TXN_END_FREE);
|
||
}
|
||
|
||
int txn_renew(MDBX_txn *txn, unsigned flags) {
|
||
MDBX_env *const env = txn->env;
|
||
int rc;
|
||
|
||
#if MDBX_ENV_CHECKPID
|
||
if (unlikely(env->pid != osal_getpid())) {
|
||
env->flags |= ENV_FATAL_ERROR;
|
||
return MDBX_PANIC;
|
||
}
|
||
#endif /* MDBX_ENV_CHECKPID */
|
||
|
||
flags |= env->flags & (MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP);
|
||
if (flags & MDBX_TXN_RDONLY) {
|
||
eASSERT(env, (flags & ~(txn_ro_begin_flags | MDBX_WRITEMAP |
|
||
MDBX_NOSTICKYTHREADS)) == 0);
|
||
txn->flags = flags;
|
||
reader_slot_t *r = txn->to.reader;
|
||
STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->tid));
|
||
if (likely(env->flags & ENV_TXKEY)) {
|
||
eASSERT(env, !(env->flags & MDBX_NOSTICKYTHREADS));
|
||
r = thread_rthc_get(env->me_txkey);
|
||
if (likely(r)) {
|
||
if (unlikely(!r->pid.weak) &&
|
||
(globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) {
|
||
thread_rthc_set(env->me_txkey, nullptr);
|
||
r = nullptr;
|
||
} else {
|
||
eASSERT(env, r->pid.weak == env->pid);
|
||
eASSERT(env, r->tid.weak == osal_thread_self());
|
||
}
|
||
}
|
||
} else {
|
||
eASSERT(env, !env->lck_mmap.lck || (env->flags & MDBX_NOSTICKYTHREADS));
|
||
}
|
||
|
||
if (likely(r)) {
|
||
if (unlikely(r->pid.weak != env->pid ||
|
||
r->txnid.weak < SAFE64_INVALID_THRESHOLD))
|
||
return MDBX_BAD_RSLOT;
|
||
} else if (env->lck_mmap.lck) {
|
||
bsr_t brs = mvcc_bind_slot(env);
|
||
if (unlikely(brs.err != MDBX_SUCCESS))
|
||
return brs.err;
|
||
r = brs.rslot;
|
||
}
|
||
txn->to.reader = r;
|
||
STATIC_ASSERT(MDBX_TXN_RDONLY_PREPARE > MDBX_TXN_RDONLY);
|
||
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) {
|
||
eASSERT(env, txn->txnid == 0);
|
||
eASSERT(env, txn->owner == 0);
|
||
eASSERT(env, txn->n_dbi == 0);
|
||
if (likely(r)) {
|
||
eASSERT(env, r->snapshot_pages_used.weak == 0);
|
||
eASSERT(env, r->txnid.weak >= SAFE64_INVALID_THRESHOLD);
|
||
atomic_store32(&r->snapshot_pages_used, 0, mo_Relaxed);
|
||
}
|
||
txn->flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED;
|
||
return MDBX_SUCCESS;
|
||
}
|
||
txn->owner = (uintptr_t)r->tid.weak;
|
||
if ((env->flags & MDBX_NOSTICKYTHREADS) == 0 && env->txn &&
|
||
unlikely(env->basal_txn->owner == txn->owner) &&
|
||
(globals.runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0)
|
||
return MDBX_TXN_OVERLAPPING;
|
||
|
||
/* Seek & fetch the last meta */
|
||
uint64_t timestamp = 0;
|
||
size_t loop = 0;
|
||
troika_t troika = meta_tap(env);
|
||
while (1) {
|
||
const meta_ptr_t head =
|
||
likely(env->stuck_meta < 0)
|
||
? /* regular */ meta_recent(env, &troika)
|
||
: /* recovery mode */ meta_ptr(env, env->stuck_meta);
|
||
if (likely(r != nullptr)) {
|
||
safe64_reset(&r->txnid, true);
|
||
atomic_store32(&r->snapshot_pages_used,
|
||
head.ptr_v->geometry.first_unallocated, mo_Relaxed);
|
||
atomic_store64(
|
||
&r->snapshot_pages_retired,
|
||
unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired),
|
||
mo_Relaxed);
|
||
safe64_write(&r->txnid, head.txnid);
|
||
eASSERT(env, r->pid.weak == osal_getpid());
|
||
eASSERT(env, r->tid.weak == ((env->flags & MDBX_NOSTICKYTHREADS)
|
||
? 0
|
||
: osal_thread_self()));
|
||
eASSERT(env, r->txnid.weak == head.txnid ||
|
||
(r->txnid.weak >= SAFE64_INVALID_THRESHOLD &&
|
||
head.txnid < env->lck->cached_oldest.weak));
|
||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_AcquireRelease);
|
||
} else {
|
||
/* exclusive mode without lck */
|
||
eASSERT(env, !env->lck_mmap.lck && env->lck == lckless_stub(env));
|
||
}
|
||
jitter4testing(true);
|
||
|
||
if (unlikely(meta_should_retry(env, &troika))) {
|
||
retry:
|
||
if (likely(++loop < 42)) {
|
||
timestamp = 0;
|
||
continue;
|
||
}
|
||
ERROR("bailout waiting for valid snapshot (%s)",
|
||
"meta-pages are too volatile");
|
||
rc = MDBX_PROBLEM;
|
||
goto read_failed;
|
||
}
|
||
|
||
/* Snap the state from current meta-head */
|
||
rc = coherency_fetch_head(txn, head, ×tamp);
|
||
jitter4testing(false);
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
if (rc == MDBX_RESULT_TRUE)
|
||
goto retry;
|
||
else
|
||
goto read_failed;
|
||
}
|
||
|
||
const uint64_t snap_oldest =
|
||
atomic_load64(&env->lck->cached_oldest, mo_AcquireRelease);
|
||
if (unlikely(txn->txnid < snap_oldest)) {
|
||
if (env->stuck_meta < 0)
|
||
goto retry;
|
||
ERROR("target meta-page %i is referenced to an obsolete MVCC-snapshot "
|
||
"%" PRIaTXN " < cached-oldest %" PRIaTXN,
|
||
env->stuck_meta, txn->txnid, snap_oldest);
|
||
rc = MDBX_MVCC_RETARDED;
|
||
goto read_failed;
|
||
}
|
||
|
||
if (likely(r != nullptr) &&
|
||
unlikely(txn->txnid != atomic_load64(&r->txnid, mo_Relaxed)))
|
||
goto retry;
|
||
break;
|
||
}
|
||
|
||
if (unlikely(txn->txnid < MIN_TXNID || txn->txnid > MAX_TXNID)) {
|
||
ERROR("%s", "environment corrupted by died writer, must shutdown!");
|
||
rc = MDBX_CORRUPTED;
|
||
read_failed:
|
||
txn->txnid = INVALID_TXNID;
|
||
if (likely(r != nullptr))
|
||
safe64_reset(&r->txnid, true);
|
||
goto bailout;
|
||
}
|
||
|
||
tASSERT(txn, rc == MDBX_SUCCESS);
|
||
ENSURE(env,
|
||
txn->txnid >=
|
||
/* paranoia is appropriate here */ env->lck->cached_oldest.weak);
|
||
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
|
||
tASSERT(txn, check_table_flags(txn->dbs[MAIN_DBI].flags));
|
||
} else {
|
||
eASSERT(env, (flags & ~(txn_rw_begin_flags | MDBX_TXN_SPILLS |
|
||
MDBX_WRITEMAP | MDBX_NOSTICKYTHREADS)) == 0);
|
||
const uintptr_t tid = osal_thread_self();
|
||
if (unlikely(txn->owner == tid ||
|
||
/* not recovery mode */ env->stuck_meta >= 0))
|
||
return MDBX_BUSY;
|
||
lck_t *const lck = env->lck_mmap.lck;
|
||
if (lck && (env->flags & MDBX_NOSTICKYTHREADS) == 0 &&
|
||
(globals.runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) {
|
||
const size_t snap_nreaders =
|
||
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||
for (size_t i = 0; i < snap_nreaders; ++i) {
|
||
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) == env->pid &&
|
||
unlikely(atomic_load64(&lck->rdt[i].tid, mo_Relaxed) == tid)) {
|
||
const txnid_t txnid = safe64_read(&lck->rdt[i].txnid);
|
||
if (txnid >= MIN_TXNID && txnid <= MAX_TXNID)
|
||
return MDBX_TXN_OVERLAPPING;
|
||
}
|
||
}
|
||
}
|
||
|
||
/* Not yet touching txn == env->basal_txn, it may be active */
|
||
jitter4testing(false);
|
||
rc = lck_txn_lock(env, !!(flags & MDBX_TXN_TRY));
|
||
if (unlikely(rc))
|
||
return rc;
|
||
if (unlikely(env->flags & ENV_FATAL_ERROR)) {
|
||
lck_txn_unlock(env);
|
||
return MDBX_PANIC;
|
||
}
|
||
#if defined(_WIN32) || defined(_WIN64)
|
||
if (unlikely(!env->dxb_mmap.base)) {
|
||
lck_txn_unlock(env);
|
||
return MDBX_EPERM;
|
||
}
|
||
#endif /* Windows */
|
||
|
||
txn->tw.troika = meta_tap(env);
|
||
const meta_ptr_t head = meta_recent(env, &txn->tw.troika);
|
||
uint64_t timestamp = 0;
|
||
while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") {
|
||
rc = coherency_fetch_head(txn, head, ×tamp);
|
||
if (likely(rc == MDBX_SUCCESS))
|
||
break;
|
||
if (unlikely(rc != MDBX_RESULT_TRUE))
|
||
goto bailout;
|
||
}
|
||
eASSERT(env, meta_txnid(head.ptr_v) == txn->txnid);
|
||
txn->txnid = safe64_txnid_next(txn->txnid);
|
||
if (unlikely(txn->txnid > MAX_TXNID)) {
|
||
rc = MDBX_TXN_FULL;
|
||
ERROR("txnid overflow, raise %d", rc);
|
||
goto bailout;
|
||
}
|
||
|
||
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
|
||
tASSERT(txn, check_table_flags(txn->dbs[MAIN_DBI].flags));
|
||
txn->flags = flags;
|
||
txn->nested = nullptr;
|
||
txn->tw.loose_pages = nullptr;
|
||
txn->tw.loose_count = 0;
|
||
#if MDBX_ENABLE_REFUND
|
||
txn->tw.loose_refund_wl = 0;
|
||
#endif /* MDBX_ENABLE_REFUND */
|
||
MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0);
|
||
txn->tw.spilled.list = nullptr;
|
||
txn->tw.spilled.least_removed = 0;
|
||
txn->tw.gc.time_acc = 0;
|
||
txn->tw.gc.last_reclaimed = 0;
|
||
if (txn->tw.gc.reclaimed)
|
||
MDBX_PNL_SETSIZE(txn->tw.gc.reclaimed, 0);
|
||
env->txn = txn;
|
||
|
||
if ((txn->flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) {
|
||
rc = dpl_alloc(txn);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto bailout;
|
||
txn->tw.dirtyroom = txn->env->options.dp_limit;
|
||
txn->tw.dirtylru = MDBX_DEBUG ? UINT32_MAX / 3 - 42 : 0;
|
||
} else {
|
||
tASSERT(txn, txn->tw.dirtylist == nullptr);
|
||
txn->tw.dirtylist = nullptr;
|
||
txn->tw.dirtyroom = MAX_PAGENO;
|
||
txn->tw.dirtylru = 0;
|
||
}
|
||
eASSERT(env, txn->tw.writemap_dirty_npages == 0);
|
||
eASSERT(env, txn->tw.writemap_spilled_npages == 0);
|
||
}
|
||
|
||
txn->front_txnid =
|
||
txn->txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0);
|
||
|
||
/* Setup db info */
|
||
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
|
||
tASSERT(txn, check_table_flags(txn->dbs[MAIN_DBI].flags));
|
||
VALGRIND_MAKE_MEM_UNDEFINED(txn->dbi_state, env->max_dbi);
|
||
#if MDBX_ENABLE_DBI_SPARSE
|
||
txn->n_dbi = CORE_DBS;
|
||
VALGRIND_MAKE_MEM_UNDEFINED(
|
||
txn->dbi_sparse,
|
||
ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) /
|
||
CHAR_BIT);
|
||
txn->dbi_sparse[0] = (1 << CORE_DBS) - 1;
|
||
#else
|
||
txn->n_dbi = (env->n_dbi < 8) ? env->n_dbi : 8;
|
||
if (txn->n_dbi > CORE_DBS)
|
||
memset(txn->dbi_state + CORE_DBS, 0, txn->n_dbi - CORE_DBS);
|
||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||
txn->dbi_state[FREE_DBI] = DBI_LINDO | DBI_VALID;
|
||
txn->dbi_state[MAIN_DBI] = DBI_LINDO | DBI_VALID;
|
||
txn->cursors[FREE_DBI] = nullptr;
|
||
txn->cursors[MAIN_DBI] = nullptr;
|
||
txn->dbi_seqs[FREE_DBI] = 0;
|
||
txn->dbi_seqs[MAIN_DBI] =
|
||
atomic_load32(&env->dbi_seqs[MAIN_DBI], mo_AcquireRelease);
|
||
|
||
if (unlikely(env->dbs_flags[MAIN_DBI] !=
|
||
(DB_VALID | txn->dbs[MAIN_DBI].flags))) {
|
||
const bool need_txn_lock =
|
||
env->basal_txn && env->basal_txn->owner != osal_thread_self();
|
||
bool should_unlock = false;
|
||
if (need_txn_lock) {
|
||
rc = lck_txn_lock(env, true);
|
||
if (rc == MDBX_SUCCESS)
|
||
should_unlock = true;
|
||
else if (rc != MDBX_BUSY && rc != MDBX_EDEADLK)
|
||
goto bailout;
|
||
}
|
||
rc = osal_fastmutex_acquire(&env->dbi_lock);
|
||
if (likely(rc == MDBX_SUCCESS)) {
|
||
uint32_t seq = dbi_seq_next(env, MAIN_DBI);
|
||
/* проверяем повторно после захвата блокировки */
|
||
if (env->dbs_flags[MAIN_DBI] != (DB_VALID | txn->dbs[MAIN_DBI].flags)) {
|
||
if (!need_txn_lock || should_unlock ||
|
||
/* если нет активной пишущей транзакции,
|
||
* то следующая будет ждать на dbi_lock */
|
||
!env->txn) {
|
||
if (env->dbs_flags[MAIN_DBI] != 0 || MDBX_DEBUG)
|
||
NOTICE("renew MainDB for %s-txn %" PRIaTXN
|
||
" since db-flags changes 0x%x -> 0x%x",
|
||
(txn->flags & MDBX_TXN_RDONLY) ? "ro" : "rw", txn->txnid,
|
||
env->dbs_flags[MAIN_DBI] & ~DB_VALID,
|
||
txn->dbs[MAIN_DBI].flags);
|
||
env->dbs_flags[MAIN_DBI] = DB_POISON;
|
||
atomic_store32(&env->dbi_seqs[MAIN_DBI], seq, mo_AcquireRelease);
|
||
rc = tbl_setup(env, &env->kvs[MAIN_DBI], &txn->dbs[MAIN_DBI]);
|
||
if (likely(rc == MDBX_SUCCESS)) {
|
||
seq = dbi_seq_next(env, MAIN_DBI);
|
||
env->dbs_flags[MAIN_DBI] = DB_VALID | txn->dbs[MAIN_DBI].flags;
|
||
txn->dbi_seqs[MAIN_DBI] = atomic_store32(&env->dbi_seqs[MAIN_DBI],
|
||
seq, mo_AcquireRelease);
|
||
}
|
||
} else {
|
||
ERROR("MainDB db-flags changes 0x%x -> 0x%x ahead of read-txn "
|
||
"%" PRIaTXN,
|
||
txn->dbs[MAIN_DBI].flags, env->dbs_flags[MAIN_DBI] & ~DB_VALID,
|
||
txn->txnid);
|
||
rc = MDBX_INCOMPATIBLE;
|
||
}
|
||
}
|
||
ENSURE(env, osal_fastmutex_release(&env->dbi_lock) == MDBX_SUCCESS);
|
||
} else {
|
||
DEBUG("dbi_lock failed, err %d", rc);
|
||
}
|
||
if (should_unlock)
|
||
lck_txn_unlock(env);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto bailout;
|
||
}
|
||
|
||
if (unlikely(txn->dbs[FREE_DBI].flags != MDBX_INTEGERKEY)) {
|
||
ERROR("unexpected/invalid db-flags 0x%x for %s", txn->dbs[FREE_DBI].flags,
|
||
"GC/FreeDB");
|
||
rc = MDBX_INCOMPATIBLE;
|
||
goto bailout;
|
||
}
|
||
|
||
tASSERT(txn, txn->dbs[FREE_DBI].flags == MDBX_INTEGERKEY);
|
||
tASSERT(txn, check_table_flags(txn->dbs[MAIN_DBI].flags));
|
||
if (unlikely(env->flags & ENV_FATAL_ERROR)) {
|
||
WARNING("%s", "environment had fatal error, must shutdown!");
|
||
rc = MDBX_PANIC;
|
||
} else {
|
||
const size_t size_bytes = pgno2bytes(env, txn->geo.end_pgno);
|
||
const size_t used_bytes = pgno2bytes(env, txn->geo.first_unallocated);
|
||
const size_t required_bytes =
|
||
(txn->flags & MDBX_TXN_RDONLY) ? used_bytes : size_bytes;
|
||
eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current);
|
||
if (unlikely(required_bytes > env->dxb_mmap.current)) {
|
||
/* Размер БД (для пишущих транзакций) или используемых данных (для
|
||
* читающих транзакций) больше предыдущего/текущего размера внутри
|
||
* процесса, увеличиваем. Сюда также попадает случай увеличения верхней
|
||
* границы размера БД и отображения. В читающих транзакциях нельзя
|
||
* изменять размер файла, который может быть больше необходимого этой
|
||
* транзакции. */
|
||
if (txn->geo.upper > MAX_PAGENO + 1 ||
|
||
bytes2pgno(env, pgno2bytes(env, txn->geo.upper)) != txn->geo.upper) {
|
||
rc = MDBX_UNABLE_EXTEND_MAPSIZE;
|
||
goto bailout;
|
||
}
|
||
rc = dxb_resize(env, txn->geo.first_unallocated, txn->geo.end_pgno,
|
||
txn->geo.upper, implicit_grow);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto bailout;
|
||
eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current);
|
||
} else if (unlikely(size_bytes < env->dxb_mmap.current)) {
|
||
/* Размер БД меньше предыдущего/текущего размера внутри процесса, можно
|
||
* уменьшить, но всё сложнее:
|
||
* - размер файла согласован со всеми читаемыми снимками на момент
|
||
* коммита последней транзакции;
|
||
* - в читающей транзакции размер файла может быть больше и него нельзя
|
||
* изменять, в том числе менять madvise (меньша размера файла нельзя,
|
||
* а за размером нет смысла).
|
||
* - в пишущей транзакции уменьшать размер файла можно только после
|
||
* проверки размера читаемых снимков, но в этом нет смысла, так как
|
||
* это будет сделано при фиксации транзакции.
|
||
*
|
||
* В сухом остатке, можно только установить dxb_mmap.current равным
|
||
* размеру файла, а это проще сделать без вызова dxb_resize() и усложения
|
||
* внутренней логики.
|
||
*
|
||
* В этой тактике есть недостаток: если пишущите транзакции не регулярны,
|
||
* и при завершении такой транзакции файл БД остаётся не-уменьшеным из-за
|
||
* читающих транзакций использующих предыдущие снимки. */
|
||
#if defined(_WIN32) || defined(_WIN64)
|
||
imports.srwl_AcquireShared(&env->remap_guard);
|
||
#else
|
||
rc = osal_fastmutex_acquire(&env->remap_guard);
|
||
#endif
|
||
if (likely(rc == MDBX_SUCCESS)) {
|
||
eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current);
|
||
rc = osal_filesize(env->dxb_mmap.fd, &env->dxb_mmap.filesize);
|
||
if (likely(rc == MDBX_SUCCESS)) {
|
||
eASSERT(env, env->dxb_mmap.filesize >= required_bytes);
|
||
if (env->dxb_mmap.current > env->dxb_mmap.filesize)
|
||
env->dxb_mmap.current =
|
||
(env->dxb_mmap.limit < env->dxb_mmap.filesize)
|
||
? env->dxb_mmap.limit
|
||
: (size_t)env->dxb_mmap.filesize;
|
||
}
|
||
#if defined(_WIN32) || defined(_WIN64)
|
||
imports.srwl_ReleaseShared(&env->remap_guard);
|
||
#else
|
||
int err = osal_fastmutex_release(&env->remap_guard);
|
||
if (unlikely(err) && likely(rc == MDBX_SUCCESS))
|
||
rc = err;
|
||
#endif
|
||
}
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto bailout;
|
||
}
|
||
eASSERT(env, pgno2bytes(env, txn->geo.first_unallocated) <=
|
||
env->dxb_mmap.current);
|
||
eASSERT(env, env->dxb_mmap.limit >= env->dxb_mmap.current);
|
||
if (txn->flags & MDBX_TXN_RDONLY) {
|
||
#if defined(_WIN32) || defined(_WIN64)
|
||
if (((used_bytes > env->geo_in_bytes.lower && env->geo_in_bytes.shrink) ||
|
||
(globals.running_under_Wine &&
|
||
/* under Wine acquisition of remap_guard is always required,
|
||
* since Wine don't support section extending,
|
||
* i.e. in both cases unmap+map are required. */
|
||
used_bytes < env->geo_in_bytes.upper && env->geo_in_bytes.grow)) &&
|
||
/* avoid recursive use SRW */ (txn->flags & MDBX_NOSTICKYTHREADS) ==
|
||
0) {
|
||
txn->flags |= txn_shrink_allowed;
|
||
imports.srwl_AcquireShared(&env->remap_guard);
|
||
}
|
||
#endif /* Windows */
|
||
} else {
|
||
tASSERT(txn, txn == env->basal_txn);
|
||
MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn));
|
||
rc = cursor_init(gc, txn, FREE_DBI);
|
||
if (rc != MDBX_SUCCESS)
|
||
goto bailout;
|
||
}
|
||
dxb_sanitize_tail(env, txn);
|
||
return MDBX_SUCCESS;
|
||
}
|
||
bailout:
|
||
tASSERT(txn, rc != MDBX_SUCCESS);
|
||
txn_end(txn, TXN_END_SLOT | TXN_END_EOTDONE | TXN_END_FAIL_BEGIN);
|
||
return rc;
|
||
}
|
||
|
||
int txn_end(MDBX_txn *txn, unsigned mode) {
|
||
MDBX_env *env = txn->env;
|
||
static const char *const names[] = TXN_END_NAMES;
|
||
|
||
DEBUG("%s txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO
|
||
"/%" PRIaPGNO,
|
||
names[mode & TXN_END_OPMASK], txn->txnid,
|
||
(txn->flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env,
|
||
txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root);
|
||
|
||
if (!(mode & TXN_END_EOTDONE)) /* !(already closed cursors) */
|
||
done_cursors(txn, false);
|
||
|
||
int rc = MDBX_SUCCESS;
|
||
if (txn->flags & MDBX_TXN_RDONLY) {
|
||
if (txn->to.reader) {
|
||
reader_slot_t *slot = txn->to.reader;
|
||
eASSERT(env, slot->pid.weak == env->pid);
|
||
if (likely(!(txn->flags & MDBX_TXN_FINISHED))) {
|
||
if (likely((txn->flags & MDBX_TXN_PARKED) == 0)) {
|
||
ENSURE(env, txn->txnid >=
|
||
/* paranoia is appropriate here */ env->lck
|
||
->cached_oldest.weak);
|
||
eASSERT(env, txn->txnid == slot->txnid.weak &&
|
||
slot->txnid.weak >= env->lck->cached_oldest.weak);
|
||
} else {
|
||
if ((mode & TXN_END_OPMASK) != TXN_END_OUSTED &&
|
||
safe64_read(&slot->tid) == MDBX_TID_TXN_OUSTED)
|
||
mode = (mode & TXN_END_OPMASK) | TXN_END_OUSTED;
|
||
do {
|
||
safe64_reset(&slot->txnid, false);
|
||
atomic_store64(&slot->tid, txn->owner, mo_AcquireRelease);
|
||
atomic_yield();
|
||
} while (
|
||
unlikely(safe64_read(&slot->txnid) < SAFE64_INVALID_THRESHOLD ||
|
||
safe64_read(&slot->tid) != txn->owner));
|
||
}
|
||
dxb_sanitize_tail(env, nullptr);
|
||
atomic_store32(&slot->snapshot_pages_used, 0, mo_Relaxed);
|
||
safe64_reset(&slot->txnid, true);
|
||
atomic_store32(&env->lck->rdt_refresh_flag, true, mo_Relaxed);
|
||
} else {
|
||
eASSERT(env, slot->pid.weak == env->pid);
|
||
eASSERT(env, slot->txnid.weak >= SAFE64_INVALID_THRESHOLD);
|
||
}
|
||
if (mode & TXN_END_SLOT) {
|
||
if ((env->flags & ENV_TXKEY) == 0)
|
||
atomic_store32(&slot->pid, 0, mo_Relaxed);
|
||
txn->to.reader = nullptr;
|
||
}
|
||
}
|
||
#if defined(_WIN32) || defined(_WIN64)
|
||
if (txn->flags & txn_shrink_allowed)
|
||
imports.srwl_ReleaseShared(&env->remap_guard);
|
||
#endif
|
||
txn->n_dbi = 0; /* prevent further DBI activity */
|
||
txn->flags = ((mode & TXN_END_OPMASK) != TXN_END_OUSTED)
|
||
? MDBX_TXN_RDONLY | MDBX_TXN_FINISHED
|
||
: MDBX_TXN_RDONLY | MDBX_TXN_FINISHED | MDBX_TXN_OUSTED;
|
||
txn->owner = 0;
|
||
} else if (!(txn->flags & MDBX_TXN_FINISHED)) {
|
||
ENSURE(env,
|
||
txn->txnid >=
|
||
/* paranoia is appropriate here */ env->lck->cached_oldest.weak);
|
||
if (txn == env->basal_txn)
|
||
dxb_sanitize_tail(env, nullptr);
|
||
|
||
txn->flags = MDBX_TXN_FINISHED;
|
||
env->txn = txn->parent;
|
||
pnl_free(txn->tw.spilled.list);
|
||
txn->tw.spilled.list = nullptr;
|
||
if (txn == env->basal_txn) {
|
||
eASSERT(env, txn->parent == nullptr);
|
||
/* Export or close DBI handles created in this txn */
|
||
rc = dbi_update(txn, mode & TXN_END_UPDATE);
|
||
pnl_shrink(&txn->tw.retired_pages);
|
||
pnl_shrink(&txn->tw.relist);
|
||
if (!(env->flags & MDBX_WRITEMAP))
|
||
dpl_release_shadows(txn);
|
||
/* The writer mutex was locked in mdbx_txn_begin. */
|
||
lck_txn_unlock(env);
|
||
} else {
|
||
eASSERT(env, txn->parent != nullptr);
|
||
MDBX_txn *const parent = txn->parent;
|
||
eASSERT(env, parent->signature == txn_signature);
|
||
eASSERT(env, parent->nested == txn &&
|
||
(parent->flags & MDBX_TXN_HAS_CHILD) != 0);
|
||
eASSERT(env,
|
||
pnl_check_allocated(txn->tw.relist, txn->geo.first_unallocated -
|
||
MDBX_ENABLE_REFUND));
|
||
eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika,
|
||
sizeof(troika_t)) == 0);
|
||
|
||
txn->owner = 0;
|
||
if (txn->tw.gc.reclaimed) {
|
||
eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.gc.reclaimed) >=
|
||
(uintptr_t)parent->tw.gc.reclaimed);
|
||
MDBX_PNL_SETSIZE(txn->tw.gc.reclaimed,
|
||
(uintptr_t)parent->tw.gc.reclaimed);
|
||
parent->tw.gc.reclaimed = txn->tw.gc.reclaimed;
|
||
}
|
||
|
||
if (txn->tw.retired_pages) {
|
||
eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.retired_pages) >=
|
||
(uintptr_t)parent->tw.retired_pages);
|
||
MDBX_PNL_SETSIZE(txn->tw.retired_pages,
|
||
(uintptr_t)parent->tw.retired_pages);
|
||
parent->tw.retired_pages = txn->tw.retired_pages;
|
||
}
|
||
|
||
parent->nested = nullptr;
|
||
parent->flags &= ~MDBX_TXN_HAS_CHILD;
|
||
parent->tw.dirtylru = txn->tw.dirtylru;
|
||
tASSERT(parent, dpl_check(parent));
|
||
tASSERT(parent, audit_ex(parent, 0, false) == 0);
|
||
dpl_release_shadows(txn);
|
||
dpl_free(txn);
|
||
pnl_free(txn->tw.relist);
|
||
|
||
if (parent->geo.upper != txn->geo.upper ||
|
||
parent->geo.now != txn->geo.now) {
|
||
/* undo resize performed by child txn */
|
||
rc = dxb_resize(env, parent->geo.first_unallocated, parent->geo.now,
|
||
parent->geo.upper, impilict_shrink);
|
||
if (rc == MDBX_EPERM) {
|
||
/* unable undo resize (it is regular for Windows),
|
||
* therefore promote size changes from child to the parent txn */
|
||
WARNING("unable undo resize performed by child txn, promote to "
|
||
"the parent (%u->%u, %u->%u)",
|
||
txn->geo.now, parent->geo.now, txn->geo.upper,
|
||
parent->geo.upper);
|
||
parent->geo.now = txn->geo.now;
|
||
parent->geo.upper = txn->geo.upper;
|
||
parent->flags |= MDBX_TXN_DIRTY;
|
||
rc = MDBX_SUCCESS;
|
||
} else if (unlikely(rc != MDBX_SUCCESS)) {
|
||
ERROR("error %d while undo resize performed by child txn, fail "
|
||
"the parent",
|
||
rc);
|
||
parent->flags |= MDBX_TXN_ERROR;
|
||
if (!env->dxb_mmap.base)
|
||
env->flags |= ENV_FATAL_ERROR;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
eASSERT(env, txn == env->basal_txn || txn->owner == 0);
|
||
if ((mode & TXN_END_FREE) != 0 && txn != env->basal_txn) {
|
||
txn->signature = 0;
|
||
osal_free(txn);
|
||
}
|
||
|
||
return rc;
|
||
}
|
||
|
||
/*----------------------------------------------------------------------------*/
|
||
|
||
int mdbx_txn_renew(MDBX_txn *txn) {
|
||
if (unlikely(!txn))
|
||
return MDBX_EINVAL;
|
||
|
||
if (unlikely(txn->signature != txn_signature))
|
||
return MDBX_EBADSIGN;
|
||
|
||
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
|
||
return MDBX_EINVAL;
|
||
|
||
if (unlikely(txn->owner != 0 || !(txn->flags & MDBX_TXN_FINISHED))) {
|
||
int rc = mdbx_txn_reset(txn);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
}
|
||
|
||
int rc = txn_renew(txn, MDBX_TXN_RDONLY);
|
||
if (rc == MDBX_SUCCESS) {
|
||
tASSERT(txn, txn->owner == (txn->flags & MDBX_NOSTICKYTHREADS)
|
||
? 0
|
||
: osal_thread_self());
|
||
DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO
|
||
"/%" PRIaPGNO,
|
||
txn->txnid, (txn->flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn,
|
||
(void *)txn->env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root);
|
||
}
|
||
return rc;
|
||
}
|
||
|
||
int mdbx_txn_set_userctx(MDBX_txn *txn, void *ctx) {
|
||
int rc = check_txn(txn, MDBX_TXN_FINISHED);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
|
||
txn->userctx = ctx;
|
||
return MDBX_SUCCESS;
|
||
}
|
||
|
||
void *mdbx_txn_get_userctx(const MDBX_txn *txn) {
|
||
return check_txn(txn, MDBX_TXN_FINISHED) ? nullptr : txn->userctx;
|
||
}
|
||
|
||
int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags,
|
||
MDBX_txn **ret, void *context) {
|
||
if (unlikely(!ret))
|
||
return MDBX_EINVAL;
|
||
*ret = nullptr;
|
||
|
||
if (unlikely((flags & ~txn_rw_begin_flags) && (flags & ~txn_ro_begin_flags)))
|
||
return MDBX_EINVAL;
|
||
|
||
int rc = check_env(env, true);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
|
||
if (unlikely(env->flags & MDBX_RDONLY & ~flags)) /* write txn in RDONLY env */
|
||
return MDBX_EACCESS;
|
||
|
||
MDBX_txn *txn = nullptr;
|
||
if (parent) {
|
||
/* Nested transactions: Max 1 child, write txns only, no writemap */
|
||
rc = check_txn_rw(parent,
|
||
MDBX_TXN_RDONLY | MDBX_WRITEMAP | MDBX_TXN_BLOCKED);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
|
||
if (env->options.spill_parent4child_denominator) {
|
||
/* Spill dirty-pages of parent to provide dirtyroom for child txn */
|
||
rc = txn_spill(parent, nullptr,
|
||
parent->tw.dirtylist->length /
|
||
env->options.spill_parent4child_denominator);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
}
|
||
tASSERT(parent, audit_ex(parent, 0, false) == 0);
|
||
|
||
flags |= parent->flags & (txn_rw_begin_flags | MDBX_TXN_SPILLS |
|
||
MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP);
|
||
} else if ((flags & MDBX_TXN_RDONLY) == 0) {
|
||
/* Reuse preallocated write txn. However, do not touch it until
|
||
* txn_renew() succeeds, since it currently may be active. */
|
||
txn = env->basal_txn;
|
||
goto renew;
|
||
}
|
||
|
||
const intptr_t bitmap_bytes =
|
||
#if MDBX_ENABLE_DBI_SPARSE
|
||
ceil_powerof2(env->max_dbi, CHAR_BIT * sizeof(txn->dbi_sparse[0])) /
|
||
CHAR_BIT;
|
||
#else
|
||
0;
|
||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||
STATIC_ASSERT(sizeof(txn->tw) > sizeof(txn->to));
|
||
const size_t base = (flags & MDBX_TXN_RDONLY)
|
||
? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to)
|
||
: sizeof(MDBX_txn);
|
||
const size_t size =
|
||
base +
|
||
((flags & MDBX_TXN_RDONLY)
|
||
? (size_t)bitmap_bytes + env->max_dbi * sizeof(txn->dbi_seqs[0])
|
||
: 0) +
|
||
env->max_dbi * (sizeof(txn->dbs[0]) + sizeof(txn->cursors[0]) +
|
||
sizeof(txn->dbi_state[0]));
|
||
txn = osal_malloc(size);
|
||
if (unlikely(txn == nullptr)) {
|
||
DEBUG("calloc: %s", "failed");
|
||
return MDBX_ENOMEM;
|
||
}
|
||
#if MDBX_DEBUG
|
||
memset(txn, 0xCD, size);
|
||
VALGRIND_MAKE_MEM_UNDEFINED(txn, size);
|
||
#endif /* MDBX_DEBUG */
|
||
MDBX_ANALYSIS_ASSUME(size > base);
|
||
memset(txn, 0,
|
||
(MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base);
|
||
txn->dbs = ptr_disp(txn, base);
|
||
txn->cursors = ptr_disp(txn->dbs, env->max_dbi * sizeof(txn->dbs[0]));
|
||
#if MDBX_DEBUG
|
||
txn->cursors[FREE_DBI] = nullptr; /* avoid SIGSEGV in an assertion later */
|
||
#endif
|
||
txn->dbi_state =
|
||
ptr_disp(txn, size - env->max_dbi * sizeof(txn->dbi_state[0]));
|
||
txn->flags = flags;
|
||
txn->env = env;
|
||
|
||
if (parent) {
|
||
tASSERT(parent, dpl_check(parent));
|
||
#if MDBX_ENABLE_DBI_SPARSE
|
||
txn->dbi_sparse = parent->dbi_sparse;
|
||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||
txn->dbi_seqs = parent->dbi_seqs;
|
||
txn->geo = parent->geo;
|
||
rc = dpl_alloc(txn);
|
||
if (likely(rc == MDBX_SUCCESS)) {
|
||
const size_t len =
|
||
MDBX_PNL_GETSIZE(parent->tw.relist) + parent->tw.loose_count;
|
||
txn->tw.relist =
|
||
pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL);
|
||
if (unlikely(!txn->tw.relist))
|
||
rc = MDBX_ENOMEM;
|
||
}
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
nested_failed:
|
||
pnl_free(txn->tw.relist);
|
||
dpl_free(txn);
|
||
osal_free(txn);
|
||
return rc;
|
||
}
|
||
|
||
/* Move loose pages to reclaimed list */
|
||
if (parent->tw.loose_count) {
|
||
do {
|
||
page_t *lp = parent->tw.loose_pages;
|
||
tASSERT(parent, lp->flags == P_LOOSE);
|
||
rc = pnl_insert_span(&parent->tw.relist, lp->pgno, 1);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
goto nested_failed;
|
||
MDBX_ASAN_UNPOISON_MEMORY_REGION(&page_next(lp), sizeof(page_t *));
|
||
VALGRIND_MAKE_MEM_DEFINED(&page_next(lp), sizeof(page_t *));
|
||
parent->tw.loose_pages = page_next(lp);
|
||
/* Remove from dirty list */
|
||
page_wash(parent, dpl_exist(parent, lp->pgno), lp, 1);
|
||
} while (parent->tw.loose_pages);
|
||
parent->tw.loose_count = 0;
|
||
#if MDBX_ENABLE_REFUND
|
||
parent->tw.loose_refund_wl = 0;
|
||
#endif /* MDBX_ENABLE_REFUND */
|
||
tASSERT(parent, dpl_check(parent));
|
||
}
|
||
txn->tw.dirtyroom = parent->tw.dirtyroom;
|
||
txn->tw.dirtylru = parent->tw.dirtylru;
|
||
|
||
dpl_sort(parent);
|
||
if (parent->tw.spilled.list)
|
||
spill_purge(parent);
|
||
|
||
tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >=
|
||
MDBX_PNL_GETSIZE(parent->tw.relist));
|
||
memcpy(txn->tw.relist, parent->tw.relist,
|
||
MDBX_PNL_SIZEOF(parent->tw.relist));
|
||
eASSERT(env, pnl_check_allocated(
|
||
txn->tw.relist,
|
||
(txn->geo.first_unallocated /* LY: intentional assignment
|
||
here, only for assertion */
|
||
= parent->geo.first_unallocated) -
|
||
MDBX_ENABLE_REFUND));
|
||
|
||
txn->tw.gc.time_acc = parent->tw.gc.time_acc;
|
||
txn->tw.gc.last_reclaimed = parent->tw.gc.last_reclaimed;
|
||
if (parent->tw.gc.reclaimed) {
|
||
txn->tw.gc.reclaimed = parent->tw.gc.reclaimed;
|
||
parent->tw.gc.reclaimed =
|
||
(void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.gc.reclaimed);
|
||
}
|
||
|
||
txn->tw.retired_pages = parent->tw.retired_pages;
|
||
parent->tw.retired_pages =
|
||
(void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages);
|
||
|
||
txn->txnid = parent->txnid;
|
||
txn->front_txnid = parent->front_txnid + 1;
|
||
#if MDBX_ENABLE_REFUND
|
||
txn->tw.loose_refund_wl = 0;
|
||
#endif /* MDBX_ENABLE_REFUND */
|
||
txn->canary = parent->canary;
|
||
parent->flags |= MDBX_TXN_HAS_CHILD;
|
||
parent->nested = txn;
|
||
txn->parent = parent;
|
||
txn->owner = parent->owner;
|
||
txn->tw.troika = parent->tw.troika;
|
||
|
||
txn->cursors[FREE_DBI] = nullptr;
|
||
txn->cursors[MAIN_DBI] = nullptr;
|
||
txn->dbi_state[FREE_DBI] =
|
||
parent->dbi_state[FREE_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
|
||
txn->dbi_state[MAIN_DBI] =
|
||
parent->dbi_state[MAIN_DBI] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY);
|
||
memset(txn->dbi_state + CORE_DBS, 0,
|
||
(txn->n_dbi = parent->n_dbi) - CORE_DBS);
|
||
memcpy(txn->dbs, parent->dbs, sizeof(txn->dbs[0]) * CORE_DBS);
|
||
|
||
tASSERT(parent, parent->tw.dirtyroom + parent->tw.dirtylist->length ==
|
||
(parent->parent ? parent->parent->tw.dirtyroom
|
||
: parent->env->options.dp_limit));
|
||
tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length ==
|
||
(txn->parent ? txn->parent->tw.dirtyroom
|
||
: txn->env->options.dp_limit));
|
||
env->txn = txn;
|
||
tASSERT(parent, parent->cursors[FREE_DBI] == nullptr);
|
||
rc = parent->cursors[MAIN_DBI]
|
||
? cursor_shadow(parent->cursors[MAIN_DBI], txn, MAIN_DBI)
|
||
: MDBX_SUCCESS;
|
||
if (AUDIT_ENABLED() && ASSERT_ENABLED()) {
|
||
txn->signature = txn_signature;
|
||
tASSERT(txn, audit_ex(txn, 0, false) == 0);
|
||
}
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
txn_end(txn, TXN_END_FAIL_BEGINCHILD);
|
||
} else { /* MDBX_TXN_RDONLY */
|
||
txn->dbi_seqs =
|
||
ptr_disp(txn->cursors, env->max_dbi * sizeof(txn->cursors[0]));
|
||
#if MDBX_ENABLE_DBI_SPARSE
|
||
txn->dbi_sparse = ptr_disp(txn->dbi_state, -bitmap_bytes);
|
||
#endif /* MDBX_ENABLE_DBI_SPARSE */
|
||
renew:
|
||
rc = txn_renew(txn, flags);
|
||
}
|
||
|
||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||
if (txn != env->basal_txn)
|
||
osal_free(txn);
|
||
} else {
|
||
if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY))
|
||
eASSERT(env, txn->flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED));
|
||
else if (flags & MDBX_TXN_RDONLY)
|
||
eASSERT(env, (txn->flags &
|
||
~(MDBX_NOSTICKYTHREADS | MDBX_TXN_RDONLY | MDBX_WRITEMAP |
|
||
/* Win32: SRWL flag */ txn_shrink_allowed)) == 0);
|
||
else {
|
||
eASSERT(env, (txn->flags & ~(MDBX_NOSTICKYTHREADS | MDBX_WRITEMAP |
|
||
txn_shrink_allowed | MDBX_NOMETASYNC |
|
||
MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0);
|
||
assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed);
|
||
}
|
||
txn->signature = txn_signature;
|
||
txn->userctx = context;
|
||
*ret = txn;
|
||
DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO
|
||
"/%" PRIaPGNO,
|
||
txn->txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn,
|
||
(void *)env, txn->dbs[MAIN_DBI].root, txn->dbs[FREE_DBI].root);
|
||
}
|
||
|
||
return rc;
|
||
}
|
||
|
||
int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) {
|
||
int rc = check_txn(txn, MDBX_TXN_FINISHED);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
|
||
if (unlikely(!info))
|
||
return MDBX_EINVAL;
|
||
|
||
MDBX_env *const env = txn->env;
|
||
#if MDBX_ENV_CHECKPID
|
||
if (unlikely(env->pid != osal_getpid())) {
|
||
env->flags |= ENV_FATAL_ERROR;
|
||
return MDBX_PANIC;
|
||
}
|
||
#endif /* MDBX_ENV_CHECKPID */
|
||
|
||
info->txn_id = txn->txnid;
|
||
info->txn_space_used = pgno2bytes(env, txn->geo.first_unallocated);
|
||
|
||
if (txn->flags & MDBX_TXN_RDONLY) {
|
||
meta_ptr_t head;
|
||
uint64_t head_retired;
|
||
troika_t troika = meta_tap(env);
|
||
do {
|
||
/* fetch info from volatile head */
|
||
head = meta_recent(env, &troika);
|
||
head_retired = unaligned_peek_u64_volatile(4, head.ptr_v->pages_retired);
|
||
info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->geometry.now);
|
||
info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->geometry.upper);
|
||
info->txn_space_leftover =
|
||
pgno2bytes(env, head.ptr_v->geometry.now -
|
||
head.ptr_v->geometry.first_unallocated);
|
||
} while (unlikely(meta_should_retry(env, &troika)));
|
||
|
||
info->txn_reader_lag = head.txnid - info->txn_id;
|
||
info->txn_space_dirty = info->txn_space_retired = 0;
|
||
uint64_t reader_snapshot_pages_retired = 0;
|
||
if (txn->to.reader &&
|
||
((txn->flags & MDBX_TXN_PARKED) == 0 ||
|
||
safe64_read(&txn->to.reader->tid) != MDBX_TID_TXN_OUSTED) &&
|
||
head_retired >
|
||
(reader_snapshot_pages_retired = atomic_load64(
|
||
&txn->to.reader->snapshot_pages_retired, mo_Relaxed))) {
|
||
info->txn_space_dirty = info->txn_space_retired = pgno2bytes(
|
||
env, (pgno_t)(head_retired - reader_snapshot_pages_retired));
|
||
|
||
size_t retired_next_reader = 0;
|
||
lck_t *const lck = env->lck_mmap.lck;
|
||
if (scan_rlt && info->txn_reader_lag > 1 && lck) {
|
||
/* find next more recent reader */
|
||
txnid_t next_reader = head.txnid;
|
||
const size_t snap_nreaders =
|
||
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||
for (size_t i = 0; i < snap_nreaders; ++i) {
|
||
retry:
|
||
if (atomic_load32(&lck->rdt[i].pid, mo_AcquireRelease)) {
|
||
jitter4testing(true);
|
||
const uint64_t snap_tid = safe64_read(&lck->rdt[i].tid);
|
||
const txnid_t snap_txnid = safe64_read(&lck->rdt[i].txnid);
|
||
const uint64_t snap_retired = atomic_load64(
|
||
&lck->rdt[i].snapshot_pages_retired, mo_AcquireRelease);
|
||
if (unlikely(snap_retired !=
|
||
atomic_load64(&lck->rdt[i].snapshot_pages_retired,
|
||
mo_Relaxed)) ||
|
||
snap_txnid != safe64_read(&lck->rdt[i].txnid) ||
|
||
snap_tid != safe64_read(&lck->rdt[i].tid))
|
||
goto retry;
|
||
if (snap_txnid <= txn->txnid) {
|
||
retired_next_reader = 0;
|
||
break;
|
||
}
|
||
if (snap_txnid < next_reader && snap_tid >= MDBX_TID_TXN_OUSTED) {
|
||
next_reader = snap_txnid;
|
||
retired_next_reader = pgno2bytes(
|
||
env, (pgno_t)(snap_retired -
|
||
atomic_load64(
|
||
&txn->to.reader->snapshot_pages_retired,
|
||
mo_Relaxed)));
|
||
}
|
||
}
|
||
}
|
||
}
|
||
info->txn_space_dirty = retired_next_reader;
|
||
}
|
||
} else {
|
||
info->txn_space_limit_soft = pgno2bytes(env, txn->geo.now);
|
||
info->txn_space_limit_hard = pgno2bytes(env, txn->geo.upper);
|
||
info->txn_space_retired =
|
||
pgno2bytes(env, txn->nested ? (size_t)txn->tw.retired_pages
|
||
: MDBX_PNL_GETSIZE(txn->tw.retired_pages));
|
||
info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom);
|
||
info->txn_space_dirty = pgno2bytes(
|
||
env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose
|
||
: (txn->tw.writemap_dirty_npages +
|
||
txn->tw.writemap_spilled_npages));
|
||
info->txn_reader_lag = INT64_MAX;
|
||
lck_t *const lck = env->lck_mmap.lck;
|
||
if (scan_rlt && lck) {
|
||
txnid_t oldest_snapshot = txn->txnid;
|
||
const size_t snap_nreaders =
|
||
atomic_load32(&lck->rdt_length, mo_AcquireRelease);
|
||
if (snap_nreaders) {
|
||
oldest_snapshot = txn_snapshot_oldest(txn);
|
||
if (oldest_snapshot == txn->txnid - 1) {
|
||
/* check if there is at least one reader */
|
||
bool exists = false;
|
||
for (size_t i = 0; i < snap_nreaders; ++i) {
|
||
if (atomic_load32(&lck->rdt[i].pid, mo_Relaxed) &&
|
||
txn->txnid > safe64_read(&lck->rdt[i].txnid)) {
|
||
exists = true;
|
||
break;
|
||
}
|
||
}
|
||
oldest_snapshot += !exists;
|
||
}
|
||
}
|
||
info->txn_reader_lag = txn->txnid - oldest_snapshot;
|
||
}
|
||
}
|
||
|
||
return MDBX_SUCCESS;
|
||
}
|
||
|
||
MDBX_env *mdbx_txn_env(const MDBX_txn *txn) {
|
||
if (unlikely(!txn || txn->signature != txn_signature ||
|
||
txn->env->signature.weak != env_signature))
|
||
return nullptr;
|
||
return txn->env;
|
||
}
|
||
|
||
uint64_t mdbx_txn_id(const MDBX_txn *txn) {
|
||
if (unlikely(!txn || txn->signature != txn_signature))
|
||
return 0;
|
||
return txn->txnid;
|
||
}
|
||
|
||
MDBX_txn_flags_t mdbx_txn_flags(const MDBX_txn *txn) {
|
||
STATIC_ASSERT(
|
||
(MDBX_TXN_INVALID &
|
||
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS |
|
||
MDBX_TXN_HAS_CHILD | txn_gc_drained | txn_shrink_allowed |
|
||
txn_rw_begin_flags | txn_ro_begin_flags)) == 0);
|
||
if (unlikely(!txn || txn->signature != txn_signature))
|
||
return MDBX_TXN_INVALID;
|
||
assert(0 == (int)(txn->flags & MDBX_TXN_INVALID));
|
||
|
||
MDBX_txn_flags_t flags = txn->flags;
|
||
if (F_ISSET(flags, MDBX_TXN_PARKED | MDBX_TXN_RDONLY) && txn->to.reader &&
|
||
safe64_read(&txn->to.reader->tid) == MDBX_TID_TXN_OUSTED)
|
||
flags |= MDBX_TXN_OUSTED;
|
||
return flags;
|
||
}
|
||
|
||
int mdbx_txn_reset(MDBX_txn *txn) {
|
||
int rc = check_txn(txn, 0);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
|
||
/* This call is only valid for read-only txns */
|
||
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
|
||
return MDBX_EINVAL;
|
||
|
||
/* LY: don't close DBI-handles */
|
||
rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE);
|
||
if (rc == MDBX_SUCCESS) {
|
||
tASSERT(txn, txn->signature == txn_signature);
|
||
tASSERT(txn, txn->owner == 0);
|
||
}
|
||
return rc;
|
||
}
|
||
|
||
int mdbx_txn_break(MDBX_txn *txn) {
|
||
do {
|
||
int rc = check_txn(txn, 0);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
txn->flags |= MDBX_TXN_ERROR;
|
||
if (txn->flags & MDBX_TXN_RDONLY)
|
||
break;
|
||
txn = txn->nested;
|
||
} while (txn);
|
||
return MDBX_SUCCESS;
|
||
}
|
||
|
||
int mdbx_txn_abort(MDBX_txn *txn) {
|
||
int rc = check_txn(txn, 0);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
|
||
rc = check_env(txn->env, true);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
|
||
if ((txn->flags & (MDBX_TXN_RDONLY | MDBX_NOSTICKYTHREADS)) ==
|
||
MDBX_NOSTICKYTHREADS &&
|
||
unlikely(txn->owner != osal_thread_self())) {
|
||
mdbx_txn_break(txn);
|
||
return MDBX_THREAD_MISMATCH;
|
||
}
|
||
|
||
return txn_abort(txn);
|
||
}
|
||
|
||
int mdbx_txn_park(MDBX_txn *txn, bool autounpark) {
|
||
STATIC_ASSERT(MDBX_TXN_BLOCKED > MDBX_TXN_ERROR);
|
||
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
if (unlikely((txn->flags & MDBX_TXN_RDONLY) == 0))
|
||
return MDBX_TXN_INVALID;
|
||
|
||
if (unlikely((txn->flags & MDBX_TXN_ERROR))) {
|
||
rc = txn_end(txn, TXN_END_RESET | TXN_END_UPDATE);
|
||
return rc ? rc : MDBX_OUSTED;
|
||
}
|
||
|
||
return txn_park(txn, autounpark);
|
||
}
|
||
|
||
int mdbx_txn_unpark(MDBX_txn *txn, bool restart_if_ousted) {
|
||
STATIC_ASSERT(MDBX_TXN_BLOCKED > MDBX_TXN_PARKED + MDBX_TXN_ERROR);
|
||
int rc = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_PARKED - MDBX_TXN_ERROR);
|
||
if (unlikely(rc != MDBX_SUCCESS))
|
||
return rc;
|
||
if (unlikely(!F_ISSET(txn->flags, MDBX_TXN_RDONLY | MDBX_TXN_PARKED)))
|
||
return MDBX_SUCCESS;
|
||
|
||
rc = txn_unpark(txn);
|
||
if (likely(rc != MDBX_OUSTED) || !restart_if_ousted)
|
||
return rc;
|
||
|
||
tASSERT(txn, txn->flags & MDBX_TXN_FINISHED);
|
||
rc = txn_renew(txn, MDBX_TXN_RDONLY);
|
||
return (rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : rc;
|
||
}
|
||
|
||
int txn_check_badbits_parked(const MDBX_txn *txn, int bad_bits) {
|
||
tASSERT(txn, (bad_bits & MDBX_TXN_PARKED) && (txn->flags & bad_bits));
|
||
/* Здесь осознано заложено отличие в поведении припаркованных транзакций:
|
||
* - некоторые функции (например mdbx_env_info_ex()), допускают
|
||
* использование поломанных транзакций (с флагом MDBX_TXN_ERROR), но
|
||
* не могут работать с припаркованными транзакциями (требуют распарковки).
|
||
* - но при распарковке поломанные транзакции завершаются.
|
||
* - получается что транзакцию можно припарковать, потом поломать вызвав
|
||
* mdbx_txn_break(), но далее любое её использование приведет к завершению
|
||
* при распарковке. */
|
||
if ((txn->flags & (bad_bits | MDBX_TXN_AUTOUNPARK)) !=
|
||
(MDBX_TXN_PARKED | MDBX_TXN_AUTOUNPARK))
|
||
return MDBX_BAD_TXN;
|
||
|
||
tASSERT(txn, bad_bits == MDBX_TXN_BLOCKED ||
|
||
bad_bits == MDBX_TXN_BLOCKED - MDBX_TXN_ERROR);
|
||
return mdbx_txn_unpark((MDBX_txn *)txn, false);
|
||
}
|