mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-04 18:04:13 +08:00
mdbx: переделка отслеживания mlocks для игнорирования EINVAL
от madvise()
.
This commit is contained in:
parent
9cbd4e63ca
commit
47e7a646fd
124
src/core.c
124
src/core.c
@ -5688,41 +5688,44 @@ __cold static int set_readahead(const MDBX_env *env, const pgno_t edge,
|
|||||||
}
|
}
|
||||||
#endif /* MDBX_ENABLE_MADVISE */
|
#endif /* MDBX_ENABLE_MADVISE */
|
||||||
|
|
||||||
__cold static void update_mlocked(const MDBX_env *env,
|
__cold static void update_mlcnt(const MDBX_env *env,
|
||||||
const pgno_t new_aligned_mlocked_pgno,
|
const pgno_t new_aligned_mlocked_pgno,
|
||||||
const bool lock_not_release) {
|
const bool lock_not_release) {
|
||||||
for (;;) {
|
for (;;) {
|
||||||
const pgno_t mlock_pgno_snap =
|
const pgno_t mlock_pgno_before =
|
||||||
atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease);
|
atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease);
|
||||||
eASSERT(env, pgno_align2os_pgno(env, mlock_pgno_snap) == mlock_pgno_snap);
|
eASSERT(env,
|
||||||
|
pgno_align2os_pgno(env, mlock_pgno_before) == mlock_pgno_before);
|
||||||
eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) ==
|
eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) ==
|
||||||
new_aligned_mlocked_pgno);
|
new_aligned_mlocked_pgno);
|
||||||
if (lock_not_release ? (mlock_pgno_snap >= new_aligned_mlocked_pgno)
|
if (lock_not_release ? (mlock_pgno_before >= new_aligned_mlocked_pgno)
|
||||||
: (mlock_pgno_snap <= new_aligned_mlocked_pgno))
|
: (mlock_pgno_before <= new_aligned_mlocked_pgno))
|
||||||
break;
|
break;
|
||||||
if (likely(atomic_cas32(&((MDBX_env *)env)->me_mlocked_pgno,
|
if (likely(atomic_cas32(&((MDBX_env *)env)->me_mlocked_pgno,
|
||||||
mlock_pgno_snap, new_aligned_mlocked_pgno)))
|
mlock_pgno_before, new_aligned_mlocked_pgno)))
|
||||||
for (;;) {
|
for (;;) {
|
||||||
MDBX_atomic_uint32_t *const mlock_counter =
|
MDBX_atomic_uint32_t *const mlcnt = env->me_lck->mti_mlcnt;
|
||||||
&env->me_lck->mti_mlock_counter;
|
const int32_t snap_locked = atomic_load32(mlcnt + 0, mo_Relaxed);
|
||||||
const uint32_t snap_counter = atomic_load32(mlock_counter, mo_Relaxed);
|
const int32_t snap_unlocked = atomic_load32(mlcnt + 1, mo_Relaxed);
|
||||||
if (mlock_pgno_snap == 0 && snap_counter < INT_MAX) {
|
if (mlock_pgno_before == 0 && (snap_locked - snap_unlocked) < INT_MAX) {
|
||||||
eASSERT(env, lock_not_release);
|
eASSERT(env, lock_not_release);
|
||||||
if (unlikely(
|
if (unlikely(!atomic_cas32(mlcnt + 0, snap_locked, snap_locked + 1)))
|
||||||
!atomic_cas32(mlock_counter, snap_counter, snap_counter + 1)))
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (new_aligned_mlocked_pgno == 0 && snap_counter > 0) {
|
if (new_aligned_mlocked_pgno == 0 &&
|
||||||
|
(snap_locked - snap_unlocked) > 0) {
|
||||||
eASSERT(env, !lock_not_release);
|
eASSERT(env, !lock_not_release);
|
||||||
if (unlikely(
|
if (unlikely(
|
||||||
!atomic_cas32(mlock_counter, snap_counter, snap_counter - 1)))
|
!atomic_cas32(mlcnt + 1, snap_unlocked, snap_unlocked + 1)))
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u",
|
NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u",
|
||||||
lock_not_release ? "lock" : "unlock",
|
lock_not_release ? "lock" : "unlock",
|
||||||
lock_not_release ? mlock_pgno_snap : new_aligned_mlocked_pgno,
|
lock_not_release ? mlock_pgno_before : new_aligned_mlocked_pgno,
|
||||||
lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_snap,
|
lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_before,
|
||||||
snap_counter, atomic_load32(mlock_counter, mo_Relaxed));
|
snap_locked - snap_unlocked,
|
||||||
|
atomic_load32(mlcnt + 0, mo_Relaxed) -
|
||||||
|
atomic_load32(mlcnt + 1, mo_Relaxed));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -5748,7 +5751,7 @@ __cold static void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno,
|
|||||||
: MDBX_SUCCESS;
|
: MDBX_SUCCESS;
|
||||||
#endif
|
#endif
|
||||||
if (likely(err == MDBX_SUCCESS))
|
if (likely(err == MDBX_SUCCESS))
|
||||||
update_mlocked(env, aligned_pgno, false);
|
update_mlcnt(env, aligned_pgno, false);
|
||||||
else {
|
else {
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size,
|
WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size,
|
||||||
@ -5878,6 +5881,8 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
|
|||||||
NOTICE("resize-MADV_%s %u..%u",
|
NOTICE("resize-MADV_%s %u..%u",
|
||||||
(env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno,
|
(env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno,
|
||||||
bytes2pgno(env, prev_size));
|
bytes2pgno(env, prev_size));
|
||||||
|
const uint32_t munlocks_before =
|
||||||
|
atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed);
|
||||||
rc = MDBX_RESULT_TRUE;
|
rc = MDBX_RESULT_TRUE;
|
||||||
#if defined(MADV_REMOVE)
|
#if defined(MADV_REMOVE)
|
||||||
if (env->me_flags & MDBX_WRITEMAP)
|
if (env->me_flags & MDBX_WRITEMAP)
|
||||||
@ -5903,23 +5908,25 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno,
|
|||||||
prev_size - size_bytes,
|
prev_size - size_bytes,
|
||||||
POSIX_FADV_DONTNEED));
|
POSIX_FADV_DONTNEED));
|
||||||
#endif /* MADV_DONTNEED */
|
#endif /* MADV_DONTNEED */
|
||||||
uint32_t snap_mlock_counter;
|
if (unlikely(MDBX_IS_ERROR(rc))) {
|
||||||
if (unlikely(rc == MDBX_EINVAL) &&
|
const uint32_t mlocks_after =
|
||||||
(snap_mlock_counter =
|
atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed);
|
||||||
atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed)) > 0) {
|
if (rc == MDBX_EINVAL) {
|
||||||
NOTICE("%s-madvise: ignore EINVAL (%d) since some pages locked (have %u "
|
const int severity =
|
||||||
"mlocked-process(es))",
|
(mlocks_after - munlocks_before) ? MDBX_LOG_NOTICE : MDBX_LOG_WARN;
|
||||||
"resize", rc, snap_mlock_counter);
|
if (LOG_ENABLED(severity))
|
||||||
} else {
|
debug_log(severity, __func__, __LINE__,
|
||||||
if (unlikely(MDBX_IS_ERROR(rc))) {
|
"%s-madvise: ignore EINVAL (%d) since some pages maybe "
|
||||||
ERROR("%s-madvise(%s, %zu..%zu), %u mlocked-process(es), err %d",
|
"locked (%u/%u mlcnt-processes)",
|
||||||
|
"resize", rc, mlocks_after, munlocks_before);
|
||||||
|
} else {
|
||||||
|
ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d",
|
||||||
"mresize", "DONTNEED", size_bytes, prev_size - size_bytes,
|
"mresize", "DONTNEED", size_bytes, prev_size - size_bytes,
|
||||||
atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed), rc);
|
mlocks_after, munlocks_before, rc);
|
||||||
goto bailout;
|
goto bailout;
|
||||||
}
|
}
|
||||||
if (env->me_lck->mti_discarded_tail.weak > size_pgno)
|
} else
|
||||||
env->me_lck->mti_discarded_tail.weak = size_pgno;
|
env->me_lck->mti_discarded_tail.weak = size_pgno;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif /* MDBX_ENABLE_MADVISE */
|
#endif /* MDBX_ENABLE_MADVISE */
|
||||||
|
|
||||||
@ -11473,6 +11480,8 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
|||||||
ENSURE(env, prev_discarded_bytes > discard_edge_bytes);
|
ENSURE(env, prev_discarded_bytes > discard_edge_bytes);
|
||||||
munlock_after(env, discard_edge_pgno,
|
munlock_after(env, discard_edge_pgno,
|
||||||
bytes_align2os_bytes(env, env->me_dxb_mmap.current));
|
bytes_align2os_bytes(env, env->me_dxb_mmap.current));
|
||||||
|
const uint32_t munlocks_before =
|
||||||
|
atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed);
|
||||||
#if defined(MADV_DONTNEED)
|
#if defined(MADV_DONTNEED)
|
||||||
int advise = MADV_DONTNEED;
|
int advise = MADV_DONTNEED;
|
||||||
#if defined(MADV_FREE) && \
|
#if defined(MADV_FREE) && \
|
||||||
@ -11489,23 +11498,27 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
|||||||
env->me_map + discard_edge_bytes,
|
env->me_map + discard_edge_bytes,
|
||||||
prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED));
|
prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED));
|
||||||
#endif
|
#endif
|
||||||
uint32_t snap_mlock_counter;
|
if (unlikely(MDBX_IS_ERROR(err))) {
|
||||||
if (unlikely(err == MDBX_EINVAL) &&
|
const uint32_t mlocks_after =
|
||||||
(snap_mlock_counter = atomic_load32(&env->me_lck->mti_mlock_counter,
|
atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed);
|
||||||
mo_Relaxed)) > 0) {
|
if (err == MDBX_EINVAL) {
|
||||||
NOTICE("%s-madvise: ignore EINVAL (%d) since some pages locked (have "
|
const int severity = (mlocks_after - munlocks_before)
|
||||||
"%u mlocked-process(es))",
|
? MDBX_LOG_NOTICE
|
||||||
"shrink", err, snap_mlock_counter);
|
: MDBX_LOG_WARN;
|
||||||
} else if (unlikely(MDBX_IS_ERROR(err))) {
|
if (LOG_ENABLED(severity))
|
||||||
ERROR("%s-madvise(%s, %zu..%zu), err %d", "shrink", "DONTNEED",
|
debug_log(severity, __func__, __LINE__,
|
||||||
discard_edge_bytes, prev_discarded_bytes - discard_edge_bytes,
|
"%s-madvise: ignore EINVAL (%d) since some pages maybe "
|
||||||
err);
|
"locked (%u/%u mlcnt-processes)",
|
||||||
ERROR("%s-madvise(%s, %zu..%zu), %u mlocked-process(es), err %d",
|
"shrink", err, mlocks_after, munlocks_before);
|
||||||
"shrink", "DONTNEED", discard_edge_bytes,
|
} else {
|
||||||
prev_discarded_bytes - discard_edge_bytes,
|
ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d",
|
||||||
atomic_load32(&env->me_lck->mti_mlock_counter, mo_Relaxed), err);
|
"shrink", "DONTNEED", discard_edge_bytes,
|
||||||
return err;
|
prev_discarded_bytes - discard_edge_bytes, mlocks_after,
|
||||||
}
|
munlocks_before, err);
|
||||||
|
return err;
|
||||||
|
}
|
||||||
|
} else
|
||||||
|
env->me_lck->mti_discarded_tail.weak = discard_edge_pgno;
|
||||||
}
|
}
|
||||||
#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
|
#endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */
|
||||||
|
|
||||||
@ -11517,10 +11530,9 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending,
|
|||||||
(shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) {
|
(shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) {
|
||||||
if (pending->mm_geo.now > largest_pgno &&
|
if (pending->mm_geo.now > largest_pgno &&
|
||||||
pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) {
|
pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) {
|
||||||
pgno_t grow_step = 0;
|
|
||||||
const pgno_t aligner =
|
const pgno_t aligner =
|
||||||
pending->mm_geo.grow_pv
|
pending->mm_geo.grow_pv
|
||||||
? (grow_step = pv2pages(pending->mm_geo.grow_pv))
|
? /* grow_step */ pv2pages(pending->mm_geo.grow_pv)
|
||||||
: shrink_step;
|
: shrink_step;
|
||||||
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
|
const pgno_t with_backlog_gap = largest_pgno + backlog_gap;
|
||||||
const pgno_t aligned = pgno_align2os_pgno(
|
const pgno_t aligned = pgno_align2os_pgno(
|
||||||
@ -23818,7 +23830,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn,
|
|||||||
rc = errno;
|
rc = errno;
|
||||||
WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc);
|
WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc);
|
||||||
} else {
|
} else {
|
||||||
update_mlocked(env, mlock_pgno, true);
|
update_mlcnt(env, mlock_pgno, true);
|
||||||
rc = MDBX_SUCCESS;
|
rc = MDBX_SUCCESS;
|
||||||
}
|
}
|
||||||
if (rc != EINVAL)
|
if (rc != EINVAL)
|
||||||
@ -23929,7 +23941,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn,
|
|||||||
atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
|
atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) {
|
||||||
#if defined(_WIN32) || defined(_WIN64)
|
#if defined(_WIN32) || defined(_WIN64)
|
||||||
if (VirtualLock(env->me_map, used_range)) {
|
if (VirtualLock(env->me_map, used_range)) {
|
||||||
update_mlocked(env, mlock_pgno, true);
|
update_mlcnt(env, mlock_pgno, true);
|
||||||
rc = MDBX_SUCCESS;
|
rc = MDBX_SUCCESS;
|
||||||
} else {
|
} else {
|
||||||
rc = (int)GetLastError();
|
rc = (int)GetLastError();
|
||||||
@ -23937,7 +23949,7 @@ __cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn,
|
|||||||
}
|
}
|
||||||
#elif defined(_POSIX_MEMLOCK_RANGE)
|
#elif defined(_POSIX_MEMLOCK_RANGE)
|
||||||
if (mlock(env->me_map, used_range) == 0) {
|
if (mlock(env->me_map, used_range) == 0) {
|
||||||
update_mlocked(env, mlock_pgno, true);
|
update_mlcnt(env, mlock_pgno, true);
|
||||||
rc = MDBX_SUCCESS;
|
rc = MDBX_SUCCESS;
|
||||||
} else {
|
} else {
|
||||||
rc = errno;
|
rc = errno;
|
||||||
|
@ -731,10 +731,10 @@ typedef struct MDBX_lockinfo {
|
|||||||
/* Marker to distinguish uniqueness of DB/CLK. */
|
/* Marker to distinguish uniqueness of DB/CLK. */
|
||||||
MDBX_atomic_uint64_t mti_bait_uniqueness;
|
MDBX_atomic_uint64_t mti_bait_uniqueness;
|
||||||
|
|
||||||
/* Counter of processes which had mlock()'ed some of mmapped DB pages.
|
/* Paired counter of processes that have mlock()ed part of mmapped DB.
|
||||||
* Non-zero means at least one process lock at leat one page,
|
* The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process
|
||||||
* and therefore madvise() could return EINVAL. */
|
* lock at leat one page, so therefore madvise() could return EINVAL. */
|
||||||
MDBX_atomic_uint32_t mti_mlock_counter;
|
MDBX_atomic_uint32_t mti_mlcnt[2];
|
||||||
|
|
||||||
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/
|
||||||
|
|
||||||
@ -764,7 +764,7 @@ typedef struct MDBX_lockinfo {
|
|||||||
/* Timestamp of the last readers check. */
|
/* Timestamp of the last readers check. */
|
||||||
MDBX_atomic_uint64_t mti_reader_check_timestamp;
|
MDBX_atomic_uint64_t mti_reader_check_timestamp;
|
||||||
|
|
||||||
/* Number of page which was discarded last time by madvise(MADV_FREE). */
|
/* Number of page which was discarded last time by madvise(DONTNEED). */
|
||||||
atomic_pgno_t mti_discarded_tail;
|
atomic_pgno_t mti_discarded_tail;
|
||||||
|
|
||||||
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
|
/* Shared anchor for tracking readahead edge and enabled/disabled status. */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user