mirror of
https://github.com/isar/libmdbx.git
synced 2025-09-06 13:42:21 +08:00
On a modern Linux the allocation of space for a file can be deferred and/or lazy, rather than when setting its length using `ftruncate()`. The actual allocation of space occurs when writing to the corresponding areas of the file, or when reading ones (in this case, the file system fills these areas with zeros). The specific behavior depends on the type of file system and the kernel version, but the main thing is that possibilities currently are, when setting the file size, just the instantaneous ability to allocate space is checked, without any booking. If the file system is running out of space, an `ENOSPC` error may occur when processing (inside a OS kernel) a page fault when accessing one of the added pages after the database has been enlarged. In this case, the OS kernel has no other alternative but to send a `SIGBUS` signal to the process. This commit fixes the problem by adding the use of system calls to explicitly allocate space for a given file size. Related-to https://github.com/erigontech/erigon/issues/16709 This is a simple improvement, however which is complicated by the need to take into account the availability of the appropriate system API and handle non-fatal errors from file systems that do not support the appropriate operations. Therefore, there is a risk of regressions in unusual/rare situations, including when hosting databases on network media.
175 lines
6.3 KiB
C
175 lines
6.3 KiB
C
/// \copyright SPDX-License-Identifier: Apache-2.0
|
|
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2025
|
|
|
|
#include "internals.h"
|
|
|
|
__cold static int lck_setup_locked(MDBX_env *env) {
|
|
int err = rthc_register(env);
|
|
if (unlikely(err != MDBX_SUCCESS))
|
|
return err;
|
|
|
|
int lck_seize_rc = lck_seize(env);
|
|
if (unlikely(MDBX_IS_ERROR(lck_seize_rc)))
|
|
return lck_seize_rc;
|
|
|
|
if (env->lck_mmap.fd == INVALID_HANDLE_VALUE) {
|
|
env->lck = lckless_stub(env);
|
|
env->max_readers = UINT_MAX;
|
|
DEBUG("lck-setup:%s%s%s", " lck-less", (env->flags & MDBX_RDONLY) ? " readonly" : "",
|
|
(lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
|
|
return lck_seize_rc;
|
|
}
|
|
|
|
DEBUG("lck-setup:%s%s%s", " with-lck", (env->flags & MDBX_RDONLY) ? " readonly" : "",
|
|
(lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative");
|
|
|
|
MDBX_env *inprocess_neighbor = nullptr;
|
|
err = rthc_uniq_check(&env->lck_mmap, &inprocess_neighbor);
|
|
if (unlikely(MDBX_IS_ERROR(err)))
|
|
return err;
|
|
if (inprocess_neighbor) {
|
|
if ((globals.runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || (inprocess_neighbor->flags & MDBX_EXCLUSIVE) != 0)
|
|
return MDBX_BUSY;
|
|
if (lck_seize_rc == MDBX_RESULT_TRUE) {
|
|
err = lck_downgrade(env);
|
|
if (unlikely(err != MDBX_SUCCESS))
|
|
return err;
|
|
lck_seize_rc = MDBX_RESULT_FALSE;
|
|
}
|
|
}
|
|
|
|
uint64_t size = 0;
|
|
err = osal_filesize(env->lck_mmap.fd, &size);
|
|
if (unlikely(err != MDBX_SUCCESS))
|
|
return err;
|
|
|
|
if (lck_seize_rc == MDBX_RESULT_TRUE) {
|
|
size = ceil_powerof2(env->max_readers * sizeof(reader_slot_t) + sizeof(lck_t), globals.sys_pagesize);
|
|
jitter4testing(false);
|
|
} else {
|
|
if (env->flags & MDBX_EXCLUSIVE)
|
|
return MDBX_BUSY;
|
|
if (size > INT_MAX || (size & (globals.sys_pagesize - 1)) != 0 || size < globals.sys_pagesize) {
|
|
ERROR("lck-file has invalid size %" PRIu64 " bytes", size);
|
|
return MDBX_PROBLEM;
|
|
}
|
|
}
|
|
|
|
const size_t maxreaders = ((size_t)size - sizeof(lck_t)) / sizeof(reader_slot_t);
|
|
if (maxreaders < 4) {
|
|
ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders);
|
|
return MDBX_PROBLEM;
|
|
}
|
|
env->max_readers = (maxreaders <= MDBX_READERS_LIMIT) ? (unsigned)maxreaders : (unsigned)MDBX_READERS_LIMIT;
|
|
|
|
err = osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap, (size_t)size, (size_t)size,
|
|
lck_seize_rc ? MMAP_OPTION_SETLENGTH | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE,
|
|
env->pathname.lck);
|
|
if (unlikely(err != MDBX_SUCCESS))
|
|
return err;
|
|
|
|
#ifdef MADV_DODUMP
|
|
err = madvise(env->lck_mmap.lck, size, MADV_DODUMP) ? ignore_enosys_and_eagain(errno) : MDBX_SUCCESS;
|
|
if (unlikely(MDBX_IS_ERROR(err)))
|
|
return err;
|
|
#endif /* MADV_DODUMP */
|
|
|
|
#ifdef MADV_WILLNEED
|
|
err = madvise(env->lck_mmap.lck, size, MADV_WILLNEED) ? ignore_enosys_and_eagain(errno) : MDBX_SUCCESS;
|
|
if (unlikely(MDBX_IS_ERROR(err)))
|
|
return err;
|
|
#elif defined(POSIX_MADV_WILLNEED)
|
|
err = ignore_enosys(posix_madvise(env->lck_mmap.lck, size, POSIX_MADV_WILLNEED));
|
|
if (unlikely(MDBX_IS_ERROR(err)))
|
|
return err;
|
|
#endif /* MADV_WILLNEED */
|
|
|
|
lck_t *lck = env->lck_mmap.lck;
|
|
if (lck_seize_rc == MDBX_RESULT_TRUE) {
|
|
/* If we succeed got exclusive lock, then nobody is using the lock region
|
|
* and we should initialize it. */
|
|
memset(lck, 0, (size_t)size);
|
|
jitter4testing(false);
|
|
lck->magic_and_version = MDBX_LOCK_MAGIC;
|
|
lck->os_and_format = MDBX_LOCK_FORMAT;
|
|
#if MDBX_ENABLE_PGOP_STAT
|
|
lck->pgops.wops.weak = 1;
|
|
#endif /* MDBX_ENABLE_PGOP_STAT */
|
|
err = osal_msync(&env->lck_mmap, 0, (size_t)size, MDBX_SYNC_DATA | MDBX_SYNC_SIZE);
|
|
if (unlikely(err != MDBX_SUCCESS)) {
|
|
ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err);
|
|
eASSERT(env, MDBX_IS_ERROR(err));
|
|
return err;
|
|
}
|
|
} else {
|
|
if (lck->magic_and_version != MDBX_LOCK_MAGIC) {
|
|
const bool invalid = (lck->magic_and_version >> 8) != MDBX_MAGIC;
|
|
ERROR("lock region has %s", invalid ? "invalid magic"
|
|
: "incompatible version (only applications with nearly or the "
|
|
"same versions of libmdbx can share the same database)");
|
|
return invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH;
|
|
}
|
|
if (lck->os_and_format != MDBX_LOCK_FORMAT) {
|
|
ERROR("lock region has os/format signature 0x%" PRIx32 ", expected 0x%" PRIx32, lck->os_and_format,
|
|
MDBX_LOCK_FORMAT);
|
|
return MDBX_VERSION_MISMATCH;
|
|
}
|
|
}
|
|
|
|
err = lck_init(env, inprocess_neighbor, lck_seize_rc);
|
|
if (unlikely(err != MDBX_SUCCESS)) {
|
|
eASSERT(env, MDBX_IS_ERROR(err));
|
|
return err;
|
|
}
|
|
|
|
env->lck = lck;
|
|
eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc));
|
|
return lck_seize_rc;
|
|
}
|
|
|
|
__cold int lck_setup(MDBX_env *env, mdbx_mode_t mode) {
|
|
eASSERT(env, env->lazy_fd != INVALID_HANDLE_VALUE);
|
|
eASSERT(env, env->lck_mmap.fd == INVALID_HANDLE_VALUE);
|
|
|
|
int err = osal_openfile(MDBX_OPEN_LCK, env, env->pathname.lck, &env->lck_mmap.fd, mode);
|
|
if (err != MDBX_SUCCESS) {
|
|
switch (err) {
|
|
case MDBX_EACCESS:
|
|
case MDBX_EPERM:
|
|
if (F_ISSET(env->flags, MDBX_RDONLY | MDBX_EXCLUSIVE))
|
|
break;
|
|
__fallthrough /* fall through */;
|
|
case MDBX_ENOFILE:
|
|
case MDBX_EROFS:
|
|
if (env->flags & MDBX_RDONLY) {
|
|
/* ENSURE the file system is read-only */
|
|
int err_rofs = osal_check_fs_rdonly(env->lazy_fd, env->pathname.lck, err);
|
|
if (err_rofs == MDBX_SUCCESS ||
|
|
/* ignore ERROR_NOT_SUPPORTED for exclusive mode */
|
|
(err_rofs == MDBX_ENOSYS && (env->flags & MDBX_EXCLUSIVE)))
|
|
break;
|
|
if (err_rofs != MDBX_ENOSYS)
|
|
err = err_rofs;
|
|
}
|
|
__fallthrough /* fall through */;
|
|
default:
|
|
ERROR("unable to open lck-file %" MDBX_PRIsPATH ", env-flags 0x%X, err %d", env->pathname.lck, env->flags, err);
|
|
return err;
|
|
}
|
|
|
|
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
|
|
env->lck_mmap.fd = INVALID_HANDLE_VALUE;
|
|
NOTICE("continue %" MDBX_PRIsPATH " within without-lck mode, env-flags 0x%X, lck-error %d", env->pathname.dxb,
|
|
env->flags, err);
|
|
}
|
|
|
|
rthc_lock();
|
|
err = lck_setup_locked(env);
|
|
rthc_unlock();
|
|
return err;
|
|
}
|
|
|
|
void mincore_clean_cache(const MDBX_env *const env) {
|
|
memset(env->lck->mincore_cache.begin, -1, sizeof(env->lck->mincore_cache.begin));
|
|
}
|