mdbx: fix unexpected SIGBUS is not enough space in a filesystem.

On a modern Linux the allocation of space for a file can be deferred
and/or lazy, rather than when setting its length using `ftruncate()`.
The actual allocation of space occurs when writing to the corresponding
areas of the file, or when reading ones (in this case, the file system
fills these areas with zeros).

The specific behavior depends on the type of file system and the kernel
version, but the main thing is that possibilities currently are, when
setting the file size, just the instantaneous ability to allocate space
is checked, without any booking.

If the file system is running out of space, an `ENOSPC` error may occur
when processing (inside a OS kernel) a page fault when accessing one of
the added pages after the database has been enlarged. In this case, the
OS kernel has no other alternative but to send a `SIGBUS` signal to the
process.

This commit fixes the problem by adding the use of system calls to
explicitly allocate space for a given file size.
Related-to https://github.com/erigontech/erigon/issues/16709

This is a simple improvement, however which is complicated by the need
to take into account the availability of the appropriate system API and
handle non-fatal errors from file systems that do not support the
appropriate operations. Therefore, there is a risk of regressions in
unusual/rare situations, including when hosting databases on network
media.
This commit is contained in:
Леонид Юрьев (Leonid Yuriev)
2025-08-22 11:26:35 +03:00
parent f23a72f59c
commit 2a7f460345
5 changed files with 39 additions and 15 deletions

View File

@@ -479,7 +479,7 @@ __cold static int copy_with_compacting(MDBX_env *env, MDBX_txn *txn, mdbx_fileha
if (meta->geometry.now != meta->geometry.first_unallocated) {
const size_t whole_size = pgno2bytes(env, meta->geometry.now);
if (!dest_is_pipe)
return osal_ftruncate(fd, whole_size);
return osal_fallocate(fd, whole_size);
const size_t used_size = pgno2bytes(env, meta->geometry.first_unallocated);
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
@@ -648,7 +648,7 @@ retry_snap_meta:
/* Extend file if required */
if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) {
if (!dest_is_pipe)
rc = osal_ftruncate(fd, whole_size);
rc = osal_fallocate(fd, whole_size);
else {
memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF);
for (size_t offset = used_size; rc == MDBX_SUCCESS && offset < whole_size;) {

View File

@@ -532,7 +532,7 @@ __cold int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bit
if (unlikely(err != MDBX_SUCCESS))
return err;
err = osal_ftruncate(env->lazy_fd, env->dxb_mmap.filesize = env->dxb_mmap.current = env->geo_in_bytes.now);
err = osal_fallocate(env->lazy_fd, env->dxb_mmap.filesize = env->dxb_mmap.current = env->geo_in_bytes.now);
if (unlikely(err != MDBX_SUCCESS))
return err;
@@ -682,7 +682,7 @@ __cold int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bit
!(env->flags & MDBX_NORDAHEAD) && mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE;
err = osal_mmap(env->flags, &env->dxb_mmap, env->geo_in_bytes.now, env->geo_in_bytes.upper,
(lck_rc && env->stuck_meta < 0) ? MMAP_OPTION_TRUNCATE : 0, env->pathname.dxb);
(lck_rc && env->stuck_meta < 0) ? MMAP_OPTION_SETLENGTH : 0, env->pathname.dxb);
if (unlikely(err != MDBX_SUCCESS))
return err;

View File

@@ -62,9 +62,9 @@ __cold static int lck_setup_locked(MDBX_env *env) {
}
env->max_readers = (maxreaders <= MDBX_READERS_LIMIT) ? (unsigned)maxreaders : (unsigned)MDBX_READERS_LIMIT;
err =
osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap, (size_t)size, (size_t)size,
lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE, env->pathname.lck);
err = osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap, (size_t)size, (size_t)size,
lck_seize_rc ? MMAP_OPTION_SETLENGTH | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE,
env->pathname.lck);
if (unlikely(err != MDBX_SUCCESS))
return err;

View File

@@ -1594,6 +1594,7 @@ MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd) {
#endif
}
/* truncate file: just set the length of a file */
MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) {
#if defined(_WIN32) || defined(_WIN64)
if (imports.SetFileInformationByHandle) {
@@ -1613,6 +1614,23 @@ MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) {
#endif
}
/* extend file: set the length of a file AND ensure the space has been allocated */
MDBX_INTERNAL int osal_fallocate(mdbx_filehandle_t fd, uint64_t length) {
assert(length > 0);
int err = MDBX_RESULT_TRUE;
#if (defined(__linux__) || defined(__gnu_linux__)) && \
((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 10)) || (defined(__ANDROID_API__) && __ANDROID_API__ >= 21))
err = fallocate(fd, 0, 0, length) ? ignore_enosys_and_eremote(errno) : MDBX_SUCCESS;
#elif defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L && !defined(__APPLE__)
err = posix_fallocate(fd, 0, length) ? ignore_enosys_and_eremote(errno) : MDBX_SUCCESS;
#elif defined(__APPLE__)
fstore_t store = {F_ALLOCATEALL, F_PEOFPOSMODE, 0, length, 0};
if (fcntl(fd, F_PREALLOCATE, &store))
err = ignore_enosys_and_eremote(errno);
#endif /* Apple */
return (err == MDBX_RESULT_TRUE) ? osal_ftruncate(fd, length) : err;
}
MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) {
#if defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER li;
@@ -2063,8 +2081,8 @@ MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size, cons
if (unlikely(err != MDBX_SUCCESS))
return err;
if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) {
err = osal_ftruncate(map->fd, size);
if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_SETLENGTH) != 0) {
err = osal_fallocate(map->fd, size);
VERBOSE("ftruncate %zu, err %d", size, err);
if (err != MDBX_SUCCESS)
return err;
@@ -2310,7 +2328,7 @@ retry_file_and_section:
}
if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) {
err = osal_ftruncate(map->fd, size);
err = osal_fallocate(map->fd, size);
if (err == MDBX_SUCCESS)
map->filesize = size;
/* ignore error, because Windows unable shrink file
@@ -2388,10 +2406,15 @@ retry_mapview:;
rc = MDBX_EPERM;
map->current = (map->filesize > limit) ? limit : (size_t)map->filesize;
} else {
if (size > map->filesize || (size < map->filesize && (flags & txn_shrink_allowed))) {
if (map->filesize != size) {
if (size > map->filesize) {
rc = osal_fallocate(map->fd, size);
VERBOSE("f%s-%s %zu, err %d", "allocate", "extend", size, rc);
} else if (flags & txn_shrink_allowed) {
rc = osal_ftruncate(map->fd, size);
VERBOSE("ftruncate %zu, err %d", size, rc);
if (rc != MDBX_SUCCESS)
VERBOSE("f%s-%s %zu, err %d", "truncate", "shrink", size, rc);
}
if (unlikely(rc != MDBX_SUCCESS))
return rc;
map->filesize = size;
}

View File

@@ -434,6 +434,7 @@ enum osal_syncmode_bits {
MDBX_INTERNAL int osal_fsync(mdbx_filehandle_t fd, const enum osal_syncmode_bits mode_bits);
MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length);
MDBX_INTERNAL int osal_fallocate(mdbx_filehandle_t fd, uint64_t length);
MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL int osal_filesize(mdbx_filehandle_t fd, uint64_t *length);
@@ -470,7 +471,7 @@ MDBX_INTERNAL int osal_removedirectory(const pathchar_t *pathname);
MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd);
MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait);
#define MMAP_OPTION_TRUNCATE 1
#define MMAP_OPTION_SETLENGTH 1
#define MMAP_OPTION_SEMAPHORE 2
MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size, const size_t limit, const unsigned options,
const pathchar_t *pathname4logging);