From 2a7f460345edbeb26a51782cbe6af3c55254ae77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9B=D0=B5=D0=BE=D0=BD=D0=B8=D0=B4=20=D0=AE=D1=80=D1=8C?= =?UTF-8?q?=D0=B5=D0=B2=20=28Leonid=20Yuriev=29?= Date: Fri, 22 Aug 2025 11:26:35 +0300 Subject: [PATCH] mdbx: fix unexpected `SIGBUS` is not enough space in a filesystem. On a modern Linux the allocation of space for a file can be deferred and/or lazy, rather than when setting its length using `ftruncate()`. The actual allocation of space occurs when writing to the corresponding areas of the file, or when reading ones (in this case, the file system fills these areas with zeros). The specific behavior depends on the type of file system and the kernel version, but the main thing is that possibilities currently are, when setting the file size, just the instantaneous ability to allocate space is checked, without any booking. If the file system is running out of space, an `ENOSPC` error may occur when processing (inside a OS kernel) a page fault when accessing one of the added pages after the database has been enlarged. In this case, the OS kernel has no other alternative but to send a `SIGBUS` signal to the process. This commit fixes the problem by adding the use of system calls to explicitly allocate space for a given file size. Related-to https://github.com/erigontech/erigon/issues/16709 This is a simple improvement, however which is complicated by the need to take into account the availability of the appropriate system API and handle non-fatal errors from file systems that do not support the appropriate operations. Therefore, there is a risk of regressions in unusual/rare situations, including when hosting databases on network media. --- src/api-copy.c | 4 ++-- src/dxb.c | 4 ++-- src/lck.c | 6 +++--- src/osal.c | 37 ++++++++++++++++++++++++++++++------- src/osal.h | 3 ++- 5 files changed, 39 insertions(+), 15 deletions(-) diff --git a/src/api-copy.c b/src/api-copy.c index a87c6ea7..d767839f 100644 --- a/src/api-copy.c +++ b/src/api-copy.c @@ -479,7 +479,7 @@ __cold static int copy_with_compacting(MDBX_env *env, MDBX_txn *txn, mdbx_fileha if (meta->geometry.now != meta->geometry.first_unallocated) { const size_t whole_size = pgno2bytes(env, meta->geometry.now); if (!dest_is_pipe) - return osal_ftruncate(fd, whole_size); + return osal_fallocate(fd, whole_size); const size_t used_size = pgno2bytes(env, meta->geometry.first_unallocated); memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); @@ -648,7 +648,7 @@ retry_snap_meta: /* Extend file if required */ if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { if (!dest_is_pipe) - rc = osal_ftruncate(fd, whole_size); + rc = osal_fallocate(fd, whole_size); else { memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; rc == MDBX_SUCCESS && offset < whole_size;) { diff --git a/src/dxb.c b/src/dxb.c index 6f500636..4136a9e4 100644 --- a/src/dxb.c +++ b/src/dxb.c @@ -532,7 +532,7 @@ __cold int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bit if (unlikely(err != MDBX_SUCCESS)) return err; - err = osal_ftruncate(env->lazy_fd, env->dxb_mmap.filesize = env->dxb_mmap.current = env->geo_in_bytes.now); + err = osal_fallocate(env->lazy_fd, env->dxb_mmap.filesize = env->dxb_mmap.current = env->geo_in_bytes.now); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -682,7 +682,7 @@ __cold int dxb_setup(MDBX_env *env, const int lck_rc, const mdbx_mode_t mode_bit !(env->flags & MDBX_NORDAHEAD) && mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; err = osal_mmap(env->flags, &env->dxb_mmap, env->geo_in_bytes.now, env->geo_in_bytes.upper, - (lck_rc && env->stuck_meta < 0) ? MMAP_OPTION_TRUNCATE : 0, env->pathname.dxb); + (lck_rc && env->stuck_meta < 0) ? MMAP_OPTION_SETLENGTH : 0, env->pathname.dxb); if (unlikely(err != MDBX_SUCCESS)) return err; diff --git a/src/lck.c b/src/lck.c index 73ce13e0..b71a7588 100644 --- a/src/lck.c +++ b/src/lck.c @@ -62,9 +62,9 @@ __cold static int lck_setup_locked(MDBX_env *env) { } env->max_readers = (maxreaders <= MDBX_READERS_LIMIT) ? (unsigned)maxreaders : (unsigned)MDBX_READERS_LIMIT; - err = - osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap, (size_t)size, (size_t)size, - lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE, env->pathname.lck); + err = osal_mmap((env->flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->lck_mmap, (size_t)size, (size_t)size, + lck_seize_rc ? MMAP_OPTION_SETLENGTH | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE, + env->pathname.lck); if (unlikely(err != MDBX_SUCCESS)) return err; diff --git a/src/osal.c b/src/osal.c index b83e2776..b32b74c3 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1594,6 +1594,7 @@ MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd) { #endif } +/* truncate file: just set the length of a file */ MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #if defined(_WIN32) || defined(_WIN64) if (imports.SetFileInformationByHandle) { @@ -1613,6 +1614,23 @@ MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #endif } +/* extend file: set the length of a file AND ensure the space has been allocated */ +MDBX_INTERNAL int osal_fallocate(mdbx_filehandle_t fd, uint64_t length) { + assert(length > 0); + int err = MDBX_RESULT_TRUE; +#if (defined(__linux__) || defined(__gnu_linux__)) && \ + ((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 10)) || (defined(__ANDROID_API__) && __ANDROID_API__ >= 21)) + err = fallocate(fd, 0, 0, length) ? ignore_enosys_and_eremote(errno) : MDBX_SUCCESS; +#elif defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L && !defined(__APPLE__) + err = posix_fallocate(fd, 0, length) ? ignore_enosys_and_eremote(errno) : MDBX_SUCCESS; +#elif defined(__APPLE__) + fstore_t store = {F_ALLOCATEALL, F_PEOFPOSMODE, 0, length, 0}; + if (fcntl(fd, F_PREALLOCATE, &store)) + err = ignore_enosys_and_eremote(errno); +#endif /* Apple */ + return (err == MDBX_RESULT_TRUE) ? osal_ftruncate(fd, length) : err; +} + MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER li; @@ -2063,8 +2081,8 @@ MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size, cons if (unlikely(err != MDBX_SUCCESS)) return err; - if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { - err = osal_ftruncate(map->fd, size); + if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_SETLENGTH) != 0) { + err = osal_fallocate(map->fd, size); VERBOSE("ftruncate %zu, err %d", size, err); if (err != MDBX_SUCCESS) return err; @@ -2310,7 +2328,7 @@ retry_file_and_section: } if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { - err = osal_ftruncate(map->fd, size); + err = osal_fallocate(map->fd, size); if (err == MDBX_SUCCESS) map->filesize = size; /* ignore error, because Windows unable shrink file @@ -2388,10 +2406,15 @@ retry_mapview:; rc = MDBX_EPERM; map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; } else { - if (size > map->filesize || (size < map->filesize && (flags & txn_shrink_allowed))) { - rc = osal_ftruncate(map->fd, size); - VERBOSE("ftruncate %zu, err %d", size, rc); - if (rc != MDBX_SUCCESS) + if (map->filesize != size) { + if (size > map->filesize) { + rc = osal_fallocate(map->fd, size); + VERBOSE("f%s-%s %zu, err %d", "allocate", "extend", size, rc); + } else if (flags & txn_shrink_allowed) { + rc = osal_ftruncate(map->fd, size); + VERBOSE("f%s-%s %zu, err %d", "truncate", "shrink", size, rc); + } + if (unlikely(rc != MDBX_SUCCESS)) return rc; map->filesize = size; } diff --git a/src/osal.h b/src/osal.h index 7622d432..6202ff47 100644 --- a/src/osal.h +++ b/src/osal.h @@ -434,6 +434,7 @@ enum osal_syncmode_bits { MDBX_INTERNAL int osal_fsync(mdbx_filehandle_t fd, const enum osal_syncmode_bits mode_bits); MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL int osal_fallocate(mdbx_filehandle_t fd, uint64_t length); MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); MDBX_INTERNAL int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); @@ -470,7 +471,7 @@ MDBX_INTERNAL int osal_removedirectory(const pathchar_t *pathname); MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd); MDBX_INTERNAL int osal_lockfile(mdbx_filehandle_t fd, bool wait); -#define MMAP_OPTION_TRUNCATE 1 +#define MMAP_OPTION_SETLENGTH 1 #define MMAP_OPTION_SEMAPHORE 2 MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size, const size_t limit, const unsigned options, const pathchar_t *pathname4logging);