mdbx: refine sync-to-disk (lazy/dsync fds).

Change-Id: I4bad81a1a0b5ccbefdc598f58a7d683fa7d8b504
This commit is contained in:
Leonid Yuriev 2019-12-16 17:43:29 +03:00
parent 2db5736554
commit 867c537655
6 changed files with 134 additions and 102 deletions

View File

@ -2961,7 +2961,7 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno,
memset(mp, 0, bytes);
mp->mp_pgno = pgno;
if ((env->me_flags & MDBX_WRITEMAP) == 0)
mdbx_pwrite(env->me_fd, mp, bytes, pgno2bytes(env, pgno));
mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno));
} else {
struct iovec iov[MDBX_COMMIT_PAGES];
iov[0].iov_len = env->me_psize;
@ -2971,13 +2971,13 @@ static __cold void mdbx_kill_page(MDBX_env *env, MDBX_page *mp, pgno_t pgno,
while (--npages) {
iov[n] = iov[0];
if (++n == MDBX_COMMIT_PAGES) {
mdbx_pwritev(env->me_fd, iov, MDBX_COMMIT_PAGES, iov_off,
mdbx_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off,
pgno2bytes(env, MDBX_COMMIT_PAGES));
iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES);
n = 0;
}
}
mdbx_pwritev(env->me_fd, iov, n, iov_off, pgno2bytes(env, n));
mdbx_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n));
}
}
@ -3645,7 +3645,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset,
bytes2pgno(env, offset), bytes2pgno(env, offset + length));
#if defined(F_RDAHEAD)
if (unlikely(fcntl(env->me_fd, F_RDAHEAD, enable) == -1))
if (unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1))
return errno;
#endif /* F_RDAHEAD */
@ -3655,7 +3655,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset,
hint.ra_offset = offset;
hint.ra_count = length;
(void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl(
env->me_fd, F_RDADVISE, &hint);
env->me_lazy_fd, F_RDADVISE, &hint);
#endif /* F_RDADVISE */
#if defined(MADV_WILLNEED)
int err = madvise(env->me_map + offset, length, MADV_WILLNEED)
@ -3677,7 +3677,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset,
}
#elif defined(POSIX_FADV_WILLNEED)
int err = ignore_enosys(
posix_fadvise(env->me_fd, offset, length, POSIX_FADV_WILLNEED));
posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_WILLNEED));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#endif /* MADV_WILLNEED */
@ -3695,7 +3695,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset,
return err;
#elif defined(POSIX_FADV_RANDOM)
int err = ignore_enosys(
posix_fadvise(env->me_fd, offset, length, POSIX_FADV_RANDOM));
posix_fadvise(env->me_lazy_fd, offset, length, POSIX_FADV_RANDOM));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
#endif /* MADV_RANDOM */
@ -3793,8 +3793,9 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
POSIX_MADV_DONTNEED));
#elif defined(POSIX_FADV_DONTNEED)
if (rc == MDBX_RESULT_TRUE)
rc = ignore_enosys(posix_fadvise(
env->me_fd, size_bytes, prev_size - size_bytes, POSIX_FADV_DONTNEED));
rc = ignore_enosys(posix_fadvise(env->me_lazy_fd, size_bytes,
prev_size - size_bytes,
POSIX_FADV_DONTNEED));
#endif /* MADV_DONTNEED */
if (unlikely(MDBX_IS_ERROR(rc)))
goto bailout;
@ -3892,7 +3893,7 @@ static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady,
if (env->me_flags & MDBX_WRITEMAP)
meta->mm_datasync_sign = wipe;
else
return mdbx_pwrite(env->me_fd, &wipe, sizeof(meta->mm_datasync_sign),
return mdbx_pwrite(env->me_lazy_fd, &wipe, sizeof(meta->mm_datasync_sign),
(uint8_t *)&meta->mm_datasync_sign - env->me_map);
}
return MDBX_SUCCESS;
@ -3911,15 +3912,16 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) {
if (env->me_flags & MDBX_WRITEMAP) {
mdbx_flush_incoherent_cpu_writeback();
return mdbx_msync(&env->me_dxb_mmap, 0, pgno2bytes(env, NUM_METAS), false);
return mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
false);
}
#if defined(__linux__) || defined(__gnu_linux__)
if (sync_file_range(env->me_fd, 0, pgno2bytes(env, NUM_METAS),
if (sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS),
SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER))
err = errno;
#else
err = mdbx_filesync(env->me_fd, MDBX_SYNC_DATA);
err = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA);
#endif
if (unlikely(err != MDBX_SUCCESS))
return err;
@ -4654,7 +4656,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) {
/* LY: pre-sync without holding lock to reduce latency for writer(s) */
int err = (flags & MDBX_WRITEMAP)
? mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, false)
: mdbx_filesync(env->me_fd, MDBX_SYNC_DATA);
: mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA);
if (unlikely(err != MDBX_SUCCESS))
return err;
@ -4693,10 +4695,11 @@ fastpath:
if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_NOMETASYNC) != 0) {
const txnid_t head_txnid = mdbx_recent_committed_txnid(env);
if (*env->me_meta_sync_txnid != (uint32_t)head_txnid) {
rc = (flags & MDBX_WRITEMAP)
? mdbx_msync(&env->me_dxb_mmap, 0, pgno2bytes(env, NUM_METAS),
false)
: mdbx_filesync(env->me_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
rc =
(flags & MDBX_WRITEMAP)
? mdbx_msync(&env->me_dxb_mmap, 0,
pgno_align2os_bytes(env, NUM_METAS), false)
: mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (likely(rc == MDBX_SUCCESS))
*env->me_meta_sync_txnid = (uint32_t)head_txnid;
}
@ -6556,7 +6559,7 @@ static int mdbx_flush_iov(MDBX_txn *const txn, struct iovec *iov,
unsigned iov_items, size_t iov_off,
size_t iov_bytes) {
MDBX_env *const env = txn->mt_env;
int rc = mdbx_pwritev(env->me_fd, iov, iov_items, iov_off, iov_bytes);
int rc = mdbx_pwritev(env->me_lazy_fd, iov, iov_items, iov_off, iov_bytes);
if (unlikely(rc != MDBX_SUCCESS)) {
mdbx_error("Write error: %s", mdbx_strerror(rc));
txn->mt_flags |= MDBX_TXN_ERROR;
@ -7163,7 +7166,7 @@ static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta,
const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize;
if (used_bytes > *filesize) {
/* Here could be a race with DB-shrinking performed by other process */
int err = mdbx_filesize(env->me_fd, filesize);
int err = mdbx_filesize(env->me_lazy_fd, filesize);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (used_bytes > *filesize) {
@ -7265,7 +7268,7 @@ static int __cold mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta,
static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
uint64_t *filesize,
const int lck_exclusive) {
int rc = mdbx_filesize(env->me_fd, filesize);
int rc = mdbx_filesize(env->me_lazy_fd, filesize);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
@ -7291,7 +7294,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
while (1) {
mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u",
meta_number, offset, MIN_PAGESIZE, retryleft);
int err = mdbx_pread(env->me_fd, buffer, MIN_PAGESIZE, offset);
int err = mdbx_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset);
if (err != MDBX_SUCCESS) {
if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 &&
*filesize == 0 && (env->me_flags & MDBX_RDONLY) == 0)
@ -7304,7 +7307,7 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *dest,
}
char again[MIN_PAGESIZE];
err = mdbx_pread(env->me_fd, again, MIN_PAGESIZE, offset);
err = mdbx_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset);
if (err != MDBX_SUCCESS) {
mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err,
mdbx_strerror(err));
@ -7516,21 +7519,25 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
MDBX_meta *const recent_steady_meta = mdbx_meta_steady(env);
if (flags & MDBX_WRITEMAP) {
const size_t usedbytes = pgno_align2os_bytes(env, pending->mm_geo.next);
rc = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, flags & MDBX_MAPASYNC);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
const size_t begin = pgno2bytes(env, NUM_METAS) & ~(env->me_os_psize - 1);
const size_t end = pgno_align2os_bytes(env, pending->mm_geo.next);
if (end > begin) {
rc = mdbx_msync(&env->me_dxb_mmap, begin, end - begin,
flags & MDBX_MAPASYNC);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
rc = MDBX_RESULT_TRUE /* carry non-steady */;
if ((flags & MDBX_MAPASYNC) == 0) {
if (unlikely(pending->mm_geo.next > recent_steady_meta->mm_geo.now)) {
rc = mdbx_filesync(env->me_fd, MDBX_SYNC_SIZE);
rc = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_SIZE);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
}
rc = MDBX_RESULT_FALSE /* carry steady */;
}
} else {
rc = mdbx_filesync(env->me_fd,
rc = mdbx_filesync(env->me_lazy_fd,
(pending->mm_geo.next > recent_steady_meta->mm_geo.now)
? MDBX_SYNC_DATA | MDBX_SYNC_SIZE
: MDBX_SYNC_DATA);
@ -7606,7 +7613,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
mdbx_ensure(env, target == head || mdbx_meta_txnid_stable(env, target) <
pending->mm_txnid_a.inconsistent);
if (env->me_flags & MDBX_WRITEMAP) {
if (flags & MDBX_WRITEMAP) {
mdbx_jitter4testing(true);
if (likely(target != head)) {
/* LY: 'invalidate' the meta. */
@ -7649,40 +7656,49 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
target->mm_datasync_sign = pending->mm_datasync_sign;
mdbx_flush_incoherent_cpu_writeback();
mdbx_jitter4testing(true);
if ((flags & MDBX_SAFE_NOSYNC) == 0) {
/* sync meta-pages */
const bool weak = (flags & (MDBX_MAPASYNC | MDBX_NOMETASYNC)) != 0;
rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS),
weak);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
if (!weak) {
#if defined(__APPLE__) && \
MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY
rc = likely(fcntl(env->me_lazy_fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS
: errno;
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
#endif /* MacOS */
*env->me_meta_sync_txnid = (uint32_t)pending->mm_txnid_a.inconsistent;
}
}
} else {
rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta),
const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE)
? env->me_dsync_fd
: env->me_lazy_fd;
rc = mdbx_pwrite(fd, pending, sizeof(MDBX_meta),
(uint8_t *)target - env->me_map);
if (unlikely(rc != MDBX_SUCCESS)) {
undo:
mdbx_debug("%s", "write failed, disk error?");
/* On a failure, the pagecache still contains the new data.
* Try write some old data back, to prevent it from being used. */
mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta),
mdbx_pwrite(fd, (void *)target, sizeof(MDBX_meta),
(uint8_t *)target - env->me_map);
goto fail;
}
mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize);
}
/* LY: step#3 - sync meta-pages. */
mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
if ((flags & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) {
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
if (flags & MDBX_WRITEMAP) {
const size_t offset = (uint8_t *)data_page(head) - env->me_dxb_mmap.dxb;
const size_t paged_offset = offset & ~(env->me_os_psize - 1);
const size_t paged_length = roundup_powerof2(
env->me_psize + offset - paged_offset, env->me_os_psize);
rc = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length,
flags & MDBX_MAPASYNC);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
} else {
rc = mdbx_filesync(env->me_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (rc != MDBX_SUCCESS)
goto undo;
if ((flags & (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) {
/* sync meta-pages */
if (fd == env->me_lazy_fd) {
rc = mdbx_filesync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ);
if (rc != MDBX_SUCCESS)
goto undo;
}
*env->me_meta_sync_txnid = (uint32_t)pending->mm_txnid_a.inconsistent;
}
*env->me_meta_sync_txnid = (uint32_t)pending->mm_txnid_a.inconsistent;
}
/* LY: shrink datafile if needed */
@ -7749,7 +7765,8 @@ int __cold mdbx_env_create(MDBX_env **penv) {
env->me_maxreaders = DEFAULT_READERS;
env->me_maxdbs = env->me_numdbs = CORE_DBS;
env->me_fd = INVALID_HANDLE_VALUE;
env->me_lazy_fd = INVALID_HANDLE_VALUE;
env->me_dsync_fd = INVALID_HANDLE_VALUE;
env->me_lfd = INVALID_HANDLE_VALUE;
env->me_pid = mdbx_getpid();
@ -8197,12 +8214,12 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
return MDBX_ENOMEM;
meta = *mdbx_init_metas(env, buffer);
err = mdbx_pwrite(env->me_fd, buffer, env->me_psize * NUM_METAS, 0);
err = mdbx_pwrite(env->me_lazy_fd, buffer, env->me_psize * NUM_METAS, 0);
mdbx_free(buffer);
if (unlikely(err != MDBX_SUCCESS))
return err;
err = mdbx_ftruncate(env->me_fd, filesize_before = env->me_dbgeo.now);
err = mdbx_ftruncate(env->me_lazy_fd, filesize_before = env->me_dbgeo.now);
if (unlikely(err != MDBX_SUCCESS))
return err;
@ -8461,7 +8478,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
MDBX_meta rollback = *head;
mdbx_meta_set_txnid(env, &rollback, undo_txnid);
rollback.mm_datasync_sign = MDBX_DATASIGN_WEAK;
err = mdbx_pwrite(env->me_fd, &rollback, sizeof(MDBX_meta),
err = mdbx_pwrite(env->me_lazy_fd, &rollback, sizeof(MDBX_meta),
(uint8_t *)head - (uint8_t *)env->me_map);
}
if (err) {
@ -8572,7 +8589,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
return err;
#elif defined(POSIX_FADV_DONTNEED)
err = ignore_enosys(posix_fadvise(
env->me_fd, used_aligned2os_bytes,
env->me_lazy_fd, used_aligned2os_bytes,
env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_FADV_DONTNEED));
if (unlikely(MDBX_IS_ERROR(err)))
return err;
@ -8594,7 +8611,7 @@ static int __cold mdbx_setup_dxb(MDBX_env *env, const int lck_rc) {
/* Open and/or initialize the lock region for the environment. */
static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
mode_t mode) {
mdbx_assert(env, env->me_fd != INVALID_HANDLE_VALUE);
mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE);
mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE);
int err = mdbx_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode);
@ -8605,7 +8622,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
return err;
/* ensure the file system is read-only */
err = mdbx_check_fs_rdonly(env->me_fd, lck_pathname, err);
err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err);
if (err != MDBX_SUCCESS)
return err;
@ -8918,7 +8935,7 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags,
if (flags & ~(CHANGEABLE | CHANGELESS))
return MDBX_EINVAL;
if (env->me_fd != INVALID_HANDLE_VALUE ||
if (env->me_lazy_fd != INVALID_HANDLE_VALUE ||
(env->me_flags & MDBX_ENV_ACTIVE) != 0)
return MDBX_EPERM;
@ -9020,10 +9037,18 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags,
rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ
: MDBX_OPEN_DXB_LAZY,
env, dxb_pathname, &env->me_fd, mode);
env, dxb_pathname, &env->me_lazy_fd, mode);
if (rc != MDBX_SUCCESS)
goto bailout;
mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE);
if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) {
rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, dxb_pathname,
&env->me_dsync_fd, 0);
mdbx_ensure(env, (rc != MDBX_SUCCESS) ==
(env->me_dsync_fd == INVALID_HANDLE_VALUE));
}
#if MDBX_LOCKING == MDBX_LOCKING_SYSV
env->me_sysv_ipc.key = ftok(dxb_pathname, 42);
if (env->me_sysv_ipc.key == -1) {
@ -9035,7 +9060,7 @@ int __cold mdbx_env_open(MDBX_env *env, const char *pathname, unsigned flags,
#if !(defined(_WIN32) || defined(_WIN64))
if (mode == 0) {
struct stat st;
if (fstat(env->me_fd, &st)) {
if (fstat(env->me_lazy_fd, &st)) {
rc = errno;
goto bailout;
}
@ -9192,9 +9217,15 @@ static int __cold mdbx_env_close0(MDBX_env *env) {
env->me_valgrind_handle = -1;
#endif
}
if (env->me_fd != INVALID_HANDLE_VALUE) {
(void)mdbx_closefile(env->me_fd);
env->me_fd = INVALID_HANDLE_VALUE;
if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
(void)mdbx_closefile(env->me_dsync_fd);
env->me_dsync_fd = INVALID_HANDLE_VALUE;
}
if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
(void)mdbx_closefile(env->me_lazy_fd);
env->me_lazy_fd = INVALID_HANDLE_VALUE;
}
if (env->me_lck)
@ -9258,7 +9289,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) {
rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc;
#else
struct stat st;
if (unlikely(fstat(env->me_fd, &st)))
if (unlikely(fstat(env->me_lazy_fd, &st)))
rc = errno;
else if (st.st_nlink > 0 /* don't sync deleted files */) {
rc = mdbx_env_sync_ex(env, true, true);
@ -14318,7 +14349,7 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
#if defined(__linux__) || defined(__gnu_linux__)
off_t in_offset = offset;
const intptr_t written =
sendfile(fd, env->me_fd, &in_offset, used_size - offset);
sendfile(fd, env->me_lazy_fd, &in_offset, used_size - offset);
if (unlikely(written <= 0)) {
rc = written ? errno : MDBX_ENODATA;
break;
@ -14330,7 +14361,7 @@ static int __cold mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn,
#if __GLIBC_PREREQ(2, 27) && defined(_GNU_SOURCE)
off_t in_offset = offset, out_offset = offset;
ssize_t bytes_copied = copy_file_range(
env->me_fd, &in_offset, fd, &out_offset, used_size - offset, 0);
env->me_lazy_fd, &in_offset, fd, &out_offset, used_size - offset, 0);
if (unlikely(bytes_copied <= 0)) {
rc = bytes_copied ? errno : MDBX_ENODATA;
break;
@ -14560,7 +14591,7 @@ int __cold mdbx_env_get_fd(MDBX_env *env, mdbx_filehandle_t *arg) {
if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
return MDBX_EBADSIGN;
*arg = env->me_fd;
*arg = env->me_lazy_fd;
return MDBX_SUCCESS;
}

View File

@ -865,7 +865,8 @@ struct MDBX_env {
size_t me_signature;
mdbx_mmap_t me_dxb_mmap; /* The main data file */
#define me_map me_dxb_mmap.dxb
#define me_fd me_dxb_mmap.fd
#define me_lazy_fd me_dxb_mmap.fd
mdbx_filehandle_t me_dsync_fd;
mdbx_mmap_t me_lck_mmap; /* The lock file */
#define me_lfd me_lck_mmap.fd
#define me_lck me_lck_mmap.lck

View File

@ -221,7 +221,7 @@ MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) {
#endif /* MDBX_LOCKING > MDBX_LOCKING_SYSV */
MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) {
assert(env->me_fd != INVALID_HANDLE_VALUE);
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
if (unlikely(mdbx_getpid() != env->me_pid))
return MDBX_PANIC;
#if MDBX_USE_OFDLOCKS
@ -233,7 +233,7 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) {
if (env->me_lfd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
rc =
lck_op(env->me_fd, op_setlk,
lck_op(env->me_lazy_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
if (rc != MDBX_SUCCESS) {
mdbx_error("%s(%s) failed: errcode %u", __func__, "without-lck", rc);
@ -249,7 +249,7 @@ retry_exclusive:
if (rc == MDBX_SUCCESS) {
continue_dxb_exclusive:
rc =
lck_op(env->me_fd, op_setlk,
lck_op(env->me_lazy_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
if (rc == MDBX_SUCCESS)
return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
@ -265,7 +265,7 @@ retry_exclusive:
/* Fallback to lck-shared */
}
/* Here could be one of two::
/* Here could be one of two:
* - mdbx_lck_destroy() from the another process was hold the lock
* during a destruction.
* - either mdbx_lck_seize() from the another process was got the exclusive
@ -317,7 +317,7 @@ retry_exclusive:
/* Lock against another process operating in without-lck or exclusive mode. */
rc =
lck_op(env->me_fd, op_setlk,
lck_op(env->me_lazy_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1);
if (rc != MDBX_SUCCESS) {
mdbx_error("%s(%s) failed: errcode %u", __func__,
@ -337,9 +337,9 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) {
int rc = MDBX_SUCCESS;
if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
rc = lck_op(env->me_fd, op_setlk, F_UNLCK, 0, env->me_pid);
rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, 0, env->me_pid);
if (rc == MDBX_SUCCESS)
rc = lck_op(env->me_fd, op_setlk, F_UNLCK, env->me_pid + 1,
rc = lck_op(env->me_lazy_fd, op_setlk, F_UNLCK, env->me_pid + 1,
OFF_T_MAX - env->me_pid - 1);
}
if (rc == MDBX_SUCCESS)
@ -361,7 +361,7 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env,
env->me_lck &&
/* try get exclusive access */
lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 &&
lck_op(env->me_fd, op_setlk,
lck_op(env->me_lazy_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
OFF_T_MAX) == 0) {
@ -392,14 +392,19 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env,
* locks should be released here explicitly with properly order. */
/* close dxb and restore lock */
if (env->me_fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->me_fd) != 0) && rc == MDBX_SUCCESS)
if (env->me_dsync_fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->me_dsync_fd) != 0) && rc == MDBX_SUCCESS)
rc = errno;
env->me_fd = INVALID_HANDLE_VALUE;
env->me_dsync_fd = INVALID_HANDLE_VALUE;
}
if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->me_lazy_fd) != 0) && rc == MDBX_SUCCESS)
rc = errno;
env->me_lazy_fd = INVALID_HANDLE_VALUE;
if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
/* restore file-lock */
rc = lck_op(
inprocess_neighbor->me_fd, F_SETLKW,
inprocess_neighbor->me_lazy_fd, F_SETLKW,
(inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
(inprocess_neighbor->me_flags & MDBX_EXCLUSIVE)
? 0
@ -439,7 +444,7 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env,
int semid = -1;
if (global_uniqueness_flag) {
struct stat st;
if (fstat(env->me_fd, &st))
if (fstat(env->me_lazy_fd, &st))
return errno;
sysv_retry_create:
semid = semget(env->me_sysv_ipc.key, 2,

View File

@ -151,7 +151,7 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
}
if ((env->me_flags & MDBX_EXCLUSIVE) ||
flock(env->me_fd,
flock(env->me_lazy_fd,
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
: (LCK_EXCLUSIVE | LCK_WAITFOR),
LCK_BODY))
@ -162,8 +162,9 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
}
void mdbx_txn_unlock(MDBX_env *env) {
int rc =
(env->me_flags & MDBX_EXCLUSIVE) ? TRUE : funlock(env->me_fd, LCK_BODY);
int rc = (env->me_flags & MDBX_EXCLUSIVE)
? TRUE
: funlock(env->me_lazy_fd, LCK_BODY);
LeaveCriticalSection(&env->me_windowsbug_lock);
if (!rc)
mdbx_panic("%s failed: errcode %u", __func__, GetLastError());
@ -385,24 +386,24 @@ static void lck_unlock(MDBX_env *env) {
SetLastError(ERROR_SUCCESS);
}
if (env->me_fd != INVALID_HANDLE_VALUE) {
if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
/* explicitly unlock to avoid latency for other processes (windows kernel
* releases such locks via deferred queues) */
while (funlock(env->me_fd, LCK_BODY))
while (funlock(env->me_lazy_fd, LCK_BODY))
;
rc = GetLastError();
assert(rc == ERROR_NOT_LOCKED);
(void)rc;
SetLastError(ERROR_SUCCESS);
while (funlock(env->me_fd, LCK_META))
while (funlock(env->me_lazy_fd, LCK_META))
;
rc = GetLastError();
assert(rc == ERROR_NOT_LOCKED);
(void)rc;
SetLastError(ERROR_SUCCESS);
while (funlock(env->me_fd, LCK_WHOLE))
while (funlock(env->me_lazy_fd, LCK_WHOLE))
;
rc = GetLastError();
assert(rc == ERROR_NOT_LOCKED);
@ -490,7 +491,7 @@ static int internal_seize_lck(HANDLE lfd) {
MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) {
int rc;
assert(env->me_fd != INVALID_HANDLE_VALUE);
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE)
return MDBX_RESULT_TRUE /* nope since files were must be opened
non-shareable */
@ -499,7 +500,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) {
if (env->me_lfd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. on read-only filesystem) */
mdbx_jitter4testing(false);
if (!flock(env->me_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) {
if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) {
rc = GetLastError();
mdbx_error("%s(%s) failed: errcode %u", __func__, "without-lck", rc);
return rc;
@ -516,7 +517,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) {
* - we need an exclusive lock for do so;
* - we can't lock meta-pages, otherwise other process could get an error
* while opening db in valid (non-conflict) mode. */
if (!flock(env->me_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) {
if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) {
rc = GetLastError();
mdbx_error("%s(%s) failed: errcode %u", __func__,
"lock-against-without-lck", rc);
@ -524,7 +525,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) {
lck_unlock(env);
} else {
mdbx_jitter4testing(false);
if (!funlock(env->me_fd, LCK_BODY))
if (!funlock(env->me_lazy_fd, LCK_BODY))
mdbx_panic("%s(%s) failed: errcode %u", __func__,
"unlock-against-without-lck", GetLastError());
}
@ -535,7 +536,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) {
MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) {
/* Transite from exclusive state (E-?) to used (S-?) */
assert(env->me_fd != INVALID_HANDLE_VALUE);
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
assert(env->me_lfd != INVALID_HANDLE_VALUE);
#if 1

View File

@ -663,7 +663,6 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose,
}
#endif
return MDBX_SUCCESS;
}
@ -967,11 +966,6 @@ MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset,
#endif /* Linux */
const int mode = async ? MS_ASYNC : MS_SYNC;
int rc = (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno;
#if defined(__APPLE__) && \
MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY
if (rc == MDBX_SUCCESS && mode == MS_SYNC)
rc = likely(fcntl(map->fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno;
#endif /* MacOS */
return rc;
#endif
}

View File

@ -562,7 +562,7 @@ enum mdbx_syncmode_bits {
};
MDBX_INTERNAL_FUNC int mdbx_filesync(mdbx_filehandle_t fd,
enum mdbx_syncmode_bits mode_bits);
const enum mdbx_syncmode_bits mode_bits);
MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length);
MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos);
MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length);