mdbx: rework POSIX-lck and merge with Linux-lck.

Change-Id: Id8fbc81b9a2ad3a3a7499ecf9a012314e1f8062a
This commit is contained in:
Leonid Yuriev 2019-09-02 20:52:29 +03:00
parent 098f8a0d77
commit 874418a301
8 changed files with 384 additions and 763 deletions

View File

@ -12,7 +12,6 @@ src/alloy.c
src/elements/data.c src/elements/data.c
src/elements/internals.h src/elements/internals.h
src/elements/defs.h src/elements/defs.h
src/elements/lck-linux.c
src/elements/lck-posix.c src/elements/lck-posix.c
src/elements/lck-windows.c src/elements/lck-windows.c
src/elements/core.c src/elements/core.c

View File

@ -28,8 +28,6 @@ if(MDBX_ALLOY_MODE)
else() else()
if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(LIBMDBX_OSAL windows) set(LIBMDBX_OSAL windows)
elseif(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(LIBMDBX_OSAL linux)
else() else()
set(LIBMDBX_OSAL posix) set(LIBMDBX_OSAL posix)
endif() endif()

View File

@ -22,9 +22,7 @@
#include "elements/core.c" #include "elements/core.c"
#include "elements/osal.c" #include "elements/osal.c"
#if defined(__linux__) || defined(__gnu_linux__) #if defined(_WIN32) || defined(_WIN64)
#include "elements/lck-linux.c"
#elif defined(_WIN32) || defined(_WIN64)
#include "elements/lck-windows.c" #include "elements/lck-windows.c"
#else #else
#include "elements/lck-posix.c" #include "elements/lck-posix.c"

View File

@ -674,93 +674,80 @@ static uint64_t rrxmrrxmsx_0(uint64_t v) {
return v ^ v >> 28; return v ^ v >> 28;
} }
static int uniq_poke(const mdbx_mmap_t *map, const uint64_t cadabra) { static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) {
int rc;
if (map->lck) {
map->lck->mti_bait_uniqueness = cadabra;
mdbx_flush_noncoherent_cpu_writeback();
rc = MDBX_SUCCESS;
} else {
rc = mdbx_pwrite(map->fd, &cadabra, sizeof(map->lck->mti_bait_uniqueness),
offsetof(MDBX_lockinfo, mti_bait_uniqueness));
}
mdbx_trace("uniq-poke: %s, cadabra 0x016%" PRIx64 ", rc %d",
map->lck ? "mem" : "file", cadabra, rc);
return rc;
}
static int uniq_peek(const mdbx_mmap_t *map, const uint64_t cadabra) {
int rc; int rc;
uint64_t bait; uint64_t bait;
if (map->lck) { if (pending->address) {
mdbx_invalidate_mmap_noncoherent_cache(map->lck, sizeof(*map->lck)); bait = pending->lck->mti_bait_uniqueness;
bait = map->lck->mti_bait_uniqueness;
rc = MDBX_SUCCESS; rc = MDBX_SUCCESS;
} else { } else {
rc = mdbx_pread(map->fd, &bait, sizeof(map->lck->mti_bait_uniqueness), bait = 0 /* hush MSVC warning */;
rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), true);
if (rc == MDBX_SUCCESS)
rc =
mdbx_pread(pending->fd, &bait, sizeof(scan->lck->mti_bait_uniqueness),
offsetof(MDBX_lockinfo, mti_bait_uniqueness)); offsetof(MDBX_lockinfo, mti_bait_uniqueness));
} }
if (likely(rc == MDBX_SUCCESS) && bait == scan->lck->mti_bait_uniqueness)
rc = MDBX_RESULT_TRUE;
if (unlikely(!MDBX_IS_ERROR(rc))) mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d",
rc = (bait == cadabra) ? MDBX_RESULT_TRUE : MDBX_RESULT_FALSE; pending->lck ? "mem" : "file", bait,
mdbx_trace("uniq-peek: %s, cadabra 0x%016" PRIx64 ", bait 0x%016" PRIx64
",%s rc %d",
map->lck ? "mem" : "file", cadabra, bait,
(rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc);
return rc; return rc;
} }
__cold static int uniq_probe(const mdbx_mmap_t *map, const mdbx_pid_t pid, static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan,
MDBX_env **found) { uint64_t *abra) {
if (inprocess_lcklist_head == RTHC_ENVLIST_END) { if (*abra == 0) {
mdbx_info("<< uniq-probe: pid %u, env-list empty, skip probing, rc %d",
(unsigned)pid, MDBX_RESULT_TRUE);
return MDBX_RESULT_TRUE;
}
const mdbx_tid_t tid = mdbx_thread_self(); const mdbx_tid_t tid = mdbx_thread_self();
size_t uit = 0; size_t uit = 0;
memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit));
uint64_t abra = *abra =
rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit); rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit);
}
for (unsigned bits = 4; bits; bits >>= 1) {
abra = abra * UINT64_C(6364136223846793005) + 1;
const uint64_t cadabra = const uint64_t cadabra =
rrxmrrxmsx_0(abra + UINT64_C(7680760450171793) * pid) << 20 | rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid())
abra >> 44; << 24 |
*abra >> 40;
scan->lck->mti_bait_uniqueness = cadabra;
mdbx_flush_noncoherent_cpu_writeback();
*abra = *abra * UINT64_C(6364136223846793005) + 1;
return uniq_peek(pending, scan);
}
int err = uniq_poke(map, cadabra); __cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) {
*found = nullptr; *found = nullptr;
for (MDBX_env *env = inprocess_lcklist_head; uint64_t salt = 0;
err == MDBX_SUCCESS && env != RTHC_ENVLIST_END; for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END;
env = env->me_lcklist_next) { scan = scan->me_lcklist_next) {
err = uniq_peek(&env->me_lck_mmap, cadabra); int err = scan->me_lck_mmap.lck->mti_bait_uniqueness
? uniq_peek(pending, &scan->me_lck_mmap)
: uniq_poke(pending, &scan->me_lck_mmap, &salt);
if (err == MDBX_RESULT_TRUE) if (err == MDBX_RESULT_TRUE)
*found = env; err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
if (err == MDBX_RESULT_TRUE) {
(void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), false);
err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
} }
if (err == MDBX_RESULT_TRUE) {
if (unlikely(MDBX_IS_ERROR(err))) { err = uniq_poke(pending, &scan->me_lck_mmap, &salt);
mdbx_verbose("<< uniq-probe: pid %u, uit %zu, failed rc %d", *found = scan;
(unsigned)pid, uit, err); mdbx_info("<< uniq-probe: found %p", *found);
return err;
}
bits += 8 & err;
if (bits == 15) {
mdbx_info("<< uniq-probe: pid %u, uit %zu, found %p", (unsigned)pid, uit,
*found);
return MDBX_RESULT_FALSE; return MDBX_RESULT_FALSE;
} }
if (unlikely(err != MDBX_SUCCESS)) {
mdbx_verbose("<< uniq-probe: failed rc %d", err);
return err;
}
} }
mdbx_info("<< uniq-probe: pid %u, uit %zu, unique", (unsigned)pid, uit); mdbx_info("<< uniq-probe: unique");
return MDBX_RESULT_TRUE; return MDBX_RESULT_TRUE;
} }
static int lcklist_detach_locked(MDBX_env *env) { static int lcklist_detach_locked(MDBX_env *env) {
MDBX_env *dup = nullptr; MDBX_env *inprocess_neighbor = nullptr;
int rc = MDBX_SUCCESS; int rc = MDBX_SUCCESS;
if (env->me_lcklist_next != nullptr) { if (env->me_lcklist_next != nullptr) {
mdbx_ensure(env, env->me_lcklist_next != nullptr); mdbx_ensure(env, env->me_lcklist_next != nullptr);
@ -776,11 +763,11 @@ static int lcklist_detach_locked(MDBX_env *env) {
mdbx_ensure(env, env->me_lcklist_next == nullptr); mdbx_ensure(env, env->me_lcklist_next == nullptr);
} }
rc = uniq_probe(&env->me_lck_mmap, env->me_pid, &dup); rc = uniq_check(&env->me_lck_mmap, &inprocess_neighbor);
if (!dup && env->me_live_reader) if (!inprocess_neighbor && env->me_live_reader)
(void)mdbx_rpid_clear(env); (void)mdbx_rpid_clear(env);
if (!MDBX_IS_ERROR(rc)) if (!MDBX_IS_ERROR(rc))
rc = mdbx_lck_destroy(env, dup); rc = mdbx_lck_destroy(env, inprocess_neighbor);
return rc; return rc;
} }
@ -6731,11 +6718,15 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
return err; return err;
} }
MDBX_env *inprocess_neighbor = nullptr;
if (err == MDBX_RESULT_TRUE) { if (err == MDBX_RESULT_TRUE) {
MDBX_env *unused_lckdup_found; err = uniq_check(&env->me_lck_mmap, &inprocess_neighbor);
err = uniq_probe(&env->me_lck_mmap, env->me_pid, &unused_lckdup_found);
if (MDBX_IS_ERROR(err)) if (MDBX_IS_ERROR(err))
goto bailout; goto bailout;
if (inprocess_neighbor && (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE)) {
err = MDBX_BUSY;
goto bailout;
}
} }
const int lck_seize_rc = err; const int lck_seize_rc = err;
@ -6814,6 +6805,7 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname,
if (lck_seize_rc == MDBX_RESULT_TRUE) { if (lck_seize_rc == MDBX_RESULT_TRUE) {
/* LY: exlcusive mode, reset lck */ /* LY: exlcusive mode, reset lck */
memset(env->me_lck, 0, (size_t)size); memset(env->me_lck, 0, (size_t)size);
mdbx_jitter4testing(false);
env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC; env->me_lck->mti_magic_and_version = MDBX_LOCK_MAGIC;
env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT; env->me_lck->mti_os_and_format = MDBX_LOCK_FORMAT;
} else { } else {
@ -6966,15 +6958,9 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags,
MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC; MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC;
if (lck_rc == MDBX_RESULT_TRUE) { if (lck_rc == MDBX_RESULT_TRUE) {
env->me_lck->mti_envmode = env->me_flags & (mode_flags | MDBX_RDONLY); env->me_lck->mti_envmode = env->me_flags & (mode_flags | MDBX_RDONLY);
if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { rc = mdbx_lck_downgrade(env);
/* LY: downgrade lock only if exclusive access not requested. mdbx_debug("lck-downgrade-%s: rc %i",
* in case exclusive==1, just leave value as is. */ (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc);
rc = mdbx_lck_downgrade(env, true);
mdbx_debug("lck-downgrade-full: rc %i ", rc);
} else {
rc = mdbx_lck_downgrade(env, false);
mdbx_debug("lck-downgrade-partial: rc %i ", rc);
}
if (rc != MDBX_SUCCESS) if (rc != MDBX_SUCCESS)
goto bailout; goto bailout;
} else { } else {

View File

@ -1,489 +0,0 @@
/*
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
#if !(defined(__linux__) || defined(__gnu_linux__))
#error "This implementation of locking only supports Linux,\
where is no interaction between the types of lock placed\
by flock() and fcntl()."
#endif
#include "./internals.h"
#include <sys/utsname.h>
/* Some platforms define the EOWNERDEAD error code
* even though they don't support Robust Mutexes.
* Compile with -DMDBX_USE_ROBUST=0. */
#ifndef MDBX_USE_ROBUST
/* Howard Chu: Android currently lacks Robust Mutex support */
#if defined(EOWNERDEAD) && \
!defined(__ANDROID__) /* LY: glibc before 2.10 has a troubles \
with Robust Mutex too. */ \
&& (!defined(__GLIBC__) || __GLIBC_PREREQ(2, 10) || \
_POSIX_C_SOURCE >= 200809L)
#define MDBX_USE_ROBUST 1
#else
#define MDBX_USE_ROBUST 0
#endif
#endif /* MDBX_USE_ROBUST */
/*----------------------------------------------------------------------------*/
/* global constructor/destructor */
#ifndef MDBX_ALLOY
uint32_t mdbx_linux_kernel_version;
#endif /* MDBX_ALLOY */
static __cold __attribute__((__constructor__)) void
mdbx_global_constructor(void) {
struct utsname buffer;
if (uname(&buffer) == 0) {
int i = 0;
char *p = buffer.release;
while (*p && i < 4) {
if (*p >= '0' && *p <= '9') {
long number = strtol(p, &p, 10);
if (number > 0) {
if (number > 255)
number = 255;
mdbx_linux_kernel_version += number << (24 - i * 8);
}
++i;
} else {
++p;
}
}
}
mdbx_rthc_global_init();
}
static __cold __attribute__((__destructor__)) void
mdbx_global_destructor(void) {
mdbx_rthc_global_dtor();
}
/*----------------------------------------------------------------------------*/
/* lck */
/* Описание реализации блокировок для Linux:
*
* lck-файл отображается в память, в нём организуется таблица читателей и
* размещаются совместно используемые posix-мьютексы (futex). Посредством
* этих мьютексов (см struct MDBX_lockinfo) реализуются:
* - Блокировка таблицы читателей для регистрации,
* т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock().
* - Блокировка БД для пишущих транзакций,
* т.е. функции mdbx_txn_lock() и mdbx_txn_unlock().
*
* Остальной функционал реализуется отдельно посредством файловых блокировок:
* - Первоначальный захват БД в режиме exclusive/shared и последующий перевод
* в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade().
* - Проверка присутствие процессов-читателей,
* т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check().
*
* Используется два вида файловых блокировок flock() и fcntl(F_SETLK),
* как для lck-файла, так и для основного файла БД:
* - Для контроля процессов-читателей используются однобайтовые
* range-блокировки lck-файла посредством fcntl(F_SETLK). При этом
* в качестве позиции используется pid процесса-читателя.
* - Для первоначального захвата и shared/exclusive блокировок используется
* комбинация flock() и fcntl(F_SETLK) блокировки одного байта lck-файла
* в нулевой позиции (нулевая позиция не используется механизмом контроля
* процессов-читателей, так как pid пользовательского процесса в Linux
* всегда больше 0).
* - Кроме этого, flock() блокировка основного файла БД используется при работе
* в режимах без lck-файла, как в в read-only, так и в эксклюзивном.
* - Блокировки flock() и fcntl(F_SETLK) в Linux работают независимо. Поэтому
* их комбинирование позволяет предотвратить совместное использование БД
* через NFS, что позволяет fcntl(F_SETLK), одновременно защитившись
* от проблем не-аторманости flock() при переходе между эксклюзивным
* и атомарным режимами блокировок.
*/
static int op_setlk, op_setlkw, op_getlk;
static void __cold choice_fcntl() {
assert(!op_setlk && !op_setlkw && !op_getlk);
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK)
if (mdbx_linux_kernel_version >
0x030f0000 /* OFD locks are available since 3.15, but engages here
only for 3.16 and larer kernels (LTS) for reliability reasons */
&& (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0) {
op_setlk = F_OFD_SETLK;
op_setlkw = F_OFD_SETLKW;
op_getlk = F_OFD_GETLK;
return;
}
#endif /* OFD locks */
op_setlk = F_SETLK;
op_setlkw = F_SETLKW;
op_getlk = F_GETLK;
}
#ifndef OFF_T_MAX
#define OFF_T_MAX \
((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff)
#endif
#define LCK_WHOLE OFF_T_MAX
static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset,
off_t len) {
for (;;) {
struct flock lock_op;
memset(&lock_op, 0, sizeof(lock_op));
lock_op.l_type = lck;
lock_op.l_whence = SEEK_SET;
lock_op.l_start = offset;
lock_op.l_len = len;
if (fcntl(fd, cmd, &lock_op) == 0) {
if (cmd == op_getlk) {
/* Checks reader by pid. Returns:
* MDBX_RESULT_TRUE - if pid is live (unable to acquire lock)
* MDBX_RESULT_FALSE - if pid is dead (lock acquired). */
return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE
: MDBX_RESULT_TRUE;
}
return 0;
}
int rc = errno;
if (rc != EINTR || cmd == op_setlkw)
return rc;
}
}
static __inline int mdbx_lck_exclusive(int lfd, bool fallback2shared) {
assert(lfd != INVALID_HANDLE_VALUE);
if (flock(lfd, LOCK_EX | LOCK_NB))
return errno;
int rc = mdbx_lck_op(lfd, op_setlk, F_WRLCK, 0, 1);
if (rc != 0 && fallback2shared) {
while (flock(lfd, LOCK_SH)) {
int rc = errno;
if (rc != EINTR)
return rc;
}
}
return rc;
}
static __inline int mdbx_lck_shared(int lfd) {
assert(lfd != INVALID_HANDLE_VALUE);
while (flock(lfd, LOCK_SH)) {
int rc = errno;
if (rc != EINTR)
return rc;
}
return mdbx_lck_op(lfd, op_setlkw, F_RDLCK, 0, 1);
}
MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env, bool complete) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS;
}
MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0);
return mdbx_lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1);
}
MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0);
return mdbx_lck_op(env->me_lfd, op_setlkw, F_UNLCK, env->me_pid, 1);
}
MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(pid > 0);
return mdbx_lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1);
}
/*---------------------------------------------------------------------------*/
static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
const int rc);
MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env,
int global_uniqueness_flag) {
if (global_uniqueness_flag == MDBX_RESULT_FALSE)
return MDBX_SUCCESS;
pthread_mutexattr_t ma;
int rc = pthread_mutexattr_init(&ma);
if (rc)
return rc;
rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
if (rc)
goto bailout;
#if MDBX_USE_ROBUST
#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \
!defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L
rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
#else
rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
#endif
if (rc)
goto bailout;
#endif /* MDBX_USE_ROBUST */
#if _POSIX_C_SOURCE >= 199506L && !defined(MDBX_SAFE4QEMU)
rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT);
if (rc == ENOTSUP)
rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE);
if (rc)
goto bailout;
#endif /* PTHREAD_PRIO_INHERIT */
rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
if (rc)
goto bailout;
rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma);
if (rc)
goto bailout;
rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma);
bailout:
pthread_mutexattr_destroy(&ma);
return rc;
}
MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env,
MDBX_env *inprocess_neighbor) {
if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor &&
env->me_lck &&
/* try get exclusive access */ mdbx_lck_exclusive(env->me_lfd, false) ==
0) {
mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_);
int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex);
if (rc == 0)
rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex);
assert(rc == 0);
(void)rc;
msync(env->me_lck, env->me_os_psize, MS_ASYNC);
/* file locks would be released (by kernel)
* while the me_lfd will be closed */
}
if (op_setlk == F_SETLK) {
/* File locks would be released (by kernel) while the file-descriptors
* will be closed. But to avoid false-positive EDEADLK from the kernel,
* locks should be released here explicitly with properly order. */
/* POSIX's fcntl() locks should be restored after file was closed.
* FIXME: This code should be rethinked and retested, since it will
* executed in really rare cases.
*
* On the other hand, seems more reasonable to disallow multi-open feature
* by default, and describe it as "use at your own risk". Currently
* multi-open required only for libfpta's unit-tests. */
int rc = MDBX_SUCCESS;
/* close clk and restore locks */
if (env->me_lfd != INVALID_HANDLE_VALUE) {
(void)close(env->me_lfd);
env->me_lfd = INVALID_HANDLE_VALUE;
if (inprocess_neighbor) {
/* restore file-locks */
if (rc == MDBX_SUCCESS)
rc = mdbx_lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1);
if (rc == MDBX_SUCCESS)
rc = mdbx_rpid_set(inprocess_neighbor);
}
}
/* close dxb and restore lock */
if (env->me_fd != INVALID_HANDLE_VALUE) {
(void)close(env->me_fd);
env->me_fd = INVALID_HANDLE_VALUE;
if (inprocess_neighbor && rc == MDBX_SUCCESS) {
/* restore file-lock */
rc = mdbx_lck_op(
inprocess_neighbor->me_fd, F_SETLKW,
(inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
(inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE)
? 0
: inprocess_neighbor->me_pid,
(inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE) ? OFF_T_MAX
: 1);
}
}
if (inprocess_neighbor && rc != MDBX_SUCCESS) {
inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR;
return rc;
}
}
return MDBX_SUCCESS;
}
static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) {
int rc = pthread_mutex_lock(mutex);
if (unlikely(rc != 0))
rc = mdbx_mutex_failed(env, mutex, rc);
return rc;
}
static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) {
int rc = pthread_mutex_trylock(mutex);
if (unlikely(rc != 0 && rc != EBUSY))
rc = mdbx_mutex_failed(env, mutex, rc);
return (rc != EBUSY) ? rc : MDBX_BUSY;
}
static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) {
int rc = pthread_mutex_unlock(mutex);
if (unlikely(rc != 0))
rc = mdbx_mutex_failed(env, mutex, rc);
return rc;
}
MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) {
mdbx_trace(">>");
int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex);
mdbx_trace("<< rc %d", rc);
return rc;
}
MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) {
mdbx_trace(">>");
int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex);
mdbx_trace("<< rc %d", rc);
if (unlikely(MDBX_IS_ERROR(rc)))
mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc);
}
int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
mdbx_trace(">>");
int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex)
: mdbx_robust_lock(env, env->me_wmutex);
mdbx_trace("<< rc %d", rc);
return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS;
}
void mdbx_txn_unlock(MDBX_env *env) {
mdbx_trace(">>");
int rc = mdbx_robust_unlock(env, env->me_wmutex);
mdbx_trace("<< rc %d", rc);
if (unlikely(MDBX_IS_ERROR(rc)))
mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc);
}
static int __cold internal_seize_lck(int lfd) {
assert(lfd != INVALID_HANDLE_VALUE);
/* try exclusive access */
int rc = mdbx_lck_exclusive(lfd, false);
if (rc == 0)
/* got exclusive */
return MDBX_RESULT_TRUE;
if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) {
/* get shared access */
rc = mdbx_lck_shared(lfd);
if (rc == 0) {
/* got shared, try exclusive again */
rc = mdbx_lck_exclusive(lfd, true);
if (rc == 0)
/* now got exclusive */
return MDBX_RESULT_TRUE;
if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK)
/* unable exclusive, but stay shared */
return MDBX_RESULT_FALSE;
}
}
assert(MDBX_IS_ERROR(rc));
return rc;
}
MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) {
assert(env->me_fd != INVALID_HANDLE_VALUE);
if (unlikely(op_setlk == 0))
choice_fcntl();
if (env->me_lfd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
int rc = mdbx_lck_op(env->me_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
LCK_WHOLE);
if (rc != 0) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc);
return rc;
}
return MDBX_RESULT_TRUE;
}
if ((env->me_flags & MDBX_RDONLY) == 0) {
/* Check that another process don't operates in without-lck mode. */
int rc = mdbx_lck_op(env->me_fd, op_setlk, F_WRLCK, env->me_pid, 1);
if (rc != 0) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_,
"lock-against-without-lck", rc);
return rc;
}
}
return internal_seize_lck(env->me_lfd);
}
static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
const int err) {
int rc = err;
#if MDBX_USE_ROBUST
if (err == EOWNERDEAD) {
/* We own the mutex. Clean up after dead previous owner. */
int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex);
rc = MDBX_SUCCESS;
if (!rlocked) {
if (unlikely(env->me_txn)) {
/* env is hosed if the dead thread was ours */
env->me_flags |= MDBX_FATAL_ERROR;
env->me_txn = NULL;
rc = MDBX_PANIC;
}
}
mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'),
(rc ? "this process' env is hosed" : "recovering"));
int check_rc = mdbx_reader_check0(env, rlocked, NULL);
check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \
!defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L
int mreco_rc = pthread_mutex_consistent_np(mutex);
#else
int mreco_rc = pthread_mutex_consistent(mutex);
#endif
check_rc = (mreco_rc == 0) ? check_rc : mreco_rc;
if (unlikely(mreco_rc))
mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc));
rc = (rc == MDBX_SUCCESS) ? check_rc : rc;
if (MDBX_IS_ERROR(rc))
pthread_mutex_unlock(mutex);
return rc;
}
#else
(void)mutex;
#endif /* MDBX_USE_ROBUST */
mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err));
if (rc != EDEADLK)
env->me_flags |= MDBX_FATAL_ERROR;
return rc;
}

View File

@ -14,22 +14,63 @@
#include "./internals.h" #include "./internals.h"
/* Some platforms define the EOWNERDEAD error code /* Some platforms define the EOWNERDEAD error code even though they
* even though they don't support Robust Mutexes. * don't support Robust Mutexes. Compile with -DMDBX_USE_ROBUST=0. */
* Compile with -DMDBX_USE_ROBUST=0. */
#ifndef MDBX_USE_ROBUST #ifndef MDBX_USE_ROBUST
#if (defined(EOWNERDEAD) || _POSIX_C_SOURCE >= 200809L) && !defined(__APPLE__) /* Howard Chu: Android currently lacks Robust Mutex support */
#if defined(EOWNERDEAD) && !defined(__ANDROID__) && !defined(__APPLE__) && \
(!defined(__GLIBC__) || \
__GLIBC_PREREQ( \
2, \
10) /* LY: glibc before 2.10 has a troubles with Robust Mutex too. */ \
|| _POSIX_C_SOURCE >= 200809L)
#define MDBX_USE_ROBUST 1 #define MDBX_USE_ROBUST 1
#else #else
#define MDBX_USE_ROBUST 0 #define MDBX_USE_ROBUST 0
#endif #endif
#endif /* MDBX_USE_ROBUST */ #endif /* MDBX_USE_ROBUST */
#ifndef MDBX_USE_OFDLOCKS
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK)
#define MDBX_USE_OFDLOCKS 1
#else
#define MDBX_USE_OFDLOCKS 0
#endif
#endif /* MDBX_USE_OFDLOCKS */
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
/* rthc */ /* global constructor/destructor */
#if defined(__linux__) || defined(__gnu_linux__)
#include <sys/utsname.h>
#ifndef MDBX_ALLOY
uint32_t mdbx_linux_kernel_version;
#endif /* MDBX_ALLOY */
#endif /* Linux */
static __cold __attribute__((__constructor__)) void static __cold __attribute__((__constructor__)) void
mdbx_global_constructor(void) { mdbx_global_constructor(void) {
#if defined(__linux__) || defined(__gnu_linux__)
struct utsname buffer;
if (uname(&buffer) == 0) {
int i = 0;
char *p = buffer.release;
while (*p && i < 4) {
if (*p >= '0' && *p <= '9') {
long number = strtol(p, &p, 10);
if (number > 0) {
if (number > 255)
number = 255;
mdbx_linux_kernel_version += number << (24 - i * 8);
}
++i;
} else {
++p;
}
}
}
#endif /* Linux */
mdbx_rthc_global_init(); mdbx_rthc_global_init();
} }
@ -41,7 +82,7 @@ mdbx_global_destructor(void) {
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
/* lck */ /* lck */
/* Описание реализации блокировок для POSIX: /* Описание реализации блокировок для POSIX & Linux:
* *
* lck-файл отображается в память, в нём организуется таблица читателей и * lck-файл отображается в память, в нём организуется таблица читателей и
* размещаются совместно используемые posix-мьютексы (futex). Посредством * размещаются совместно используемые posix-мьютексы (futex). Посредством
@ -57,7 +98,7 @@ mdbx_global_destructor(void) {
* - Проверка присутствие процессов-читателей, * - Проверка присутствие процессов-читателей,
* т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check().
* *
* Для блокировки файлов Используется только fcntl(F_SETLK), так как: * Для блокировки файлов используется fcntl(F_SETLK), так как:
* - lockf() оперирует только эксклюзивной блокировкой и требует * - lockf() оперирует только эксклюзивной блокировкой и требует
* открытия файла в RW-режиме. * открытия файла в RW-режиме.
* - flock() не гарантирует атомарности при смене блокировок * - flock() не гарантирует атомарности при смене блокировок
@ -67,28 +108,68 @@ mdbx_global_destructor(void) {
* в качестве позиции используется pid процесса-читателя. * в качестве позиции используется pid процесса-читателя.
* - Для первоначального захвата и shared/exclusive выполняется блокировка * - Для первоначального захвата и shared/exclusive выполняется блокировка
* основного файла БД и при успехе lck-файла. * основного файла БД и при успехе lck-файла.
*
* ----------------------------------------------------------------------------
* УДЕРЖИВАЕМЫЕ БЛОКИРОВКИ В ЗАВИСИМОСТИ ОТ РЕЖИМА И СОСТОЯНИЯ
*
* Эксклюзивный режим без lck-файла:
* = заблокирован весь dxb-файл посредством F_RDLCK или F_WRLCK,
* в зависимости от MDBX_RDONLY.
*
* Не-операционный режим на время пере-инициализации и разрушении lck-файла:
* = F_WRLCK блокировка первого байта lck-файла, другие процессы ждут её
* снятия при получении F_RDLCK через F_SETLKW.
* - блокировки dxb-файла могут меняться до снятие эксклюзивной блокировки
* lck-файла:
* + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле
* посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
* + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла
* посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
*
* ОПЕРАЦИОННЫЙ режим с lck-файлом:
* = F_RDLCK блокировка первого байта lck-файла, другие процессы не могут
* получить F_WRLCK и таким образом видят что БД используется.
* + F_WRLCK блокировка pid-байта в clk-файле после первой транзакции чтения.
* + для НЕ-эксклюзивного режима блокировка pid-байта в dxb-файле
* посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
* + для ЭКСКЛЮЗИВНОГО режима блокировка pid-байта всего dxb-файла
* посредством F_RDLCK или F_WRLCK, в зависимости от MDBX_RDONLY.
*/ */
#if MDBX_USE_OFDLOCKS
static int op_setlk, op_setlkw, op_getlk;
static void __cold choice_fcntl() {
assert(!op_setlk && !op_setlkw && !op_getlk);
if ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0
#if defined(__linux__) || defined(__gnu_linux__)
&& mdbx_linux_kernel_version >
0x030f0000 /* OFD locks are available since 3.15, but engages here
only for 3.16 and larer kernels (LTS) for reliability reasons */
#endif /* linux */
) {
op_setlk = F_OFD_SETLK;
op_setlkw = F_OFD_SETLKW;
op_getlk = F_OFD_GETLK;
return;
}
op_setlk = F_SETLK;
op_setlkw = F_SETLKW;
op_getlk = F_GETLK;
}
#else
#define op_setlk F_SETLK
#define op_setlkw F_SETLKW
#define op_getlk F_GETLK
#endif /* MDBX_USE_OFDLOCKS */
#ifndef OFF_T_MAX #ifndef OFF_T_MAX
#define OFF_T_MAX \ #define OFF_T_MAX \
((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff)
#endif #endif
#ifndef PID_T_MAX
#define PID_T_MAX INT_MAX
#endif
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) static int lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset,
#define OP_SETLK F_OFD_SETLK
#define OP_SETLKW F_OFD_SETLKW
#define OP_GETLK F_OFD_GETLK
#else
#define OP_SETLK F_SETLK
#define OP_SETLKW F_SETLKW
#define OP_GETLK F_GETLK
#endif /* OFD locks */
static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset,
off_t len) { off_t len) {
mdbx_jitter4testing(true);
for (;;) { for (;;) {
struct flock lock_op; struct flock lock_op;
memset(&lock_op, 0, sizeof(lock_op)); memset(&lock_op, 0, sizeof(lock_op));
@ -96,131 +177,212 @@ static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset,
lock_op.l_whence = SEEK_SET; lock_op.l_whence = SEEK_SET;
lock_op.l_start = offset; lock_op.l_start = offset;
lock_op.l_len = len; lock_op.l_len = len;
if (fcntl(fd, cmd, &lock_op) == 0) { int rc = fcntl(fd, cmd, &lock_op);
if (cmd == OP_GETLK) { mdbx_jitter4testing(true);
if (rc != -1) {
if (cmd == op_getlk) {
/* Checks reader by pid. Returns: /* Checks reader by pid. Returns:
* MDBX_RESULT_TRUE - if pid is live (unable to acquire lock) * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock)
* MDBX_RESULT_FALSE - if pid is dead (lock acquired). */ * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */
return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE
: MDBX_RESULT_TRUE; : MDBX_RESULT_TRUE;
} }
return 0; return MDBX_SUCCESS;
} }
int rc = errno; rc = errno;
if (rc != EINTR || cmd == F_SETLKW) if (rc != EINTR || cmd == op_setlkw) {
mdbx_assert(nullptr, MDBX_IS_ERROR(rc));
return rc; return rc;
} }
}
} }
MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); assert(env->me_pid > 0);
return mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, env->me_pid, 1); return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1);
} }
MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); assert(env->me_pid > 0);
return mdbx_lck_op(env->me_lfd, OP_SETLKW, F_UNLCK, env->me_pid, 1); return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1);
} }
MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) {
assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(pid > 0 && pid <= PID_T_MAX); assert(pid > 0);
assert(PID_T_MAX < OFF_T_MAX); return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1);
return mdbx_lck_op(env->me_lfd, OP_GETLK, F_WRLCK, pid, 1);
} }
/*---------------------------------------------------------------------------*/
MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) { MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) {
assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_fd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); #if MDBX_USE_OFDLOCKS
if (unlikely(op_setlk == 0))
choice_fcntl();
#endif /* MDBX_USE_OFDLOCKS */
int rc;
if (env->me_lfd == INVALID_HANDLE_VALUE) { if (env->me_lfd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
int rc = mdbx_lck_op(env->me_fd, OP_SETLK, rc =
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, lck_op(env->me_fd, op_setlk,
OFF_T_MAX); (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
if (rc != 0) { if (rc != MDBX_SUCCESS) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc);
mdbx_assert(env, MDBX_IS_ERROR(rc));
return rc; return rc;
} }
return MDBX_RESULT_TRUE; return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
} }
/* try exclusive access */ /* Firstly try to get exclusive locking. */
int rc = mdbx_lck_op(env->me_fd, OP_SETLK, rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, if (rc == MDBX_SUCCESS) {
OFF_T_MAX); continue_dxb_exclusive:
if (rc == 0) { rc =
continue_exclusive: lck_op(env->me_fd, op_setlk,
/* got dxb-exclusive, continue lck-exclusive */ (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX);
rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_WRLCK, 0, OFF_T_MAX); if (rc == MDBX_SUCCESS)
if (rc == 0) { return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */;
/* got both exclusive */
return MDBX_RESULT_TRUE; /* the cause may be a collision with POSIX's file-lock recovery. */
} if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK ||
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, rc == EDEADLK)) {
"lck-after-dxb-exclusive", rc); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb-exclusive", rc);
assert(MDBX_IS_ERROR(rc)); mdbx_assert(env, MDBX_IS_ERROR(rc));
goto bailout; return rc;
} }
if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { /* Fallback to lck-shared */
rc = mdbx_lck_op(env->me_fd, OP_SETLKW, rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1);
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, if (rc != MDBX_SUCCESS) {
env->me_pid, 1); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "fallback-shared",
if (rc == 0) { rc);
/* got dxb-shared, try again dxb-exclusive */ mdbx_assert(env, MDBX_IS_ERROR(rc));
rc = mdbx_lck_op(env->me_fd, OP_SETLK, return rc;
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, }
OFF_T_MAX); /* Done: return with shared locking. */
if (rc == 0)
goto continue_exclusive;
/* continue lck-shared */
rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1);
if (rc == 0) {
/* got both dxb and lck shared lock */
return MDBX_RESULT_FALSE; return MDBX_RESULT_FALSE;
} }
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck-shared", rc);
} else { /* Wait for lck-shared now. */
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb-shared", rc); /* Here may be await during transient processes, for instance until another
} * competing process doesn't call lck_downgrade(). */
assert(MDBX_IS_ERROR(rc)); rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1);
if (rc != MDBX_SUCCESS) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "try-shared", rc);
mdbx_assert(env, MDBX_IS_ERROR(rc));
return rc;
} }
bailout: /* Lock against another process operating in without-lck or exclusive mode. */
(void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); rc =
(void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); lck_op(env->me_fd, op_setlk,
assert(MDBX_IS_ERROR(rc)); (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1);
if (rc != MDBX_SUCCESS) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_,
"lock-against-without-lck", rc);
mdbx_assert(env, MDBX_IS_ERROR(rc));
return rc;
}
/* got shared, retry exclusive */
rc = lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, 1);
if (rc == MDBX_SUCCESS)
goto continue_dxb_exclusive;
if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK ||
rc == EDEADLK)
return MDBX_RESULT_FALSE /* Done: exclusive is unavailable,
but shared locks are alive. */
;
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "try-exclusive", rc);
mdbx_assert(env, MDBX_IS_ERROR(rc));
return rc; return rc;
} }
int mdbx_lck_downgrade(MDBX_env *env, bool complete) { MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE);
int rc = mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 1, OFF_T_MAX - 1); int rc = MDBX_SUCCESS;
if (rc == 0) if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1); rc = lck_op(env->me_fd, op_setlk, F_UNLCK, 0, env->me_pid);
if (rc == MDBX_SUCCESS)
rc = lck_op(env->me_fd, op_setlk, F_UNLCK, env->me_pid + 1,
OFF_T_MAX - env->me_pid - 1);
}
if (rc == MDBX_SUCCESS)
rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1);
if (unlikely(rc != 0)) { if (unlikely(rc != 0)) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck", rc); mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck", rc);
goto bailout;
}
if (complete) {
rc = mdbx_lck_op(env->me_fd, OP_SETLK,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
env->me_pid, 1);
if (unlikely(rc != 0)) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb", rc);
goto bailout;
}
}
return MDBX_SUCCESS;
bailout:
(void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
(void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
assert(MDBX_IS_ERROR(rc)); assert(MDBX_IS_ERROR(rc));
}
return rc;
}
MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env,
MDBX_env *inprocess_neighbor) {
int rc = MDBX_SUCCESS;
if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor &&
env->me_lck &&
/* try get exclusive access */
lck_op(env->me_lfd, op_setlk, F_WRLCK, 0, OFF_T_MAX) == 0 &&
lck_op(env->me_fd, op_setlk,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX)) {
mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_);
rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex);
if (rc == 0)
rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex);
mdbx_assert(env, rc == 0);
if (rc == 0) {
memset(env->me_lck, 0x81, sizeof(MDBX_lockinfo));
msync(env->me_lck, env->me_os_psize, MS_ASYNC);
}
mdbx_jitter4testing(false);
}
/* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored
* after file was closed.
*
* 2) File locks would be released (by kernel) while the file-descriptors will
* be closed. But to avoid false-positive EACCESS and EDEADLK from the kernel,
* locks should be released here explicitly with properly order. */
/* close dxb and restore lock */
if (env->me_fd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->me_fd) != 0) && rc == MDBX_SUCCESS)
rc = errno;
env->me_fd = INVALID_HANDLE_VALUE;
if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
/* restore file-lock */
rc = lck_op(
inprocess_neighbor->me_fd, F_SETLKW,
(inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
(inprocess_neighbor->me_flags & MDBX_EXCLUSIVE)
? 0
: inprocess_neighbor->me_pid,
(inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) ? OFF_T_MAX : 1);
}
}
/* close clk and restore locks */
if (env->me_lfd != INVALID_HANDLE_VALUE) {
if (unlikely(close(env->me_lfd) != 0) && rc == MDBX_SUCCESS)
rc = errno;
env->me_lfd = INVALID_HANDLE_VALUE;
if (op_setlk == F_SETLK && inprocess_neighbor && rc == MDBX_SUCCESS) {
/* restore file-locks */
rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1);
if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader)
rc = mdbx_rpid_set(inprocess_neighbor);
}
}
if (inprocess_neighbor && rc != MDBX_SUCCESS)
inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR;
return rc; return rc;
} }
@ -231,7 +393,7 @@ static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env, MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env,
int global_uniqueness_flag) { int global_uniqueness_flag) {
if (global_uniqueness_flag == MDBX_RESULT_FALSE) if (global_uniqueness_flag != MDBX_RESULT_TRUE)
return MDBX_SUCCESS; return MDBX_SUCCESS;
pthread_mutexattr_t ma; pthread_mutexattr_t ma;
@ -244,7 +406,12 @@ MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env,
goto bailout; goto bailout;
#if MDBX_USE_ROBUST #if MDBX_USE_ROBUST
#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \
!defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L
rc = pthread_mutexattr_setrobust_np(&ma, PTHREAD_MUTEX_ROBUST_NP);
#else
rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
#endif
if (rc) if (rc)
goto bailout; goto bailout;
#endif /* MDBX_USE_ROBUST */ #endif /* MDBX_USE_ROBUST */
@ -271,75 +438,8 @@ bailout:
return rc; return rc;
} }
MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env,
MDBX_env *inprocess_neighbor) {
/* File locks would be released (by kernel) while the file-descriptors
* will be closed. But to avoid false-positive EDEADLK from the kernel,
* locks should be released here explicitly with properly order. */
if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor &&
env->me_lck &&
/* try get exclusive access */
mdbx_lck_op(env->me_fd, OP_SETLK,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
OFF_T_MAX) == 0 &&
mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, 0, OFF_T_MAX) == 0) {
mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_);
int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex);
if (rc == 0)
rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex);
assert(rc == 0);
(void)rc;
msync(env->me_lck, env->me_os_psize, MS_ASYNC);
}
/* POSIX's fcntl() locks should be restored after file was closed.
* FIXME: This code should be rethinked and retested, since it will executed
* in really rare cases. For instance, this code could wait a lot, if other
* process get exclusive access immediately after the close().
*
* On the other hand, seems more reasonable to disallow multi-open feature
* by default, and describe it as "use at your own risk". Currently
* multi-open required only for libfpta's unit-tests. */
int rc = MDBX_SUCCESS;
/* close clk and restore locks */
if (env->me_lfd != INVALID_HANDLE_VALUE) {
(void)close(env->me_lfd);
env->me_lfd = INVALID_HANDLE_VALUE;
if (inprocess_neighbor) {
/* restore file-locks */
if (rc == MDBX_SUCCESS)
rc = mdbx_lck_op(inprocess_neighbor->me_lfd, OP_SETLKW, F_RDLCK, 0, 1);
if (rc == MDBX_SUCCESS)
rc = mdbx_rpid_set(inprocess_neighbor);
}
}
/* close dxb and restore lock */
if (env->me_fd != INVALID_HANDLE_VALUE) {
(void)close(env->me_fd);
env->me_fd = INVALID_HANDLE_VALUE;
if (inprocess_neighbor && rc == MDBX_SUCCESS) {
/* restore file-lock */
rc = mdbx_lck_op(
inprocess_neighbor->me_fd, OP_SETLKW,
(inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
(inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE)
? 0
: inprocess_neighbor->me_pid,
(inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE) ? OFF_T_MAX : 1);
}
}
if (inprocess_neighbor && rc != MDBX_SUCCESS) {
inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR;
return rc;
}
return MDBX_SUCCESS;
}
static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) {
mdbx_jitter4testing(true);
int rc = pthread_mutex_lock(mutex); int rc = pthread_mutex_lock(mutex);
if (unlikely(rc != 0)) if (unlikely(rc != 0))
rc = mdbx_mutex_failed(env, mutex, rc); rc = mdbx_mutex_failed(env, mutex, rc);
@ -347,6 +447,7 @@ static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) {
} }
static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) { static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) {
mdbx_jitter4testing(true);
int rc = pthread_mutex_trylock(mutex); int rc = pthread_mutex_trylock(mutex);
if (unlikely(rc != 0 && rc != EBUSY)) if (unlikely(rc != 0 && rc != EBUSY))
rc = mdbx_mutex_failed(env, mutex, rc); rc = mdbx_mutex_failed(env, mutex, rc);
@ -355,6 +456,7 @@ static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) {
static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) { static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) {
int rc = pthread_mutex_unlock(mutex); int rc = pthread_mutex_unlock(mutex);
mdbx_jitter4testing(true);
if (unlikely(rc != 0)) if (unlikely(rc != 0))
rc = mdbx_mutex_failed(env, mutex, rc); rc = mdbx_mutex_failed(env, mutex, rc);
return rc; return rc;
@ -414,7 +516,12 @@ static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
int check_rc = mdbx_reader_check0(env, rlocked, NULL); int check_rc = mdbx_reader_check0(env, rlocked, NULL);
check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
#if defined(__GLIBC__) && !__GLIBC_PREREQ(2, 12) && \
!defined(pthread_mutex_consistent) && _POSIX_C_SOURCE < 200809L
int mreco_rc = pthread_mutex_consistent_np(mutex);
#else
int mreco_rc = pthread_mutex_consistent(mutex); int mreco_rc = pthread_mutex_consistent(mutex);
#endif
check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; check_rc = (mreco_rc == 0) ? check_rc : mreco_rc;
if (unlikely(mreco_rc)) if (unlikely(mreco_rc))

View File

@ -341,17 +341,32 @@ mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) {
/* global `initial` lock for lockfile initialization, /* global `initial` lock for lockfile initialization,
* exclusive/shared locking first cacheline */ * exclusive/shared locking first cacheline */
/* FIXME: locking schema/algo descritpion. /* Briefly descritpion of locking schema/algorithm:
?-? = free * - Windows does not support upgrading or downgrading for file locking.
S-? = used * - Therefore upgrading/downgrading is emulated by shared and exclusive
E-? = exclusive-read * locking of upper and lower halves.
?-S * - In other words, we have FSM with possible 9 states,
?-E = middle * i.e. free/shared/exclusive x free/shared/exclusive == 9.
S-S * Only 6 states of FSM are used, which 2 of ones are transitive.
S-E = locked *
E-S * The mdbx_lck_seize() moves the locking-FSM from the initial free/unlocked
E-E = exclusive-write * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible,
*/ * or to the "used" (and returns MDBX_RESULT_FALSE).
*
* The mdbx_lck_downgrade() moves the locking-FSM from "exclusive write"
* state to the "used" (i.e. shared) state.
*
* States:
* ?-? = free, i.e. unlocked
* S-? = used, i.e. shared lock
* E-? = exclusive-read, i.e. operational exclusive
* ?-S
* ?-E = middle (transitive state)
* S-S
* S-E = locked (transitive state)
* E-S
* E-E = exclusive-write, i.e. exclusive due (re)initialization
*/
static void lck_unlock(MDBX_env *env) { static void lck_unlock(MDBX_env *env) {
int rc; int rc;
@ -414,8 +429,8 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env,
} }
/* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE) /* Seize state as 'exclusive-write' (E-E and returns MDBX_RESULT_TRUE)
* or as 'used' (S-? and returns MDBX_RESULT_FALSE), otherwise returns an error * or as 'used' (S-? and returns MDBX_RESULT_FALSE).
*/ * Oherwise returns an error. */
static int internal_seize_lck(HANDLE lfd) { static int internal_seize_lck(HANDLE lfd) {
int rc; int rc;
assert(lfd != INVALID_HANDLE_VALUE); assert(lfd != INVALID_HANDLE_VALUE);
@ -511,23 +526,25 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) {
return rc; return rc;
} }
MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env, bool complete) { MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) {
/* Transite from exclusive state (E-?) to used (S-?) */ /* Transite from exclusive state (E-?) to used (S-?) */
assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_fd != INVALID_HANDLE_VALUE);
assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE);
#if 1
if (env->me_flags & MDBX_EXCLUSIVE) if (env->me_flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
; ;
#else
/* 1) must be at E-E (exclusive-write) */ /* 1) must be at E-E (exclusive-write) */
if (!complete) { if (env->me_flags & MDBX_EXCLUSIVE) {
/* transite from E-E to E_? (exclusive-read) */ /* transite from E-E to E_? (exclusive-read) */
if (!funlock(env->me_lfd, LCK_UPPER)) if (!funlock(env->me_lfd, LCK_UPPER))
mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_, mdbx_panic("%s(%s) failed: errcode %u", mdbx_func_,
"E-E(exclusive-write) >> E-?(exclusive-read)", GetLastError()); "E-E(exclusive-write) >> E-?(exclusive-read)", GetLastError());
return MDBX_SUCCESS /* 2) now at E-? (exclusive-read), done */; return MDBX_SUCCESS /* 2) now at E-? (exclusive-read), done */;
} }
#endif
/* 3) now at E-E (exclusive-write), transite to ?_E (middle) */ /* 3) now at E-E (exclusive-write), transite to ?_E (middle) */
if (!funlock(env->me_lfd, LCK_LOWER)) if (!funlock(env->me_lfd, LCK_LOWER))

View File

@ -698,12 +698,17 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env,
MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env);
/// \brief Снижает уровень первоначальной захваченной блокировки до /// \brief Снижает уровень первоначальной захваченной блокировки до
/// операционного уровня определяемого аргументом. /// операционного уровня определяемого аргументом. Смысл функции в возврате
/// \param /// в операционный режим:
/// complete = TRUE - понижение до разделяемой блокировки. /// - разблокирование других процессов ожидающих доступа, т.е если
/// complete = FALSE - понижение до эксклюзивной операционной блокировки. /// (env->me_flags & MDBX_EXCLUSIVE) != 0, то другие процессы должны узнать
/// о невозможности доступа, а не ждать его.
/// - снятия блокировок мешающих работе с файлом (актуально для Windows).
/// (env->me_flags & MDBX_EXCLUSIVE) == 0 - понижение до разделяемой
/// блокировки. (env->me_flags & MDBX_EXCLUSIVE) != 0 - понижение до
/// эксклюзивной операционной блокировки.
/// \return Код ошибки или 0 в случае успеха. /// \return Код ошибки или 0 в случае успеха.
MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env, bool complete); MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env);
/// \brief Блокирует lck-файл и/или таблицу читателей для (де)регистрации. /// \brief Блокирует lck-файл и/или таблицу читателей для (де)регистрации.
/// \return Код ошибки или 0 в случае успеха. /// \return Код ошибки или 0 в случае успеха.