libmdbx/src/elements/lck-posix.c

437 lines
15 KiB
C
Raw Normal View History

/*
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
* and other libmdbx authors: please see AUTHORS file.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted only as authorized by the OpenLDAP
* Public License.
*
* A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>.
*/
#include "./internals.h"
/* Some platforms define the EOWNERDEAD error code
* even though they don't support Robust Mutexes.
* Compile with -DMDBX_USE_ROBUST=0. */
#ifndef MDBX_USE_ROBUST
2019-08-13 02:07:10 +03:00
#if (defined(EOWNERDEAD) || _POSIX_C_SOURCE >= 200809L) && !defined(__APPLE__)
#define MDBX_USE_ROBUST 1
#else
#define MDBX_USE_ROBUST 0
#endif
#endif /* MDBX_USE_ROBUST */
/*----------------------------------------------------------------------------*/
/* rthc */
2019-08-25 03:05:58 +03:00
static __cold __attribute__((__constructor__)) void
mdbx_global_constructor(void) {
mdbx_rthc_global_init();
}
2019-08-25 03:05:58 +03:00
static __cold __attribute__((__destructor__)) void
mdbx_global_destructor(void) {
mdbx_rthc_global_dtor();
}
/*----------------------------------------------------------------------------*/
/* lck */
/* Описание реализации блокировок для POSIX:
*
* lck-файл отображается в память, в нём организуется таблица читателей и
* размещаются совместно используемые posix-мьютексы (futex). Посредством
* этих мьютексов (см struct MDBX_lockinfo) реализуются:
* - Блокировка таблицы читателей для регистрации,
* т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock().
* - Блокировка БД для пишущих транзакций,
* т.е. функции mdbx_txn_lock() и mdbx_txn_unlock().
*
* Остальной функционал реализуется отдельно посредством файловых блокировок:
* - Первоначальный захват БД в режиме exclusive/shared и последующий перевод
* в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade().
* - Проверка присутствие процессов-читателей,
* т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check().
*
* Для блокировки файлов Используется только fcntl(F_SETLK), так как:
* - lockf() оперирует только эксклюзивной блокировкой и требует
* открытия файла в RW-режиме.
* - flock() не гарантирует атомарности при смене блокировок
* и оперирует только всем файлом целиком.
* - Для контроля процессов-читателей используются однобайтовые
* range-блокировки lck-файла посредством fcntl(F_SETLK). При этом
* в качестве позиции используется pid процесса-читателя.
* - Для первоначального захвата и shared/exclusive выполняется блокировка
* основного файла БД и при успехе lck-файла.
*/
#ifndef OFF_T_MAX
#define OFF_T_MAX \
((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff)
#endif
#ifndef PID_T_MAX
#define PID_T_MAX INT_MAX
#endif
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK)
#define OP_SETLK F_OFD_SETLK
#define OP_SETLKW F_OFD_SETLKW
#define OP_GETLK F_OFD_GETLK
#else
#define OP_SETLK F_SETLK
#define OP_SETLKW F_SETLKW
#define OP_GETLK F_GETLK
#endif /* OFD locks */
static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset,
off_t len) {
for (;;) {
struct flock lock_op;
memset(&lock_op, 0, sizeof(lock_op));
lock_op.l_type = lck;
lock_op.l_whence = SEEK_SET;
lock_op.l_start = offset;
lock_op.l_len = len;
if (fcntl(fd, cmd, &lock_op) == 0) {
if (cmd == OP_GETLK) {
/* Checks reader by pid. Returns:
* MDBX_RESULT_TRUE - if pid is live (unable to acquire lock)
* MDBX_RESULT_FALSE - if pid is dead (lock acquired). */
return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE
: MDBX_RESULT_TRUE;
}
return 0;
}
int rc = errno;
if (rc != EINTR || cmd == F_SETLKW)
return rc;
}
}
MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX);
return mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, env->me_pid, 1);
}
MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX);
return mdbx_lck_op(env->me_lfd, OP_SETLKW, F_UNLCK, env->me_pid, 1);
}
MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
assert(pid > 0 && pid <= PID_T_MAX);
assert(PID_T_MAX < OFF_T_MAX);
return mdbx_lck_op(env->me_lfd, OP_GETLK, F_WRLCK, pid, 1);
}
MDBX_INTERNAL_FUNC int __cold mdbx_lck_seize(MDBX_env *env) {
assert(env->me_fd != INVALID_HANDLE_VALUE);
assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX);
if (env->me_lfd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */
int rc = mdbx_lck_op(env->me_fd, OP_SETLK,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
OFF_T_MAX);
if (rc != 0) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc);
return rc;
}
return MDBX_RESULT_TRUE;
}
/* try exclusive access */
int rc = mdbx_lck_op(env->me_fd, OP_SETLK,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
OFF_T_MAX);
if (rc == 0) {
2019-08-25 15:56:43 +03:00
continue_exclusive:
/* got dxb-exclusive, continue lck-exclusive */
rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_WRLCK, 0, OFF_T_MAX);
if (rc == 0) {
/* got both exclusive */
return MDBX_RESULT_TRUE;
}
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_,
"lck-after-dxb-exclusive", rc);
assert(MDBX_IS_ERROR(rc));
goto bailout;
}
if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) {
rc = mdbx_lck_op(env->me_fd, OP_SETLKW,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
env->me_pid, 1);
if (rc == 0) {
2019-08-25 15:56:43 +03:00
/* got dxb-shared, try again dxb-exclusive */
rc = mdbx_lck_op(env->me_fd, OP_SETLK,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
OFF_T_MAX);
if (rc == 0)
goto continue_exclusive;
/* continue lck-shared */
2019-08-11 19:13:29 +03:00
rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1);
if (rc == 0) {
/* got both dxb and lck shared lock */
return MDBX_RESULT_FALSE;
}
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck-shared", rc);
} else {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb-shared", rc);
}
assert(MDBX_IS_ERROR(rc));
}
bailout:
(void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
(void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
assert(MDBX_IS_ERROR(rc));
return rc;
}
int mdbx_lck_downgrade(MDBX_env *env, bool complete) {
assert(env->me_lfd != INVALID_HANDLE_VALUE);
2019-08-11 19:13:29 +03:00
int rc = mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 1, OFF_T_MAX - 1);
if (rc == 0)
rc = mdbx_lck_op(env->me_lfd, OP_SETLKW, F_RDLCK, 0, 1);
if (unlikely(rc != 0)) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck", rc);
goto bailout;
}
if (complete) {
rc = mdbx_lck_op(env->me_fd, OP_SETLK,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
env->me_pid, 1);
if (unlikely(rc != 0)) {
mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb", rc);
goto bailout;
}
}
return MDBX_SUCCESS;
bailout:
(void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
(void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX);
assert(MDBX_IS_ERROR(rc));
return rc;
}
/*---------------------------------------------------------------------------*/
static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
const int rc);
MDBX_INTERNAL_FUNC int __cold mdbx_lck_init(MDBX_env *env,
int global_uniqueness_flag) {
if (global_uniqueness_flag == MDBX_RESULT_FALSE)
return MDBX_SUCCESS;
pthread_mutexattr_t ma;
int rc = pthread_mutexattr_init(&ma);
if (rc)
return rc;
rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED);
if (rc)
goto bailout;
#if MDBX_USE_ROBUST
rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST);
if (rc)
goto bailout;
#endif /* MDBX_USE_ROBUST */
#if _POSIX_C_SOURCE >= 199506L && !defined(MDBX_SAFE4QEMU)
rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT);
if (rc == ENOTSUP)
rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE);
if (rc)
goto bailout;
#endif /* PTHREAD_PRIO_INHERIT */
rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
if (rc)
goto bailout;
rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma);
if (rc)
goto bailout;
rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma);
bailout:
pthread_mutexattr_destroy(&ma);
return rc;
}
MDBX_INTERNAL_FUNC int __cold mdbx_lck_destroy(MDBX_env *env,
MDBX_env *inprocess_neighbor) {
2019-08-11 19:13:29 +03:00
/* File locks would be released (by kernel) while the file-descriptors
* will be closed. But to avoid false-positive EDEADLK from the kernel,
* locks should be released here explicitly with properly order. */
if (env->me_lfd != INVALID_HANDLE_VALUE && !inprocess_neighbor &&
env->me_lck &&
/* try get exclusive access */
mdbx_lck_op(env->me_fd, OP_SETLK,
(env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0,
OFF_T_MAX) == 0 &&
mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, 0, OFF_T_MAX) == 0) {
mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_);
int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex);
if (rc == 0)
rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex);
assert(rc == 0);
(void)rc;
msync(env->me_lck, env->me_os_psize, MS_ASYNC);
}
/* POSIX's fcntl() locks should be restored after file was closed.
* FIXME: This code should be rethinked and retested, since it will executed
* in really rare cases. For instance, this code could wait a lot, if other
* process get exclusive access immediately after the close().
*
* On the other hand, seems more reasonable to disallow multi-open feature
* by default, and describe it as "use at your own risk". Currently
* multi-open required only for libfpta's unit-tests. */
int rc = MDBX_SUCCESS;
/* close clk and restore locks */
if (env->me_lfd != INVALID_HANDLE_VALUE) {
(void)close(env->me_lfd);
env->me_lfd = INVALID_HANDLE_VALUE;
if (inprocess_neighbor) {
/* restore file-locks */
if (rc == MDBX_SUCCESS)
rc = mdbx_lck_op(inprocess_neighbor->me_lfd, OP_SETLKW, F_RDLCK, 0, 1);
if (rc == MDBX_SUCCESS)
rc = mdbx_rpid_set(inprocess_neighbor);
}
}
/* close dxb and restore lock */
if (env->me_fd != INVALID_HANDLE_VALUE) {
(void)close(env->me_fd);
env->me_fd = INVALID_HANDLE_VALUE;
if (inprocess_neighbor && rc == MDBX_SUCCESS) {
/* restore file-lock */
rc = mdbx_lck_op(
inprocess_neighbor->me_fd, OP_SETLKW,
(inprocess_neighbor->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK,
(inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE)
? 0
: inprocess_neighbor->me_pid,
(inprocess_neighbor->me_lfd == INVALID_HANDLE_VALUE) ? OFF_T_MAX : 1);
}
}
if (inprocess_neighbor && rc != MDBX_SUCCESS) {
inprocess_neighbor->me_flags |= MDBX_FATAL_ERROR;
return rc;
}
return MDBX_SUCCESS;
}
static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) {
int rc = pthread_mutex_lock(mutex);
if (unlikely(rc != 0))
rc = mdbx_mutex_failed(env, mutex, rc);
return rc;
}
static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) {
int rc = pthread_mutex_trylock(mutex);
if (unlikely(rc != 0 && rc != EBUSY))
rc = mdbx_mutex_failed(env, mutex, rc);
return (rc != EBUSY) ? rc : MDBX_BUSY;
}
static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) {
int rc = pthread_mutex_unlock(mutex);
if (unlikely(rc != 0))
rc = mdbx_mutex_failed(env, mutex, rc);
return rc;
}
MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) {
mdbx_trace(">>");
int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex);
mdbx_trace("<< rc %d", rc);
return rc;
}
MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) {
mdbx_trace(">>");
int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex);
mdbx_trace("<< rc %d", rc);
if (unlikely(MDBX_IS_ERROR(rc)))
mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc);
}
int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
mdbx_trace(">>");
int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex)
: mdbx_robust_lock(env, env->me_wmutex);
mdbx_trace("<< rc %d", rc);
return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS;
}
void mdbx_txn_unlock(MDBX_env *env) {
mdbx_trace(">>");
int rc = mdbx_robust_unlock(env, env->me_wmutex);
mdbx_trace("<< rc %d", rc);
if (unlikely(MDBX_IS_ERROR(rc)))
mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc);
}
static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex,
const int err) {
int rc = err;
#if MDBX_USE_ROBUST
if (err == EOWNERDEAD) {
/* We own the mutex. Clean up after dead previous owner. */
int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex);
rc = MDBX_SUCCESS;
if (!rlocked) {
if (unlikely(env->me_txn)) {
/* env is hosed if the dead thread was ours */
env->me_flags |= MDBX_FATAL_ERROR;
env->me_txn = NULL;
rc = MDBX_PANIC;
}
}
mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'),
(rc ? "this process' env is hosed" : "recovering"));
int check_rc = mdbx_reader_check0(env, rlocked, NULL);
check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc;
int mreco_rc = pthread_mutex_consistent(mutex);
check_rc = (mreco_rc == 0) ? check_rc : mreco_rc;
if (unlikely(mreco_rc))
mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc));
rc = (rc == MDBX_SUCCESS) ? check_rc : rc;
if (MDBX_IS_ERROR(rc))
pthread_mutex_unlock(mutex);
return rc;
}
#else
(void)mutex;
#endif /* MDBX_USE_ROBUST */
mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err));
if (rc != EDEADLK)
env->me_flags |= MDBX_FATAL_ERROR;
return rc;
}