From c6369e68a374374aa277e9c2580ac8700dccd043 Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Sat, 10 Aug 2019 15:38:09 +0300 Subject: [PATCH] mdbx: draft support for non-Linux POSIX-platforms (FreeBSD, MacOS, etc). Change-Id: Iaee2dc31b134fe92fc67508d011835a60f3723e6 --- Makefile | 25 +++- libmdbx.files | 1 + src/lck-posix.c | 366 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 390 insertions(+), 2 deletions(-) create mode 100644 src/lck-posix.c diff --git a/Makefile b/Makefile index d0ebf870..d5e628b6 100644 --- a/Makefile +++ b/Makefile @@ -50,10 +50,31 @@ TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 SHELL := /bin/bash -CORE_SRC := $(filter-out src/lck-windows.c, $(wildcard src/*.c)) +ifdef MSVC + LCK_IMPL := windows + TEST_OSAL := windows +else + define uname2lck + case "`uname -s 2>/dev/null`" in + Linux) echo linux;; + CYGWIN*|MINGW32*|MSYS*) echo windows;; + *) echo posix;; + esac + endef + define uname2osal + case "`uname -s 2>/dev/null`" in + CYGWIN*|MINGW32*|MSYS*) echo windows;; + *) echo unix;; + esac + endef + LCK_IMPL := $(shell $(uname2lck)) + TEST_OSAL := $(shell $(uname2osal)) +endif + +CORE_SRC := src/lck-$(LCK_IMPL).c $(filter-out $(wildcard src/lck-*.c), $(wildcard src/*.c)) CORE_INC := $(wildcard src/*.h) CORE_OBJ := $(patsubst %.c,%.o,$(CORE_SRC)) -TEST_SRC := $(filter-out test/osal-windows.cc, $(wildcard test/*.cc)) +TEST_SRC := test/osal-$(TEST_OSAL).cc $(filter-out $(wildcard test/osal-*.cc), $(wildcard test/*.cc)) TEST_INC := $(wildcard test/*.h) TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) diff --git a/libmdbx.files b/libmdbx.files index e0d851ef..cdea405b 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -9,6 +9,7 @@ mdbx.h src/bits.h src/defs.h src/lck-linux.c +src/lck-posix.c src/lck-windows.c src/mdbx.c src/osal.c diff --git a/src/lck-posix.c b/src/lck-posix.c new file mode 100644 index 00000000..8440c3fd --- /dev/null +++ b/src/lck-posix.c @@ -0,0 +1,366 @@ +/* + * Copyright 2015-2019 Leonid Yuriev + * and other libmdbx authors: please see AUTHORS file. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted only as authorized by the OpenLDAP + * Public License. + * + * A copy of this license is available in the file LICENSE in the + * top-level directory of the distribution or, alternatively, at + * . + */ + +#include "./bits.h" + +/* Some platforms define the EOWNERDEAD error code + * even though they don't support Robust Mutexes. + * Compile with -DMDBX_USE_ROBUST=0. */ +#ifndef MDBX_USE_ROBUST +#if defined(EOWNERDEAD) || _POSIX_C_SOURCE >= 200809L +#define MDBX_USE_ROBUST 1 +#else +#define MDBX_USE_ROBUST 0 +#endif +#endif /* MDBX_USE_ROBUST */ + +/*----------------------------------------------------------------------------*/ +/* rthc */ + +static __cold __attribute__((constructor)) void mdbx_global_constructor(void) { + mdbx_rthc_global_init(); +} + +static __cold __attribute__((destructor)) void mdbx_global_destructor(void) { + mdbx_rthc_global_dtor(); +} + +/*----------------------------------------------------------------------------*/ +/* lck */ + +/* Описание реализации блокировок для POSIX: + * + * lck-файл отображается в память, в нём организуется таблица читателей и + * размещаются совместно используемые posix-мьютексы (futex). Посредством + * этих мьютексов (см struct MDBX_lockinfo) реализуются: + * - Блокировка таблицы читателей для регистрации, + * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). + * - Блокировка БД для пишущих транзакций, + * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). + * + * Остальной функционал реализуется отдельно посредством файловых блокировок: + * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод + * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). + * - Проверка присутствие процессов-читателей, + * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). + * + * Для блокировки файлов Используется только fcntl(F_SETLK), так как: + * - lockf() оперирует только эксклюзивной блокировкой и требует + * открытия файла в RW-режиме. + * - flock() не гарантирует атомарности при смене блокировок + * и оперирует только всем файлом целиком. + * - Для контроля процессов-читателей используются однобайтовые + * range-блокировки lck-файла посредством fcntl(F_SETLK). При этом + * в качестве позиции используется pid процесса-читателя. + * - Для первоначального захвата и shared/exclusive выполняется блокировка + * основного файла БД и при успехе lck-файла. + */ + +#ifndef OFF_T_MAX +#define OFF_T_MAX \ + ((sizeof(off_t) > 4 ? INT64_MAX : INT32_MAX) & ~(size_t)0xffff) +#endif +#ifndef PID_T_MAX +#define PID_T_MAX INT_MAX +#endif + +#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) +#define OP_SETLK F_OFD_SETLK +#define OP_SETLKW F_OFD_SETLKW +#define OP_GETLK F_OFD_GETLK +#else +#define OP_SETLK F_SETLK +#define OP_SETLKW F_SETLKW +#define OP_GETLK F_GETLK +#endif /* OFD locks */ + +static int mdbx_lck_op(mdbx_filehandle_t fd, int cmd, short lck, off_t offset, + off_t len) { + for (;;) { + struct flock lock_op; + memset(&lock_op, 0, sizeof(lock_op)); + lock_op.l_type = lck; + lock_op.l_whence = SEEK_SET; + lock_op.l_start = offset; + lock_op.l_len = len; + if (fcntl(fd, cmd, &lock_op) == 0) { + if (cmd == OP_GETLK) { + /* Checks reader by pid. Returns: + * MDBX_RESULT_TRUE - if pid is live (unable to acquire lock) + * MDBX_RESULT_FALSE - if pid is dead (lock acquired). */ + return (lock_op.l_type == F_UNLCK) ? MDBX_RESULT_FALSE + : MDBX_RESULT_TRUE; + } + return 0; + } + int rc = errno; + if (rc != EINTR) + return rc; + } +} + +int mdbx_rpid_set(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); + return mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, env->me_pid, 1); +} + +int mdbx_rpid_clear(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); + return mdbx_lck_op(env->me_lfd, OP_SETLKW, F_UNLCK, env->me_pid, 1); +} + +int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + assert(pid > 0 && pid <= PID_T_MAX); + assert(PID_T_MAX < OFF_T_MAX); + return mdbx_lck_op(env->me_lfd, OP_GETLK, F_WRLCK, pid, 1); +} + +int __cold mdbx_lck_seize(MDBX_env *env) { + assert(env->me_fd != INVALID_HANDLE_VALUE); + assert(env->me_pid > 0 && env->me_pid <= PID_T_MAX); + + if (env->me_lfd == INVALID_HANDLE_VALUE) { + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + int rc = mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX); + if (rc != 0) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); + return rc; + } + return MDBX_RESULT_TRUE; + } + + /* try exclusive access */ + int rc = mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX); + if (rc == 0) { + /* got dxb-exclusive, try lck-exclusive */ + rc = mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, 0, OFF_T_MAX); + if (rc == 0) { + /* got both exclusive */ + return MDBX_RESULT_TRUE; + } + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "lck-after-dxb-exclusive", rc); + assert(MDBX_IS_ERROR(rc)); + goto bailout; + } + + if (rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK) { + rc = mdbx_lck_op(env->me_fd, OP_SETLKW, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, + env->me_pid, 1); + if (rc == 0) { + /* got dxb-shared-rw */ + return MDBX_RESULT_FALSE; + } + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, + "lock-against-without-lck", rc); + assert(MDBX_IS_ERROR(rc)); + } + +bailout: + (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + assert(MDBX_IS_ERROR(rc)); + return rc; +} + +int mdbx_lck_downgrade(MDBX_env *env, bool complete) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); + int rc = mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + if (unlikely(rc != 0)) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "lck", rc); + goto bailout; + } + if (complete) { + rc = mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, + env->me_pid, 1); + if (unlikely(rc != 0)) { + mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "dxb", rc); + goto bailout; + } + } + return MDBX_SUCCESS; + +bailout: + (void)mdbx_lck_op(env->me_lfd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + (void)mdbx_lck_op(env->me_fd, OP_SETLK, F_UNLCK, 0, OFF_T_MAX); + assert(MDBX_IS_ERROR(rc)); + return rc; +} + +/*---------------------------------------------------------------------------*/ + +static int mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, + const int rc); + +int __cold mdbx_lck_init(MDBX_env *env) { + pthread_mutexattr_t ma; + int rc = pthread_mutexattr_init(&ma); + if (rc) + return rc; + + rc = pthread_mutexattr_setpshared(&ma, PTHREAD_PROCESS_SHARED); + if (rc) + goto bailout; + +#if MDBX_USE_ROBUST + rc = pthread_mutexattr_setrobust(&ma, PTHREAD_MUTEX_ROBUST); + if (rc) + goto bailout; +#endif /* MDBX_USE_ROBUST */ + +#if _POSIX_C_SOURCE >= 199506L && !defined(MDBX_SAFE4QEMU) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_INHERIT); + if (rc == ENOTSUP) + rc = pthread_mutexattr_setprotocol(&ma, PTHREAD_PRIO_NONE); + if (rc) + goto bailout; +#endif /* PTHREAD_PRIO_INHERIT */ + + rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK); + if (rc) + goto bailout; + + rc = pthread_mutex_init(&env->me_lck->mti_rmutex, &ma); + if (rc) + goto bailout; + rc = pthread_mutex_init(&env->me_lck->mti_wmutex, &ma); + +bailout: + pthread_mutexattr_destroy(&ma); + return rc; +} + +void __cold mdbx_lck_destroy(MDBX_env *env) { + if (env->me_lfd != INVALID_HANDLE_VALUE && env->me_lck) { + /* try get exclusive access */ + if (mdbx_lck_op(env->me_fd, OP_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + OFF_T_MAX) == 0 && + mdbx_lck_op(env->me_lfd, OP_SETLK, F_WRLCK, 0, OFF_T_MAX) == 0) { + mdbx_info("%s: got exclusive, drown mutexes", mdbx_func_); + int rc = pthread_mutex_destroy(&env->me_lck->mti_rmutex); + if (rc == 0) + rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); + assert(rc == 0); + (void)rc; + /* file locks would be released (by kernel) + * while the me_lfd will be closed */ + } + } +} + +static int mdbx_robust_lock(MDBX_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_lock(mutex); + if (unlikely(rc != 0)) + rc = mdbx_mutex_failed(env, mutex, rc); + return rc; +} + +static int mdbx_robust_trylock(MDBX_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_trylock(mutex); + if (unlikely(rc != 0 && rc != EBUSY)) + rc = mdbx_mutex_failed(env, mutex, rc); + return (rc != EBUSY) ? rc : MDBX_BUSY; +} + +static int mdbx_robust_unlock(MDBX_env *env, pthread_mutex_t *mutex) { + int rc = pthread_mutex_unlock(mutex); + if (unlikely(rc != 0)) + rc = mdbx_mutex_failed(env, mutex, rc); + return rc; +} + +int mdbx_rdt_lock(MDBX_env *env) { + mdbx_trace(">>"); + int rc = mdbx_robust_lock(env, &env->me_lck->mti_rmutex); + mdbx_trace("<< rc %d", rc); + return rc; +} + +void mdbx_rdt_unlock(MDBX_env *env) { + mdbx_trace(">>"); + int rc = mdbx_robust_unlock(env, &env->me_lck->mti_rmutex); + mdbx_trace("<< rc %d", rc); + if (unlikely(MDBX_IS_ERROR(rc))) + mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); +} + +int mdbx_txn_lock(MDBX_env *env, bool dontwait) { + mdbx_trace(">>"); + int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex) + : mdbx_robust_lock(env, env->me_wmutex); + mdbx_trace("<< rc %d", rc); + return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; +} + +void mdbx_txn_unlock(MDBX_env *env) { + mdbx_trace(">>"); + int rc = mdbx_robust_unlock(env, env->me_wmutex); + mdbx_trace("<< rc %d", rc); + if (unlikely(MDBX_IS_ERROR(rc))) + mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); +} + +static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, + const int err) { + int rc = err; +#if MDBX_USE_ROBUST + if (err == EOWNERDEAD) { + /* We own the mutex. Clean up after dead previous owner. */ + + int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex); + rc = MDBX_SUCCESS; + if (!rlocked) { + if (unlikely(env->me_txn)) { + /* env is hosed if the dead thread was ours */ + env->me_flags |= MDBX_FATAL_ERROR; + env->me_txn = NULL; + rc = MDBX_PANIC; + } + } + mdbx_notice("%cmutex owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); + + int check_rc = mdbx_reader_check0(env, rlocked, NULL); + check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; + + int mreco_rc = pthread_mutex_consistent(mutex); + check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; + + if (unlikely(mreco_rc)) + mdbx_error("mutex recovery failed, %s", mdbx_strerror(mreco_rc)); + + rc = (rc == MDBX_SUCCESS) ? check_rc : rc; + if (MDBX_IS_ERROR(rc)) + pthread_mutex_unlock(mutex); + return rc; + } +#else + (void)mutex; +#endif /* MDBX_USE_ROBUST */ + + mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); + if (rc != EDEADLK) + env->me_flags |= MDBX_FATAL_ERROR; + return rc; +}