From d90e6187f7e69290239a84d764db323866b9c36d Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 30 Jun 2018 11:58:57 +0300 Subject: [PATCH] mdbx: support exclusive mode without lck-file. Change-Id: I1beef09d62965d0d777f579a8aa6f15c478eebd2 --- mdbx.h | 3 + src/bits.h | 22 +++--- src/lck-posix.c | 23 ++++--- src/mdbx.c | 173 +++++++++++++++++++++++++++++------------------- 4 files changed, 137 insertions(+), 84 deletions(-) diff --git a/mdbx.h b/mdbx.h index f18f6cef..f7c9ff59 100644 --- a/mdbx.h +++ b/mdbx.h @@ -100,6 +100,7 @@ typedef DWORD mdbx_tid_t; #define MDBX_EIO ERROR_WRITE_FAULT #define MDBX_EPERM ERROR_INVALID_FUNCTION #define MDBX_EINTR ERROR_CANCELLED +#define MDBX_ENOFILE ERROR_FILE_NOT_FOUND #else @@ -120,6 +121,8 @@ typedef pthread_t mdbx_tid_t; #define MDBX_EIO EIO #define MDBX_EPERM EPERM #define MDBX_EINTR EINTR +#define MDBX_ENOFILE ENOENT + #endif #ifdef _MSC_VER diff --git a/src/bits.h b/src/bits.h index 599dee6f..d43f4e99 100644 --- a/src/bits.h +++ b/src/bits.h @@ -411,7 +411,7 @@ typedef struct MDBX_lockinfo { volatile uint32_t mti_envmode; #ifdef MDBX_OSAL_LOCK - /* Mutex protecting write access to this table. */ + /* Mutex protecting write-txn. */ union { MDBX_OSAL_LOCK mti_wmutex; uint8_t pad_mti_wmutex[MDBX_OSAL_LOCK_SIZE % sizeof(size_t)]; @@ -734,14 +734,17 @@ struct MDBX_env { /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */ unsigned me_close_readers; mdbx_fastmutex_t me_dbi_lock; - MDBX_dbi me_numdbs; /* number of DBs opened */ - MDBX_dbi me_maxdbs; /* size of the DB table */ - mdbx_pid_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_path; /* path to the DB files */ - void *me_pbuf; /* scratch area for DUPSORT put() */ - MDBX_txn *me_txn; /* current write transaction */ - MDBX_txn *me_txn0; /* prealloc'd write transaction */ + MDBX_dbi me_numdbs; /* number of DBs opened */ + MDBX_dbi me_maxdbs; /* size of the DB table */ + mdbx_pid_t me_pid; /* process ID of this env */ + mdbx_thread_key_t me_txkey; /* thread-key for readers */ + char *me_path; /* path to the DB files */ + void *me_pbuf; /* scratch area for DUPSORT put() */ + MDBX_txn *me_txn; /* current write transaction */ + MDBX_txn *me_txn0; /* prealloc'd write transaction */ +#ifdef MDBX_OSAL_LOCK + MDBX_OSAL_LOCK *me_wmutex; /* write-txn mutex */ +#endif MDBX_dbx *me_dbxs; /* array of static DB info */ uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */ @@ -786,6 +789,7 @@ struct MDBX_env { /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; #else + mdbx_fastmutex_t me_lckless_wmutex; mdbx_fastmutex_t me_remap_guard; #endif }; diff --git a/src/lck-posix.c b/src/lck-posix.c index 532505e8..a2928b10 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -86,16 +86,19 @@ static __inline int mdbx_lck_shared(int lfd) { } int mdbx_lck_downgrade(MDBX_env *env, bool complete) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); return complete ? mdbx_lck_shared(env->me_lfd) : MDBX_SUCCESS; } int mdbx_lck_upgrade(MDBX_env *env) { return mdbx_lck_exclusive(env->me_lfd); } int mdbx_rpid_set(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); return mdbx_lck_op(env->me_lfd, F_SETLK, F_WRLCK, env->me_pid, 1); } int mdbx_rpid_clear(MDBX_env *env) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); return mdbx_lck_op(env->me_lfd, F_SETLKW, F_UNLCK, env->me_pid, 1); } @@ -106,6 +109,7 @@ int mdbx_rpid_clear(MDBX_env *env) { * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid) { + assert(env->me_lfd != INVALID_HANDLE_VALUE); int rc = mdbx_lck_op(env->me_lfd, F_GETLK, F_WRLCK, pid, 1); if (rc == 0) return MDBX_RESULT_FALSE; @@ -166,7 +170,8 @@ void mdbx_lck_destroy(MDBX_env *env) { rc = pthread_mutex_destroy(&env->me_lck->mti_wmutex); assert(rc == 0); (void)rc; - /* lock would be released (by kernel) while the me_lfd will be closed */ + /* file locks would be released (by kernel) + * while the me_lfd will be closed */ } } } @@ -209,15 +214,15 @@ void mdbx_rdt_unlock(MDBX_env *env) { int mdbx_txn_lock(MDBX_env *env, bool dontwait) { mdbx_trace(">>"); - int rc = dontwait ? mdbx_robust_trylock(env, &env->me_lck->mti_wmutex) - : mdbx_robust_lock(env, &env->me_lck->mti_wmutex); + int rc = dontwait ? mdbx_robust_trylock(env, env->me_wmutex) + : mdbx_robust_lock(env, env->me_wmutex); mdbx_trace("<< rc %d", rc); return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; } void mdbx_txn_unlock(MDBX_env *env) { mdbx_trace(">>"); - int rc = mdbx_robust_unlock(env, &env->me_lck->mti_wmutex); + int rc = mdbx_robust_unlock(env, env->me_wmutex); mdbx_trace("<< rc %d", rc); if (unlikely(MDBX_IS_ERROR(rc))) mdbx_panic("%s() failed: errcode %d\n", mdbx_func_, rc); @@ -253,13 +258,15 @@ int mdbx_lck_seize(MDBX_env *env) { assert(env->me_fd != INVALID_HANDLE_VALUE); if (env->me_lfd == INVALID_HANDLE_VALUE) { - /* LY: without-lck mode (e.g. on read-only filesystem) */ - int rc = mdbx_lck_op(env->me_fd, F_SETLK, F_RDLCK, 0, LCK_WHOLE); + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ + int rc = mdbx_lck_op(env->me_fd, F_SETLK, + (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, + LCK_WHOLE); if (rc != 0) { mdbx_error("%s(%s) failed: errcode %u", mdbx_func_, "without-lck", rc); return rc; } - return MDBX_RESULT_FALSE; + return MDBX_RESULT_TRUE; } if ((env->me_flags & MDBX_RDONLY) == 0) { @@ -285,7 +292,7 @@ static int __cold mdbx_mutex_failed(MDBX_env *env, pthread_mutex_t *mutex, if (rc == EOWNERDEAD) { /* We own the mutex. Clean up after dead previous owner. */ - int rlocked = (mutex == &env->me_lck->mti_rmutex); + int rlocked = (env->me_lck && mutex == &env->me_lck->mti_rmutex); rc = MDBX_SUCCESS; if (!rlocked) { if (unlikely(env->me_txn)) { diff --git a/src/mdbx.c b/src/mdbx.c index 98cb4fee..71fcd503 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1931,13 +1931,16 @@ static const char *mdbx_durable_str(const MDBX_meta *const meta) { static txnid_t mdbx_find_oldest(MDBX_txn *txn) { mdbx_tassert(txn, (txn->mt_flags & MDBX_RDONLY) == 0); const MDBX_env *env = txn->mt_env; - MDBX_lockinfo *const lck = env->me_lck; - const txnid_t edge = mdbx_reclaiming_detent(env); mdbx_tassert(txn, edge <= txn->mt_txnid - 1); + + MDBX_lockinfo *const lck = env->me_lck; + if (unlikely(env->me_lck == NULL /* exclusive mode */)) + return edge; + const txnid_t last_oldest = lck->mti_oldest; mdbx_tassert(txn, edge >= last_oldest); - if (last_oldest == edge) + if (likely(last_oldest == edge)) return edge; const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); @@ -2734,9 +2737,6 @@ static int mdbx_env_sync_ex(MDBX_env *env, int force, int nonblock) { if (unlikely(flags & (MDBX_RDONLY | MDBX_FATAL_ERROR))) return MDBX_EACCESS; - if (unlikely(!env->me_lck)) - return MDBX_PANIC; - const bool outside_txn = (!env->me_txn0 || env->me_txn0->mt_owner != mdbx_thread_self()); @@ -4274,7 +4274,9 @@ int mdbx_txn_commit(MDBX_txn *txn) { } if (unlikely(rc != MDBX_SUCCESS)) goto fail; - env->me_lck->mti_readers_refresh_flag = false; + + if (likely(env->me_lck)) + env->me_lck->mti_readers_refresh_flag = false; end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: @@ -4926,6 +4928,12 @@ int __cold mdbx_env_create(MDBX_env **penv) { mdbx_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } + rc = mdbx_fastmutex_init(&env->me_lckless_wmutex); + if (unlikely(rc != MDBX_SUCCESS)) { + mdbx_fastmutex_destroy(&env->me_remap_guard); + mdbx_fastmutex_destroy(&env->me_dbi_lock); + goto bailout; + } #endif /* Windows */ VALGRIND_CREATE_MEMPOOL(env, 0, 0); @@ -5589,17 +5597,30 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, assert(env->me_fd != INVALID_HANDLE_VALUE); assert(env->me_lfd == INVALID_HANDLE_VALUE); - int err = mdbx_openfile(lck_pathname, O_RDWR | O_CREAT, mode, &env->me_lfd, + const int open_flags = + (env->me_flags & MDBX_EXCLUSIVE) ? O_RDWR : O_RDWR | O_CREAT; + int err = mdbx_openfile(lck_pathname, open_flags, mode, &env->me_lfd, (env->me_flags & MDBX_EXCLUSIVE) ? true : false); if (err != MDBX_SUCCESS) { - if (err != MDBX_EROFS || (env->me_flags & MDBX_RDONLY) == 0) + if (!(err == MDBX_ENOFILE && (env->me_flags & MDBX_EXCLUSIVE)) && + !(err == MDBX_EROFS && (env->me_flags & MDBX_RDONLY))) return err; - /* LY: without-lck mode (e.g. on read-only filesystem) */ + + /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ env->me_lfd = INVALID_HANDLE_VALUE; + const int rc = mdbx_lck_seize(env); + if (MDBX_IS_ERROR(rc)) + return rc; + env->me_oldest = &env->me_oldest_stub; env->me_maxreaders = UINT_MAX; - mdbx_debug("lck-setup: %s ", "lockless mode (readonly)"); - return MDBX_SUCCESS; +#ifdef MDBX_OSAL_LOCK + env->me_wmutex = &env->me_lckless_wmutex; +#endif + mdbx_debug("lck-setup:%s%s%s", " lck-less", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + return rc; } /* Try to get exclusive lock. If we succeed, then @@ -5608,8 +5629,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, if (MDBX_IS_ERROR(rc)) return rc; - mdbx_debug("lck-setup: %s ", - (rc == MDBX_RESULT_TRUE) ? "exclusive" : "shared"); + mdbx_debug("lck-setup:%s%s%s", " with-lck", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); uint64_t size; err = mdbx_filesize(env->me_lfd, &size); @@ -5699,6 +5721,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, mdbx_assert(env, !MDBX_IS_ERROR(rc)); env->me_oldest = &env->me_lck->mti_oldest; +#ifdef MDBX_OSAL_LOCK + env->me_wmutex = &env->me_lck->mti_wmutex; +#endif return rc; } @@ -5804,44 +5829,46 @@ int __cold mdbx_env_open(MDBX_env *env, const char *path, unsigned flags, } mdbx_debug("opened dbenv %p", (void *)env); - const unsigned mode_flags = - MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC; - if (lck_rc == MDBX_RESULT_TRUE) { - env->me_lck->mti_envmode = env->me_flags & (mode_flags | MDBX_RDONLY); - if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { - /* LY: downgrade lock only if exclusive access not requested. - * in case exclusive==1, just leave value as is. */ - rc = mdbx_lck_downgrade(env, true); - mdbx_debug("lck-downgrade-full: rc %i ", rc); - } else { - rc = mdbx_lck_downgrade(env, false); - mdbx_debug("lck-downgrade-partial: rc %i ", rc); - } - if (rc != MDBX_SUCCESS) - goto bailout; - } else { - if ((env->me_flags & MDBX_RDONLY) == 0) { - while (env->me_lck->mti_envmode == MDBX_RDONLY) { - if (mdbx_atomic_compare_and_swap32(&env->me_lck->mti_envmode, - MDBX_RDONLY, - env->me_flags & mode_flags)) - break; - /* TODO: yield/relax cpu */ + if (env->me_lck) { + const unsigned mode_flags = + MDBX_WRITEMAP | MDBX_NOSYNC | MDBX_NOMETASYNC | MDBX_MAPASYNC; + if (lck_rc == MDBX_RESULT_TRUE) { + env->me_lck->mti_envmode = env->me_flags & (mode_flags | MDBX_RDONLY); + if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { + /* LY: downgrade lock only if exclusive access not requested. + * in case exclusive==1, just leave value as is. */ + rc = mdbx_lck_downgrade(env, true); + mdbx_debug("lck-downgrade-full: rc %i ", rc); + } else { + rc = mdbx_lck_downgrade(env, false); + mdbx_debug("lck-downgrade-partial: rc %i ", rc); } - if ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) { - mdbx_error("current mode/flags incompatible with requested"); - rc = MDBX_INCOMPATIBLE; + if (rc != MDBX_SUCCESS) goto bailout; + } else { + if ((env->me_flags & MDBX_RDONLY) == 0) { + while (env->me_lck->mti_envmode == MDBX_RDONLY) { + if (mdbx_atomic_compare_and_swap32(&env->me_lck->mti_envmode, + MDBX_RDONLY, + env->me_flags & mode_flags)) + break; + /* TODO: yield/relax cpu */ + } + if ((env->me_lck->mti_envmode ^ env->me_flags) & mode_flags) { + mdbx_error("current mode/flags incompatible with requested"); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } } } - } - if (env->me_lck && (env->me_flags & MDBX_NOTLS) == 0) { - rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_lck->mti_readers[0], - &env->me_lck->mti_readers[env->me_maxreaders]); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - env->me_flags |= MDBX_ENV_TXKEY; + if ((env->me_flags & MDBX_NOTLS) == 0) { + rc = mdbx_rthc_alloc(&env->me_txkey, &env->me_lck->mti_readers[0], + &env->me_lck->mti_readers[env->me_maxreaders]); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->me_flags |= MDBX_ENV_TXKEY; + } } if ((flags & MDBX_RDONLY) == 0) { @@ -5952,7 +5979,7 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDBX_EBADSIGN; - if (env->me_lck && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0) { + if ((env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0) { if (env->me_txn0 && env->me_txn0->mt_owner && env->me_txn0->mt_owner != mdbx_thread_self()) return MDBX_BUSY; @@ -5987,6 +6014,8 @@ int __cold mdbx_env_close_ex(MDBX_env *env, int dont_sync) { /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ DeleteCriticalSection(&env->me_windowsbug_lock); #else + mdbx_ensure(env, + mdbx_fastmutex_destroy(&env->me_lckless_wmutex) == MDBX_SUCCESS); mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); #endif /* Windows */ @@ -11241,28 +11270,31 @@ int __cold mdbx_reader_list(MDBX_env *env, MDBX_msg_func *func, void *ctx) { return MDBX_EBADSIGN; const MDBX_lockinfo *const lck = env->me_lck; - const unsigned snap_nreaders = lck->mti_numreaders; - for (unsigned i = 0; i < snap_nreaders; i++) { - if (lck->mti_readers[i].mr_pid) { - const txnid_t txnid = lck->mti_readers[i].mr_txnid; - if (txnid == ~(txnid_t)0) - snprintf(buf, sizeof(buf), "%10" PRIuPTR " %" PRIxPTR " -\n", - (uintptr_t)lck->mti_readers[i].mr_pid, - (uintptr_t)lck->mti_readers[i].mr_tid); - else - snprintf(buf, sizeof(buf), "%10" PRIuPTR " %" PRIxPTR " %" PRIaTXN "\n", - (uintptr_t)lck->mti_readers[i].mr_pid, - (uintptr_t)lck->mti_readers[i].mr_tid, txnid); + if (likely(lck)) { + const unsigned snap_nreaders = lck->mti_numreaders; + for (unsigned i = 0; i < snap_nreaders; i++) { + if (lck->mti_readers[i].mr_pid) { + const txnid_t txnid = lck->mti_readers[i].mr_txnid; + if (txnid == ~(txnid_t)0) + snprintf(buf, sizeof(buf), "%10" PRIuPTR " %" PRIxPTR " -\n", + (uintptr_t)lck->mti_readers[i].mr_pid, + (uintptr_t)lck->mti_readers[i].mr_tid); + else + snprintf(buf, sizeof(buf), + "%10" PRIuPTR " %" PRIxPTR " %" PRIaTXN "\n", + (uintptr_t)lck->mti_readers[i].mr_pid, + (uintptr_t)lck->mti_readers[i].mr_tid, txnid); - if (first) { - first = 0; - rc = func(" pid thread txnid\n", ctx); + if (first) { + first = 0; + rc = func(" pid thread txnid\n", ctx); + if (rc < 0) + break; + } + rc = func(buf, ctx); if (rc < 0) break; } - rc = func(buf, ctx); - if (rc < 0) - break; } } if (first) @@ -11327,6 +11359,13 @@ int __cold mdbx_reader_check0(MDBX_env *env, int rdt_locked, int *dead) { } MDBX_lockinfo *const lck = env->me_lck; + if (unlikely(lck == NULL)) { + /* exclusive mode */ + if (dead) + *dead = 0; + return MDBX_SUCCESS; + } + const unsigned snap_nreaders = lck->mti_numreaders; mdbx_pid_t *pids = alloca((snap_nreaders + 1) * sizeof(mdbx_pid_t)); pids[0] = 0; @@ -11441,7 +11480,7 @@ static txnid_t __cold mdbx_oomkick(MDBX_env *env, const txnid_t laggard) { mdbx_assert(env, oldest < env->me_txn0->mt_txnid); mdbx_assert(env, oldest >= laggard); mdbx_assert(env, oldest >= env->me_oldest[0]); - if (oldest == laggard) + if (oldest == laggard || unlikely(env->me_lck == NULL /* exclusive mode */)) return oldest; if (MDBX_IS_ERROR(mdbx_reader_check0(env, false, NULL)))