From bf81e86d939d164ef23f1e80f5153f84b42e133b Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 22 Nov 2015 13:23:43 +0300 Subject: [PATCH] mdbx: prevents use the env after a fork(). Change-Id: I9c86500ac008a7be7eb16aeff2610fa5cfa22b84 --- mdb.c | 128 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 49 deletions(-) diff --git a/mdb.c b/mdb.c index 741c7cb3..179c550b 100644 --- a/mdb.c +++ b/mdb.c @@ -1033,7 +1033,7 @@ enum { #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ -static void mdb_txn_end(MDB_txn *txn, unsigned mode); +static int mdb_txn_end(MDB_txn *txn, unsigned mode); static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); static int mdb_page_search_root(MDB_cursor *mc, @@ -2700,6 +2700,11 @@ mdb_txn_renew0(MDB_txn *txn) uint16_t x; int rc, new_notls = 0; + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + if ((flags &= MDB_TXN_RDONLY) != 0) { struct MDB_rthc *rthc = NULL; MDB_reader *r = NULL; @@ -2889,6 +2894,11 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **ret) if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) return MDB_VERSION_MISMATCH; + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + flags &= MDB_TXN_BEGIN_FLAGS; flags |= env->me_flags & MDB_WRITEMAP; @@ -3064,12 +3074,17 @@ mdbx_txn_straggler(MDB_txn *txn, int *percent) * @param[in] txn the transaction handle to end * @param[in] mode why and how to end the transaction */ -static void +static int mdb_txn_end(MDB_txn *txn, unsigned mode) { MDB_env *env = txn->mt_env; static const char *const names[] = MDB_END_NAMES; + if (unlikely(txn->mt_env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + /* Export or close DBI handles opened in this txn */ mdb_dbis_update(txn, mode & MDB_END_UPDATE); @@ -3139,6 +3154,8 @@ mdb_txn_end(MDB_txn *txn, unsigned mode) txn->mt_signature = 0; free(txn); } + + return MDB_SUCCESS; } int @@ -3154,8 +3171,7 @@ mdb_txn_reset(MDB_txn *txn) if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) return EINVAL; - mdb_txn_end(txn, MDB_END_RESET); - return MDB_SUCCESS; + return mdb_txn_end(txn, MDB_END_RESET); } int @@ -3170,8 +3186,7 @@ mdb_txn_abort(MDB_txn *txn) if (txn->mt_child) mdb_txn_abort(txn->mt_child); - mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); - return MDB_SUCCESS; + return mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); } static int @@ -3649,6 +3664,11 @@ mdb_txn_commit(MDB_txn *txn) if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) return MDB_VERSION_MISMATCH; + if (unlikely(txn->mt_env->me_pid != getpid())) { + txn->mt_env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + /* mdb_txn_end() mode for a commit which writes nothing */ end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; @@ -3864,8 +3884,7 @@ mdb_txn_commit(MDB_txn *txn) end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; done: - mdb_txn_end(txn, end_mode); - return MDB_SUCCESS; + return mdb_txn_end(txn, end_mode); fail: mdb_txn_abort(txn); @@ -4205,14 +4224,16 @@ mdb_env_map(MDB_env *env, void *addr) return errno; } - if (flags & MDB_NORDAHEAD) { + unsigned madvise_flags = MADV_DONTFORK; + if (flags & MDB_NORDAHEAD) /* Turn off readahead. It's harmful when the DB is larger than RAM. */ -#ifdef MADV_RANDOM - madvise(env->me_map, env->me_mapsize, MADV_RANDOM); -#elif defined(POSIX_MADV_RANDOM) - posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM); -#endif /* MADV_RANDOM & POSIX_MADV_RANDOM */ - } + madvise_flags |= MADV_RANDOM; + if (madvise(env->me_map, env->me_mapsize, madvise_flags)) + return errno; + +#ifdef MADV_DONTDUMP + madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP); +#endif /* Can happen because the address argument to mmap() is just a * hint. mmap() can pick another, e.g. if the range is in use. @@ -4438,7 +4459,7 @@ mdb_env_reader_destr(void *ptr) mdb_ensure(NULL, pthread_mutex_lock(&mdb_rthc_lock) == 0); reader = rthc->rc_reader; - if (reader) { + if (reader && reader->mr_pid == getpid()) { mdb_ensure(NULL, reader->mr_rthc == rthc); rthc->rc_reader = NULL; reader->mr_rthc = NULL; @@ -4611,7 +4632,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { return MDB_SUCCESS; } - goto fail_errno; + return rc; } /* Lose record locks when exec*() */ @@ -4621,20 +4642,20 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) if (!(env->me_flags & MDB_NOTLS)) { rc = pthread_key_create(&env->me_txkey, mdb_env_reader_destr); if (rc) - goto fail; + return rc; env->me_flags |= MDB_ENV_TXKEY; } /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ - if ((rc = mdb_env_excl_lock(env, excl))) goto fail; + if ((rc = mdb_env_excl_lock(env, excl))) return rc; size = lseek(env->me_lfd, 0, SEEK_END); - if (size == -1) goto fail_errno; + if (size == -1) return errno; rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); if (size < rsize && *excl > 0) { - if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; + if (ftruncate(env->me_lfd, rsize) != 0) return errno; } else { rsize = size; size = rsize - sizeof(MDB_txninfo); @@ -4643,9 +4664,16 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_lfd, 0); if (m == MAP_FAILED) - goto fail_errno; + return errno; env->me_txns = m; + if (madvise(env->me_txns, rsize, MADV_DONTFORK | MADV_WILLNEED)) + return errno; + +#ifdef MADV_DODUMP + madvise(env->me_txns, rsize, MADV_DODUMP); +#endif + if (*excl > 0) { pthread_mutexattr_t mattr; @@ -4656,7 +4684,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) #endif /* MDB_USE_ROBUST */ || (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr)) || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr))) - goto fail; + return rc; pthread_mutexattr_destroy(&mattr); env->me_txns->mti_magic = MDB_MAGIC; @@ -4666,27 +4694,19 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl) } else { if (env->me_txns->mti_magic != MDB_MAGIC) { mdb_debug("lock region has invalid magic"); - rc = MDB_INVALID; - goto fail; + return MDB_INVALID; } if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { mdb_debug("lock region has format+version 0x%x, expected 0x%x", env->me_txns->mti_format, MDB_LOCK_FORMAT); - rc = MDB_VERSION_MISMATCH; - goto fail; + return MDB_VERSION_MISMATCH; } rc = errno; - if (rc && rc != EACCES && rc != EAGAIN) { - goto fail; - } + if (rc && rc != EACCES && rc != EAGAIN) + return rc; } return MDB_SUCCESS; - -fail_errno: - rc = errno; -fail: - return rc; } /** The name of the lock file in the DB environment */ @@ -4909,22 +4929,25 @@ mdb_env_close0(MDB_env *env) * data owned by this process (me_close_readers and * our readers), and clear each reader atomically. */ - mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0); - for (i = env->me_close_readers; --i >= 0; ) { - MDB_reader *reader = &env->me_txns->mti_readers[i]; - if (reader->mr_pid == pid) { - struct MDB_rthc *rthc = reader->mr_rthc; - if (rthc) { - mdb_ensure(env, rthc->rc_reader == reader); - rthc->rc_reader = NULL; - reader->mr_rthc = NULL; - free(rthc); + if (pid == getpid()) { + mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0); + for (i = env->me_close_readers; --i >= 0; ) { + MDB_reader *reader = &env->me_txns->mti_readers[i]; + if (reader->mr_pid == pid) { + struct MDB_rthc *rthc = reader->mr_rthc; + if (rthc) { + mdb_ensure(env, rthc->rc_reader == reader); + rthc->rc_reader = NULL; + reader->mr_rthc = NULL; + free(rthc); + } + reader->mr_pid = 0; } - reader->mr_pid = 0; } + mdb_coherent_barrier(); + mdb_ensure(env, pthread_mutex_unlock(&mdb_rthc_lock) == 0); } - mdb_coherent_barrier(); - mdb_ensure(env, pthread_mutex_unlock(&mdb_rthc_lock) == 0); + munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); env->me_txns = NULL; @@ -9171,7 +9194,9 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd) return rc; /* We must start the actual read txn after blocking writers */ - mdb_txn_end(txn, MDB_END_RESET_TMP); + rc = mdb_txn_end(txn, MDB_END_RESET_TMP); + if (rc) + return rc; /* Temporarily block writers until we snapshot the meta pages */ wmutex = MDB_MUTEX(env, w); @@ -10016,6 +10041,11 @@ mdb_reader_check0(MDB_env *env, int rlocked, int *dead) pid_t *pids, pid; int rc = MDB_SUCCESS, count = 0; + if (unlikely(env->me_pid != getpid())) { + env->me_flags |= MDB_FATAL_ERROR; + return MDB_PANIC; + } + rdrs = env->me_txns->mti_numreaders; pids = malloc((rdrs+1) * sizeof(pid_t)); if (!pids)