mdbx: prevents use the env after a fork().

Change-Id: I9c86500ac008a7be7eb16aeff2610fa5cfa22b84
This commit is contained in:
Leo Yuriev 2015-11-22 13:23:43 +03:00
parent 7e2000ef7a
commit bf81e86d93

128
mdb.c
View File

@ -1033,7 +1033,7 @@ enum {
#define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */
#define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */
#define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */
static void mdb_txn_end(MDB_txn *txn, unsigned mode); static int mdb_txn_end(MDB_txn *txn, unsigned mode);
static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl);
static int mdb_page_search_root(MDB_cursor *mc, static int mdb_page_search_root(MDB_cursor *mc,
@ -2700,6 +2700,11 @@ mdb_txn_renew0(MDB_txn *txn)
uint16_t x; uint16_t x;
int rc, new_notls = 0; int rc, new_notls = 0;
if (unlikely(env->me_pid != getpid())) {
env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
if ((flags &= MDB_TXN_RDONLY) != 0) { if ((flags &= MDB_TXN_RDONLY) != 0) {
struct MDB_rthc *rthc = NULL; struct MDB_rthc *rthc = NULL;
MDB_reader *r = NULL; MDB_reader *r = NULL;
@ -2889,6 +2894,11 @@ mdb_txn_begin(MDB_env *env, MDB_txn *parent, unsigned flags, MDB_txn **ret)
if (unlikely(env->me_signature != MDBX_ME_SIGNATURE)) if (unlikely(env->me_signature != MDBX_ME_SIGNATURE))
return MDB_VERSION_MISMATCH; return MDB_VERSION_MISMATCH;
if (unlikely(env->me_pid != getpid())) {
env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
flags &= MDB_TXN_BEGIN_FLAGS; flags &= MDB_TXN_BEGIN_FLAGS;
flags |= env->me_flags & MDB_WRITEMAP; flags |= env->me_flags & MDB_WRITEMAP;
@ -3064,12 +3074,17 @@ mdbx_txn_straggler(MDB_txn *txn, int *percent)
* @param[in] txn the transaction handle to end * @param[in] txn the transaction handle to end
* @param[in] mode why and how to end the transaction * @param[in] mode why and how to end the transaction
*/ */
static void static int
mdb_txn_end(MDB_txn *txn, unsigned mode) mdb_txn_end(MDB_txn *txn, unsigned mode)
{ {
MDB_env *env = txn->mt_env; MDB_env *env = txn->mt_env;
static const char *const names[] = MDB_END_NAMES; static const char *const names[] = MDB_END_NAMES;
if (unlikely(txn->mt_env->me_pid != getpid())) {
env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
/* Export or close DBI handles opened in this txn */ /* Export or close DBI handles opened in this txn */
mdb_dbis_update(txn, mode & MDB_END_UPDATE); mdb_dbis_update(txn, mode & MDB_END_UPDATE);
@ -3139,6 +3154,8 @@ mdb_txn_end(MDB_txn *txn, unsigned mode)
txn->mt_signature = 0; txn->mt_signature = 0;
free(txn); free(txn);
} }
return MDB_SUCCESS;
} }
int int
@ -3154,8 +3171,7 @@ mdb_txn_reset(MDB_txn *txn)
if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY))) if (unlikely(!(txn->mt_flags & MDB_TXN_RDONLY)))
return EINVAL; return EINVAL;
mdb_txn_end(txn, MDB_END_RESET); return mdb_txn_end(txn, MDB_END_RESET);
return MDB_SUCCESS;
} }
int int
@ -3170,8 +3186,7 @@ mdb_txn_abort(MDB_txn *txn)
if (txn->mt_child) if (txn->mt_child)
mdb_txn_abort(txn->mt_child); mdb_txn_abort(txn->mt_child);
mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE); return mdb_txn_end(txn, MDB_END_ABORT|MDB_END_SLOT|MDB_END_FREE);
return MDB_SUCCESS;
} }
static int static int
@ -3649,6 +3664,11 @@ mdb_txn_commit(MDB_txn *txn)
if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) if(unlikely(txn->mt_signature != MDBX_MT_SIGNATURE))
return MDB_VERSION_MISMATCH; return MDB_VERSION_MISMATCH;
if (unlikely(txn->mt_env->me_pid != getpid())) {
txn->mt_env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
/* mdb_txn_end() mode for a commit which writes nothing */ /* mdb_txn_end() mode for a commit which writes nothing */
end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE; end_mode = MDB_END_EMPTY_COMMIT|MDB_END_UPDATE|MDB_END_SLOT|MDB_END_FREE;
@ -3864,8 +3884,7 @@ mdb_txn_commit(MDB_txn *txn)
end_mode = MDB_END_COMMITTED|MDB_END_UPDATE; end_mode = MDB_END_COMMITTED|MDB_END_UPDATE;
done: done:
mdb_txn_end(txn, end_mode); return mdb_txn_end(txn, end_mode);
return MDB_SUCCESS;
fail: fail:
mdb_txn_abort(txn); mdb_txn_abort(txn);
@ -4205,14 +4224,16 @@ mdb_env_map(MDB_env *env, void *addr)
return errno; return errno;
} }
if (flags & MDB_NORDAHEAD) { unsigned madvise_flags = MADV_DONTFORK;
if (flags & MDB_NORDAHEAD)
/* Turn off readahead. It's harmful when the DB is larger than RAM. */ /* Turn off readahead. It's harmful when the DB is larger than RAM. */
#ifdef MADV_RANDOM madvise_flags |= MADV_RANDOM;
madvise(env->me_map, env->me_mapsize, MADV_RANDOM); if (madvise(env->me_map, env->me_mapsize, madvise_flags))
#elif defined(POSIX_MADV_RANDOM) return errno;
posix_madvise(env->me_map, env->me_mapsize, POSIX_MADV_RANDOM);
#endif /* MADV_RANDOM & POSIX_MADV_RANDOM */ #ifdef MADV_DONTDUMP
} madvise(env->me_map, env->me_mapsize, MADV_DONTDUMP);
#endif
/* Can happen because the address argument to mmap() is just a /* Can happen because the address argument to mmap() is just a
* hint. mmap() can pick another, e.g. if the range is in use. * hint. mmap() can pick another, e.g. if the range is in use.
@ -4438,7 +4459,7 @@ mdb_env_reader_destr(void *ptr)
mdb_ensure(NULL, pthread_mutex_lock(&mdb_rthc_lock) == 0); mdb_ensure(NULL, pthread_mutex_lock(&mdb_rthc_lock) == 0);
reader = rthc->rc_reader; reader = rthc->rc_reader;
if (reader) { if (reader && reader->mr_pid == getpid()) {
mdb_ensure(NULL, reader->mr_rthc == rthc); mdb_ensure(NULL, reader->mr_rthc == rthc);
rthc->rc_reader = NULL; rthc->rc_reader = NULL;
reader->mr_rthc = NULL; reader->mr_rthc = NULL;
@ -4611,7 +4632,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
if (rc == EROFS && (env->me_flags & MDB_RDONLY)) { if (rc == EROFS && (env->me_flags & MDB_RDONLY)) {
return MDB_SUCCESS; return MDB_SUCCESS;
} }
goto fail_errno; return rc;
} }
/* Lose record locks when exec*() */ /* Lose record locks when exec*() */
@ -4621,20 +4642,20 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
if (!(env->me_flags & MDB_NOTLS)) { if (!(env->me_flags & MDB_NOTLS)) {
rc = pthread_key_create(&env->me_txkey, mdb_env_reader_destr); rc = pthread_key_create(&env->me_txkey, mdb_env_reader_destr);
if (rc) if (rc)
goto fail; return rc;
env->me_flags |= MDB_ENV_TXKEY; env->me_flags |= MDB_ENV_TXKEY;
} }
/* Try to get exclusive lock. If we succeed, then /* Try to get exclusive lock. If we succeed, then
* nobody is using the lock region and we should initialize it. * nobody is using the lock region and we should initialize it.
*/ */
if ((rc = mdb_env_excl_lock(env, excl))) goto fail; if ((rc = mdb_env_excl_lock(env, excl))) return rc;
size = lseek(env->me_lfd, 0, SEEK_END); size = lseek(env->me_lfd, 0, SEEK_END);
if (size == -1) goto fail_errno; if (size == -1) return errno;
rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo); rsize = (env->me_maxreaders-1) * sizeof(MDB_reader) + sizeof(MDB_txninfo);
if (size < rsize && *excl > 0) { if (size < rsize && *excl > 0) {
if (ftruncate(env->me_lfd, rsize) != 0) goto fail_errno; if (ftruncate(env->me_lfd, rsize) != 0) return errno;
} else { } else {
rsize = size; rsize = size;
size = rsize - sizeof(MDB_txninfo); size = rsize - sizeof(MDB_txninfo);
@ -4643,9 +4664,16 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_lfd, 0); m = mmap(NULL, rsize, PROT_READ|PROT_WRITE, MAP_SHARED, env->me_lfd, 0);
if (m == MAP_FAILED) if (m == MAP_FAILED)
goto fail_errno; return errno;
env->me_txns = m; env->me_txns = m;
if (madvise(env->me_txns, rsize, MADV_DONTFORK | MADV_WILLNEED))
return errno;
#ifdef MADV_DODUMP
madvise(env->me_txns, rsize, MADV_DODUMP);
#endif
if (*excl > 0) { if (*excl > 0) {
pthread_mutexattr_t mattr; pthread_mutexattr_t mattr;
@ -4656,7 +4684,7 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
#endif /* MDB_USE_ROBUST */ #endif /* MDB_USE_ROBUST */
|| (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr)) || (rc = pthread_mutex_init(&env->me_txns->mti_rmutex, &mattr))
|| (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr))) || (rc = pthread_mutex_init(&env->me_txns->mti_wmutex, &mattr)))
goto fail; return rc;
pthread_mutexattr_destroy(&mattr); pthread_mutexattr_destroy(&mattr);
env->me_txns->mti_magic = MDB_MAGIC; env->me_txns->mti_magic = MDB_MAGIC;
@ -4666,27 +4694,19 @@ mdb_env_setup_locks(MDB_env *env, char *lpath, int mode, int *excl)
} else { } else {
if (env->me_txns->mti_magic != MDB_MAGIC) { if (env->me_txns->mti_magic != MDB_MAGIC) {
mdb_debug("lock region has invalid magic"); mdb_debug("lock region has invalid magic");
rc = MDB_INVALID; return MDB_INVALID;
goto fail;
} }
if (env->me_txns->mti_format != MDB_LOCK_FORMAT) { if (env->me_txns->mti_format != MDB_LOCK_FORMAT) {
mdb_debug("lock region has format+version 0x%x, expected 0x%x", mdb_debug("lock region has format+version 0x%x, expected 0x%x",
env->me_txns->mti_format, MDB_LOCK_FORMAT); env->me_txns->mti_format, MDB_LOCK_FORMAT);
rc = MDB_VERSION_MISMATCH; return MDB_VERSION_MISMATCH;
goto fail;
} }
rc = errno; rc = errno;
if (rc && rc != EACCES && rc != EAGAIN) { if (rc && rc != EACCES && rc != EAGAIN)
goto fail; return rc;
}
} }
return MDB_SUCCESS; return MDB_SUCCESS;
fail_errno:
rc = errno;
fail:
return rc;
} }
/** The name of the lock file in the DB environment */ /** The name of the lock file in the DB environment */
@ -4909,22 +4929,25 @@ mdb_env_close0(MDB_env *env)
* data owned by this process (me_close_readers and * data owned by this process (me_close_readers and
* our readers), and clear each reader atomically. * our readers), and clear each reader atomically.
*/ */
mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0); if (pid == getpid()) {
for (i = env->me_close_readers; --i >= 0; ) { mdb_ensure(env, pthread_mutex_lock(&mdb_rthc_lock) == 0);
MDB_reader *reader = &env->me_txns->mti_readers[i]; for (i = env->me_close_readers; --i >= 0; ) {
if (reader->mr_pid == pid) { MDB_reader *reader = &env->me_txns->mti_readers[i];
struct MDB_rthc *rthc = reader->mr_rthc; if (reader->mr_pid == pid) {
if (rthc) { struct MDB_rthc *rthc = reader->mr_rthc;
mdb_ensure(env, rthc->rc_reader == reader); if (rthc) {
rthc->rc_reader = NULL; mdb_ensure(env, rthc->rc_reader == reader);
reader->mr_rthc = NULL; rthc->rc_reader = NULL;
free(rthc); reader->mr_rthc = NULL;
free(rthc);
}
reader->mr_pid = 0;
} }
reader->mr_pid = 0;
} }
mdb_coherent_barrier();
mdb_ensure(env, pthread_mutex_unlock(&mdb_rthc_lock) == 0);
} }
mdb_coherent_barrier();
mdb_ensure(env, pthread_mutex_unlock(&mdb_rthc_lock) == 0);
munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo)); munmap((void *)env->me_txns, (env->me_maxreaders-1)*sizeof(MDB_reader)+sizeof(MDB_txninfo));
env->me_txns = NULL; env->me_txns = NULL;
@ -9171,7 +9194,9 @@ mdb_env_copyfd0(MDB_env *env, HANDLE fd)
return rc; return rc;
/* We must start the actual read txn after blocking writers */ /* We must start the actual read txn after blocking writers */
mdb_txn_end(txn, MDB_END_RESET_TMP); rc = mdb_txn_end(txn, MDB_END_RESET_TMP);
if (rc)
return rc;
/* Temporarily block writers until we snapshot the meta pages */ /* Temporarily block writers until we snapshot the meta pages */
wmutex = MDB_MUTEX(env, w); wmutex = MDB_MUTEX(env, w);
@ -10016,6 +10041,11 @@ mdb_reader_check0(MDB_env *env, int rlocked, int *dead)
pid_t *pids, pid; pid_t *pids, pid;
int rc = MDB_SUCCESS, count = 0; int rc = MDB_SUCCESS, count = 0;
if (unlikely(env->me_pid != getpid())) {
env->me_flags |= MDB_FATAL_ERROR;
return MDB_PANIC;
}
rdrs = env->me_txns->mti_numreaders; rdrs = env->me_txns->mti_numreaders;
pids = malloc((rdrs+1) * sizeof(pid_t)); pids = malloc((rdrs+1) * sizeof(pid_t));
if (!pids) if (!pids)