mdbx: partial fix for recursive SRW-lock with MDBX_NOTLS on Windows.

Here are some changes to avoid recursive acquisition of SRW-lock,
which is still in use:
 - Read transactions don't acquire the shared SRW-lock with `MDBX_NOTLS.
 - Memory-mapping of DB is always kept while DB opened,
   therefore following limitations are:
 - DB file can't be shrinked while it used,
   including auto-shrink due to auto-compactification with corresponding geometry settings.
 - The upper limit of DB size can't be changed while DB is used.
 - The DB can grow within the upper size limit defined while opening by a first process,
   but this does not work under Wine since there is no `NtExtendSection()` function.

Partially fix https://github.com/erthink/libmdbx/issues/203
This commit is contained in:
Leonid Yuriev 2021-06-10 02:57:32 +03:00
parent 18bc28bea2
commit d7c06b1337
5 changed files with 125 additions and 95 deletions

View File

@ -1,34 +1,34 @@
N | MASK | ENV | TXN | DB | PUT | DBI | NODE | PAGE |
--|---------|-----------|--------------|----------|-----------|------------|---------|----------|
0 |0000 0001|ALLOC_CACHE|TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH
1 |0000 0002|ALLOC_GC |TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |F_SUBDATA|P_LEAF
2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW
3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META
4 |0000 0010| |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD
5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2
6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP
7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | |
8 |0000 0100| | | | | | |
9 |0000 0200| | | | | | |
10|0000 0400| | | | | | |
11|0000 0800| | | | | | |
12|0000 1000| | | | | | |
13|0000 2000| | | | | | |P_SPILLED
14|0000 4000|NOSUBDIR | | | | | |P_LOOSE
15|0000 8000| | |DB_VALID |NOSPILL | | |P_FROZEN
16|0001 0000|SAFE_NOSYNC|TXN_NOSYNC | |RESERVE | |RESERVE |
17|0002 0000|RDONLY |TXN_RDONLY | |APPEND | |APPEND |
18|0004 0000|NOMETASYNC |TXN_NOMETASYNC|CREATE |APPENDDUP
19|0008 0000|WRITEMAP |<= | |MULTIPLE
20|0010 0000|UTTERLY | |
21|0020 0000|NOTLS |<= |
22|0040 0000|EXCLUSIVE | |
23|0080 0000|NORDAHEAD | |
24|0100 0000|NOMEMINIT |TXN_PREPARE |
25|0200 0000|COALESCE | |
26|0400 0000|LIFORECLAIM| |
27|0800 0000|PAGEPERTURB| |
28|1000 0000|ENV_TXKEY |TXN_TRY |
29|2000 0000|ENV_ACTIVE | |
30|4000 0000|ACCEDE |SHRINK_ALLOWED|DB_ACCEDE
31|8000 0000|FATAL_ERROR| |
N | MASK | ENV | TXN | DB | PUT | DBI | NODE | PAGE | MRESIZE |
--|---------|-----------|--------------|----------|-----------|------------|---------|----------|---------|
0 |0000 0001|ALLOC_CACHE|TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH | |
1 |0000 0002|ALLOC_GC |TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |F_SUBDATA|P_LEAF | |
2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| |
3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | |
4 |0000 0010| |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | |
5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | |
6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | |
7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | |
8 |0000 0100| _MAY_MOVE | | | | | | | <= |
9 |0000 0200| _MAY_UNMAP| | | | | | | <= |
10|0000 0400| | | | | | | | |
11|0000 0800| | | | | | | | |
12|0000 1000| | | | | | | | |
13|0000 2000| | | | | | |P_SPILLED | |
14|0000 4000|NOSUBDIR | | | | | |P_LOOSE | |
15|0000 8000| | |DB_VALID |NOSPILL | | |P_FROZEN | |
16|0001 0000|SAFE_NOSYNC|TXN_NOSYNC | |RESERVE | |RESERVE | | |
17|0002 0000|RDONLY |TXN_RDONLY | |APPEND | |APPEND | | <= |
18|0004 0000|NOMETASYNC |TXN_NOMETASYNC|CREATE |APPENDDUP | | | | |
19|0008 0000|WRITEMAP |<= | |MULTIPLE | | | | <= |
20|0010 0000|UTTERLY | | | | | | | <= |
21|0020 0000|NOTLS |<= | | | | | | |
22|0040 0000|EXCLUSIVE | | | | | | | |
23|0080 0000|NORDAHEAD | | | | | | | |
24|0100 0000|NOMEMINIT |TXN_PREPARE | | | | | | |
25|0200 0000|COALESCE | | | | | | | |
26|0400 0000|LIFORECLAIM| | | | | | | |
27|0800 0000|PAGEPERTURB| | | | | | | |
28|1000 0000|ENV_TXKEY |TXN_TRY | | | | | | |
29|2000 0000|ENV_ACTIVE | | | | | | | |
30|4000 0000|ACCEDE |SHRINK_ALLOWED|DB_ACCEDE | | | | | |
31|8000 0000|FATAL_ERROR| | | | | | | |

View File

@ -5888,16 +5888,6 @@ static __cold int mdbx_set_readahead(MDBX_env *env, const pgno_t edge,
static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
const pgno_t size_pgno,
const pgno_t limit_pgno, const bool implicit) {
if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) {
#if MDBX_ENABLE_PGOP_STAT
safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1);
#endif /* MDBX_ENABLE_PGOP_STAT */
int err = mdbx_msync(&env->me_dxb_mmap, 0,
pgno_align2os_bytes(env, used_pgno), MDBX_SYNC_NONE);
if (unlikely(err != MDBX_SUCCESS))
return err;
}
const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno);
const size_t size_bytes = pgno_align2os_bytes(env, size_pgno);
const size_t prev_size = env->me_dxb_mmap.current;
@ -5915,6 +5905,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno);
mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno);
unsigned mresize_flags =
env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC);
#if defined(_WIN32) || defined(_WIN64)
/* Acquire guard in exclusive mode for:
* - to avoid collision between read and write txns around env->me_dbgeo;
@ -5928,28 +5920,30 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
size_bytes == env->me_dxb_mmap.filesize)
goto bailout;
/* 1) Windows allows only extending a read-write section, but not a
* corresponding mapped view. Therefore in other cases we must suspend
* the local threads for safe remap.
* 2) At least on Windows 10 1803 the entire mapped section is unavailable
* for short time during NtExtendSection() or VirtualAlloc() execution.
* 3) Under Wine runtime environment on Linux a section extending is not
* supported. Therefore thread suspending is always required.
*
* THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */
array_onstack.limit = ARRAY_LENGTH(array_onstack.handles);
array_onstack.count = 0;
suspended = &array_onstack;
rc = mdbx_suspend_threads_before_remap(env, &suspended);
if (rc != MDBX_SUCCESS) {
mdbx_error("failed suspend-for-remap: errcode %d", rc);
goto bailout;
if ((env->me_flags & MDBX_NOTLS) == 0) {
/* 1) Windows allows only extending a read-write section, but not a
* corresponding mapped view. Therefore in other cases we must suspend
* the local threads for safe remap.
* 2) At least on Windows 10 1803 the entire mapped section is unavailable
* for short time during NtExtendSection() or VirtualAlloc() execution.
* 3) Under Wine runtime environment on Linux a section extending is not
* supported.
*
* THEREFORE LOCAL THREADS SUSPENDING IS ALWAYS REQUIRED! */
array_onstack.limit = ARRAY_LENGTH(array_onstack.handles);
array_onstack.count = 0;
suspended = &array_onstack;
rc = mdbx_suspend_threads_before_remap(env, &suspended);
if (rc != MDBX_SUCCESS) {
mdbx_error("failed suspend-for-remap: errcode %d", rc);
goto bailout;
}
mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP
: MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE;
}
const bool mapping_can_be_moved = !implicit;
#else /* Windows */
#else /* Windows */
/* Acquire guard to avoid collision between read and write txns
* around env->me_dbgeo */
bool mapping_can_be_moved = false;
int rc = mdbx_fastmutex_acquire(&env->me_remap_guard);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
@ -5958,7 +5952,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
goto bailout;
MDBX_lockinfo *const lck = env->me_lck_mmap.lck;
if (limit_bytes != env->me_dxb_mmap.limit && lck && !implicit) {
if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) &&
lck && !implicit) {
int err = mdbx_rdt_lock(env) /* lock readers table until remap done */;
if (unlikely(MDBX_IS_ERROR(err))) {
rc = err;
@ -5968,21 +5963,31 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
/* looking for readers from this process */
const unsigned snap_nreaders =
atomic_load32(&lck->mti_numreaders, mo_AcquireRelease);
mapping_can_be_moved = true;
mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP
: MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE;
for (unsigned i = 0; i < snap_nreaders; ++i) {
if (lck->mti_readers[i].mr_pid.weak == env->me_pid &&
lck->mti_readers[i].mr_tid.weak != mdbx_thread_self()) {
/* the base address of the mapping can't be changed since
* the other reader thread from this process exists. */
mdbx_rdt_unlock(env);
mapping_can_be_moved = false;
mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE);
break;
}
}
}
#endif /* ! Windows */
if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) {
#if MDBX_ENABLE_PGOP_STAT
safe64_inc(&env->me_lck->mti_pgop_stat.wops, 1);
#endif /* MDBX_ENABLE_PGOP_STAT */
rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno),
MDBX_SYNC_NONE);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
}
#if MDBX_ENABLE_MADVISE
if (size_bytes < prev_size) {
mdbx_notice("resize-MADV_%s %u..%u",
@ -6020,8 +6025,7 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno,
}
#endif /* MDBX_ENABLE_MADVISE */
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes,
mapping_can_be_moved);
rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes);
#if MDBX_ENABLE_MADVISE
if (rc == MDBX_SUCCESS) {
@ -6056,7 +6060,7 @@ bailout:
}
#endif /* MDBX_USE_VALGRIND */
} else {
if (rc != MDBX_UNABLE_EXTEND_MAPSIZE) {
if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_RESULT_TRUE) {
mdbx_error("failed resize datafile/mapping: "
"present %" PRIuPTR " -> %" PRIuPTR ", "
"limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d",
@ -6084,7 +6088,8 @@ bailout:
mdbx_free(suspended);
}
#else
if (env->me_lck_mmap.lck && mapping_can_be_moved)
if (env->me_lck_mmap.lck &&
(mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0)
mdbx_rdt_unlock(env);
int err = mdbx_fastmutex_release(&env->me_remap_guard);
#endif /* Windows */
@ -7629,12 +7634,13 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) {
}
if (txn->mt_flags & MDBX_TXN_RDONLY) {
#if defined(_WIN32) || defined(_WIN64)
if ((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) ||
(mdbx_RunningUnderWine() &&
/* under Wine acquisition of remap_guard is always required,
* since Wine don't support section extending,
* i.e. in both cases unmap+map are required. */
size < env->me_dbgeo.upper && env->me_dbgeo.grow)) {
if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) ||
(mdbx_RunningUnderWine() &&
/* under Wine acquisition of remap_guard is always required,
* since Wine don't support section extending,
* i.e. in both cases unmap+map are required. */
size < env->me_dbgeo.upper && env->me_dbgeo.grow)) &&
/* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) {
txn->mt_flags |= MDBX_SHRINK_ALLOWED;
mdbx_srwlock_AcquireShared(&env->me_remap_guard);
}

View File

@ -260,6 +260,7 @@ static int suspend_and_append(mdbx_handle_array_t **array,
MDBX_INTERNAL_FUNC int
mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
mdbx_assert(env, (env->me_flags & MDBX_NOTLS) == 0);
const uintptr_t CurrentTid = GetCurrentThreadId();
int rc;
if (env->me_lck_mmap.lck) {
@ -277,12 +278,6 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) {
if (reader->mr_tid.weak == CurrentTid ||
reader->mr_tid.weak == WriteTxnOwner)
goto skip_lck;
if (env->me_flags & MDBX_NOTLS) {
/* Skip duplicates in no-tls mode */
for (const MDBX_reader *scan = reader; --scan >= begin;)
if (scan->mr_tid.weak == reader->mr_tid.weak)
goto skip_lck;
}
rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak);
if (rc != MDBX_SUCCESS) {

View File

@ -1535,8 +1535,8 @@ MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) {
return MDBX_SUCCESS;
}
MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size,
size_t limit, const bool may_move) {
MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map,
size_t size, size_t limit) {
assert(size <= limit);
#if defined(_WIN32) || defined(_WIN64)
assert(size != map->current || limit != map->limit || size < map->filesize);
@ -1580,6 +1580,9 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size,
* - change size of mapped view;
* - extend read-only mapping;
* Therefore we should unmap/map entire section. */
if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0)
return MDBX_RESULT_TRUE;
status = NtUnmapViewOfSection(GetCurrentProcess(), map->address);
if (!NT_SUCCESS(status))
return ntstatus2errcode(status);
@ -1615,7 +1618,7 @@ retry_file_and_section:
if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018)
goto bailout_ntstatus /* no way to recovery */;
if (may_move)
if (flags & MDBX_MRESIZE_MAY_MOVE)
/* the base address could be changed */
map->address = NULL;
}
@ -1673,7 +1676,7 @@ retry_mapview:;
if (!NT_SUCCESS(status)) {
if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 &&
map->address && may_move) {
map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) {
/* try remap at another base address */
map->address = NULL;
goto retry_mapview;
@ -1684,7 +1687,7 @@ retry_mapview:;
if (map->address && (size != map->current || limit != map->limit)) {
/* try remap with previously size and limit,
* but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */
rc = MDBX_UNABLE_EXTEND_MAPSIZE;
rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_RESULT_TRUE;
size = map->current;
ReservedSize = limit = map->limit;
goto retry_file_and_section;
@ -1708,7 +1711,8 @@ retry_mapview:;
if (flags & MDBX_RDONLY) {
map->current = (filesize > limit) ? limit : (size_t)filesize;
if (map->current != size)
rc = MDBX_UNABLE_EXTEND_MAPSIZE;
rc =
(size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_RESULT_TRUE;
} else if (filesize != size) {
rc = mdbx_ftruncate(map->fd, size);
if (rc != MDBX_SUCCESS)
@ -1731,7 +1735,8 @@ retry_mapview:;
uint8_t *ptr = MAP_FAILED;
#if defined(MREMAP_MAYMOVE)
ptr = mremap(map->address, map->limit, limit, may_move ? MREMAP_MAYMOVE : 0);
ptr = mremap(map->address, map->limit, limit,
(flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : 0);
if (ptr == MAP_FAILED) {
const int err = errno;
switch (err) {
@ -1780,7 +1785,7 @@ retry_mapview:;
if (ptr == MAP_FAILED) {
/* unmap and map again whole region */
if (!may_move) {
if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) {
/* TODO: Perhaps here it is worth to implement suspend/resume threads
* and perform unmap/map as like for Windows. */
return MDBX_UNABLE_EXTEND_MAPSIZE;
@ -1789,9 +1794,31 @@ retry_mapview:;
if (unlikely(munmap(map->address, map->limit)))
return errno;
ptr = mmap(map->address, limit, mmap_prot, mmap_flags, map->fd, 0);
ptr = mmap(map->address, limit, mmap_prot,
(flags & MDBX_MRESIZE_MAY_MOVE)
? mmap_flags
: mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE
: MAP_FIXED),
map->fd, 0);
if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED &&
unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) &&
errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL)
ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED,
map->fd, 0);
if (unlikely(ptr == MAP_FAILED)) {
ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags, map->fd, 0);
/* try to restore prev mapping */
ptr = mmap(map->address, map->limit, mmap_prot,
(flags & MDBX_MRESIZE_MAY_MOVE)
? mmap_flags
: mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE
: MAP_FIXED),
map->fd, 0);
if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED &&
unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) &&
errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL)
ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED,
map->fd, 0);
if (unlikely(ptr == MAP_FAILED)) {
VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current);
/* Unpoisoning is required for ASAN to avoid false-positive diagnostic

View File

@ -695,8 +695,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map,
const size_t must, const size_t limit,
const unsigned options);
MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map);
MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current,
size_t wanna, const bool may_move);
#define MDBX_MRESIZE_MAY_MOVE 0x00000100
#define MDBX_MRESIZE_MAY_UNMAP 0x00000200
MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map,
size_t size, size_t limit);
#if defined(_WIN32) || defined(_WIN64)
typedef struct {
unsigned limit, count;