From 3351c1f869b466b4ae1adc8a5afc554da3d3038c Mon Sep 17 00:00:00 2001 From: Leonid Yuriev Date: Mon, 6 Jul 2020 16:23:52 +0300 Subject: [PATCH] mdbx: implements remapping of the database file when it it possible. Change-Id: Ida15ba1f396a33b2c6063e680dff612f39a9608f --- ChangeLog.md | 2 ++ src/core.c | 46 ++++++++++++++++++++++++++------ src/osal.c | 75 +++++++++++++++++++++++++++++++++++++++++++--------- src/osal.h | 2 +- 4 files changed, 103 insertions(+), 22 deletions(-) diff --git a/ChangeLog.md b/ChangeLog.md index afe452a8..23c2cf82 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -13,6 +13,8 @@ v0.8.2 2020-07-??:   - Refined mode bits while auto-creating LCK-file. - Avoids unnecessary database file re-mapping in case geometry changed by another process(es). From the user's point of view, the MDBX_UNABLE_EXTEND_MAPSIZE error will now be returned less frequently and only when using the DB in the current process really requires it to be reopened. + - Remapping on-the-fly and of the database file was implemented. + Now remapping with a change of address is performed automatically if there are no dependent readers in the current process. v0.8.1 2020-06-12:   - Minor change versioning. The last number in the version now means the number of commits since last release/tag. diff --git a/src/core.c b/src/core.c index 2eaf9dc0..14608262 100644 --- a/src/core.c +++ b/src/core.c @@ -4658,7 +4658,7 @@ static int __cold mdbx_set_readahead(MDBX_env *env, const size_t offset, static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, const pgno_t size_pgno, - const pgno_t limit_pgno) { + const pgno_t limit_pgno, const bool implicit) { if ((env->me_flags & MDBX_WRITEMAP) && *env->me_unsynced_pages) { int err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), true); @@ -4711,16 +4711,40 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, mdbx_error("failed suspend-for-remap: errcode %d", rc); goto bailout; } -#else + const bool mapping_can_be_moved = !implicit; +#else /* Windows */ /* Acquire guard to avoid collision between read and write txns * around env->me_dbgeo */ + bool mapping_can_be_moved = false; int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (limit_bytes == env->me_dxb_mmap.limit && size_bytes == env->me_dxb_mmap.current) goto bailout; -#endif /* Windows */ + + if (limit_bytes != env->me_dxb_mmap.limit && env->me_lck && !implicit) { + rc = mdbx_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + /* looking for readers from this process */ + MDBX_lockinfo *const lck = env->me_lck; + const unsigned snap_nreaders = lck->mti_numreaders; + mapping_can_be_moved = true; + for (unsigned i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid == env->me_pid && + lck->mti_readers[i].mr_tid != mdbx_thread_self()) { + /* the base address of the mapping can't be changed since + * the other reader thread from this process exists. */ + mdbx_rdt_unlock(env); + mapping_can_be_moved = false; + break; + } + } + } + +#endif /* ! Windows */ const size_t prev_size = env->me_dxb_mmap.current; if (size_bytes < prev_size) { @@ -4758,7 +4782,8 @@ static __cold int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, *env->me_discarded_tail = size_pgno; } - rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, size_bytes, limit_bytes, + mapping_can_be_moved); if (rc == MDBX_SUCCESS && (env->me_flags & MDBX_NORDAHEAD) == 0) { const int readahead = mdbx_is_readahead_reasonable(size_bytes, 0); if (readahead == MDBX_RESULT_FALSE) @@ -4829,6 +4854,8 @@ bailout: mdbx_free(suspended); } #else + if (env->me_lck && mapping_can_be_moved) + mdbx_rdt_unlock(env); int err = mdbx_fastmutex_release(&env->me_remap_guard); #endif /* Windows */ if (err != MDBX_SUCCESS) { @@ -4849,7 +4876,8 @@ static __cold int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno, ? limit_pgno : /* The actual mapsize may be less since the geo.upper may be changed by other process. So, avoids remapping until it necessary. */ - mapped_pgno); + mapped_pgno, + true); } static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, @@ -6115,8 +6143,9 @@ static int mdbx_txn_renew0(MDBX_txn *txn, unsigned flags) { rc = MDBX_UNABLE_EXTEND_MAPSIZE; goto bailout; } - rc = mdbx_mapresize_implicit(env, txn->mt_next_pgno, txn->mt_end_pgno, - txn->mt_geo.upper); + rc = mdbx_mapresize(env, txn->mt_next_pgno, txn->mt_end_pgno, + txn->mt_geo.upper, + (txn->mt_flags & MDBX_RDONLY) ? true : false); if (rc != MDBX_SUCCESS) goto bailout; } @@ -9192,7 +9221,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (new_geo.now != current_geo->now || new_geo.upper != current_geo->upper) { - rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper); + rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper, + false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; mdbx_assert(env, (head == nullptr) == inside_txn); diff --git a/src/osal.c b/src/osal.c index b790029e..5059cefe 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1403,7 +1403,7 @@ MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { } MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, - size_t limit) { + size_t limit, const bool may_move) { assert(size <= limit); #if defined(_WIN32) || defined(_WIN64) assert(size != map->current || limit != map->limit || size < map->filesize); @@ -1482,9 +1482,9 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t size, if (status != /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018) goto bailout_ntstatus /* no way to recovery */; - /* assume we can change base address if mapping size changed or prev address - * couldn't be used */ - map->address = NULL; + if (may_move) + /* the base address could be changed */ + map->address = NULL; } retry_file_and_section: @@ -1541,7 +1541,7 @@ retry_mapview:; if (!NT_SUCCESS(status)) { if (status == /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && - map->address) { + map->address && may_move) { /* try remap at another base address */ map->address = NULL; goto retry_mapview; @@ -1565,6 +1565,7 @@ retry_mapview:; map->current = (size_t)SectionSize.QuadPart; map->limit = ViewSize; + #else uint64_t filesize = 0; @@ -1585,7 +1586,8 @@ retry_mapview:; if (limit != map->limit) { #if defined(MREMAP_MAYMOVE) - void *ptr = mremap(map->address, map->limit, limit, 0); + void *ptr = + mremap(map->address, map->limit, limit, may_move ? MREMAP_MAYMOVE : 0); if (ptr == MAP_FAILED) { rc = errno; switch (rc) { @@ -1596,7 +1598,59 @@ retry_mapview:; } return rc; } - map->address = ptr; +#else + if (!may_move) + /* TODO: Perhaps here it is worth to implement suspend/resume threads + * and perform unmap/map as like for Windows. */ + return MDBX_UNABLE_EXTEND_MAPSIZE; + + if (unlikely(munmap(map->address, map->limit))) + return errno; + + unsigned mmap_flags = + MAP_CONCEAL | MAP_SHARED | MAP_FILE | + (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0); +#ifdef MAP_FIXED + if (!may_move) + mmap_flags |= MAP_FIXED; +#endif + + void *ptr = + mmap(map->address, limit, + (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, + mmap_flags, map->fd, 0); + if (unlikely(ptr == MAP_FAILED)) { + ptr = mmap(map->address, map->limit, + (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, + mmap_flags, map->fd, 0); + if (unlikely(ptr == MAP_FAILED)) { + VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmaping. + * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + */ + ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); + map->limit = 0; + map->current = 0; + map->address = nullptr; + return errno; + } + return MDBX_UNABLE_EXTEND_MAPSIZE; + } +#endif /* !MREMAP_MAYMOVE */ + + if (map->address != ptr) { + VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + /* Unpoisoning is required for ASAN to avoid false-positive diagnostic + * when this memory will re-used by malloc or another mmaping. + * See https://github.com/erthink/libmdbx/pull/93#issuecomment-613687203 + */ + ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); + + VALGRIND_MAKE_MEM_DEFINED(ptr, map->current); + ASAN_UNPOISON_MEMORY_REGION(ptr, map->current); + map->address = ptr; + } map->limit = limit; #ifdef MADV_DONTFORK @@ -1607,14 +1661,9 @@ retry_mapview:; #ifdef MADV_NOHUGEPAGE (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); #endif /* MADV_NOHUGEPAGE */ - -#else /* MREMAP_MAYMOVE */ - /* TODO: Perhaps here it is worth to implement suspend/resume threads - * and perform unmap/map as like for Windows. */ - rc = MDBX_UNABLE_EXTEND_MAPSIZE; -#endif /* !MREMAP_MAYMOVE */ } #endif + return rc; } diff --git a/src/osal.h b/src/osal.h index 85e84ebd..8d134ba2 100644 --- a/src/osal.h +++ b/src/osal.h @@ -623,7 +623,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, const unsigned options); MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); MDBX_INTERNAL_FUNC int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, - size_t wanna); + size_t wanna, const bool may_move); #if defined(_WIN32) || defined(_WIN64) typedef struct { unsigned limit, count;