diff --git a/src/bits.h b/src/bits.h index dcdafb0a..3c35c8e3 100644 --- a/src/bits.h +++ b/src/bits.h @@ -659,12 +659,13 @@ typedef struct MDBX_pgstate { struct MDBX_env { #define MDBX_ME_SIGNATURE UINT32_C(0x9A899641) size_t me_signature; - mdbx_filehandle_t me_fd; /* The main data file */ - mdbx_filehandle_t me_lfd; /* The lock file */ -#ifdef MDBX_OSAL_SECTION - MDBX_OSAL_SECTION me_dxb_section; - MDBX_OSAL_SECTION me_lck_section; -#endif + mdbx_mmap_t me_dxb_mmap; /* The main data file */ + mdbx_mmap_t me_lck_mmap; /* The lock file */ +#define me_map me_dxb_mmap.dxb +#define me_lck me_lck_mmap.lck +#define me_fd me_dxb_mmap.fd +#define me_lfd me_lck_mmap.fd + /* Failed to update the meta page. Probably an I/O error. */ #define MDBX_FATAL_ERROR UINT32_C(0x80000000) /* Some fields are initialized. */ @@ -684,8 +685,6 @@ struct MDBX_env { mdbx_pid_t me_pid; /* process ID of this env */ mdbx_thread_key_t me_txkey; /* thread-key for readers */ char *me_path; /* path to the DB files */ - char *me_map; /* the memory map of the data file */ - MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn; /* current write transaction */ MDBX_txn *me_txn0; /* prealloc'd write transaction */ diff --git a/src/mdbx.c b/src/mdbx.c index e07655a3..2705602e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -1852,14 +1852,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp, "), %" PRIuPTR " bytes", growth_pgno, growth_pgno - txn->mt_end_pgno, growth_bytes); - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - rc = - mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, growth_bytes); + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + growth_bytes); if (rc == MDBX_SUCCESS) { txn->mt_end_pgno = growth_pgno; env->me_dbgeo.now = growth_bytes; @@ -2146,7 +2140,8 @@ int mdbx_env_sync(MDBX_env *env, int force) { /* LY: pre-sync without holding lock to reduce latency for writer(s) */ int rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC) + ? mdbx_msync(&env->me_dxb_mmap, 0, used_size, + flags & MDBX_MAPASYNC) : mdbx_filesync(env->me_fd, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -3783,7 +3778,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); MDBX_meta *const steady = mdbx_meta_steady(env); if (flags & MDBX_WRITEMAP) { - rc = mdbx_msync(env->me_map, usedbytes, flags & MDBX_MAPASYNC); + rc = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, flags & MDBX_MAPASYNC); if (unlikely(rc != MDBX_SUCCESS)) goto fail; if ((flags & MDBX_MAPASYNC) == 0) { @@ -3885,7 +3880,6 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_assert(env, !mdbx_meta_eq(env, pending, meta1)); mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2)); - const size_t offset = (char *)target - env->me_map; mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); mdbx_ensure(env, target == head || @@ -3932,16 +3926,18 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, mdbx_jitter4testing(true); } else { pending->mm_magic_and_version = MDBX_DATA_MAGIC; - rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), offset); + rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), + (uint8_t *)target - env->me_map); if (unlikely(rc != MDBX_SUCCESS)) { undo: mdbx_debug("write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), offset); + mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), + (uint8_t *)target - env->me_map); goto fail; } - mdbx_invalidate_cache(env->me_map + offset, sizeof(MDBX_meta)); + mdbx_invalidate_cache(target, sizeof(MDBX_meta)); } /* LY: step#3 - sync meta-pages. */ @@ -3949,8 +3945,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) { mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { - char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1)); - rc = mdbx_msync(ptr, env->me_os_psize, flags & MDBX_MAPASYNC); + const size_t offset = (uint8_t *)container_of(head, MDBX_page, mp_meta) - + env->me_dxb_mmap.dxb; + const size_t paged_offset = offset & ~(env->me_os_psize - 1); + const size_t paged_length = mdbx_roundup2( + env->me_psize + offset - paged_offset, env->me_os_psize); + rc = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length, + flags & MDBX_MAPASYNC); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } else { @@ -3965,13 +3966,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, #else /* LY: shrink datafile if needed */ if (shrink_pgno_delta) { - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - rc = mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, shrink_bytes); + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + shrink_bytes); if (rc == MDBX_SUCCESS) env->me_dbgeo.now = shrink_bytes; else if (rc != MDBX_RESULT_TRUE) @@ -4082,19 +4078,13 @@ bailout: } static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) { - mdbx_mmap_param_t mmap; - mmap.fd = env->me_fd; - int rc = mdbx_mmap(env->me_flags, &mmap, env->me_dbgeo.now, env->me_mapsize); + int rc = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + env->me_mapsize); if (unlikely(rc != MDBX_SUCCESS)) { env->me_map = NULL; return rc; } - env->me_map = mmap.address; -#ifdef MDBX_OSAL_SECTION - env->me_dxb_section = mmap.section; -#endif - #ifdef MADV_DONTFORK if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK)) return errno; @@ -4130,7 +4120,7 @@ static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) { /* Lock meta pages to avoid unexpected write, * before the data pages would be synchronized. */ if (env->me_flags & MDBX_WRITEMAP) { - rc = mdbx_mlock(&mmap, pgno2bytes(env, NUM_METAS)); + rc = mdbx_mlock(&env->me_dxb_mmap, pgno2bytes(env, NUM_METAS)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -4351,17 +4341,11 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, const size_t size = mdbx_roundup2(pgno2bytes(env, meta.mm_geo.upper), env->me_os_psize); - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - rc = mdbx_mremap(env->me_flags, &mmap, env->me_mapsize, size); + rc = mdbx_mremap(env->me_flags, &env->me_dxb_mmap, env->me_mapsize, + size); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; env->me_mapsize = size; - env->me_map = mmap.address; #ifdef USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = @@ -4372,13 +4356,7 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower, const size_t size = mdbx_roundup2(pgno2bytes(env, meta.mm_geo.now), env->me_os_psize); - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - rc = mdbx_mresize(env->me_flags, &mmap, + rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, pgno2bytes(env, head->mm_geo.now), size); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -4736,15 +4714,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) { } env->me_maxreaders = (unsigned)maxreaders; - mdbx_mmap_param_t mmap; - mmap.fd = env->me_lfd; - err = mdbx_mmap(MDBX_WRITEMAP, &mmap, (size_t)size, (size_t)size); + err = mdbx_mmap(MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size); if (unlikely(err != MDBX_SUCCESS)) return err; - env->me_lck = mmap.address; -#ifdef MDBX_OSAL_SECTION - env->me_lck_section = mmap.section; -#endif #ifdef MADV_DODUMP (void)madvise(env->me_lck, size, MADV_DODUMP); @@ -5011,13 +4983,7 @@ static void __cold mdbx_env_close0(MDBX_env *env) { } if (env->me_map) { - mdbx_mmap_param_t mmap; - mmap.address = env->me_map; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_dxb_section; -#endif - mmap.fd = env->me_fd; - mdbx_munmap(&mmap, env->me_mapsize); + mdbx_munmap(&env->me_dxb_mmap, env->me_mapsize); #ifdef USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; @@ -5029,14 +4995,9 @@ static void __cold mdbx_env_close0(MDBX_env *env) { } if (env->me_lck) { - mdbx_mmap_param_t mmap; - mmap.address = env->me_lck; -#ifdef MDBX_OSAL_SECTION - mmap.section = env->me_lck_section; -#endif - mmap.fd = env->me_lfd; - mdbx_munmap(&mmap, (env->me_maxreaders - 1) * sizeof(MDBX_reader) + - sizeof(MDBX_lockinfo)); + mdbx_munmap(&env->me_lck_mmap, + (env->me_maxreaders - 1) * sizeof(MDBX_reader) + + sizeof(MDBX_lockinfo)); env->me_lck = nullptr; } env->me_pid = 0; @@ -11027,9 +10988,9 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { * Тем не менее, однозначно страница "не грязная" (не будет переписана * во время транзакции) если адрес находится внутри mmap-диапазона * и в заголовке страницы нет флажка P_DIRTY. */ - if (env->me_map < (char *)page) { + if (env->me_map < (uint8_t *)page) { const size_t used_size = pgno2bytes(env, txn->mt_next_pgno); - if ((char *)page < env->me_map + used_size) { + if ((uint8_t *)page < env->me_map + used_size) { /* страница внутри диапазона, смотрим на флажки */ return (page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP)) ? MDBX_RESULT_TRUE @@ -11040,7 +11001,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { * ошибка, к которой не возможно прийти без каких-то больших нарушений. * Поэтому не проверяем этот случай кроме как assert-ом, на то что * страница вне mmap-диаппазона. */ - mdbx_tassert(txn, (char *)page >= env->me_map + env->me_mapsize); + mdbx_tassert(txn, (uint8_t *)page >= env->me_map + env->me_mapsize); } /* Страница вне используемого mmap-диапазона, т.е. либо в функцию был diff --git a/src/osal.c b/src/osal.c index ae28d191..80728283 100644 --- a/src/osal.c +++ b/src/osal.c @@ -732,18 +732,19 @@ int mdbx_thread_join(mdbx_thread_t thread) { /*----------------------------------------------------------------------------*/ -int mdbx_msync(void *addr, size_t length, int async) { +int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) { + uint8_t *ptr = (uint8_t *)map->address + offset; #if defined(_WIN32) || defined(_WIN64) - if (async) + if (FlushViewOfFile(ptr, length) && (async || FlushFileBuffers(map->fd))) return MDBX_SUCCESS; - return FlushViewOfFile(addr, length) ? MDBX_SUCCESS : GetLastError(); + return GetLastError(); #else const int mode = async ? MS_ASYNC : MS_SYNC; - return (msync(addr, length, mode) == 0) ? MDBX_SUCCESS : errno; + return (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno; #endif } -int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) { +int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) { #if defined(_WIN32) || defined(_WIN64) NTSTATUS rc = NtCreateSection( &map->section, @@ -790,7 +791,7 @@ int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) { #endif } -int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) { +int mdbx_munmap(mdbx_mmap_t *map, size_t length) { #if defined(_WIN32) || defined(_WIN64) (void)length; if (map->section) @@ -802,7 +803,7 @@ int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) { #endif } -int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) { +int mdbx_mlock(mdbx_mmap_t *map, size_t length) { #if defined(_WIN32) || defined(_WIN64) return VirtualLock(map->address, length) ? MDBX_SUCCESS : GetLastError(); #else @@ -810,8 +811,7 @@ int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) { #endif } -int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current, - size_t wanna) { +int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna) { #if defined(_WIN32) || defined(_WIN64) if (wanna > current) { /* growth */ @@ -831,7 +831,7 @@ int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current, #endif } -int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit, +int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit, size_t new_limit) { #if defined(_WIN32) || defined(_WIN64) (void)flags; diff --git a/src/osal.h b/src/osal.h index 731c86b5..413e7d9b 100644 --- a/src/osal.h +++ b/src/osal.h @@ -420,8 +420,6 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t count); -int mdbx_msync(void *addr, size_t length, int async); - int mdbx_thread_create(mdbx_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); @@ -440,19 +438,24 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode, int mdbx_closefile(mdbx_filehandle_t fd); typedef struct mdbx_mmap_param { - void *address; + union { + void *address; + uint8_t *dxb; + struct MDBX_lockinfo *lck; + }; + mdbx_filehandle_t fd; #ifdef MDBX_OSAL_SECTION MDBX_OSAL_SECTION section; #endif - mdbx_filehandle_t fd; -} mdbx_mmap_param_t; -int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit); -int mdbx_munmap(mdbx_mmap_param_t *map, size_t length); -int mdbx_mlock(mdbx_mmap_param_t *map, size_t length); -int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current, - size_t wanna); -int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit, +} mdbx_mmap_t; + +int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit); +int mdbx_munmap(mdbx_mmap_t *map, size_t length); +int mdbx_mlock(mdbx_mmap_t *map, size_t length); +int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna); +int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit, size_t new_limit); +int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async); static __inline mdbx_pid_t mdbx_getpid(void) { #if defined(_WIN32) || defined(_WIN64)