mdbx: rework using of mdbx_mmap_t and mdbx_msync().

This commit is contained in:
Leo Yuriev 2017-07-11 14:10:24 +03:00
parent 70d54f6f2c
commit 455de97d36
4 changed files with 64 additions and 101 deletions

View File

@ -659,12 +659,13 @@ typedef struct MDBX_pgstate {
struct MDBX_env {
#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
size_t me_signature;
mdbx_filehandle_t me_fd; /* The main data file */
mdbx_filehandle_t me_lfd; /* The lock file */
#ifdef MDBX_OSAL_SECTION
MDBX_OSAL_SECTION me_dxb_section;
MDBX_OSAL_SECTION me_lck_section;
#endif
mdbx_mmap_t me_dxb_mmap; /* The main data file */
mdbx_mmap_t me_lck_mmap; /* The lock file */
#define me_map me_dxb_mmap.dxb
#define me_lck me_lck_mmap.lck
#define me_fd me_dxb_mmap.fd
#define me_lfd me_lck_mmap.fd
/* Failed to update the meta page. Probably an I/O error. */
#define MDBX_FATAL_ERROR UINT32_C(0x80000000)
/* Some fields are initialized. */
@ -684,8 +685,6 @@ struct MDBX_env {
mdbx_pid_t me_pid; /* process ID of this env */
mdbx_thread_key_t me_txkey; /* thread-key for readers */
char *me_path; /* path to the DB files */
char *me_map; /* the memory map of the data file */
MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */
void *me_pbuf; /* scratch area for DUPSORT put() */
MDBX_txn *me_txn; /* current write transaction */
MDBX_txn *me_txn0; /* prealloc'd write transaction */

View File

@ -1852,14 +1852,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
"), %" PRIuPTR " bytes",
growth_pgno, growth_pgno - txn->mt_end_pgno, growth_bytes);
mdbx_mmap_param_t mmap;
mmap.address = env->me_map;
#ifdef MDBX_OSAL_SECTION
mmap.section = env->me_dxb_section;
#endif
mmap.fd = env->me_fd;
rc =
mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, growth_bytes);
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
growth_bytes);
if (rc == MDBX_SUCCESS) {
txn->mt_end_pgno = growth_pgno;
env->me_dbgeo.now = growth_bytes;
@ -2146,7 +2140,8 @@ int mdbx_env_sync(MDBX_env *env, int force) {
/* LY: pre-sync without holding lock to reduce latency for writer(s) */
int rc = (flags & MDBX_WRITEMAP)
? mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC)
? mdbx_msync(&env->me_dxb_mmap, 0, used_size,
flags & MDBX_MAPASYNC)
: mdbx_filesync(env->me_fd, false);
if (unlikely(rc != MDBX_SUCCESS))
return rc;
@ -3783,7 +3778,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
MDBX_meta *const steady = mdbx_meta_steady(env);
if (flags & MDBX_WRITEMAP) {
rc = mdbx_msync(env->me_map, usedbytes, flags & MDBX_MAPASYNC);
rc = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, flags & MDBX_MAPASYNC);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
if ((flags & MDBX_MAPASYNC) == 0) {
@ -3885,7 +3880,6 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_assert(env, !mdbx_meta_eq(env, pending, meta1));
mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2));
const size_t offset = (char *)target - env->me_map;
mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
mdbx_ensure(env,
target == head ||
@ -3932,16 +3926,18 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
mdbx_jitter4testing(true);
} else {
pending->mm_magic_and_version = MDBX_DATA_MAGIC;
rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), offset);
rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta),
(uint8_t *)target - env->me_map);
if (unlikely(rc != MDBX_SUCCESS)) {
undo:
mdbx_debug("write failed, disk error?");
/* On a failure, the pagecache still contains the new data.
* Try write some old data back, to prevent it from being used. */
mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), offset);
mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta),
(uint8_t *)target - env->me_map);
goto fail;
}
mdbx_invalidate_cache(env->me_map + offset, sizeof(MDBX_meta));
mdbx_invalidate_cache(target, sizeof(MDBX_meta));
}
/* LY: step#3 - sync meta-pages. */
@ -3949,8 +3945,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) {
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
if (flags & MDBX_WRITEMAP) {
char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1));
rc = mdbx_msync(ptr, env->me_os_psize, flags & MDBX_MAPASYNC);
const size_t offset = (uint8_t *)container_of(head, MDBX_page, mp_meta) -
env->me_dxb_mmap.dxb;
const size_t paged_offset = offset & ~(env->me_os_psize - 1);
const size_t paged_length = mdbx_roundup2(
env->me_psize + offset - paged_offset, env->me_os_psize);
rc = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length,
flags & MDBX_MAPASYNC);
if (unlikely(rc != MDBX_SUCCESS))
goto fail;
} else {
@ -3965,13 +3966,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
#else
/* LY: shrink datafile if needed */
if (shrink_pgno_delta) {
mdbx_mmap_param_t mmap;
mmap.address = env->me_map;
#ifdef MDBX_OSAL_SECTION
mmap.section = env->me_dxb_section;
#endif
mmap.fd = env->me_fd;
rc = mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, shrink_bytes);
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
shrink_bytes);
if (rc == MDBX_SUCCESS)
env->me_dbgeo.now = shrink_bytes;
else if (rc != MDBX_RESULT_TRUE)
@ -4082,19 +4078,13 @@ bailout:
}
static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) {
mdbx_mmap_param_t mmap;
mmap.fd = env->me_fd;
int rc = mdbx_mmap(env->me_flags, &mmap, env->me_dbgeo.now, env->me_mapsize);
int rc = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
env->me_mapsize);
if (unlikely(rc != MDBX_SUCCESS)) {
env->me_map = NULL;
return rc;
}
env->me_map = mmap.address;
#ifdef MDBX_OSAL_SECTION
env->me_dxb_section = mmap.section;
#endif
#ifdef MADV_DONTFORK
if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK))
return errno;
@ -4130,7 +4120,7 @@ static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) {
/* Lock meta pages to avoid unexpected write,
* before the data pages would be synchronized. */
if (env->me_flags & MDBX_WRITEMAP) {
rc = mdbx_mlock(&mmap, pgno2bytes(env, NUM_METAS));
rc = mdbx_mlock(&env->me_dxb_mmap, pgno2bytes(env, NUM_METAS));
if (unlikely(rc != MDBX_SUCCESS))
return rc;
}
@ -4351,17 +4341,11 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower,
const size_t size =
mdbx_roundup2(pgno2bytes(env, meta.mm_geo.upper), env->me_os_psize);
mdbx_mmap_param_t mmap;
mmap.address = env->me_map;
#ifdef MDBX_OSAL_SECTION
mmap.section = env->me_dxb_section;
#endif
mmap.fd = env->me_fd;
rc = mdbx_mremap(env->me_flags, &mmap, env->me_mapsize, size);
rc = mdbx_mremap(env->me_flags, &env->me_dxb_mmap, env->me_mapsize,
size);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
env->me_mapsize = size;
env->me_map = mmap.address;
#ifdef USE_VALGRIND
VALGRIND_DISCARD(env->me_valgrind_handle);
env->me_valgrind_handle =
@ -4372,13 +4356,7 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower,
const size_t size =
mdbx_roundup2(pgno2bytes(env, meta.mm_geo.now), env->me_os_psize);
mdbx_mmap_param_t mmap;
mmap.address = env->me_map;
#ifdef MDBX_OSAL_SECTION
mmap.section = env->me_dxb_section;
#endif
mmap.fd = env->me_fd;
rc = mdbx_mresize(env->me_flags, &mmap,
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap,
pgno2bytes(env, head->mm_geo.now), size);
if (unlikely(rc != MDBX_SUCCESS))
goto bailout;
@ -4736,15 +4714,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) {
}
env->me_maxreaders = (unsigned)maxreaders;
mdbx_mmap_param_t mmap;
mmap.fd = env->me_lfd;
err = mdbx_mmap(MDBX_WRITEMAP, &mmap, (size_t)size, (size_t)size);
err = mdbx_mmap(MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size);
if (unlikely(err != MDBX_SUCCESS))
return err;
env->me_lck = mmap.address;
#ifdef MDBX_OSAL_SECTION
env->me_lck_section = mmap.section;
#endif
#ifdef MADV_DODUMP
(void)madvise(env->me_lck, size, MADV_DODUMP);
@ -5011,13 +4983,7 @@ static void __cold mdbx_env_close0(MDBX_env *env) {
}
if (env->me_map) {
mdbx_mmap_param_t mmap;
mmap.address = env->me_map;
#ifdef MDBX_OSAL_SECTION
mmap.section = env->me_dxb_section;
#endif
mmap.fd = env->me_fd;
mdbx_munmap(&mmap, env->me_mapsize);
mdbx_munmap(&env->me_dxb_mmap, env->me_mapsize);
#ifdef USE_VALGRIND
VALGRIND_DISCARD(env->me_valgrind_handle);
env->me_valgrind_handle = -1;
@ -5029,14 +4995,9 @@ static void __cold mdbx_env_close0(MDBX_env *env) {
}
if (env->me_lck) {
mdbx_mmap_param_t mmap;
mmap.address = env->me_lck;
#ifdef MDBX_OSAL_SECTION
mmap.section = env->me_lck_section;
#endif
mmap.fd = env->me_lfd;
mdbx_munmap(&mmap, (env->me_maxreaders - 1) * sizeof(MDBX_reader) +
sizeof(MDBX_lockinfo));
mdbx_munmap(&env->me_lck_mmap,
(env->me_maxreaders - 1) * sizeof(MDBX_reader) +
sizeof(MDBX_lockinfo));
env->me_lck = nullptr;
}
env->me_pid = 0;
@ -11027,9 +10988,9 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
* Тем не менее, однозначно страница "не грязная" (не будет переписана
* во время транзакции) если адрес находится внутри mmap-диапазона
* и в заголовке страницы нет флажка P_DIRTY. */
if (env->me_map < (char *)page) {
if (env->me_map < (uint8_t *)page) {
const size_t used_size = pgno2bytes(env, txn->mt_next_pgno);
if ((char *)page < env->me_map + used_size) {
if ((uint8_t *)page < env->me_map + used_size) {
/* страница внутри диапазона, смотрим на флажки */
return (page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP))
? MDBX_RESULT_TRUE
@ -11040,7 +11001,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
* ошибка, к которой не возможно прийти без каких-то больших нарушений.
* Поэтому не проверяем этот случай кроме как assert-ом, на то что
* страница вне mmap-диаппазона. */
mdbx_tassert(txn, (char *)page >= env->me_map + env->me_mapsize);
mdbx_tassert(txn, (uint8_t *)page >= env->me_map + env->me_mapsize);
}
/* Страница вне используемого mmap-диапазона, т.е. либо в функцию был

View File

@ -732,18 +732,19 @@ int mdbx_thread_join(mdbx_thread_t thread) {
/*----------------------------------------------------------------------------*/
int mdbx_msync(void *addr, size_t length, int async) {
int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) {
uint8_t *ptr = (uint8_t *)map->address + offset;
#if defined(_WIN32) || defined(_WIN64)
if (async)
if (FlushViewOfFile(ptr, length) && (async || FlushFileBuffers(map->fd)))
return MDBX_SUCCESS;
return FlushViewOfFile(addr, length) ? MDBX_SUCCESS : GetLastError();
return GetLastError();
#else
const int mode = async ? MS_ASYNC : MS_SYNC;
return (msync(addr, length, mode) == 0) ? MDBX_SUCCESS : errno;
return (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno;
#endif
}
int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) {
int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) {
#if defined(_WIN32) || defined(_WIN64)
NTSTATUS rc = NtCreateSection(
&map->section,
@ -790,7 +791,7 @@ int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) {
#endif
}
int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) {
int mdbx_munmap(mdbx_mmap_t *map, size_t length) {
#if defined(_WIN32) || defined(_WIN64)
(void)length;
if (map->section)
@ -802,7 +803,7 @@ int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) {
#endif
}
int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) {
int mdbx_mlock(mdbx_mmap_t *map, size_t length) {
#if defined(_WIN32) || defined(_WIN64)
return VirtualLock(map->address, length) ? MDBX_SUCCESS : GetLastError();
#else
@ -810,8 +811,7 @@ int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) {
#endif
}
int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current,
size_t wanna) {
int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna) {
#if defined(_WIN32) || defined(_WIN64)
if (wanna > current) {
/* growth */
@ -831,7 +831,7 @@ int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current,
#endif
}
int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit,
int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit,
size_t new_limit) {
#if defined(_WIN32) || defined(_WIN64)
(void)flags;

View File

@ -420,8 +420,6 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count,
uint64_t offset);
int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t count);
int mdbx_msync(void *addr, size_t length, int async);
int mdbx_thread_create(mdbx_thread_t *thread,
THREAD_RESULT(THREAD_CALL *start_routine)(void *),
void *arg);
@ -440,19 +438,24 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode,
int mdbx_closefile(mdbx_filehandle_t fd);
typedef struct mdbx_mmap_param {
void *address;
union {
void *address;
uint8_t *dxb;
struct MDBX_lockinfo *lck;
};
mdbx_filehandle_t fd;
#ifdef MDBX_OSAL_SECTION
MDBX_OSAL_SECTION section;
#endif
mdbx_filehandle_t fd;
} mdbx_mmap_param_t;
int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit);
int mdbx_munmap(mdbx_mmap_param_t *map, size_t length);
int mdbx_mlock(mdbx_mmap_param_t *map, size_t length);
int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current,
size_t wanna);
int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit,
} mdbx_mmap_t;
int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit);
int mdbx_munmap(mdbx_mmap_t *map, size_t length);
int mdbx_mlock(mdbx_mmap_t *map, size_t length);
int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna);
int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit,
size_t new_limit);
int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async);
static __inline mdbx_pid_t mdbx_getpid(void) {
#if defined(_WIN32) || defined(_WIN64)