mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-20 05:28:21 +08:00
mdbx: rework using of mdbx_mmap_t and mdbx_msync().
This commit is contained in:
parent
70d54f6f2c
commit
455de97d36
15
src/bits.h
15
src/bits.h
@ -659,12 +659,13 @@ typedef struct MDBX_pgstate {
|
||||
struct MDBX_env {
|
||||
#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
|
||||
size_t me_signature;
|
||||
mdbx_filehandle_t me_fd; /* The main data file */
|
||||
mdbx_filehandle_t me_lfd; /* The lock file */
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
MDBX_OSAL_SECTION me_dxb_section;
|
||||
MDBX_OSAL_SECTION me_lck_section;
|
||||
#endif
|
||||
mdbx_mmap_t me_dxb_mmap; /* The main data file */
|
||||
mdbx_mmap_t me_lck_mmap; /* The lock file */
|
||||
#define me_map me_dxb_mmap.dxb
|
||||
#define me_lck me_lck_mmap.lck
|
||||
#define me_fd me_dxb_mmap.fd
|
||||
#define me_lfd me_lck_mmap.fd
|
||||
|
||||
/* Failed to update the meta page. Probably an I/O error. */
|
||||
#define MDBX_FATAL_ERROR UINT32_C(0x80000000)
|
||||
/* Some fields are initialized. */
|
||||
@ -684,8 +685,6 @@ struct MDBX_env {
|
||||
mdbx_pid_t me_pid; /* process ID of this env */
|
||||
mdbx_thread_key_t me_txkey; /* thread-key for readers */
|
||||
char *me_path; /* path to the DB files */
|
||||
char *me_map; /* the memory map of the data file */
|
||||
MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */
|
||||
void *me_pbuf; /* scratch area for DUPSORT put() */
|
||||
MDBX_txn *me_txn; /* current write transaction */
|
||||
MDBX_txn *me_txn0; /* prealloc'd write transaction */
|
||||
|
105
src/mdbx.c
105
src/mdbx.c
@ -1852,14 +1852,8 @@ static int mdbx_page_alloc(MDBX_cursor *mc, unsigned num, MDBX_page **mp,
|
||||
"), %" PRIuPTR " bytes",
|
||||
growth_pgno, growth_pgno - txn->mt_end_pgno, growth_bytes);
|
||||
|
||||
mdbx_mmap_param_t mmap;
|
||||
mmap.address = env->me_map;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
mmap.section = env->me_dxb_section;
|
||||
#endif
|
||||
mmap.fd = env->me_fd;
|
||||
rc =
|
||||
mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, growth_bytes);
|
||||
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
|
||||
growth_bytes);
|
||||
if (rc == MDBX_SUCCESS) {
|
||||
txn->mt_end_pgno = growth_pgno;
|
||||
env->me_dbgeo.now = growth_bytes;
|
||||
@ -2146,7 +2140,8 @@ int mdbx_env_sync(MDBX_env *env, int force) {
|
||||
|
||||
/* LY: pre-sync without holding lock to reduce latency for writer(s) */
|
||||
int rc = (flags & MDBX_WRITEMAP)
|
||||
? mdbx_msync(env->me_map, used_size, flags & MDBX_MAPASYNC)
|
||||
? mdbx_msync(&env->me_dxb_mmap, 0, used_size,
|
||||
flags & MDBX_MAPASYNC)
|
||||
: mdbx_filesync(env->me_fd, false);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
@ -3783,7 +3778,7 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
|
||||
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
|
||||
MDBX_meta *const steady = mdbx_meta_steady(env);
|
||||
if (flags & MDBX_WRITEMAP) {
|
||||
rc = mdbx_msync(env->me_map, usedbytes, flags & MDBX_MAPASYNC);
|
||||
rc = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, flags & MDBX_MAPASYNC);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
if ((flags & MDBX_MAPASYNC) == 0) {
|
||||
@ -3885,7 +3880,6 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
|
||||
mdbx_assert(env, !mdbx_meta_eq(env, pending, meta1));
|
||||
mdbx_assert(env, !mdbx_meta_eq(env, pending, meta2));
|
||||
|
||||
const size_t offset = (char *)target - env->me_map;
|
||||
mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0);
|
||||
mdbx_ensure(env,
|
||||
target == head ||
|
||||
@ -3932,16 +3926,18 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
|
||||
mdbx_jitter4testing(true);
|
||||
} else {
|
||||
pending->mm_magic_and_version = MDBX_DATA_MAGIC;
|
||||
rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta), offset);
|
||||
rc = mdbx_pwrite(env->me_fd, pending, sizeof(MDBX_meta),
|
||||
(uint8_t *)target - env->me_map);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
undo:
|
||||
mdbx_debug("write failed, disk error?");
|
||||
/* On a failure, the pagecache still contains the new data.
|
||||
* Try write some old data back, to prevent it from being used. */
|
||||
mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta), offset);
|
||||
mdbx_pwrite(env->me_fd, (void *)target, sizeof(MDBX_meta),
|
||||
(uint8_t *)target - env->me_map);
|
||||
goto fail;
|
||||
}
|
||||
mdbx_invalidate_cache(env->me_map + offset, sizeof(MDBX_meta));
|
||||
mdbx_invalidate_cache(target, sizeof(MDBX_meta));
|
||||
}
|
||||
|
||||
/* LY: step#3 - sync meta-pages. */
|
||||
@ -3949,8 +3945,13 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
|
||||
if ((flags & (MDBX_NOSYNC | MDBX_NOMETASYNC)) == 0) {
|
||||
mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0);
|
||||
if (flags & MDBX_WRITEMAP) {
|
||||
char *ptr = env->me_map + (offset & ~(env->me_os_psize - 1));
|
||||
rc = mdbx_msync(ptr, env->me_os_psize, flags & MDBX_MAPASYNC);
|
||||
const size_t offset = (uint8_t *)container_of(head, MDBX_page, mp_meta) -
|
||||
env->me_dxb_mmap.dxb;
|
||||
const size_t paged_offset = offset & ~(env->me_os_psize - 1);
|
||||
const size_t paged_length = mdbx_roundup2(
|
||||
env->me_psize + offset - paged_offset, env->me_os_psize);
|
||||
rc = mdbx_msync(&env->me_dxb_mmap, paged_offset, paged_length,
|
||||
flags & MDBX_MAPASYNC);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto fail;
|
||||
} else {
|
||||
@ -3965,13 +3966,8 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags,
|
||||
#else
|
||||
/* LY: shrink datafile if needed */
|
||||
if (shrink_pgno_delta) {
|
||||
mdbx_mmap_param_t mmap;
|
||||
mmap.address = env->me_map;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
mmap.section = env->me_dxb_section;
|
||||
#endif
|
||||
mmap.fd = env->me_fd;
|
||||
rc = mdbx_mresize(env->me_flags, &mmap, env->me_dbgeo.now, shrink_bytes);
|
||||
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
|
||||
shrink_bytes);
|
||||
if (rc == MDBX_SUCCESS)
|
||||
env->me_dbgeo.now = shrink_bytes;
|
||||
else if (rc != MDBX_RESULT_TRUE)
|
||||
@ -4082,19 +4078,13 @@ bailout:
|
||||
}
|
||||
|
||||
static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) {
|
||||
mdbx_mmap_param_t mmap;
|
||||
mmap.fd = env->me_fd;
|
||||
int rc = mdbx_mmap(env->me_flags, &mmap, env->me_dbgeo.now, env->me_mapsize);
|
||||
int rc = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now,
|
||||
env->me_mapsize);
|
||||
if (unlikely(rc != MDBX_SUCCESS)) {
|
||||
env->me_map = NULL;
|
||||
return rc;
|
||||
}
|
||||
|
||||
env->me_map = mmap.address;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
env->me_dxb_section = mmap.section;
|
||||
#endif
|
||||
|
||||
#ifdef MADV_DONTFORK
|
||||
if (madvise(env->me_map, env->me_mapsize, MADV_DONTFORK))
|
||||
return errno;
|
||||
@ -4130,7 +4120,7 @@ static int __cold mdbx_env_map(MDBX_env *env, size_t usedsize) {
|
||||
/* Lock meta pages to avoid unexpected write,
|
||||
* before the data pages would be synchronized. */
|
||||
if (env->me_flags & MDBX_WRITEMAP) {
|
||||
rc = mdbx_mlock(&mmap, pgno2bytes(env, NUM_METAS));
|
||||
rc = mdbx_mlock(&env->me_dxb_mmap, pgno2bytes(env, NUM_METAS));
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
return rc;
|
||||
}
|
||||
@ -4351,17 +4341,11 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower,
|
||||
const size_t size =
|
||||
mdbx_roundup2(pgno2bytes(env, meta.mm_geo.upper), env->me_os_psize);
|
||||
|
||||
mdbx_mmap_param_t mmap;
|
||||
mmap.address = env->me_map;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
mmap.section = env->me_dxb_section;
|
||||
#endif
|
||||
mmap.fd = env->me_fd;
|
||||
rc = mdbx_mremap(env->me_flags, &mmap, env->me_mapsize, size);
|
||||
rc = mdbx_mremap(env->me_flags, &env->me_dxb_mmap, env->me_mapsize,
|
||||
size);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
env->me_mapsize = size;
|
||||
env->me_map = mmap.address;
|
||||
#ifdef USE_VALGRIND
|
||||
VALGRIND_DISCARD(env->me_valgrind_handle);
|
||||
env->me_valgrind_handle =
|
||||
@ -4372,13 +4356,7 @@ LIBMDBX_API int mdbx_env_set_geometry(MDBX_env *env, ssize_t size_lower,
|
||||
const size_t size =
|
||||
mdbx_roundup2(pgno2bytes(env, meta.mm_geo.now), env->me_os_psize);
|
||||
|
||||
mdbx_mmap_param_t mmap;
|
||||
mmap.address = env->me_map;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
mmap.section = env->me_dxb_section;
|
||||
#endif
|
||||
mmap.fd = env->me_fd;
|
||||
rc = mdbx_mresize(env->me_flags, &mmap,
|
||||
rc = mdbx_mresize(env->me_flags, &env->me_dxb_mmap,
|
||||
pgno2bytes(env, head->mm_geo.now), size);
|
||||
if (unlikely(rc != MDBX_SUCCESS))
|
||||
goto bailout;
|
||||
@ -4736,15 +4714,9 @@ static int __cold mdbx_setup_lck(MDBX_env *env, char *lck_pathname, int mode) {
|
||||
}
|
||||
env->me_maxreaders = (unsigned)maxreaders;
|
||||
|
||||
mdbx_mmap_param_t mmap;
|
||||
mmap.fd = env->me_lfd;
|
||||
err = mdbx_mmap(MDBX_WRITEMAP, &mmap, (size_t)size, (size_t)size);
|
||||
err = mdbx_mmap(MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size);
|
||||
if (unlikely(err != MDBX_SUCCESS))
|
||||
return err;
|
||||
env->me_lck = mmap.address;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
env->me_lck_section = mmap.section;
|
||||
#endif
|
||||
|
||||
#ifdef MADV_DODUMP
|
||||
(void)madvise(env->me_lck, size, MADV_DODUMP);
|
||||
@ -5011,13 +4983,7 @@ static void __cold mdbx_env_close0(MDBX_env *env) {
|
||||
}
|
||||
|
||||
if (env->me_map) {
|
||||
mdbx_mmap_param_t mmap;
|
||||
mmap.address = env->me_map;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
mmap.section = env->me_dxb_section;
|
||||
#endif
|
||||
mmap.fd = env->me_fd;
|
||||
mdbx_munmap(&mmap, env->me_mapsize);
|
||||
mdbx_munmap(&env->me_dxb_mmap, env->me_mapsize);
|
||||
#ifdef USE_VALGRIND
|
||||
VALGRIND_DISCARD(env->me_valgrind_handle);
|
||||
env->me_valgrind_handle = -1;
|
||||
@ -5029,14 +4995,9 @@ static void __cold mdbx_env_close0(MDBX_env *env) {
|
||||
}
|
||||
|
||||
if (env->me_lck) {
|
||||
mdbx_mmap_param_t mmap;
|
||||
mmap.address = env->me_lck;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
mmap.section = env->me_lck_section;
|
||||
#endif
|
||||
mmap.fd = env->me_lfd;
|
||||
mdbx_munmap(&mmap, (env->me_maxreaders - 1) * sizeof(MDBX_reader) +
|
||||
sizeof(MDBX_lockinfo));
|
||||
mdbx_munmap(&env->me_lck_mmap,
|
||||
(env->me_maxreaders - 1) * sizeof(MDBX_reader) +
|
||||
sizeof(MDBX_lockinfo));
|
||||
env->me_lck = nullptr;
|
||||
}
|
||||
env->me_pid = 0;
|
||||
@ -11027,9 +10988,9 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
|
||||
* Тем не менее, однозначно страница "не грязная" (не будет переписана
|
||||
* во время транзакции) если адрес находится внутри mmap-диапазона
|
||||
* и в заголовке страницы нет флажка P_DIRTY. */
|
||||
if (env->me_map < (char *)page) {
|
||||
if (env->me_map < (uint8_t *)page) {
|
||||
const size_t used_size = pgno2bytes(env, txn->mt_next_pgno);
|
||||
if ((char *)page < env->me_map + used_size) {
|
||||
if ((uint8_t *)page < env->me_map + used_size) {
|
||||
/* страница внутри диапазона, смотрим на флажки */
|
||||
return (page->mp_flags & (P_DIRTY | P_LOOSE | P_KEEP))
|
||||
? MDBX_RESULT_TRUE
|
||||
@ -11040,7 +11001,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) {
|
||||
* ошибка, к которой не возможно прийти без каких-то больших нарушений.
|
||||
* Поэтому не проверяем этот случай кроме как assert-ом, на то что
|
||||
* страница вне mmap-диаппазона. */
|
||||
mdbx_tassert(txn, (char *)page >= env->me_map + env->me_mapsize);
|
||||
mdbx_tassert(txn, (uint8_t *)page >= env->me_map + env->me_mapsize);
|
||||
}
|
||||
|
||||
/* Страница вне используемого mmap-диапазона, т.е. либо в функцию был
|
||||
|
20
src/osal.c
20
src/osal.c
@ -732,18 +732,19 @@ int mdbx_thread_join(mdbx_thread_t thread) {
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
|
||||
int mdbx_msync(void *addr, size_t length, int async) {
|
||||
int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async) {
|
||||
uint8_t *ptr = (uint8_t *)map->address + offset;
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (async)
|
||||
if (FlushViewOfFile(ptr, length) && (async || FlushFileBuffers(map->fd)))
|
||||
return MDBX_SUCCESS;
|
||||
return FlushViewOfFile(addr, length) ? MDBX_SUCCESS : GetLastError();
|
||||
return GetLastError();
|
||||
#else
|
||||
const int mode = async ? MS_ASYNC : MS_SYNC;
|
||||
return (msync(addr, length, mode) == 0) ? MDBX_SUCCESS : errno;
|
||||
return (msync(ptr, length, mode) == 0) ? MDBX_SUCCESS : errno;
|
||||
#endif
|
||||
}
|
||||
|
||||
int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) {
|
||||
int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
NTSTATUS rc = NtCreateSection(
|
||||
&map->section,
|
||||
@ -790,7 +791,7 @@ int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) {
|
||||
int mdbx_munmap(mdbx_mmap_t *map, size_t length) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
(void)length;
|
||||
if (map->section)
|
||||
@ -802,7 +803,7 @@ int mdbx_munmap(mdbx_mmap_param_t *map, size_t length) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) {
|
||||
int mdbx_mlock(mdbx_mmap_t *map, size_t length) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
return VirtualLock(map->address, length) ? MDBX_SUCCESS : GetLastError();
|
||||
#else
|
||||
@ -810,8 +811,7 @@ int mdbx_mlock(mdbx_mmap_param_t *map, size_t length) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current,
|
||||
size_t wanna) {
|
||||
int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
if (wanna > current) {
|
||||
/* growth */
|
||||
@ -831,7 +831,7 @@ int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current,
|
||||
#endif
|
||||
}
|
||||
|
||||
int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit,
|
||||
int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit,
|
||||
size_t new_limit) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
(void)flags;
|
||||
|
25
src/osal.h
25
src/osal.h
@ -420,8 +420,6 @@ int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count,
|
||||
uint64_t offset);
|
||||
int mdbx_write(mdbx_filehandle_t fd, const void *buf, size_t count);
|
||||
|
||||
int mdbx_msync(void *addr, size_t length, int async);
|
||||
|
||||
int mdbx_thread_create(mdbx_thread_t *thread,
|
||||
THREAD_RESULT(THREAD_CALL *start_routine)(void *),
|
||||
void *arg);
|
||||
@ -440,19 +438,24 @@ int mdbx_openfile(const char *pathname, int flags, mode_t mode,
|
||||
int mdbx_closefile(mdbx_filehandle_t fd);
|
||||
|
||||
typedef struct mdbx_mmap_param {
|
||||
void *address;
|
||||
union {
|
||||
void *address;
|
||||
uint8_t *dxb;
|
||||
struct MDBX_lockinfo *lck;
|
||||
};
|
||||
mdbx_filehandle_t fd;
|
||||
#ifdef MDBX_OSAL_SECTION
|
||||
MDBX_OSAL_SECTION section;
|
||||
#endif
|
||||
mdbx_filehandle_t fd;
|
||||
} mdbx_mmap_param_t;
|
||||
int mdbx_mmap(int flags, mdbx_mmap_param_t *map, size_t length, size_t limit);
|
||||
int mdbx_munmap(mdbx_mmap_param_t *map, size_t length);
|
||||
int mdbx_mlock(mdbx_mmap_param_t *map, size_t length);
|
||||
int mdbx_mresize(int flags, mdbx_mmap_param_t *map, size_t current,
|
||||
size_t wanna);
|
||||
int mdbx_mremap(int flags, mdbx_mmap_param_t *map, size_t old_limit,
|
||||
} mdbx_mmap_t;
|
||||
|
||||
int mdbx_mmap(int flags, mdbx_mmap_t *map, size_t length, size_t limit);
|
||||
int mdbx_munmap(mdbx_mmap_t *map, size_t length);
|
||||
int mdbx_mlock(mdbx_mmap_t *map, size_t length);
|
||||
int mdbx_mresize(int flags, mdbx_mmap_t *map, size_t current, size_t wanna);
|
||||
int mdbx_mremap(int flags, mdbx_mmap_t *map, size_t old_limit,
|
||||
size_t new_limit);
|
||||
int mdbx_msync(mdbx_mmap_t *map, size_t offset, size_t length, int async);
|
||||
|
||||
static __inline mdbx_pid_t mdbx_getpid(void) {
|
||||
#if defined(_WIN32) || defined(_WIN64)
|
||||
|
Loading…
x
Reference in New Issue
Block a user