mdbx: поддержка асинхронного ввода-вывода для Windows и подготовка к io_ring (объединённые коммиты и исправления).

This commit is contained in:
Леонид Юрьев (Leonid Yuriev)
2022-09-25 12:47:31 +03:00
parent 9f64e2a10c
commit 474391c83c
9 changed files with 1323 additions and 399 deletions

View File

@@ -112,32 +112,73 @@ static
#define LCK_WAITFOR 0
#define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY
static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset,
size_t bytes) {
static int flock_with_event(HANDLE fd, HANDLE event, DWORD flags,
uint64_t offset, size_t bytes) {
TRACE("lock>>: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64
" >>",
fd, event, flags, offset, bytes);
OVERLAPPED ov;
ov.hEvent = 0;
ov.Internal = 0;
ov.InternalHigh = 0;
ov.hEvent = event;
ov.Offset = (DWORD)offset;
ov.OffsetHigh = HIGH_DWORD(offset);
return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov);
if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) {
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64
", bytes %" PRId64 " << %s",
fd, event, flags, offset, bytes, "done");
return MDBX_SUCCESS;
}
DWORD rc = GetLastError();
if (rc == ERROR_IO_PENDING) {
if (event) {
if (GetOverlappedResult(fd, &ov, &rc, true)) {
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64
", bytes %" PRId64 " << %s",
fd, event, flags, offset, bytes, "overlapped-done");
return MDBX_SUCCESS;
}
rc = GetLastError();
} else
CancelIo(fd);
}
TRACE("lock<<: fd %p, event %p, flags 0x%x offset %" PRId64 ", bytes %" PRId64
" << err %d",
fd, event, flags, offset, bytes, rc);
return (int)rc;
}
static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset,
size_t bytes) {
static __inline int flock(HANDLE fd, DWORD flags, uint64_t offset,
size_t bytes) {
return flock_with_event(fd, 0, flags, offset, bytes);
}
static __inline int flock_data(const MDBX_env *env, DWORD flags,
uint64_t offset, size_t bytes) {
return flock_with_event(env->me_fd4data, env->me_data_lock_event, flags,
offset, bytes);
}
static int funlock(mdbx_filehandle_t fd, uint64_t offset, size_t bytes) {
TRACE("unlock: fd %p, offset %" PRId64 ", bytes %" PRId64, fd, offset, bytes);
return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes,
HIGH_DWORD(bytes));
HIGH_DWORD(bytes))
? MDBX_SUCCESS
: (int)GetLastError();
}
/*----------------------------------------------------------------------------*/
/* global `write` lock for write-txt processing,
* exclusive locking both meta-pages) */
#define LCK_MAXLEN (1u + ((~(size_t)0) >> 1))
#define LCK_META_OFFSET 0
#define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS)
#define LCK_BODY_OFFSET LCK_META_LEN
#define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET)
#define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN
#define LCK_WHOLE 0, LCK_MAXLEN
#ifdef _WIN64
#define DXB_MAXLEN UINT64_C(0x7fffFFFFfff00000)
#else
#define DXB_MAXLEN UINT32_C(0x7ff00000)
#endif
#define DXB_BODY (env->me_psize * NUM_METAS), DXB_MAXLEN
#define DXB_WHOLE 0, DXB_MAXLEN
int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
if (dontwait) {
@@ -155,24 +196,27 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) {
}
}
if ((env->me_flags & MDBX_EXCLUSIVE) ||
flock(env->me_lazy_fd,
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
: (LCK_EXCLUSIVE | LCK_WAITFOR),
LCK_BODY))
if (env->me_flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS;
int rc = (int)GetLastError();
int rc = flock_with_event(env->me_fd4data, env->me_data_lock_event,
dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT)
: (LCK_EXCLUSIVE | LCK_WAITFOR),
DXB_BODY);
if (rc == MDBX_SUCCESS)
return rc;
LeaveCriticalSection(&env->me_windowsbug_lock);
return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY;
}
void mdbx_txn_unlock(MDBX_env *env) {
int rc = (env->me_flags & MDBX_EXCLUSIVE)
? TRUE
: funlock(env->me_lazy_fd, LCK_BODY);
if ((env->me_flags & MDBX_EXCLUSIVE) == 0) {
int err = funlock(env->me_fd4data, DXB_BODY);
if (err != MDBX_SUCCESS)
mdbx_panic("%s failed: err %u", __func__, err);
}
LeaveCriticalSection(&env->me_windowsbug_lock);
if (!rc)
mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
}
/*----------------------------------------------------------------------------*/
@@ -193,32 +237,32 @@ MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) {
/* transition from S-? (used) to S-E (locked),
* e.g. exclusive lock upper-part */
if ((env->me_flags & MDBX_EXCLUSIVE) ||
flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER))
if (env->me_flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS;
int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
if (rc == MDBX_SUCCESS)
return MDBX_SUCCESS;
int rc = (int)GetLastError();
osal_srwlock_ReleaseShared(&env->me_remap_guard);
return rc;
}
MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) {
if (env->me_lfd != INVALID_HANDLE_VALUE) {
if (env->me_lfd != INVALID_HANDLE_VALUE &&
(env->me_flags & MDBX_EXCLUSIVE) == 0) {
/* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */
if ((env->me_flags & MDBX_EXCLUSIVE) == 0 &&
!funlock(env->me_lfd, LCK_UPPER))
mdbx_panic("%s failed: err %u", __func__, (int)GetLastError());
int err = funlock(env->me_lfd, LCK_UPPER);
if (err != MDBX_SUCCESS)
mdbx_panic("%s failed: err %u", __func__, err);
}
osal_srwlock_ReleaseShared(&env->me_remap_guard);
}
MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) {
return flock(fd,
wait ? LCK_EXCLUSIVE | LCK_WAITFOR
: LCK_EXCLUSIVE | LCK_DONTWAIT,
0, LCK_MAXLEN)
? MDBX_SUCCESS
: (int)GetLastError();
return flock(
fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0,
DXB_MAXLEN);
}
static int suspend_and_append(mdbx_handle_array_t **array,
@@ -386,40 +430,36 @@ static void lck_unlock(MDBX_env *env) {
if (env->me_lfd != INVALID_HANDLE_VALUE) {
/* double `unlock` for robustly remove overlapped shared/exclusive locks */
while (funlock(env->me_lfd, LCK_LOWER))
;
err = (int)GetLastError();
do
err = funlock(env->me_lfd, LCK_LOWER);
while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(void)err;
SetLastError(ERROR_SUCCESS);
while (funlock(env->me_lfd, LCK_UPPER))
;
err = (int)GetLastError();
do
err = funlock(env->me_lfd, LCK_UPPER);
while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(void)err;
SetLastError(ERROR_SUCCESS);
}
if (env->me_lazy_fd != INVALID_HANDLE_VALUE) {
if (env->me_fd4data != INVALID_HANDLE_VALUE) {
/* explicitly unlock to avoid latency for other processes (windows kernel
* releases such locks via deferred queues) */
while (funlock(env->me_lazy_fd, LCK_BODY))
;
err = (int)GetLastError();
do
err = funlock(env->me_fd4data, DXB_BODY);
while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(void)err;
SetLastError(ERROR_SUCCESS);
while (funlock(env->me_lazy_fd, LCK_WHOLE))
;
err = (int)GetLastError();
do
err = funlock(env->me_fd4data, DXB_WHOLE);
while (err == MDBX_SUCCESS);
assert(err == ERROR_NOT_LOCKED ||
(mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION));
(void)err;
SetLastError(ERROR_SUCCESS);
}
}
@@ -428,56 +468,55 @@ static void lck_unlock(MDBX_env *env) {
* or as 'used' (S-? and returns MDBX_RESULT_FALSE).
* Otherwise returns an error. */
static int internal_seize_lck(HANDLE lfd) {
int rc;
assert(lfd != INVALID_HANDLE_VALUE);
/* 1) now on ?-? (free), get ?-E (middle) */
jitter4testing(false);
if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) {
rc = (int)GetLastError() /* 2) something went wrong, give up */;
int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER);
if (rc != MDBX_SUCCESS) {
/* 2) something went wrong, give up */;
ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc);
return rc;
}
/* 3) now on ?-E (middle), try E-E (exclusive-write) */
jitter4testing(false);
if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER))
rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER);
if (rc == MDBX_SUCCESS)
return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */;
/* 5) still on ?-E (middle) */
rc = (int)GetLastError();
jitter4testing(false);
if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) {
/* 6) something went wrong, give up */
if (!funlock(lfd, LCK_UPPER))
rc = funlock(lfd, LCK_UPPER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)",
(int)GetLastError());
rc);
return rc;
}
/* 7) still on ?-E (middle), try S-E (locked) */
jitter4testing(false);
rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE
: (int)GetLastError();
rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
jitter4testing(false);
if (rc != MDBX_RESULT_FALSE)
if (rc != MDBX_SUCCESS)
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
/* 8) now on S-E (locked) or still on ?-E (middle),
* transition to S-? (used) or ?-? (free) */
if (!funlock(lfd, LCK_UPPER))
int err = funlock(lfd, LCK_UPPER);
if (err != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__,
"X-E(locked/middle) >> X-?(used/free)", (int)GetLastError());
"X-E(locked/middle) >> X-?(used/free)", err);
/* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */
return rc;
}
MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
int rc;
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
assert(env->me_fd4data != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE)
return MDBX_RESULT_TRUE /* nope since files were must be opened
non-shareable */
@@ -486,15 +525,13 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
if (env->me_lfd == INVALID_HANDLE_VALUE) {
/* LY: without-lck mode (e.g. on read-only filesystem) */
jitter4testing(false);
if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) {
rc = (int)GetLastError();
int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE);
if (rc != MDBX_SUCCESS)
ERROR("%s, err %u", "without-lck", rc);
return rc;
}
return MDBX_RESULT_FALSE;
return rc;
}
rc = internal_seize_lck(env->me_lfd);
int rc = internal_seize_lck(env->me_lfd);
jitter4testing(false);
if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) {
/* Check that another process don't operates in without-lck mode.
@@ -503,17 +540,18 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
* - we need an exclusive lock for do so;
* - we can't lock meta-pages, otherwise other process could get an error
* while opening db in valid (non-conflict) mode. */
if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) {
rc = (int)GetLastError();
ERROR("%s, err %u", "lock-against-without-lck", rc);
int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE);
if (err != MDBX_SUCCESS) {
ERROR("%s, err %u", "lock-against-without-lck", err);
jitter4testing(false);
lck_unlock(env);
} else {
jitter4testing(false);
if (!funlock(env->me_lazy_fd, LCK_BODY))
mdbx_panic("%s(%s) failed: err %u", __func__,
"unlock-against-without-lck", (int)GetLastError());
return err;
}
jitter4testing(false);
err = funlock(env->me_fd4data, DXB_WHOLE);
if (err != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__,
"unlock-against-without-lck", err);
}
return rc;
@@ -521,28 +559,31 @@ MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) {
MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) {
/* Transite from exclusive-write state (E-E) to used (S-?) */
assert(env->me_lazy_fd != INVALID_HANDLE_VALUE);
assert(env->me_fd4data != INVALID_HANDLE_VALUE);
assert(env->me_lfd != INVALID_HANDLE_VALUE);
if (env->me_flags & MDBX_EXCLUSIVE)
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
;
/* 1) now at E-E (exclusive-write), transition to ?_E (middle) */
if (!funlock(env->me_lfd, LCK_LOWER))
int rc = funlock(env->me_lfd, LCK_LOWER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__,
"E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError());
"E-E(exclusive-write) >> ?-E(middle)", rc);
/* 2) now at ?-E (middle), transition to S-E (locked) */
if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) {
int rc = (int)GetLastError() /* 3) something went wrong, give up */;
rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER);
if (rc != MDBX_SUCCESS) {
/* 3) something went wrong, give up */;
ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc);
return rc;
}
/* 4) got S-E (locked), continue transition to S-? (used) */
if (!funlock(env->me_lfd, LCK_UPPER))
rc = funlock(env->me_lfd, LCK_UPPER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)",
(int)GetLastError());
rc);
return MDBX_SUCCESS /* 5) now at S-? (used), done */;
}
@@ -555,24 +596,26 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) {
return MDBX_SUCCESS /* nope since files were must be opened non-shareable */
;
int rc;
/* 1) now on S-? (used), try S-E (locked) */
jitter4testing(false);
if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) {
rc = (int)GetLastError() /* 2) something went wrong, give up */;
int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER);
if (rc != MDBX_SUCCESS) {
/* 2) something went wrong, give up */;
VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc);
return rc;
}
/* 3) now on S-E (locked), transition to ?-E (middle) */
if (!funlock(env->me_lfd, LCK_LOWER))
rc = funlock(env->me_lfd, LCK_LOWER);
if (rc != MDBX_SUCCESS)
mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)",
(int)GetLastError());
rc);
/* 4) now on ?-E (middle), try E-E (exclusive-write) */
jitter4testing(false);
if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) {
rc = (int)GetLastError() /* 5) something went wrong, give up */;
rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER);
if (rc != MDBX_SUCCESS) {
/* 5) something went wrong, give up */;
VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc);
return rc;
}
@@ -586,6 +629,23 @@ MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env,
(void)env;
(void)inprocess_neighbor;
(void)global_uniqueness_flag;
if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) {
HANDLE token = INVALID_HANDLE_VALUE;
TOKEN_PRIVILEGES privileges;
privileges.PrivilegeCount = 1;
privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES,
&token) ||
!LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME,
&privileges.Privileges[0].Luid) ||
!AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges),
nullptr, nullptr) ||
GetLastError() != ERROR_SUCCESS)
mdbx_SetFileIoOverlappedRange = NULL;
if (token != INVALID_HANDLE_VALUE)
CloseHandle(token);
}
return MDBX_SUCCESS;
}
@@ -752,6 +812,7 @@ MDBX_NtFsControlFile mdbx_NtFsControlFile;
MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory;
MDBX_GetTickCount64 mdbx_GetTickCount64;
MDBX_RegGetValueA mdbx_RegGetValueA;
MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange;
#endif /* xMDBX_ALLOY */
#if __GNUC_PREREQ(8, 0)
@@ -783,6 +844,7 @@ static void mdbx_winnt_import(void) {
GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW);
GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW);
GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory);
GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange);
}
const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll");