libmdbx/src/osal.c

3681 lines
118 KiB
C
Raw Normal View History

/// \copyright SPDX-License-Identifier: Apache-2.0
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
///
/// https://en.wikipedia.org/wiki/Operating_system_abstraction_layer
#include "internals.h"
#if defined(_WIN32) || defined(_WIN64)
2017-09-05 20:58:03 +03:00
#include <psapi.h>
#include <winioctl.h>
#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG)
#include <crtdbg.h>
#endif
static int waitstatus2errcode(DWORD result) {
switch (result) {
case WAIT_OBJECT_0:
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
case WAIT_FAILED:
return (int)GetLastError();
case WAIT_ABANDONED:
return ERROR_ABANDONED_WAIT_0;
case WAIT_IO_COMPLETION:
return ERROR_USER_APC;
case WAIT_TIMEOUT:
return ERROR_TIMEOUT;
default:
return ERROR_UNHANDLED_ERROR;
}
}
2017-06-06 21:20:16 +03:00
/* Map a result from an NTAPI call to WIN32 error code. */
static int ntstatus2errcode(NTSTATUS status) {
DWORD dummy;
OVERLAPPED ov;
memset(&ov, 0, sizeof(ov));
ov.Internal = status;
/* Zap: '_Param_(1)' could be '0' */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6387);
return GetOverlappedResult(nullptr, &ov, &dummy, FALSE) ? MDBX_SUCCESS
: (int)GetLastError();
2017-06-06 21:20:16 +03:00
}
/* We use native NT APIs to setup the memory map, so that we can
* let the DB file grow incrementally instead of always preallocating
* the full size. These APIs are defined in <wdm.h> and <ntifs.h>
* but those headers are meant for driver-level development and
* conflict with the regular user-level headers, so we explicitly
* declare them here. Using these APIs also means we must link to
* ntdll.dll, which is not linked by default in user code. */
extern NTSTATUS NTAPI NtCreateSection(
OUT PHANDLE SectionHandle, IN ACCESS_MASK DesiredAccess,
IN OPTIONAL POBJECT_ATTRIBUTES ObjectAttributes,
IN OPTIONAL PLARGE_INTEGER MaximumSize, IN ULONG SectionPageProtection,
IN ULONG AllocationAttributes, IN OPTIONAL HANDLE FileHandle);
typedef struct _SECTION_BASIC_INFORMATION {
ULONG Unknown;
ULONG SectionAttributes;
LARGE_INTEGER SectionSize;
} SECTION_BASIC_INFORMATION, *PSECTION_BASIC_INFORMATION;
extern NTSTATUS NTAPI NtMapViewOfSection(
IN HANDLE SectionHandle, IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress,
IN ULONG_PTR ZeroBits, IN SIZE_T CommitSize,
IN OUT OPTIONAL PLARGE_INTEGER SectionOffset, IN OUT PSIZE_T ViewSize,
IN SECTION_INHERIT InheritDisposition, IN ULONG AllocationType,
IN ULONG Win32Protect);
extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle,
IN OPTIONAL PVOID BaseAddress);
/* Zap: Inconsistent annotation for 'NtClose'... */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(28251)
extern NTSTATUS NTAPI NtClose(HANDLE Handle);
2017-06-06 21:20:16 +03:00
extern NTSTATUS NTAPI NtAllocateVirtualMemory(
IN HANDLE ProcessHandle, IN OUT PVOID *BaseAddress, IN ULONG_PTR ZeroBits,
IN OUT PSIZE_T RegionSize, IN ULONG AllocationType, IN ULONG Protect);
extern NTSTATUS NTAPI NtFreeVirtualMemory(IN HANDLE ProcessHandle,
IN PVOID *BaseAddress,
IN OUT PSIZE_T RegionSize,
IN ULONG FreeType);
2017-06-06 21:20:16 +03:00
#ifndef WOF_CURRENT_VERSION
typedef struct _WOF_EXTERNAL_INFO {
DWORD Version;
DWORD Provider;
} WOF_EXTERNAL_INFO, *PWOF_EXTERNAL_INFO;
#endif /* WOF_CURRENT_VERSION */
#ifndef WIM_PROVIDER_CURRENT_VERSION
#define WIM_PROVIDER_HASH_SIZE 20
typedef struct _WIM_PROVIDER_EXTERNAL_INFO {
DWORD Version;
DWORD Flags;
LARGE_INTEGER DataSourceId;
BYTE ResourceHash[WIM_PROVIDER_HASH_SIZE];
} WIM_PROVIDER_EXTERNAL_INFO, *PWIM_PROVIDER_EXTERNAL_INFO;
#endif /* WIM_PROVIDER_CURRENT_VERSION */
#ifndef FILE_PROVIDER_CURRENT_VERSION
typedef struct _FILE_PROVIDER_EXTERNAL_INFO_V1 {
ULONG Version;
ULONG Algorithm;
ULONG Flags;
} FILE_PROVIDER_EXTERNAL_INFO_V1, *PFILE_PROVIDER_EXTERNAL_INFO_V1;
#endif /* FILE_PROVIDER_CURRENT_VERSION */
#ifndef STATUS_OBJECT_NOT_EXTERNALLY_BACKED
#define STATUS_OBJECT_NOT_EXTERNALLY_BACKED ((NTSTATUS)0xC000046DL)
#endif
#ifndef STATUS_INVALID_DEVICE_REQUEST
#define STATUS_INVALID_DEVICE_REQUEST ((NTSTATUS)0xC0000010L)
#endif
#ifndef STATUS_NOT_SUPPORTED
#define STATUS_NOT_SUPPORTED ((NTSTATUS)0xC00000BBL)
#endif
#ifndef FILE_DEVICE_FILE_SYSTEM
#define FILE_DEVICE_FILE_SYSTEM 0x00000009
#endif
#ifndef FSCTL_GET_EXTERNAL_BACKING
#define FSCTL_GET_EXTERNAL_BACKING \
CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 196, METHOD_BUFFERED, FILE_ANY_ACCESS)
#endif
#ifndef ERROR_NOT_CAPABLE
#define ERROR_NOT_CAPABLE 775L
#endif
#endif /* _WIN32 || _WIN64 */
/*----------------------------------------------------------------------------*/
#if defined(__ANDROID_API__)
__extern_C void __assert2(const char *file, int line, const char *function,
const char *msg) __noreturn;
#define __assert_fail(assertion, file, line, function) \
__assert2(file, line, function, assertion)
#elif defined(__UCLIBC__)
MDBX_NORETURN __extern_C void __assert(const char *, const char *, unsigned,
const char *)
#ifdef __THROW
__THROW
#else
__nothrow
#endif /* __THROW */
;
#define __assert_fail(assertion, file, line, function) \
__assert(assertion, file, line, function)
#elif _POSIX_C_SOURCE > 200212 && \
/* workaround for avoid musl libc wrong prototype */ ( \
defined(__GLIBC__) || defined(__GNU_LIBRARY__))
/* Prototype should match libc runtime. ISO POSIX (2003) & LSB 1.x-3.x */
MDBX_NORETURN __extern_C void __assert_fail(const char *assertion,
const char *file, unsigned line,
const char *function)
2019-08-25 03:05:58 +03:00
#ifdef __THROW
__THROW
#else
__nothrow
#endif /* __THROW */
;
2019-08-25 03:05:58 +03:00
2019-08-13 02:07:10 +03:00
#elif defined(__APPLE__) || defined(__MACH__)
2019-08-25 03:05:58 +03:00
__extern_C void __assert_rtn(const char *function, const char *file, int line,
const char *assertion) /* __nothrow */
#ifdef __dead2
__dead2
#else
MDBX_NORETURN
2019-08-25 03:05:58 +03:00
#endif /* __dead2 */
#ifdef __disable_tail_calls
__disable_tail_calls
#endif /* __disable_tail_calls */
;
2019-08-13 02:07:10 +03:00
#define __assert_fail(assertion, file, line, function) \
__assert_rtn(function, file, line, assertion)
#elif defined(__sun) || defined(__SVR4) || defined(__svr4__)
MDBX_NORETURN __extern_C void __assert_c99(const char *assection,
const char *file, int line,
const char *function);
#define __assert_fail(assertion, file, line, function) \
__assert_c99(assertion, file, line, function)
#elif defined(__OpenBSD__)
__extern_C __dead void __assert2(const char *file, int line,
const char *function,
const char *assertion) /* __nothrow */;
#define __assert_fail(assertion, file, line, function) \
__assert2(file, line, function, assertion)
#elif defined(__NetBSD__)
__extern_C __dead void __assert13(const char *file, int line,
const char *function,
const char *assertion) /* __nothrow */;
#define __assert_fail(assertion, file, line, function) \
__assert13(file, line, function, assertion)
#elif defined(__FreeBSD__) || defined(__BSD__) || defined(__bsdi__) || \
2019-08-13 02:07:10 +03:00
defined(__DragonFly__)
2019-08-25 03:05:58 +03:00
__extern_C void __assert(const char *function, const char *file, int line,
const char *assertion) /* __nothrow */
#ifdef __dead2
__dead2
#else
MDBX_NORETURN
2019-08-25 03:05:58 +03:00
#endif /* __dead2 */
#ifdef __disable_tail_calls
__disable_tail_calls
#endif /* __disable_tail_calls */
;
#define __assert_fail(assertion, file, line, function) \
__assert(function, file, line, assertion)
#endif /* __assert_fail */
__cold void mdbx_assert_fail(const MDBX_env *env, const char *msg,
const char *func, unsigned line) {
2017-05-24 01:42:10 +03:00
#if MDBX_DEBUG
if (env && env->assert_func)
env->assert_func(env, msg, func, line);
#else
(void)env;
assert_fail(msg, func, line);
}
MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func,
unsigned line) {
2017-05-24 01:42:10 +03:00
#endif /* MDBX_DEBUG */
if (globals.logger.ptr)
debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg);
else {
#if defined(_WIN32) || defined(_WIN64)
char *message = nullptr;
const int num = osal_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u",
2018-10-21 15:42:26 +03:00
msg, func ? func : "unknown", line);
if (num < 1 || !message)
message = "<troubles with assertion-message preparation>";
OutputDebugStringA(message);
#else
__assert_fail(msg, "mdbx", line, func);
#endif
}
while (1) {
#if defined(_WIN32) || defined(_WIN64)
#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG)
_CrtDbgReport(_CRT_ASSERT, func ? func : "unknown", line, "libmdbx",
"assertion failed: %s", msg);
#else
if (IsDebuggerPresent())
DebugBreak();
#endif
FatalExit(STATUS_ASSERTION_FAILURE);
#else
abort();
#endif
}
}
__cold void mdbx_panic(const char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
char *message = nullptr;
const int num = osal_vasprintf(&message, fmt, ap);
va_end(ap);
const char *const const_message =
unlikely(num < 1 || !message)
? "<troubles with panic-message preparation>"
: message;
if (globals.logger.ptr)
debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message);
while (1) {
#if defined(_WIN32) || defined(_WIN64)
#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG)
_CrtDbgReport(_CRT_ASSERT, "mdbx.c", 0, "libmdbx", "panic: %s",
const_message);
#else
OutputDebugStringA("\r\nMDBX-PANIC: ");
OutputDebugStringA(const_message);
if (IsDebuggerPresent())
DebugBreak();
#endif
FatalExit(ERROR_UNHANDLED_ERROR);
#else
__assert_fail(const_message, "mdbx", 0, "panic");
abort();
#endif
}
}
/*----------------------------------------------------------------------------*/
#ifndef osal_vasprintf
MDBX_INTERNAL int osal_vasprintf(char **strp, const char *fmt, va_list ap) {
2018-10-05 15:41:27 +03:00
va_list ones;
va_copy(ones, ap);
const int needed = vsnprintf(nullptr, 0, fmt, ones);
va_end(ones);
if (unlikely(needed < 0 || needed >= INT_MAX)) {
*strp = nullptr;
return needed;
}
*strp = osal_malloc(needed + (size_t)1);
if (unlikely(*strp == nullptr)) {
2018-10-05 15:41:27 +03:00
#if defined(_WIN32) || defined(_WIN64)
2017-04-24 19:37:01 +03:00
SetLastError(MDBX_ENOMEM);
2018-10-05 15:41:27 +03:00
#else
errno = MDBX_ENOMEM;
#endif
2017-04-24 19:37:01 +03:00
return -1;
}
const int actual = vsnprintf(*strp, needed + (size_t)1, fmt, ap);
assert(actual == needed);
if (unlikely(actual < 0)) {
osal_free(*strp);
*strp = nullptr;
}
return actual;
}
#endif /* osal_vasprintf */
2018-10-05 15:41:27 +03:00
#ifndef osal_asprintf
MDBX_INTERNAL int osal_asprintf(char **strp, const char *fmt, ...) {
2018-10-05 15:41:27 +03:00
va_list ap;
va_start(ap, fmt);
const int rc = osal_vasprintf(strp, fmt, ap);
2018-10-05 15:41:27 +03:00
va_end(ap);
return rc;
}
#endif /* osal_asprintf */
#ifndef osal_memalign_alloc
MDBX_INTERNAL int osal_memalign_alloc(size_t alignment, size_t bytes,
void **result) {
2020-02-02 20:49:51 +03:00
assert(is_powerof2(alignment) && alignment >= sizeof(void *));
#if defined(_WIN32) || defined(_WIN64)
(void)alignment;
*result =
VirtualAlloc(nullptr, bytes, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
2017-05-24 01:42:10 +03:00
return *result ? MDBX_SUCCESS : MDBX_ENOMEM /* ERROR_OUTOFMEMORY */;
#elif defined(_ISOC11_SOURCE)
*result = aligned_alloc(alignment, ceil_powerof2(bytes, alignment));
2017-05-24 01:42:10 +03:00
return *result ? MDBX_SUCCESS : errno;
#elif _POSIX_VERSION >= 200112L && \
(!defined(__ANDROID_API__) || __ANDROID_API__ >= 17)
*result = nullptr;
return posix_memalign(result, alignment, bytes);
#elif __GLIBC_PREREQ(2, 16) || __STDC_VERSION__ >= 201112L
*result = memalign(alignment, bytes);
return *result ? MDBX_SUCCESS : errno;
#else
#error FIXME
#endif
}
#endif /* osal_memalign_alloc */
#ifndef osal_memalign_free
MDBX_INTERNAL void osal_memalign_free(void *ptr) {
#if defined(_WIN32) || defined(_WIN64)
VirtualFree(ptr, 0, MEM_RELEASE);
#else
osal_free(ptr);
#endif
}
#endif /* osal_memalign_free */
#ifndef osal_strdup
char *osal_strdup(const char *str) {
if (!str)
return nullptr;
size_t bytes = strlen(str) + 1;
char *dup = osal_malloc(bytes);
if (dup)
memcpy(dup, str, bytes);
return dup;
}
#endif /* osal_strdup */
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL int osal_condpair_init(osal_condpair_t *condpair) {
int rc;
memset(condpair, 0, sizeof(osal_condpair_t));
#if defined(_WIN32) || defined(_WIN64)
if (!(condpair->mutex = CreateMutexW(nullptr, FALSE, nullptr))) {
rc = (int)GetLastError();
goto bailout_mutex;
}
if (!(condpair->event[0] = CreateEventW(nullptr, FALSE, FALSE, nullptr))) {
rc = (int)GetLastError();
goto bailout_event;
}
if ((condpair->event[1] = CreateEventW(nullptr, FALSE, FALSE, nullptr)))
return MDBX_SUCCESS;
rc = (int)GetLastError();
(void)CloseHandle(condpair->event[0]);
bailout_event:
(void)CloseHandle(condpair->mutex);
#else
rc = pthread_mutex_init(&condpair->mutex, nullptr);
if (unlikely(rc != 0))
goto bailout_mutex;
rc = pthread_cond_init(&condpair->cond[0], nullptr);
if (unlikely(rc != 0))
goto bailout_cond;
rc = pthread_cond_init(&condpair->cond[1], nullptr);
if (likely(rc == 0))
return MDBX_SUCCESS;
(void)pthread_cond_destroy(&condpair->cond[0]);
bailout_cond:
(void)pthread_mutex_destroy(&condpair->mutex);
#endif
bailout_mutex:
memset(condpair, 0, sizeof(osal_condpair_t));
return rc;
}
MDBX_INTERNAL int osal_condpair_destroy(osal_condpair_t *condpair) {
#if defined(_WIN32) || defined(_WIN64)
int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError();
rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError();
rc = CloseHandle(condpair->event[1]) ? rc : (int)GetLastError();
#else
int err, rc = pthread_mutex_destroy(&condpair->mutex);
rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc;
rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc;
#endif
memset(condpair, 0, sizeof(osal_condpair_t));
return rc;
}
MDBX_INTERNAL int osal_condpair_lock(osal_condpair_t *condpair) {
#if defined(_WIN32) || defined(_WIN64)
DWORD code = WaitForSingleObject(condpair->mutex, INFINITE);
return waitstatus2errcode(code);
#else
return osal_pthread_mutex_lock(&condpair->mutex);
#endif
}
MDBX_INTERNAL int osal_condpair_unlock(osal_condpair_t *condpair) {
#if defined(_WIN32) || defined(_WIN64)
return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError();
#else
return pthread_mutex_unlock(&condpair->mutex);
#endif
}
MDBX_INTERNAL int osal_condpair_signal(osal_condpair_t *condpair, bool part) {
#if defined(_WIN32) || defined(_WIN64)
return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError();
#else
return pthread_cond_signal(&condpair->cond[part]);
#endif
}
MDBX_INTERNAL int osal_condpair_wait(osal_condpair_t *condpair, bool part) {
#if defined(_WIN32) || defined(_WIN64)
DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part],
INFINITE, FALSE);
if (code == WAIT_OBJECT_0) {
code = WaitForSingleObject(condpair->mutex, INFINITE);
if (code == WAIT_OBJECT_0)
return MDBX_SUCCESS;
}
return waitstatus2errcode(code);
#else
return pthread_cond_wait(&condpair->cond[part], &condpair->mutex);
#endif
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL int osal_fastmutex_init(osal_fastmutex_t *fastmutex) {
2017-05-23 21:54:06 +03:00
#if defined(_WIN32) || defined(_WIN64)
InitializeCriticalSection(fastmutex);
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
#elif MDBX_DEBUG
pthread_mutexattr_t ma;
int rc = pthread_mutexattr_init(&ma);
if (likely(!rc)) {
rc = pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_ERRORCHECK);
if (likely(!rc) || rc == ENOTSUP)
rc = pthread_mutex_init(fastmutex, &ma);
pthread_mutexattr_destroy(&ma);
}
return rc;
2017-05-23 21:54:06 +03:00
#else
return pthread_mutex_init(fastmutex, nullptr);
2017-05-23 21:54:06 +03:00
#endif
}
MDBX_INTERNAL int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex) {
2017-05-23 21:54:06 +03:00
#if defined(_WIN32) || defined(_WIN64)
DeleteCriticalSection(fastmutex);
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
2017-05-23 21:54:06 +03:00
#else
return pthread_mutex_destroy(fastmutex);
#endif
}
MDBX_INTERNAL int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) {
2017-05-23 21:54:06 +03:00
#if defined(_WIN32) || defined(_WIN64)
__try {
EnterCriticalSection(fastmutex);
} __except (
(GetExceptionCode() ==
0xC0000194 /* STATUS_POSSIBLE_DEADLOCK / EXCEPTION_POSSIBLE_DEADLOCK */)
? EXCEPTION_EXECUTE_HANDLER
: EXCEPTION_CONTINUE_SEARCH) {
return MDBX_EDEADLK;
}
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
2017-05-23 21:54:06 +03:00
#else
return osal_pthread_mutex_lock(fastmutex);
2017-05-23 21:54:06 +03:00
#endif
}
MDBX_INTERNAL int osal_fastmutex_release(osal_fastmutex_t *fastmutex) {
2017-05-23 21:54:06 +03:00
#if defined(_WIN32) || defined(_WIN64)
LeaveCriticalSection(fastmutex);
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
2017-05-23 21:54:06 +03:00
#else
return pthread_mutex_unlock(fastmutex);
#endif
}
/*----------------------------------------------------------------------------*/
#if defined(_WIN32) || defined(_WIN64)
MDBX_INTERNAL int osal_mb2w(const char *const src, wchar_t **const pdst) {
const size_t dst_wlen = MultiByteToWideChar(
CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, -1, nullptr, 0);
wchar_t *dst = *pdst;
int rc = ERROR_INVALID_NAME;
if (unlikely(dst_wlen < 2 || dst_wlen > /* MAX_PATH */ INT16_MAX))
goto bailout;
dst = osal_realloc(dst, dst_wlen * sizeof(wchar_t));
rc = MDBX_ENOMEM;
if (unlikely(!dst))
goto bailout;
*pdst = dst;
if (likely(dst_wlen == (size_t)MultiByteToWideChar(CP_THREAD_ACP,
MB_ERR_INVALID_CHARS, src,
-1, dst, (int)dst_wlen)))
return MDBX_SUCCESS;
rc = ERROR_INVALID_NAME;
bailout:
if (*pdst) {
osal_free(*pdst);
*pdst = nullptr;
}
return rc;
}
#endif /* Windows */
/*----------------------------------------------------------------------------*/
#if defined(_WIN32) || defined(_WIN64)
#define ior_alignment_mask (ior->pagesize - 1)
#define ior_WriteFile_flag 1
#define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element))
static void ior_put_event(osal_ioring_t *ior, HANDLE event) {
assert(event && event != INVALID_HANDLE_VALUE && event != ior);
assert(ior->event_stack < ior->allocated);
ior->event_pool[ior->event_stack] = event;
ior->event_stack += 1;
}
static HANDLE ior_get_event(osal_ioring_t *ior) {
assert(ior->event_stack <= ior->allocated);
if (ior->event_stack > 0) {
ior->event_stack -= 1;
assert(ior->event_pool[ior->event_stack] != 0);
return ior->event_pool[ior->event_stack];
}
return CreateEventW(nullptr, true, false, nullptr);
}
static void WINAPI ior_wocr(DWORD err, DWORD bytes, OVERLAPPED *ov) {
osal_ioring_t *ior = ov->hEvent;
ov->Internal = err;
ov->InternalHigh = bytes;
if (++ior->async_completed >= ior->async_waiting)
SetEvent(ior->async_done);
}
#elif MDBX_HAVE_PWRITEV
#if defined(_SC_IOV_MAX)
static size_t osal_iov_max;
#define OSAL_IOV_MAX osal_iov_max
#else
#define OSAL_IOV_MAX IOV_MAX
#endif
#else
#undef OSAL_IOV_MAX
#endif /* OSAL_IOV_MAX */
MDBX_INTERNAL int osal_ioring_create(osal_ioring_t *ior
#if defined(_WIN32) || defined(_WIN64)
,
bool enable_direct,
mdbx_filehandle_t overlapped_fd
#endif /* Windows */
) {
memset(ior, 0, sizeof(osal_ioring_t));
#if defined(_WIN32) || defined(_WIN64)
ior->overlapped_fd = overlapped_fd;
ior->direct = enable_direct && overlapped_fd;
ior->pagesize = globals.sys_pagesize;
ior->pagesize_ln2 = globals.sys_pagesize_ln2;
ior->async_done = ior_get_event(ior);
if (!ior->async_done)
return GetLastError();
#endif /* !Windows */
#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX)
assert(osal_iov_max > 0);
#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */
ior->boundary = ptr_disp(ior->pool, ior->allocated);
return MDBX_SUCCESS;
}
static inline size_t ior_offset(const ior_item_t *item) {
#if defined(_WIN32) || defined(_WIN64)
return item->ov.Offset | (size_t)((sizeof(size_t) > sizeof(item->ov.Offset))
? (uint64_t)item->ov.OffsetHigh << 32
: 0);
#else
return item->offset;
#endif /* !Windows */
}
static inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) {
#if defined(ior_sgv_element)
assert(sgvcnt > 0);
return (ior_item_t *)ptr_disp(item, sizeof(ior_item_t) -
sizeof(ior_sgv_element) +
sizeof(ior_sgv_element) * sgvcnt);
#else
assert(sgvcnt == 1);
(void)sgvcnt;
return item + 1;
#endif
}
MDBX_INTERNAL int osal_ioring_add(osal_ioring_t *ior, const size_t offset,
void *data, const size_t bytes) {
assert(bytes && data);
assert(bytes % MDBX_MIN_PAGESIZE == 0 && bytes <= MAX_WRITE);
assert(offset % MDBX_MIN_PAGESIZE == 0 &&
offset + (uint64_t)bytes <= MAX_MAPSIZE);
#if defined(_WIN32) || defined(_WIN64)
const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2);
const bool use_gather =
ior->direct && ior->overlapped_fd && ior->slots_left >= segments;
#endif /* Windows */
ior_item_t *item = ior->pool;
if (likely(ior->last)) {
item = ior->last;
if (unlikely(ior_offset(item) + ior_last_bytes(ior, item) == offset) &&
likely(ior_last_bytes(ior, item) + bytes <= MAX_WRITE)) {
#if defined(_WIN32) || defined(_WIN64)
if (use_gather &&
((bytes | (uintptr_t)data | ior->last_bytes |
(uintptr_t)(uint64_t)item->sgv[0].Buffer) &
ior_alignment_mask) == 0 &&
ior->last_sgvcnt + (size_t)segments < OSAL_IOV_MAX) {
assert(ior->overlapped_fd);
assert((item->single.iov_len & ior_WriteFile_flag) == 0);
assert(item->sgv[ior->last_sgvcnt].Buffer == 0);
ior->last_bytes += bytes;
size_t i = 0;
do {
item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data);
data = ptr_disp(data, ior->pagesize);
} while (++i < segments);
ior->slots_left -= segments;
item->sgv[ior->last_sgvcnt += segments].Buffer = 0;
assert((item->single.iov_len & ior_WriteFile_flag) == 0);
return MDBX_SUCCESS;
}
const void *end = ptr_disp(item->single.iov_base,
item->single.iov_len - ior_WriteFile_flag);
if (unlikely(end == data)) {
assert((item->single.iov_len & ior_WriteFile_flag) != 0);
item->single.iov_len += bytes;
return MDBX_SUCCESS;
}
#elif MDBX_HAVE_PWRITEV
assert((int)item->sgvcnt > 0);
const void *end = ptr_disp(item->sgv[item->sgvcnt - 1].iov_base,
item->sgv[item->sgvcnt - 1].iov_len);
if (unlikely(end == data)) {
item->sgv[item->sgvcnt - 1].iov_len += bytes;
ior->last_bytes += bytes;
return MDBX_SUCCESS;
}
if (likely(item->sgvcnt < OSAL_IOV_MAX)) {
if (unlikely(ior->slots_left < 1))
return MDBX_RESULT_TRUE;
item->sgv[item->sgvcnt].iov_base = data;
item->sgv[item->sgvcnt].iov_len = bytes;
ior->last_bytes += bytes;
item->sgvcnt += 1;
ior->slots_left -= 1;
return MDBX_SUCCESS;
}
#else
const void *end = ptr_disp(item->single.iov_base, item->single.iov_len);
if (unlikely(end == data)) {
item->single.iov_len += bytes;
return MDBX_SUCCESS;
}
#endif
}
item = ior_next(item, ior_last_sgvcnt(ior, item));
}
if (unlikely(ior->slots_left < 1))
return MDBX_RESULT_TRUE;
unsigned slots_used = 1;
#if defined(_WIN32) || defined(_WIN64)
item->ov.Internal = item->ov.InternalHigh = 0;
item->ov.Offset = (DWORD)offset;
item->ov.OffsetHigh = HIGH_DWORD(offset);
item->ov.hEvent = 0;
if (!use_gather || ((bytes | (uintptr_t)(data)) & ior_alignment_mask) != 0 ||
segments > OSAL_IOV_MAX) {
/* WriteFile() */
item->single.iov_base = data;
item->single.iov_len = bytes + ior_WriteFile_flag;
assert((item->single.iov_len & ior_WriteFile_flag) != 0);
} else {
/* WriteFileGather() */
assert(ior->overlapped_fd);
item->sgv[0].Buffer = PtrToPtr64(data);
for (size_t i = 1; i < segments; ++i) {
data = ptr_disp(data, ior->pagesize);
item->sgv[i].Buffer = PtrToPtr64(data);
}
item->sgv[slots_used = segments].Buffer = 0;
assert((item->single.iov_len & ior_WriteFile_flag) == 0);
}
ior->last_bytes = bytes;
ior_last_sgvcnt(ior, item) = slots_used;
#elif MDBX_HAVE_PWRITEV
item->offset = offset;
item->sgv[0].iov_base = data;
item->sgv[0].iov_len = bytes;
ior->last_bytes = bytes;
ior_last_sgvcnt(ior, item) = slots_used;
#else
item->offset = offset;
item->single.iov_base = data;
item->single.iov_len = bytes;
#endif /* !Windows */
ior->slots_left -= slots_used;
ior->last = item;
return MDBX_SUCCESS;
}
MDBX_INTERNAL void osal_ioring_walk(osal_ioring_t *ior, iov_ctx_t *ctx,
void (*callback)(iov_ctx_t *ctx,
size_t offset, void *data,
size_t bytes)) {
for (ior_item_t *item = ior->pool; item <= ior->last;) {
#if defined(_WIN32) || defined(_WIN64)
size_t offset = ior_offset(item);
char *data = item->single.iov_base;
size_t bytes = item->single.iov_len - ior_WriteFile_flag;
size_t i = 1;
if (bytes & ior_WriteFile_flag) {
data = Ptr64ToPtr(item->sgv[0].Buffer);
bytes = ior->pagesize;
/* Zap: Reading invalid data from 'item->sgv' */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385);
while (item->sgv[i].Buffer) {
if (data + ior->pagesize != item->sgv[i].Buffer) {
callback(ctx, offset, data, bytes);
offset += bytes;
data = Ptr64ToPtr(item->sgv[i].Buffer);
bytes = 0;
}
bytes += ior->pagesize;
++i;
}
}
assert(bytes < MAX_WRITE);
callback(ctx, offset, data, bytes);
#elif MDBX_HAVE_PWRITEV
assert(item->sgvcnt > 0);
size_t offset = item->offset;
size_t i = 0;
do {
callback(ctx, offset, item->sgv[i].iov_base, item->sgv[i].iov_len);
offset += item->sgv[i].iov_len;
} while (++i != item->sgvcnt);
#else
const size_t i = 1;
callback(ctx, item->offset, item->single.iov_base, item->single.iov_len);
#endif
item = ior_next(item, i);
}
}
MDBX_INTERNAL osal_ioring_write_result_t
osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) {
osal_ioring_write_result_t r = {MDBX_SUCCESS, 0};
#if defined(_WIN32) || defined(_WIN64)
HANDLE *const end_wait_for =
ior->event_pool + ior->allocated +
/* был выделен один дополнительный элемент для async_done */ 1;
HANDLE *wait_for = end_wait_for;
LONG async_started = 0;
for (ior_item_t *item = ior->pool; item <= ior->last;) {
item->ov.Internal = STATUS_PENDING;
size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag;
r.wops += 1;
if (bytes & ior_WriteFile_flag) {
assert(ior->overlapped_fd && fd == ior->overlapped_fd);
bytes = ior->pagesize;
/* Zap: Reading invalid data from 'item->sgv' */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385);
while (item->sgv[i].Buffer) {
bytes += ior->pagesize;
++i;
}
assert(bytes < MAX_WRITE);
item->ov.hEvent = ior_get_event(ior);
if (unlikely(!item->ov.hEvent)) {
bailout_geterr:
r.err = GetLastError();
bailout_rc:
assert(r.err != MDBX_SUCCESS);
CancelIo(fd);
return r;
}
if (WriteFileGather(fd, item->sgv, (DWORD)bytes, nullptr, &item->ov)) {
assert(item->ov.Internal == 0 &&
WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0);
ior_put_event(ior, item->ov.hEvent);
item->ov.hEvent = 0;
} else {
r.err = (int)GetLastError();
if (unlikely(r.err != ERROR_IO_PENDING)) {
void *data = Ptr64ToPtr(item->sgv[0].Buffer);
ERROR("%s: fd %p, item %p (%zu), addr %p pgno %u, bytes %zu,"
" offset %" PRId64 ", err %d",
"WriteFileGather", fd, __Wpedantic_format_voidptr(item),
item - ior->pool, data, ((page_t *)data)->pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
goto bailout_rc;
}
assert(wait_for > ior->event_pool + ior->event_stack);
*--wait_for = item->ov.hEvent;
}
} else if (fd == ior->overlapped_fd) {
assert(bytes < MAX_WRITE);
retry:
item->ov.hEvent = ior;
if (WriteFileEx(fd, item->single.iov_base, (DWORD)bytes, &item->ov,
ior_wocr)) {
async_started += 1;
} else {
r.err = (int)GetLastError();
switch (r.err) {
default:
ERROR("%s: fd %p, item %p (%zu), addr %p pgno %u, bytes %zu,"
" offset %" PRId64 ", err %d",
"WriteFileEx", fd, __Wpedantic_format_voidptr(item),
item - ior->pool, item->single.iov_base,
((page_t *)item->single.iov_base)->pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
goto bailout_rc;
case ERROR_NOT_FOUND:
case ERROR_USER_MAPPED_FILE:
case ERROR_LOCK_VIOLATION:
WARNING("%s: fd %p, item %p (%zu), addr %p pgno %u, bytes %zu,"
" offset %" PRId64 ", err %d",
"WriteFileEx", fd, __Wpedantic_format_voidptr(item),
item - ior->pool, item->single.iov_base,
((page_t *)item->single.iov_base)->pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
r.err);
SleepEx(0, true);
goto retry;
case ERROR_INVALID_USER_BUFFER:
case ERROR_NOT_ENOUGH_MEMORY:
if (SleepEx(0, true) == WAIT_IO_COMPLETION)
goto retry;
goto bailout_rc;
case ERROR_IO_PENDING:
async_started += 1;
}
}
} else {
assert(bytes < MAX_WRITE);
DWORD written = 0;
if (!WriteFile(fd, item->single.iov_base, (DWORD)bytes, &written,
&item->ov)) {
r.err = (int)GetLastError();
ERROR("%s: fd %p, item %p (%zu), addr %p pgno %u, bytes %zu,"
" offset %" PRId64 ", err %d",
"WriteFile", fd, __Wpedantic_format_voidptr(item),
item - ior->pool, item->single.iov_base,
((page_t *)item->single.iov_base)->pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), r.err);
goto bailout_rc;
} else if (unlikely(written != bytes)) {
r.err = ERROR_WRITE_FAULT;
goto bailout_rc;
}
}
item = ior_next(item, i);
}
assert(ior->async_waiting > ior->async_completed &&
ior->async_waiting == INT_MAX);
ior->async_waiting = async_started;
if (async_started > ior->async_completed && end_wait_for == wait_for) {
assert(wait_for > ior->event_pool + ior->event_stack);
*--wait_for = ior->async_done;
}
const size_t pending_count = end_wait_for - wait_for;
if (pending_count) {
/* Ждем до MAXIMUM_WAIT_OBJECTS (64) последних хендлов, а после избирательно
* ждем посредством GetOverlappedResult(), если какие-то более ранние
* элементы еще не завершены. В целом, так получается меньше системных
* вызовов, т.е. меньше накладных расходов. Однако, не факт что эта экономия
* не будет перекрыта неэффективностью реализации
* WaitForMultipleObjectsEx(), но тогда это проблемы на стороне M$. */
DWORD madness;
do
madness = WaitForMultipleObjectsEx((pending_count < MAXIMUM_WAIT_OBJECTS)
? (DWORD)pending_count
: MAXIMUM_WAIT_OBJECTS,
wait_for, true,
/* сутки */ 86400000ul, true);
while (madness == WAIT_IO_COMPLETION);
STATIC_ASSERT(WAIT_OBJECT_0 == 0);
if (/* madness >= WAIT_OBJECT_0 && */
madness < WAIT_OBJECT_0 + MAXIMUM_WAIT_OBJECTS)
r.err = MDBX_SUCCESS;
else if (madness >= WAIT_ABANDONED_0 &&
madness < WAIT_ABANDONED_0 + MAXIMUM_WAIT_OBJECTS) {
r.err = ERROR_ABANDONED_WAIT_0;
goto bailout_rc;
} else if (madness == WAIT_TIMEOUT) {
r.err = WAIT_TIMEOUT;
goto bailout_rc;
} else {
r.err = /* madness == WAIT_FAILED */ MDBX_PROBLEM;
goto bailout_rc;
}
assert(ior->async_waiting == ior->async_completed);
for (ior_item_t *item = ior->pool; item <= ior->last;) {
size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag;
void *data = item->single.iov_base;
if (bytes & ior_WriteFile_flag) {
data = Ptr64ToPtr(item->sgv[0].Buffer);
bytes = ior->pagesize;
/* Zap: Reading invalid data from 'item->sgv' */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385);
while (item->sgv[i].Buffer) {
bytes += ior->pagesize;
++i;
}
if (!HasOverlappedIoCompleted(&item->ov)) {
DWORD written = 0;
if (unlikely(!GetOverlappedResult(fd, &item->ov, &written, true))) {
ERROR("%s: item %p (%zu), addr %p pgno %u, bytes %zu,"
" offset %" PRId64 ", err %d",
"GetOverlappedResult", __Wpedantic_format_voidptr(item),
item - ior->pool, data, ((page_t *)data)->pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
(int)GetLastError());
goto bailout_geterr;
}
assert(MDBX_SUCCESS == item->ov.Internal);
assert(written == item->ov.InternalHigh);
}
} else {
assert(HasOverlappedIoCompleted(&item->ov));
}
assert(item->ov.Internal != ERROR_IO_PENDING);
if (unlikely(item->ov.Internal != MDBX_SUCCESS)) {
DWORD written = 0;
r.err = (int)item->ov.Internal;
if ((r.err & 0x80000000) &&
GetOverlappedResult(nullptr, &item->ov, &written, true))
r.err = (int)GetLastError();
ERROR("%s: item %p (%zu), addr %p pgno %u, bytes %zu,"
" offset %" PRId64 ", err %d",
"Result", __Wpedantic_format_voidptr(item), item - ior->pool,
data, ((page_t *)data)->pgno, bytes,
item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32),
(int)GetLastError());
goto bailout_rc;
}
if (unlikely(item->ov.InternalHigh != bytes)) {
r.err = ERROR_WRITE_FAULT;
goto bailout_rc;
}
item = ior_next(item, i);
}
assert(ior->async_waiting == ior->async_completed);
} else {
assert(r.err == MDBX_SUCCESS);
}
assert(ior->async_waiting == ior->async_completed);
#else
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
for (ior_item_t *item = ior->pool; item <= ior->last;) {
#if MDBX_HAVE_PWRITEV
assert(item->sgvcnt > 0);
if (item->sgvcnt == 1)
r.err = osal_pwrite(fd, item->sgv[0].iov_base, item->sgv[0].iov_len,
item->offset);
else
r.err = osal_pwritev(fd, item->sgv, item->sgvcnt, item->offset);
// TODO: io_uring_prep_write(sqe, fd, ...);
item = ior_next(item, item->sgvcnt);
#else
r.err = osal_pwrite(fd, item->single.iov_base, item->single.iov_len,
item->offset);
item = ior_next(item, 1);
#endif
r.wops += 1;
if (unlikely(r.err != MDBX_SUCCESS))
break;
}
// TODO: io_uring_submit(&ring)
// TODO: err = io_uring_wait_cqe(&ring, &cqe);
// TODO: io_uring_cqe_seen(&ring, cqe);
#endif /* !Windows */
return r;
}
MDBX_INTERNAL void osal_ioring_reset(osal_ioring_t *ior) {
#if defined(_WIN32) || defined(_WIN64)
if (ior->last) {
for (ior_item_t *item = ior->pool; item <= ior->last;) {
if (!HasOverlappedIoCompleted(&item->ov)) {
assert(ior->overlapped_fd);
CancelIoEx(ior->overlapped_fd, &item->ov);
}
if (item->ov.hEvent && item->ov.hEvent != ior)
ior_put_event(ior, item->ov.hEvent);
size_t i = 1;
if ((item->single.iov_len & ior_WriteFile_flag) == 0) {
/* Zap: Reading invalid data from 'item->sgv' */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385);
while (item->sgv[i].Buffer)
++i;
}
item = ior_next(item, i);
}
}
ior->async_waiting = INT_MAX;
ior->async_completed = 0;
ResetEvent(ior->async_done);
#endif /* !Windows */
ior->slots_left = ior->allocated;
ior->last = nullptr;
}
static void ior_cleanup(osal_ioring_t *ior, const size_t since) {
osal_ioring_reset(ior);
#if defined(_WIN32) || defined(_WIN64)
for (size_t i = since; i < ior->event_stack; ++i) {
/* Zap: Using uninitialized memory '**ior.event_pool' */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001);
CloseHandle(ior->event_pool[i]);
}
ior->event_stack = 0;
#else
(void)since;
#endif /* Windows */
}
MDBX_INTERNAL int osal_ioring_resize(osal_ioring_t *ior, size_t items) {
assert(items > 0 && items < INT_MAX / sizeof(ior_item_t));
#if defined(_WIN32) || defined(_WIN64)
if (ior->state & IOR_STATE_LOCKED)
return MDBX_SUCCESS;
const bool useSetFileIoOverlappedRange =
ior->overlapped_fd && imports.SetFileIoOverlappedRange && items > 42;
const size_t ceiling =
useSetFileIoOverlappedRange
? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4)
: 1024;
const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling);
items = bytes / sizeof(ior_item_t);
#endif /* Windows */
if (items != ior->allocated) {
assert(items >= osal_ioring_used(ior));
if (items < ior->allocated)
ior_cleanup(ior, items);
#if defined(_WIN32) || defined(_WIN64)
void *ptr = osal_realloc(
ior->event_pool,
(items + /* extra for waiting the async_done */ 1) * sizeof(HANDLE));
if (unlikely(!ptr))
return MDBX_ENOMEM;
ior->event_pool = ptr;
int err = osal_memalign_alloc(ceiling, bytes, &ptr);
if (unlikely(err != MDBX_SUCCESS))
return err;
if (ior->pool) {
memcpy(ptr, ior->pool, ior->allocated * sizeof(ior_item_t));
osal_memalign_free(ior->pool);
}
#else
void *ptr = osal_realloc(ior->pool, sizeof(ior_item_t) * items);
if (unlikely(!ptr))
return MDBX_ENOMEM;
#endif
ior->pool = ptr;
if (items > ior->allocated)
memset(ior->pool + ior->allocated, 0,
sizeof(ior_item_t) * (items - ior->allocated));
ior->allocated = (unsigned)items;
ior->boundary = ptr_disp(ior->pool, ior->allocated);
#if defined(_WIN32) || defined(_WIN64)
if (useSetFileIoOverlappedRange) {
if (imports.SetFileIoOverlappedRange(ior->overlapped_fd, ptr,
(ULONG)bytes))
ior->state += IOR_STATE_LOCKED;
else
return GetLastError();
}
#endif /* Windows */
}
return MDBX_SUCCESS;
}
MDBX_INTERNAL void osal_ioring_destroy(osal_ioring_t *ior) {
if (ior->allocated)
ior_cleanup(ior, 0);
#if defined(_WIN32) || defined(_WIN64)
osal_memalign_free(ior->pool);
osal_free(ior->event_pool);
CloseHandle(ior->async_done);
if (ior->overlapped_fd)
CloseHandle(ior->overlapped_fd);
#else
osal_free(ior->pool);
#endif
memset(ior, 0, sizeof(osal_ioring_t));
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL int osal_removefile(const pathchar_t *pathname) {
#if defined(_WIN32) || defined(_WIN64)
return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError();
#else
return unlink(pathname) ? errno : MDBX_SUCCESS;
#endif
}
#if !(defined(_WIN32) || defined(_WIN64))
static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); }
#endif /*! Windows */
MDBX_INTERNAL int osal_removedirectory(const pathchar_t *pathname) {
#if defined(_WIN32) || defined(_WIN64)
return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError();
#else
return rmdir(pathname) ? errno : MDBX_SUCCESS;
#endif
}
MDBX_INTERNAL int osal_fileexists(const pathchar_t *pathname) {
#if defined(_WIN32) || defined(_WIN64)
if (GetFileAttributesW(pathname) != INVALID_FILE_ATTRIBUTES)
return MDBX_RESULT_TRUE;
int err = GetLastError();
return (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND)
? MDBX_RESULT_FALSE
: err;
#else
if (access(pathname, F_OK) == 0)
return MDBX_RESULT_TRUE;
int err = errno;
return (err == ENOENT || err == ENOTDIR) ? MDBX_RESULT_FALSE : err;
#endif
}
MDBX_INTERNAL pathchar_t *osal_fileext(const pathchar_t *pathname, size_t len) {
const pathchar_t *ext = nullptr;
for (size_t i = 0; i < len && pathname[i]; i++)
if (pathname[i] == '.')
ext = pathname + i;
else if (osal_isdirsep(pathname[i]))
ext = nullptr;
return (pathchar_t *)ext;
}
MDBX_INTERNAL bool osal_pathequal(const pathchar_t *l, const pathchar_t *r,
size_t len) {
#if defined(_WIN32) || defined(_WIN64)
for (size_t i = 0; i < len; ++i) {
pathchar_t a = l[i];
pathchar_t b = r[i];
a = (a == '\\') ? '/' : a;
b = (b == '\\') ? '/' : b;
if (a != b)
return false;
}
return true;
#else
return memcmp(l, r, len * sizeof(pathchar_t)) == 0;
#endif
}
MDBX_INTERNAL int osal_openfile(const enum osal_openfile_purpose purpose,
const MDBX_env *env, const pathchar_t *pathname,
mdbx_filehandle_t *fd,
mdbx_mode_t unix_mode_bits) {
*fd = INVALID_HANDLE_VALUE;
#if defined(_WIN32) || defined(_WIN64)
DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING;
DWORD FlagsAndAttributes =
FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED;
DWORD DesiredAccess = FILE_READ_ATTRIBUTES;
DWORD ShareMode =
(env->flags & MDBX_EXCLUSIVE) ? 0 : (FILE_SHARE_READ | FILE_SHARE_WRITE);
switch (purpose) {
default:
return ERROR_INVALID_PARAMETER;
case MDBX_OPEN_LCK:
CreationDisposition = OPEN_ALWAYS;
DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
FlagsAndAttributes |= FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_TEMPORARY;
break;
case MDBX_OPEN_DXB_READ:
CreationDisposition = OPEN_EXISTING;
DesiredAccess |= GENERIC_READ;
ShareMode |= FILE_SHARE_READ;
break;
case MDBX_OPEN_DXB_LAZY:
DesiredAccess |= GENERIC_READ | GENERIC_WRITE;
break;
case MDBX_OPEN_DXB_OVERLAPPED_DIRECT:
FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING;
/* fall through */
__fallthrough;
case MDBX_OPEN_DXB_OVERLAPPED:
FlagsAndAttributes |= FILE_FLAG_OVERLAPPED;
/* fall through */
__fallthrough;
case MDBX_OPEN_DXB_DSYNC:
CreationDisposition = OPEN_EXISTING;
DesiredAccess |= GENERIC_WRITE | GENERIC_READ;
FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH;
break;
case MDBX_OPEN_COPY:
CreationDisposition = CREATE_NEW;
ShareMode = 0;
DesiredAccess |= GENERIC_WRITE;
if (env->ps >= globals.sys_pagesize)
FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING;
break;
case MDBX_OPEN_DELETE:
CreationDisposition = OPEN_EXISTING;
ShareMode |= FILE_SHARE_DELETE;
DesiredAccess =
FILE_READ_ATTRIBUTES | FILE_WRITE_ATTRIBUTES | DELETE | SYNCHRONIZE;
break;
}
*fd = CreateFileW(pathname, DesiredAccess, ShareMode, nullptr,
CreationDisposition, FlagsAndAttributes, nullptr);
if (*fd == INVALID_HANDLE_VALUE) {
int err = (int)GetLastError();
if (err == ERROR_ACCESS_DENIED && purpose == MDBX_OPEN_LCK) {
if (GetFileAttributesW(pathname) == INVALID_FILE_ATTRIBUTES &&
GetLastError() == ERROR_FILE_NOT_FOUND)
err = ERROR_FILE_NOT_FOUND;
}
return err;
}
BY_HANDLE_FILE_INFORMATION info;
if (!GetFileInformationByHandle(*fd, &info)) {
int err = (int)GetLastError();
CloseHandle(*fd);
*fd = INVALID_HANDLE_VALUE;
return err;
}
const DWORD AttributesDiff =
(info.dwFileAttributes ^ FlagsAndAttributes) &
(FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED |
FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED);
if (AttributesDiff)
(void)SetFileAttributesW(pathname, info.dwFileAttributes ^ AttributesDiff);
#else
int flags = unix_mode_bits ? O_CREAT : 0;
switch (purpose) {
default:
return EINVAL;
case MDBX_OPEN_LCK:
flags |= O_RDWR;
break;
case MDBX_OPEN_DXB_READ:
flags = O_RDONLY;
break;
case MDBX_OPEN_DXB_LAZY:
flags |= O_RDWR;
break;
case MDBX_OPEN_COPY:
flags = O_CREAT | O_WRONLY | O_EXCL;
break;
case MDBX_OPEN_DXB_DSYNC:
flags |= O_WRONLY;
#if defined(O_DSYNC)
flags |= O_DSYNC;
#elif defined(O_SYNC)
flags |= O_SYNC;
#elif defined(O_FSYNC)
flags |= O_FSYNC;
#endif
break;
case MDBX_OPEN_DELETE:
flags = O_RDWR;
break;
}
const bool direct_nocache_for_copy =
env->ps >= globals.sys_pagesize && purpose == MDBX_OPEN_COPY;
if (direct_nocache_for_copy) {
#if defined(O_DIRECT)
flags |= O_DIRECT;
#endif /* O_DIRECT */
#if defined(O_NOCACHE)
flags |= O_NOCACHE;
#endif /* O_NOCACHE */
}
#ifdef O_CLOEXEC
flags |= O_CLOEXEC;
#endif /* O_CLOEXEC */
/* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */
#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2
int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1;
static const char dev_null[] = "/dev/null";
if (!is_valid_fd(STDIN_FILENO)) {
WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN",
STDIN_FILENO, dev_null);
stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY);
}
if (!is_valid_fd(STDOUT_FILENO)) {
WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "OUT",
STDOUT_FILENO, dev_null);
stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY);
}
if (!is_valid_fd(STDERR_FILENO)) {
WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "ERR",
STDERR_FILENO, dev_null);
stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY);
}
#else
#error "Unexpected or unsupported UNIX or POSIX system"
#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */
*fd = open(pathname, flags, unix_mode_bits);
#if defined(O_DIRECT)
if (*fd < 0 && (flags & O_DIRECT) &&
(errno == EINVAL || errno == EAFNOSUPPORT)) {
flags &= ~(O_DIRECT | O_EXCL);
*fd = open(pathname, flags, unix_mode_bits);
}
#endif /* O_DIRECT */
if (*fd < 0 && errno == EACCES && purpose == MDBX_OPEN_LCK) {
struct stat unused;
if (stat(pathname, &unused) == 0 || errno != ENOENT)
errno = EACCES /* restore errno if file exists */;
}
/* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */
#if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2
if (*fd == STDIN_FILENO) {
WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN",
STDIN_FILENO);
assert(stub_fd0 == -1);
*fd = dup(stub_fd0 = *fd);
}
if (*fd == STDOUT_FILENO) {
WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT",
STDOUT_FILENO);
assert(stub_fd1 == -1);
*fd = dup(stub_fd1 = *fd);
}
if (*fd == STDERR_FILENO) {
WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR",
STDERR_FILENO);
assert(stub_fd2 == -1);
*fd = dup(stub_fd2 = *fd);
}
if (stub_fd0 != -1)
close(stub_fd0);
if (stub_fd1 != -1)
close(stub_fd1);
if (stub_fd2 != -1)
close(stub_fd2);
if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) {
ERROR("Rejecting the use of a FD in the range "
"STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption",
STDIN_FILENO, STDERR_FILENO);
close(*fd);
return EBADF;
}
#else
#error "Unexpected or unsupported UNIX or POSIX system"
#endif /* STDIN_FILENO == 0 && STDERR_FILENO == 2 */
if (*fd < 0)
return errno;
#if defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
const int fd_flags = fcntl(*fd, F_GETFD);
if (fd_flags != -1)
(void)fcntl(*fd, F_SETFD, fd_flags | FD_CLOEXEC);
#endif /* FD_CLOEXEC && !O_CLOEXEC */
if (direct_nocache_for_copy) {
#if defined(F_NOCACHE) && !defined(O_NOCACHE)
2019-08-13 02:07:10 +03:00
(void)fcntl(*fd, F_NOCACHE, 1);
#endif /* F_NOCACHE */
}
#endif
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
}
MDBX_INTERNAL int osal_closefile(mdbx_filehandle_t fd) {
#if defined(_WIN32) || defined(_WIN64)
return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError();
#else
assert(fd > STDERR_FILENO);
2017-05-24 01:42:10 +03:00
return (close(fd) == 0) ? MDBX_SUCCESS : errno;
#endif
}
MDBX_INTERNAL int osal_pread(mdbx_filehandle_t fd, void *buf, size_t bytes,
uint64_t offset) {
if (bytes > MAX_WRITE)
return MDBX_EINVAL;
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
ov.hEvent = 0;
ov.Offset = (DWORD)offset;
ov.OffsetHigh = HIGH_DWORD(offset);
2017-04-24 15:51:21 +03:00
DWORD read = 0;
if (unlikely(!ReadFile(fd, buf, (DWORD)bytes, &read, &ov))) {
int rc = (int)GetLastError();
2017-05-24 01:42:10 +03:00
return (rc == MDBX_SUCCESS) ? /* paranoia */ ERROR_READ_FAULT : rc;
}
#else
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
intptr_t read = pread(fd, buf, bytes, offset);
2017-04-25 19:58:00 +03:00
if (read < 0) {
int rc = errno;
2017-05-24 01:42:10 +03:00
return (rc == MDBX_SUCCESS) ? /* paranoia */ MDBX_EIO : rc;
2017-04-25 19:58:00 +03:00
}
#endif
2017-05-24 01:42:10 +03:00
return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA;
}
MDBX_INTERNAL int osal_pwrite(mdbx_filehandle_t fd, const void *buf,
size_t bytes, uint64_t offset) {
while (true) {
#if defined(_WIN32) || defined(_WIN64)
OVERLAPPED ov;
ov.hEvent = 0;
ov.Offset = (DWORD)offset;
ov.OffsetHigh = HIGH_DWORD(offset);
DWORD written;
if (unlikely(!WriteFile(
fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE,
&written, &ov)))
return (int)GetLastError();
if (likely(bytes == written))
return MDBX_SUCCESS;
#else
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
const intptr_t written =
pwrite(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE, offset);
if (likely(bytes == (size_t)written))
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
if (written < 0) {
const int rc = errno;
if (rc != EINTR)
return rc;
continue;
}
#endif
bytes -= written;
offset += written;
buf = ptr_disp(buf, written);
}
}
MDBX_INTERNAL int osal_write(mdbx_filehandle_t fd, const void *buf,
size_t bytes) {
while (true) {
#if defined(_WIN32) || defined(_WIN64)
DWORD written;
if (unlikely(!WriteFile(
fd, buf, likely(bytes <= MAX_WRITE) ? (DWORD)bytes : MAX_WRITE,
&written, nullptr)))
return (int)GetLastError();
if (likely(bytes == written))
return MDBX_SUCCESS;
#else
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
const intptr_t written =
write(fd, buf, likely(bytes <= MAX_WRITE) ? bytes : MAX_WRITE);
if (likely(bytes == (size_t)written))
return MDBX_SUCCESS;
if (written < 0) {
const int rc = errno;
if (rc != EINTR)
return rc;
continue;
}
#endif
bytes -= written;
buf = ptr_disp(buf, written);
}
}
int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, size_t sgvcnt,
uint64_t offset) {
size_t expected = 0;
for (size_t i = 0; i < sgvcnt; ++i)
expected += iov[i].iov_len;
#if !MDBX_HAVE_PWRITEV
size_t written = 0;
for (size_t i = 0; i < sgvcnt; ++i) {
int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset);
2017-05-24 01:42:10 +03:00
if (unlikely(rc != MDBX_SUCCESS))
return rc;
written += iov[i].iov_len;
offset += iov[i].iov_len;
}
return (expected == written) ? MDBX_SUCCESS
: MDBX_EIO /* ERROR_WRITE_FAULT */;
#else
int rc;
intptr_t written;
do {
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
written = pwritev(fd, iov, sgvcnt, offset);
if (likely(expected == (size_t)written))
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
rc = errno;
} while (rc == EINTR);
2017-05-22 19:59:16 +03:00
return (written < 0) ? rc : MDBX_EIO /* Use which error code? */;
#endif
}
MDBX_INTERNAL int osal_fsync(mdbx_filehandle_t fd,
enum osal_syncmode_bits mode_bits) {
#if defined(_WIN32) || defined(_WIN64)
if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd))
return (int)GetLastError();
return MDBX_SUCCESS;
#else
2019-08-20 00:17:28 +03:00
#if defined(__APPLE__) && \
MDBX_OSX_SPEED_INSTEADOF_DURABILITY == MDBX_OSX_WANNA_DURABILITY
2019-08-20 00:17:28 +03:00
if (mode_bits & MDBX_SYNC_IODQ)
return likely(fcntl(fd, F_FULLFSYNC) != -1) ? MDBX_SUCCESS : errno;
#endif /* MacOS */
/* LY: This approach is always safe and without appreciable performance
* degradation, even on a kernel with fdatasync's bug.
*
* For more info about of a corresponding fdatasync() bug
* see http://www.spinics.net/lists/linux-ext4/msg33714.html */
while (1) {
switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) {
case MDBX_SYNC_NONE:
case MDBX_SYNC_KICK:
return MDBX_SUCCESS /* nothing to do */;
#if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0
case MDBX_SYNC_DATA:
if (likely(fdatasync(fd) == 0))
return MDBX_SUCCESS;
break /* error */;
#if defined(__linux__) || defined(__gnu_linux__)
case MDBX_SYNC_SIZE:
assert(globals.linux_kernel_version >= 0x03060000);
2022-09-18 13:21:38 +03:00
return MDBX_SUCCESS;
#endif /* Linux */
#endif /* _POSIX_SYNCHRONIZED_IO > 0 */
default:
if (likely(fsync(fd) == 0))
return MDBX_SUCCESS;
}
int rc = errno;
if (rc != EINTR)
return rc;
}
#endif
}
int osal_filesize(mdbx_filehandle_t fd, uint64_t *length) {
#if defined(_WIN32) || defined(_WIN64)
BY_HANDLE_FILE_INFORMATION info;
if (!GetFileInformationByHandle(fd, &info))
return (int)GetLastError();
2017-05-25 09:26:03 +03:00
*length = info.nFileSizeLow | (uint64_t)info.nFileSizeHigh << 32;
#else
struct stat st;
STATIC_ASSERT_MSG(sizeof(off_t) <= sizeof(uint64_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
if (fstat(fd, &st))
return errno;
*length = st.st_size;
#endif
2017-05-24 01:42:10 +03:00
return MDBX_SUCCESS;
}
MDBX_INTERNAL int osal_is_pipe(mdbx_filehandle_t fd) {
#if defined(_WIN32) || defined(_WIN64)
switch (GetFileType(fd)) {
case FILE_TYPE_DISK:
return MDBX_RESULT_FALSE;
case FILE_TYPE_CHAR:
case FILE_TYPE_PIPE:
return MDBX_RESULT_TRUE;
default:
return (int)GetLastError();
}
#else
struct stat info;
if (fstat(fd, &info))
return errno;
switch (info.st_mode & S_IFMT) {
case S_IFBLK:
case S_IFREG:
return MDBX_RESULT_FALSE;
case S_IFCHR:
case S_IFIFO:
case S_IFSOCK:
return MDBX_RESULT_TRUE;
case S_IFDIR:
case S_IFLNK:
default:
return MDBX_INCOMPATIBLE;
}
#endif
}
MDBX_INTERNAL int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) {
#if defined(_WIN32) || defined(_WIN64)
if (imports.SetFileInformationByHandle) {
FILE_END_OF_FILE_INFO EndOfFileInfo;
EndOfFileInfo.EndOfFile.QuadPart = length;
return imports.SetFileInformationByHandle(fd, FileEndOfFileInfo,
&EndOfFileInfo,
sizeof(FILE_END_OF_FILE_INFO))
? MDBX_SUCCESS
: (int)GetLastError();
} else {
LARGE_INTEGER li;
li.QuadPart = length;
return (SetFilePointerEx(fd, li, nullptr, FILE_BEGIN) && SetEndOfFile(fd))
? MDBX_SUCCESS
: (int)GetLastError();
}
#else
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
2017-05-24 01:42:10 +03:00
return ftruncate(fd, length) == 0 ? MDBX_SUCCESS : errno;
#endif
}
MDBX_INTERNAL int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) {
#if defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER li;
li.QuadPart = pos;
return SetFilePointerEx(fd, li, nullptr, FILE_BEGIN) ? MDBX_SUCCESS
: (int)GetLastError();
#else
STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t),
"libmdbx requires 64-bit file I/O on 64-bit systems");
return (lseek(fd, pos, SEEK_SET) < 0) ? errno : MDBX_SUCCESS;
#endif
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL int
osal_thread_create(osal_thread_t *thread,
THREAD_RESULT(THREAD_CALL *start_routine)(void *),
void *arg) {
#if defined(_WIN32) || defined(_WIN64)
*thread = CreateThread(nullptr, 0, start_routine, arg, 0, nullptr);
return *thread ? MDBX_SUCCESS : (int)GetLastError();
#else
return pthread_create(thread, nullptr, start_routine, arg);
#endif
}
MDBX_INTERNAL int osal_thread_join(osal_thread_t thread) {
#if defined(_WIN32) || defined(_WIN64)
DWORD code = WaitForSingleObject(thread, INFINITE);
return waitstatus2errcode(code);
#else
void *unused_retval = &unused_retval;
return pthread_join(thread, &unused_retval);
#endif
}
/*----------------------------------------------------------------------------*/
MDBX_INTERNAL int osal_msync(const osal_mmap_t *map, size_t offset,
size_t length, enum osal_syncmode_bits mode_bits) {
if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_NONE)
return MDBX_SUCCESS;
void *ptr = ptr_disp(map->base, offset);
#if defined(_WIN32) || defined(_WIN64)
if (!FlushViewOfFile(ptr, length))
return (int)GetLastError();
if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) &&
!FlushFileBuffers(map->fd))
return (int)GetLastError();
#else
#if defined(__linux__) || defined(__gnu_linux__)
2022-09-18 13:21:38 +03:00
/* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly
* tracks dirty pages and flushes ones as necessary. */
//
// However, this behavior may be changed in custom kernels,
// so just leave such optimization to the libc discretion.
// NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases.
//
// assert(mdbx.linux_kernel_version > 0x02061300);
// if (mode_bits <= MDBX_SYNC_KICK)
// return MDBX_SUCCESS;
#endif /* Linux */
if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC))
return errno;
if ((mode_bits & MDBX_SYNC_SIZE) && fsync(map->fd))
return errno;
#endif
return MDBX_SUCCESS;
}
MDBX_INTERNAL int osal_check_fs_rdonly(mdbx_filehandle_t handle,
const pathchar_t *pathname, int err) {
#if defined(_WIN32) || defined(_WIN64)
(void)pathname;
(void)err;
if (!imports.GetVolumeInformationByHandleW)
return MDBX_ENOSYS;
DWORD unused, flags;
if (!imports.GetVolumeInformationByHandleW(handle, nullptr, 0, nullptr,
&unused, &flags, nullptr, 0))
return (int)GetLastError();
if ((flags & FILE_READ_ONLY_VOLUME) == 0)
return MDBX_EACCESS;
#else
struct statvfs info;
if (err != MDBX_ENOFILE) {
if (statvfs(pathname, &info) == 0)
return (info.f_flag & ST_RDONLY) ? MDBX_SUCCESS : err;
if (errno != MDBX_ENOFILE)
return errno;
}
if (fstatvfs(handle, &info))
return errno;
if ((info.f_flag & ST_RDONLY) == 0)
return (err == MDBX_ENOFILE) ? MDBX_EACCESS : err;
#endif /* !Windows */
return MDBX_SUCCESS;
}
MDBX_INTERNAL int osal_check_fs_incore(mdbx_filehandle_t handle) {
#if defined(_WIN32) || defined(_WIN64)
(void)handle;
#else
struct statfs statfs_info;
if (fstatfs(handle, &statfs_info))
return errno;
#if defined(__OpenBSD__)
const unsigned type = 0;
#else
const unsigned type = statfs_info.f_type;
#endif
switch (type) {
case 0x28cd3d45 /* CRAMFS_MAGIC */:
case 0x858458f6 /* RAMFS_MAGIC */:
case 0x01021994 /* TMPFS_MAGIC */:
case 0x73717368 /* SQUASHFS_MAGIC */:
case 0x7275 /* ROMFS_MAGIC */:
return MDBX_RESULT_TRUE;
}
#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \
defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \
defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN)
const char *const name = statfs_info.f_fstypename;
const size_t name_len = sizeof(statfs_info.f_fstypename);
#else
const char *const name = "";
const size_t name_len = 0;
#endif
if (name_len) {
if (strncasecmp("tmpfs", name, 6) == 0 ||
strncasecmp("mfs", name, 4) == 0 ||
strncasecmp("ramfs", name, 6) == 0 ||
strncasecmp("romfs", name, 6) == 0)
return MDBX_RESULT_TRUE;
}
#endif /* !Windows */
return MDBX_RESULT_FALSE;
}
static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) {
#if defined(_WIN32) || defined(_WIN64)
if (globals.running_under_Wine && !(flags & MDBX_EXCLUSIVE))
return ERROR_NOT_CAPABLE /* workaround for Wine */;
if (GetFileType(handle) != FILE_TYPE_DISK)
return ERROR_FILE_OFFLINE;
if (imports.GetFileInformationByHandleEx) {
FILE_REMOTE_PROTOCOL_INFO RemoteProtocolInfo;
if (imports.GetFileInformationByHandleEx(handle, FileRemoteProtocolInfo,
&RemoteProtocolInfo,
sizeof(RemoteProtocolInfo))) {
if ((RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_OFFLINE) &&
!(flags & MDBX_RDONLY))
return ERROR_FILE_OFFLINE;
if (!(RemoteProtocolInfo.Flags & REMOTE_PROTOCOL_INFO_FLAG_LOOPBACK) &&
!(flags & MDBX_EXCLUSIVE))
return ERROR_REMOTE_STORAGE_MEDIA_ERROR;
}
}
if (imports.NtFsControlFile) {
NTSTATUS rc;
struct {
WOF_EXTERNAL_INFO wof_info;
union {
WIM_PROVIDER_EXTERNAL_INFO wim_info;
FILE_PROVIDER_EXTERNAL_INFO_V1 file_info;
};
size_t reserved_for_microsoft_madness[42];
} GetExternalBacking_OutputBuffer;
IO_STATUS_BLOCK StatusBlock;
rc = imports.NtFsControlFile(handle, nullptr, nullptr, nullptr,
&StatusBlock, FSCTL_GET_EXTERNAL_BACKING,
nullptr, 0, &GetExternalBacking_OutputBuffer,
sizeof(GetExternalBacking_OutputBuffer));
if (NT_SUCCESS(rc)) {
if (!(flags & MDBX_EXCLUSIVE))
return ERROR_REMOTE_STORAGE_MEDIA_ERROR;
} else if (rc != STATUS_OBJECT_NOT_EXTERNALLY_BACKED &&
rc != STATUS_INVALID_DEVICE_REQUEST &&
rc != STATUS_NOT_SUPPORTED)
return ntstatus2errcode(rc);
}
if (imports.GetVolumeInformationByHandleW &&
imports.GetFinalPathNameByHandleW) {
WCHAR *PathBuffer = osal_malloc(sizeof(WCHAR) * INT16_MAX);
if (!PathBuffer)
return MDBX_ENOMEM;
int rc = MDBX_SUCCESS;
DWORD VolumeSerialNumber, FileSystemFlags;
if (!imports.GetVolumeInformationByHandleW(handle, PathBuffer, INT16_MAX,
&VolumeSerialNumber, nullptr,
&FileSystemFlags, nullptr, 0)) {
rc = (int)GetLastError();
goto bailout;
}
if ((flags & MDBX_RDONLY) == 0) {
if (FileSystemFlags &
(FILE_SEQUENTIAL_WRITE_ONCE | FILE_READ_ONLY_VOLUME |
FILE_VOLUME_IS_COMPRESSED)) {
rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
goto bailout;
}
}
if (imports.GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX,
FILE_NAME_NORMALIZED |
VOLUME_NAME_NT)) {
if (_wcsnicmp(PathBuffer, L"\\Device\\Mup\\", 12) == 0) {
if (!(flags & MDBX_EXCLUSIVE)) {
rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
goto bailout;
}
}
}
if (F_ISSET(flags, MDBX_RDONLY | MDBX_EXCLUSIVE) &&
(FileSystemFlags & FILE_READ_ONLY_VOLUME)) {
/* without-LCK (exclusive readonly) mode for DB on a read-only volume */
goto bailout;
}
if (imports.GetFinalPathNameByHandleW(handle, PathBuffer, INT16_MAX,
FILE_NAME_NORMALIZED |
VOLUME_NAME_DOS)) {
UINT DriveType = GetDriveTypeW(PathBuffer);
if (DriveType == DRIVE_NO_ROOT_DIR &&
_wcsnicmp(PathBuffer, L"\\\\?\\", 4) == 0 &&
_wcsnicmp(PathBuffer + 5, L":\\", 2) == 0) {
PathBuffer[7] = 0;
DriveType = GetDriveTypeW(PathBuffer + 4);
}
switch (DriveType) {
case DRIVE_CDROM:
if (flags & MDBX_RDONLY)
break;
// fall through
case DRIVE_UNKNOWN:
case DRIVE_NO_ROOT_DIR:
case DRIVE_REMOTE:
default:
if (!(flags & MDBX_EXCLUSIVE))
rc = ERROR_REMOTE_STORAGE_MEDIA_ERROR;
// fall through
case DRIVE_REMOVABLE:
case DRIVE_FIXED:
case DRIVE_RAMDISK:
break;
}
}
bailout:
osal_free(PathBuffer);
return rc;
}
#else
struct statvfs statvfs_info;
if (fstatvfs(handle, &statvfs_info))
return errno;
#if defined(ST_LOCAL) || defined(ST_EXPORTED)
const unsigned long st_flags = statvfs_info.f_flag;
#endif /* ST_LOCAL || ST_EXPORTED */
#if defined(__NetBSD__)
const unsigned type = 0;
const char *const name = statvfs_info.f_fstypename;
const size_t name_len = VFS_NAMELEN;
#elif defined(_AIX) || defined(__OS400__)
const char *const name = statvfs_info.f_basetype;
const size_t name_len = sizeof(statvfs_info.f_basetype);
struct stat st;
if (fstat(handle, &st))
return errno;
const unsigned type = st.st_vfstype;
if ((st.st_flag & FS_REMOTE) != 0 && !(flags & MDBX_EXCLUSIVE))
return MDBX_EREMOTE;
#elif defined(FSTYPSZ) || defined(_FSTYPSZ)
const unsigned type = 0;
const char *const name = statvfs_info.f_basetype;
const size_t name_len = sizeof(statvfs_info.f_basetype);
#elif defined(__sun) || defined(__SVR4) || defined(__svr4__) || \
defined(ST_FSTYPSZ) || defined(_ST_FSTYPSZ)
const unsigned type = 0;
struct stat st;
if (fstat(handle, &st))
return errno;
const char *const name = st.st_fstype;
const size_t name_len = strlen(name);
#else
struct statfs statfs_info;
if (fstatfs(handle, &statfs_info))
return errno;
#if defined(__OpenBSD__)
const unsigned type = 0;
#else
const unsigned type = statfs_info.f_type;
#endif
#if defined(MNT_LOCAL) || defined(MNT_EXPORTED)
const unsigned long mnt_flags = statfs_info.f_flags;
#endif /* MNT_LOCAL || MNT_EXPORTED */
#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \
defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \
defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN)
const char *const name = statfs_info.f_fstypename;
const size_t name_len = sizeof(statfs_info.f_fstypename);
#elif defined(__ANDROID_API__) && __ANDROID_API__ < 21
const char *const name = "";
const unsigned name_len = 0;
#else
const char *name = "";
unsigned name_len = 0;
struct stat st;
if (fstat(handle, &st))
return errno;
char pathbuf[PATH_MAX];
FILE *mounted = nullptr;
#if defined(__linux__) || defined(__gnu_linux__)
mounted = setmntent("/proc/mounts", "r");
#endif /* Linux */
if (!mounted)
mounted = setmntent("/etc/mtab", "r");
if (mounted) {
const struct mntent *ent;
#if defined(_BSD_SOURCE) || defined(_SVID_SOURCE) || defined(__BIONIC__) || \
(defined(_DEFAULT_SOURCE) && __GLIBC_PREREQ(2, 19))
struct mntent entbuf;
const bool should_copy = false;
while (nullptr !=
(ent = getmntent_r(mounted, &entbuf, pathbuf, sizeof(pathbuf))))
#else
const bool should_copy = true;
while (nullptr != (ent = getmntent(mounted)))
#endif
{
struct stat mnt;
if (!stat(ent->mnt_dir, &mnt) && mnt.st_dev == st.st_dev) {
if (should_copy) {
name =
strncpy(pathbuf, ent->mnt_fsname, name_len = sizeof(pathbuf) - 1);
pathbuf[name_len] = 0;
} else {
name = ent->mnt_fsname;
name_len = strlen(name);
}
break;
}
}
endmntent(mounted);
}
#endif /* !xBSD && !Android/Bionic */
#endif
if (name_len) {
if (((name_len > 2 && strncasecmp("nfs", name, 3) == 0) ||
strncasecmp("cifs", name, name_len) == 0 ||
strncasecmp("ncpfs", name, name_len) == 0 ||
strncasecmp("smbfs", name, name_len) == 0 ||
strcasecmp("9P" /* WSL2 */, name) == 0 ||
((name_len > 3 && strncasecmp("fuse", name, 4) == 0) &&
strncasecmp("fuseblk", name, name_len) != 0)) &&
!(flags & MDBX_EXCLUSIVE))
return MDBX_EREMOTE;
if (strcasecmp("ftp", name) == 0 || strcasecmp("http", name) == 0 ||
strcasecmp("sshfs", name) == 0)
return MDBX_EREMOTE;
}
#ifdef ST_LOCAL
if ((st_flags & ST_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE))
return MDBX_EREMOTE;
#elif defined(MNT_LOCAL)
if ((mnt_flags & MNT_LOCAL) == 0 && !(flags & MDBX_EXCLUSIVE))
return MDBX_EREMOTE;
#endif /* ST/MNT_LOCAL */
#ifdef ST_EXPORTED
if ((st_flags & ST_EXPORTED) != 0 && !(flags & MDBX_RDONLY))
return MDBX_EREMOTE;
#elif defined(MNT_EXPORTED)
if ((mnt_flags & MNT_EXPORTED) != 0 && !(flags & MDBX_RDONLY))
return MDBX_EREMOTE;
#endif /* ST/MNT_EXPORTED */
switch (type) {
case 0xFF534D42 /* CIFS_MAGIC_NUMBER */:
case 0x6969 /* NFS_SUPER_MAGIC */:
case 0x564c /* NCP_SUPER_MAGIC */:
case 0x517B /* SMB_SUPER_MAGIC */:
#if defined(__digital__) || defined(__osf__) || defined(__osf)
case 0x0E /* Tru64 NFS */:
#endif
#ifdef ST_FST_NFS
case ST_FST_NFS:
#endif
if ((flags & MDBX_EXCLUSIVE) == 0)
return MDBX_EREMOTE;
case 0:
default:
break;
}
#endif /* Unix */
return MDBX_SUCCESS;
}
static int check_mmap_limit(const size_t limit) {
const bool should_check =
#if defined(__SANITIZE_ADDRESS__)
true;
#else
RUNNING_ON_VALGRIND;
#endif /* __SANITIZE_ADDRESS__ */
if (should_check) {
intptr_t pagesize, total_ram_pages, avail_ram_pages;
int err =
mdbx_get_sysraminfo(&pagesize, &total_ram_pages, &avail_ram_pages);
if (unlikely(err != MDBX_SUCCESS))
return err;
const int log2page = log2n_powerof2(pagesize);
if ((limit >> (log2page + 7)) > (size_t)total_ram_pages ||
(limit >> (log2page + 6)) > (size_t)avail_ram_pages) {
ERROR("%s (%zu pages) is too large for available (%zu pages) or total "
"(%zu pages) system RAM",
"database upper size limit", limit >> log2page, avail_ram_pages,
total_ram_pages);
return MDBX_TOO_LARGE;
}
}
return MDBX_SUCCESS;
}
MDBX_INTERNAL int osal_mmap(const int flags, osal_mmap_t *map, size_t size,
const size_t limit, const unsigned options) {
assert(size <= limit);
map->limit = 0;
map->current = 0;
map->base = nullptr;
map->filesize = 0;
#if defined(_WIN32) || defined(_WIN64)
map->section = nullptr;
#endif /* Windows */
int err = osal_check_fs_local(map->fd, flags);
if (unlikely(err != MDBX_SUCCESS))
return err;
err = check_mmap_limit(limit);
if (unlikely(err != MDBX_SUCCESS))
return err;
if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) {
err = osal_ftruncate(map->fd, size);
VERBOSE("ftruncate %zu, err %d", size, err);
if (err != MDBX_SUCCESS)
return err;
map->filesize = size;
#if !(defined(_WIN32) || defined(_WIN64))
map->current = size;
#endif /* !Windows */
} else {
err = osal_filesize(map->fd, &map->filesize);
VERBOSE("filesize %" PRIu64 ", err %d", map->filesize, err);
if (err != MDBX_SUCCESS)
return err;
#if defined(_WIN32) || defined(_WIN64)
if (map->filesize < size) {
WARNING("file size (%zu) less than requested for mapping (%zu)",
(size_t)map->filesize, size);
size = (size_t)map->filesize;
}
#else
map->current = (map->filesize > limit) ? limit : (size_t)map->filesize;
#endif /* !Windows */
}
#if defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER SectionSize;
SectionSize.QuadPart = size;
err = NtCreateSection(&map->section,
/* DesiredAccess */
(flags & MDBX_WRITEMAP)
? SECTION_QUERY | SECTION_MAP_READ |
SECTION_EXTEND_SIZE | SECTION_MAP_WRITE
: SECTION_QUERY | SECTION_MAP_READ |
SECTION_EXTEND_SIZE,
/* ObjectAttributes */ nullptr,
/* MaximumSize (InitialSize) */ &SectionSize,
/* SectionPageProtection */
(flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE,
/* AllocationAttributes */ SEC_RESERVE, map->fd);
if (!NT_SUCCESS(err))
return ntstatus2errcode(err);
SIZE_T ViewSize = (flags & MDBX_RDONLY) ? 0
: globals.running_under_Wine ? size
: limit;
err = NtMapViewOfSection(
map->section, GetCurrentProcess(), &map->base,
/* ZeroBits */ 0,
/* CommitSize */ 0,
/* SectionOffset */ nullptr, &ViewSize,
/* InheritDisposition */ ViewUnmap,
/* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE,
/* Win32Protect */
(flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY);
if (!NT_SUCCESS(err)) {
NtClose(map->section);
map->section = 0;
map->base = nullptr;
return ntstatus2errcode(err);
}
assert(map->base != MAP_FAILED);
map->current = (size_t)SectionSize.QuadPart;
map->limit = ViewSize;
#else /* Windows */
#ifndef MAP_TRYFIXED
#define MAP_TRYFIXED 0
#endif
#ifndef MAP_HASSEMAPHORE
#define MAP_HASSEMAPHORE 0
#endif
#ifndef MAP_CONCEAL
#define MAP_CONCEAL 0
#endif
#ifndef MAP_NOSYNC
#define MAP_NOSYNC 0
#endif
#ifndef MAP_FIXED_NOREPLACE
#define MAP_FIXED_NOREPLACE 0
#endif
#ifndef MAP_NORESERVE
#define MAP_NORESERVE 0
#endif
map->base = mmap(nullptr, limit,
(flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ,
MAP_SHARED | MAP_FILE | MAP_NORESERVE |
(F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) |
((options & MMAP_OPTION_SEMAPHORE)
? MAP_HASSEMAPHORE | MAP_NOSYNC
: MAP_CONCEAL),
map->fd, 0);
if (unlikely(map->base == MAP_FAILED)) {
map->limit = 0;
map->current = 0;
map->base = nullptr;
assert(errno != 0);
return errno;
}
map->limit = limit;
#if MDBX_ENABLE_MADVISE
#ifdef MADV_DONTFORK
if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0))
return errno;
#endif /* MADV_DONTFORK */
#ifdef MADV_NOHUGEPAGE
(void)madvise(map->base, map->limit, MADV_NOHUGEPAGE);
#endif /* MADV_NOHUGEPAGE */
#endif /* MDBX_ENABLE_MADVISE */
#endif /* ! Windows */
VALGRIND_MAKE_MEM_DEFINED(map->base, map->current);
MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->current);
return MDBX_SUCCESS;
}
MDBX_INTERNAL int osal_munmap(osal_mmap_t *map) {
VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current);
/* Unpoisoning is required for ASAN to avoid false-positive diagnostic
* when this memory will re-used by malloc or another mmapping.
* See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */
MDBX_ASAN_UNPOISON_MEMORY_REGION(
map->base, (map->filesize && map->filesize < map->limit) ? map->filesize
: map->limit);
#if defined(_WIN32) || defined(_WIN64)
if (map->section)
NtClose(map->section);
NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->base);
if (!NT_SUCCESS(rc))
ntstatus2errcode(rc);
#else
if (unlikely(munmap(map->base, map->limit))) {
assert(errno != 0);
return errno;
}
#endif /* ! Windows */
map->limit = 0;
map->current = 0;
map->base = nullptr;
return MDBX_SUCCESS;
}
MDBX_INTERNAL int osal_mresize(const int flags, osal_mmap_t *map, size_t size,
size_t limit) {
int rc = osal_filesize(map->fd, &map->filesize);
VERBOSE("flags 0x%x, size %zu, limit %zu, filesize %" PRIu64, flags, size,
limit, map->filesize);
assert(size <= limit);
if (rc != MDBX_SUCCESS) {
map->filesize = 0;
return rc;
}
#if defined(_WIN32) || defined(_WIN64)
assert(size != map->current || limit != map->limit || size < map->filesize);
2018-01-09 15:30:36 +03:00
NTSTATUS status;
LARGE_INTEGER SectionSize;
int err;
2018-01-09 15:30:36 +03:00
if (limit == map->limit && size > map->current) {
if ((flags & MDBX_RDONLY) && map->filesize >= size) {
map->current = size;
return MDBX_SUCCESS;
} else if (!(flags & MDBX_RDONLY) &&
/* workaround for Wine */ imports.NtExtendSection) {
/* growth rw-section */
SectionSize.QuadPart = size;
status = imports.NtExtendSection(map->section, &SectionSize);
if (!NT_SUCCESS(status))
return ntstatus2errcode(status);
map->current = size;
if (map->filesize < size)
map->filesize = size;
return MDBX_SUCCESS;
}
}
if (limit > map->limit) {
err = check_mmap_limit(limit);
if (unlikely(err != MDBX_SUCCESS))
return err;
/* check ability of address space for growth before unmap */
PVOID BaseAddress = (PBYTE)map->base + map->limit;
SIZE_T RegionSize = limit - map->limit;
status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0,
&RegionSize, MEM_RESERVE, PAGE_NOACCESS);
if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018)
return MDBX_UNABLE_EXTEND_MAPSIZE;
if (!NT_SUCCESS(status))
return ntstatus2errcode(status);
status = NtFreeVirtualMemory(GetCurrentProcess(), &BaseAddress, &RegionSize,
MEM_RELEASE);
if (!NT_SUCCESS(status))
return ntstatus2errcode(status);
}
/* Windows unable:
* - shrink a mapped file;
* - change size of mapped view;
* - extend read-only mapping;
* Therefore we should unmap/map entire section. */
if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) {
if (size <= map->current && limit == map->limit)
return MDBX_SUCCESS;
return MDBX_EPERM;
}
/* Unpoisoning is required for ASAN to avoid false-positive diagnostic
* when this memory will re-used by malloc or another mmapping.
* See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */
MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->limit);
status = NtUnmapViewOfSection(GetCurrentProcess(), map->base);
2018-01-09 15:30:36 +03:00
if (!NT_SUCCESS(status))
return ntstatus2errcode(status);
status = NtClose(map->section);
map->section = nullptr;
PVOID ReservedAddress = nullptr;
SIZE_T ReservedSize = limit;
2018-01-09 15:30:36 +03:00
if (!NT_SUCCESS(status)) {
bailout_ntstatus:
err = ntstatus2errcode(status);
map->base = nullptr;
map->current = map->limit = 0;
if (ReservedAddress) {
ReservedSize = 0;
status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress,
&ReservedSize, MEM_RELEASE);
assert(NT_SUCCESS(status));
(void)status;
}
2018-01-09 15:30:36 +03:00
return err;
}
retry_file_and_section:
/* resizing of the file may take a while,
* therefore we reserve address space to avoid occupy it by other threads */
ReservedAddress = map->base;
status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0,
&ReservedSize, MEM_RESERVE, PAGE_NOACCESS);
if (!NT_SUCCESS(status)) {
ReservedAddress = nullptr;
if (status != (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018)
goto bailout_ntstatus /* no way to recovery */;
if (flags & MDBX_MRESIZE_MAY_MOVE)
/* the base address could be changed */
map->base = nullptr;
}
if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) {
err = osal_ftruncate(map->fd, size);
2018-01-09 15:30:36 +03:00
if (err == MDBX_SUCCESS)
map->filesize = size;
/* ignore error, because Windows unable shrink file
* that already mapped (by another process) */
}
SectionSize.QuadPart = size;
2018-01-09 15:30:36 +03:00
status = NtCreateSection(
&map->section,
/* DesiredAccess */
(flags & MDBX_WRITEMAP)
? SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE |
SECTION_MAP_WRITE
: SECTION_QUERY | SECTION_MAP_READ | SECTION_EXTEND_SIZE,
/* ObjectAttributes */ nullptr,
/* MaximumSize (InitialSize) */ &SectionSize,
/* SectionPageProtection */
(flags & MDBX_RDONLY) ? PAGE_READONLY : PAGE_READWRITE,
/* AllocationAttributes */ SEC_RESERVE, map->fd);
2018-01-09 15:30:36 +03:00
if (!NT_SUCCESS(status))
goto bailout_ntstatus;
if (ReservedAddress) {
/* release reserved address space */
ReservedSize = 0;
status = NtFreeVirtualMemory(GetCurrentProcess(), &ReservedAddress,
&ReservedSize, MEM_RELEASE);
ReservedAddress = nullptr;
if (!NT_SUCCESS(status))
goto bailout_ntstatus;
}
2018-01-09 15:30:36 +03:00
retry_mapview:;
SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit;
2018-01-09 15:30:36 +03:00
status = NtMapViewOfSection(
map->section, GetCurrentProcess(), &map->base,
/* ZeroBits */ 0,
/* CommitSize */ 0,
/* SectionOffset */ nullptr, &ViewSize,
/* InheritDisposition */ ViewUnmap,
/* AllocationType */ (flags & MDBX_RDONLY) ? 0 : MEM_RESERVE,
/* Win32Protect */
(flags & MDBX_WRITEMAP) ? PAGE_READWRITE : PAGE_READONLY);
2018-01-09 15:30:36 +03:00
if (!NT_SUCCESS(status)) {
if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 &&
map->base && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) {
/* try remap at another base address */
map->base = nullptr;
2018-01-09 15:30:36 +03:00
goto retry_mapview;
}
NtClose(map->section);
map->section = nullptr;
2018-01-09 15:30:36 +03:00
if (map->base && (size != map->current || limit != map->limit)) {
2018-01-09 15:30:36 +03:00
/* try remap with previously size and limit,
* but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */
rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM;
2018-01-09 15:30:36 +03:00
size = map->current;
ReservedSize = limit = map->limit;
2018-01-09 15:30:36 +03:00
goto retry_file_and_section;
}
/* no way to recovery */
goto bailout_ntstatus;
}
assert(map->base != MAP_FAILED);
map->current = (size_t)SectionSize.QuadPart;
map->limit = ViewSize;
#else /* Windows */
if (flags & MDBX_RDONLY) {
if (size > map->filesize)
rc = MDBX_UNABLE_EXTEND_MAPSIZE;
else if (size < map->filesize && map->filesize > limit)
rc = MDBX_EPERM;
map->current = (map->filesize > limit) ? limit : (size_t)map->filesize;
} else {
if (size > map->filesize ||
(size < map->filesize && (flags & txn_shrink_allowed))) {
rc = osal_ftruncate(map->fd, size);
VERBOSE("ftruncate %zu, err %d", size, rc);
if (rc != MDBX_SUCCESS)
return rc;
map->filesize = size;
}
if (map->current > size) {
/* Clearing asan's bitmask for the region which released in shrinking,
* since:
* - after the shrinking we will get an exception when accessing
* this region and (therefore) do not need the help of ASAN.
* - this allows us to clear the mask only within the file size
* when closing the mapping. */
MDBX_ASAN_UNPOISON_MEMORY_REGION(
ptr_disp(map->base, size),
((map->current < map->limit) ? map->current : map->limit) - size);
}
map->current = (size < map->limit) ? size : map->limit;
}
if (limit == map->limit)
return rc;
if (limit < map->limit) {
/* unmap an excess at end of mapping. */
2021-08-16 23:45:56 +03:00
// coverity[offset_free : FALSE]
if (unlikely(munmap(ptr_disp(map->base, limit), map->limit - limit))) {
assert(errno != 0);
return errno;
}
map->limit = limit;
return rc;
}
int err = check_mmap_limit(limit);
if (unlikely(err != MDBX_SUCCESS))
return err;
assert(limit > map->limit);
void *ptr = MAP_FAILED;
#if (defined(__linux__) || defined(__gnu_linux__)) && defined(_GNU_SOURCE)
ptr = mremap(map->base, map->limit, limit,
#if defined(MREMAP_MAYMOVE)
(flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE :
#endif /* MREMAP_MAYMOVE */
0);
if (ptr == MAP_FAILED) {
err = errno;
assert(err != 0);
switch (err) {
default:
return err;
case 0 /* paranoia */:
case EAGAIN:
case ENOMEM:
return MDBX_UNABLE_EXTEND_MAPSIZE;
case EFAULT /* MADV_DODUMP / MADV_DONTDUMP are mixed for mmap-range */:
break;
}
}
#endif /* Linux & _GNU_SOURCE */
const unsigned mmap_flags =
MAP_CONCEAL | MAP_SHARED | MAP_FILE | MAP_NORESERVE |
(F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0);
const unsigned mmap_prot =
(flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ;
if (ptr == MAP_FAILED) {
/* Try to mmap additional space beyond the end of mapping. */
ptr = mmap(ptr_disp(map->base, map->limit), limit - map->limit, mmap_prot,
mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit);
if (ptr == ptr_disp(map->base, map->limit))
/* успешно прилепили отображение в конец */
ptr = map->base;
else if (ptr != MAP_FAILED) {
/* the desired address is busy, unmap unsuitable one */
if (unlikely(munmap(ptr, limit - map->limit))) {
assert(errno != 0);
return errno;
}
ptr = MAP_FAILED;
} else {
err = errno;
assert(err != 0);
switch (err) {
default:
return err;
case 0 /* paranoia */:
case EAGAIN:
case ENOMEM:
return MDBX_UNABLE_EXTEND_MAPSIZE;
case EEXIST: /* address busy */
case EINVAL: /* kernel don't support MAP_FIXED_NOREPLACE */
break;
}
}
}
if (ptr == MAP_FAILED) {
/* unmap and map again whole region */
if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) {
/* TODO: Perhaps here it is worth to implement suspend/resume threads
* and perform unmap/map as like for Windows. */
return MDBX_UNABLE_EXTEND_MAPSIZE;
}
if (unlikely(munmap(map->base, map->limit))) {
assert(errno != 0);
return errno;
}
2021-08-16 23:45:56 +03:00
// coverity[pass_freed_arg : FALSE]
ptr = mmap(map->base, limit, mmap_prot,
(flags & MDBX_MRESIZE_MAY_MOVE)
? mmap_flags
: mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE
: MAP_FIXED),
map->fd, 0);
if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED &&
unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) &&
errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL)
2021-08-16 23:45:56 +03:00
// coverity[pass_freed_arg : FALSE]
ptr =
mmap(map->base, limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0);
if (unlikely(ptr == MAP_FAILED)) {
/* try to restore prev mapping */
2021-08-16 23:45:56 +03:00
// coverity[pass_freed_arg : FALSE]
ptr = mmap(map->base, map->limit, mmap_prot,
(flags & MDBX_MRESIZE_MAY_MOVE)
? mmap_flags
: mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE
: MAP_FIXED),
map->fd, 0);
if (MAP_FIXED_NOREPLACE != 0 && MAP_FIXED_NOREPLACE != MAP_FIXED &&
unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) &&
errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL)
2021-08-16 23:45:56 +03:00
// coverity[pass_freed_arg : FALSE]
ptr = mmap(map->base, map->limit, mmap_prot, mmap_flags | MAP_FIXED,
map->fd, 0);
if (unlikely(ptr == MAP_FAILED)) {
VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current);
/* Unpoisoning is required for ASAN to avoid false-positive diagnostic
* when this memory will re-used by malloc or another mmapping.
* See
* https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203
*/
MDBX_ASAN_UNPOISON_MEMORY_REGION(
map->base, (map->current < map->limit) ? map->current : map->limit);
map->limit = 0;
map->current = 0;
map->base = nullptr;
assert(errno != 0);
return errno;
}
rc = MDBX_UNABLE_EXTEND_MAPSIZE;
limit = map->limit;
}
}
assert(ptr && ptr != MAP_FAILED);
if (map->base != ptr) {
VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current);
/* Unpoisoning is required for ASAN to avoid false-positive diagnostic
* when this memory will re-used by malloc or another mmapping.
* See
* https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203
*/
MDBX_ASAN_UNPOISON_MEMORY_REGION(
map->base, (map->current < map->limit) ? map->current : map->limit);
VALGRIND_MAKE_MEM_DEFINED(ptr, map->current);
MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current);
map->base = ptr;
}
map->limit = limit;
map->current = size;
#if MDBX_ENABLE_MADVISE
#ifdef MADV_DONTFORK
if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) {
assert(errno != 0);
return errno;
}
#endif /* MADV_DONTFORK */
#ifdef MADV_NOHUGEPAGE
(void)madvise(map->base, map->limit, MADV_NOHUGEPAGE);
#endif /* MADV_NOHUGEPAGE */
#endif /* MDBX_ENABLE_MADVISE */
#endif /* POSIX / Windows */
/* Zap: Redundant code */
MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6287);
assert(rc != MDBX_SUCCESS ||
(map->base != nullptr && map->base != MAP_FAILED &&
map->current == size && map->limit == limit &&
map->filesize >= size));
return rc;
}
/*----------------------------------------------------------------------------*/
__cold MDBX_INTERNAL void osal_jitter(bool tiny) {
for (;;) {
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
defined(__x86_64__)
unsigned salt = 5296013u * (unsigned)__rdtsc();
salt ^= salt >> 11;
salt *= 25810541u;
#elif (defined(_WIN32) || defined(_WIN64)) && MDBX_WITHOUT_MSVC_CRT
static ULONG state;
const unsigned salt = (unsigned)RtlRandomEx(&state);
#else
const unsigned salt = rand();
#endif
const int coin = salt % (tiny ? 29u : 43u);
if (coin < 43 / 3)
break;
#if defined(_WIN32) || defined(_WIN64)
if (coin < 43 * 2 / 3)
SwitchToThread();
else {
static HANDLE timer;
if (!timer)
timer = CreateWaitableTimer(NULL, TRUE, NULL);
LARGE_INTEGER ft;
ft.QuadPart =
coin * (int64_t)-10; // Convert to 100 nanosecond interval,
// negative value indicates relative time.
SetWaitableTimer(timer, &ft, 0, NULL, NULL, 0);
WaitForSingleObject(timer, INFINITE);
// CloseHandle(timer);
break;
}
#else
sched_yield();
if (coin > 43 * 2 / 3)
usleep(coin);
#endif
}
}
/*----------------------------------------------------------------------------*/
#if defined(_WIN32) || defined(_WIN64)
static LARGE_INTEGER performance_frequency;
#elif defined(__APPLE__) || defined(__MACH__)
#include <mach/mach_time.h>
static uint64_t ratio_16dot16_to_monotine;
#elif defined(__linux__) || defined(__gnu_linux__)
static clockid_t posix_clockid;
__cold static clockid_t choice_monoclock(void) {
struct timespec probe;
#if defined(CLOCK_BOOTTIME)
if (clock_gettime(CLOCK_BOOTTIME, &probe) == 0)
return CLOCK_BOOTTIME;
#elif defined(CLOCK_MONOTONIC_RAW)
if (clock_gettime(CLOCK_MONOTONIC_RAW, &probe) == 0)
return CLOCK_MONOTONIC_RAW;
#elif defined(CLOCK_MONOTONIC_COARSE)
if (clock_gettime(CLOCK_MONOTONIC_COARSE, &probe) == 0)
return CLOCK_MONOTONIC_COARSE;
#endif
return CLOCK_MONOTONIC;
}
#elif defined(CLOCK_MONOTONIC)
#define posix_clockid CLOCK_MONOTONIC
#else
#define posix_clockid CLOCK_REALTIME
#endif
MDBX_INTERNAL uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) {
#if defined(_WIN32) || defined(_WIN64)
const uint64_t ratio = performance_frequency.QuadPart;
#elif defined(__APPLE__) || defined(__MACH__)
const uint64_t ratio = ratio_16dot16_to_monotine;
#else
const uint64_t ratio = UINT64_C(1000000000);
#endif
const uint64_t ret = (ratio * seconds_16dot16 + 32768) >> 16;
return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1;
}
static uint64_t monotime_limit;
MDBX_INTERNAL uint32_t osal_monotime_to_16dot16(uint64_t monotime) {
if (unlikely(monotime > monotime_limit))
return UINT32_MAX;
const uint32_t ret =
#if defined(_WIN32) || defined(_WIN64)
(uint32_t)((monotime << 16) / performance_frequency.QuadPart);
#elif defined(__APPLE__) || defined(__MACH__)
(uint32_t)((monotime << 16) / ratio_16dot16_to_monotine);
#else
(uint32_t)((monotime << 7) / 1953125);
#endif
mdbx: защита от нуля только общей задержки в метриках транзакции. Ранее, при конвертации очень коротких интервалов в формат фиксированной точки 16-точка-16, всегда выполнялось замещение нуля единицей. Т.е. если интервал был не нулевым, но меньше 15.259 микросекунд (1/65536 секунды), то вместо 0 возвращалось 1. Это приводило к тому, что сумма длительности отдельных стадий нередко была больше чем общее время фиксации транзакции. Проблема усугублялась, если получаемые значения аккумулировались по серии транзакций. Теперь такая защита от нуля выполняется только для общего времени, но не для отдельных стадий. Было: latency(ms): preparation=72.69 gc=72.69 write=73.04 sync=141.40 ending=72.69 whole=142.14 Аккумулированная сумма длительности этапов ВТРОЕ(!) больше общей длительности. Стало: latency(ms): preparation=0.00 gc=0.02 write=0.79 sync=67.98 ending=0.00 whole=140.81 Аккумулированная сумма длительности этапов меньше общей длительности, так как для каждой транзакции общая длительность возвращается не менее 15.259 микросекунд.
2022-10-07 14:53:35 +03:00
return ret;
}
MDBX_INTERNAL uint64_t osal_monotime(void) {
#if defined(_WIN32) || defined(_WIN64)
LARGE_INTEGER counter;
if (QueryPerformanceCounter(&counter))
return counter.QuadPart;
#elif defined(__APPLE__) || defined(__MACH__)
return mach_absolute_time();
#else
struct timespec ts;
if (likely(clock_gettime(posix_clockid, &ts) == 0))
return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec;
#endif
return 0;
}
MDBX_INTERNAL uint64_t osal_cputime(size_t *optional_page_faults) {
#if defined(_WIN32) || defined(_WIN64)
if (optional_page_faults) {
PROCESS_MEMORY_COUNTERS pmc;
*optional_page_faults =
GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc))
? pmc.PageFaultCount
: 0;
}
FILETIME unused, usermode;
if (GetThreadTimes(GetCurrentThread(),
/* CreationTime */ &unused,
/* ExitTime */ &unused,
/* KernelTime */ &unused,
/* UserTime */ &usermode)) {
/* one second = 10_000_000 * 100ns = 78125 * (1 << 7) * 100ns;
* result = (h * f / 10_000_000) << 32) + l * f / 10_000_000 =
* = ((h * f) >> 7) / 78125) << 32) + ((l * f) >> 7) / 78125;
* 1) {h, l} *= f;
* 2) {h, l} >>= 7;
* 3) result = ((h / 78125) << 32) + l / 78125; */
uint64_t l = usermode.dwLowDateTime * performance_frequency.QuadPart;
uint64_t h = usermode.dwHighDateTime * performance_frequency.QuadPart;
l = h << (64 - 7) | l >> 7;
h = h >> 7;
return ((h / 78125) << 32) + l / 78125;
}
#elif defined(RUSAGE_THREAD) || defined(RUSAGE_LWP)
#ifndef RUSAGE_THREAD
#define RUSAGE_THREAD RUSAGE_LWP /* Solaris */
#endif
struct rusage usage;
if (getrusage(RUSAGE_THREAD, &usage) == 0) {
if (optional_page_faults)
*optional_page_faults = usage.ru_majflt;
return usage.ru_utime.tv_sec * UINT64_C(1000000000) +
usage.ru_utime.tv_usec * 1000u;
}
if (optional_page_faults)
*optional_page_faults = 0;
#elif defined(CLOCK_THREAD_CPUTIME_ID)
if (optional_page_faults)
*optional_page_faults = 0;
struct timespec ts;
if (likely(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0))
return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec;
#else
/* FIXME */
if (optional_page_faults)
*optional_page_faults = 0;
#endif
return 0;
}
/*----------------------------------------------------------------------------*/
static void bootid_shake(bin128_t *p) {
/* Bob Jenkins's PRNG: https://burtleburtle.net/bob/rand/smallprng.html */
const uint32_t e = p->a - (p->b << 23 | p->b >> 9);
p->a = p->b ^ (p->c << 16 | p->c >> 16);
p->b = p->c + (p->d << 11 | p->d >> 21);
p->c = p->d + e;
p->d = e + p->a;
}
2022-08-07 12:10:36 +03:00
__cold static void bootid_collect(bin128_t *p, const void *s, size_t n) {
p->y += UINT64_C(64526882297375213);
bootid_shake(p);
for (size_t i = 0; i < n; ++i) {
bootid_shake(p);
p->y ^= UINT64_C(48797879452804441) * ((const uint8_t *)s)[i];
bootid_shake(p);
p->y += 14621231;
}
bootid_shake(p);
/* minor non-linear tomfoolery */
const unsigned z = p->x % 61 + 1;
p->y = p->y << z | p->y >> (64 - z);
bootid_shake(p);
bootid_shake(p);
const unsigned q = p->x % 59 + 1;
p->y = p->y << q | p->y >> (64 - q);
bootid_shake(p);
bootid_shake(p);
bootid_shake(p);
}
static size_t hamming_weight(size_t v) {
const size_t m1 = (size_t)UINT64_C(0x5555555555555555);
const size_t m2 = (size_t)UINT64_C(0x3333333333333333);
const size_t m4 = (size_t)UINT64_C(0x0f0f0f0f0f0f0f0f);
const size_t h01 = (size_t)UINT64_C(0x0101010101010101);
v -= (v >> 1) & m1;
v = (v & m2) + ((v >> 2) & m2);
v = (v + (v >> 4)) & m4;
return (v * h01) >> (sizeof(v) * 8 - 8);
}
static inline size_t hw64(uint64_t v) {
size_t r = hamming_weight((size_t)v);
if (sizeof(v) > sizeof(r))
r += hamming_weight((size_t)(v >> sizeof(r) * 4 >> sizeof(r) * 4));
return r;
}
static bool check_uuid(bin128_t uuid) {
size_t hw = hw64(uuid.x) + hw64(uuid.y) + hw64(uuid.x ^ uuid.y);
return (hw >> 6) == 1;
}
#if defined(_WIN32) || defined(_WIN64)
__cold static uint64_t windows_systemtime_ms() {
FILETIME ft;
GetSystemTimeAsFileTime(&ft);
return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul;
}
__cold static uint64_t windows_bootime(void) {
unsigned confirmed = 0;
uint64_t boottime = 0;
uint64_t up0 = imports.GetTickCount64();
uint64_t st0 = windows_systemtime_ms();
for (uint64_t fuse = st0; up0 && st0 < fuse + 1000 * 1000u / 42;) {
YieldProcessor();
const uint64_t up1 = imports.GetTickCount64();
const uint64_t st1 = windows_systemtime_ms();
if (st1 > fuse && st1 == st0 && up1 == up0) {
uint64_t diff = st1 - up1;
if (boottime == diff) {
if (++confirmed > 4)
return boottime;
} else {
confirmed = 0;
boottime = diff;
}
fuse = st1;
Sleep(1);
}
st0 = st1;
up0 = up1;
}
return 0;
}
__cold static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey,
LPCSTR lpValue, PVOID pvData,
LPDWORD pcbData) {
LSTATUS rc;
if (!imports.RegGetValueA) {
/* an old Windows 2000/XP */
HKEY hSubKey;
rc = RegOpenKeyA(hKey, lpSubKey, &hSubKey);
if (rc == ERROR_SUCCESS) {
rc =
RegQueryValueExA(hSubKey, lpValue, nullptr, nullptr, pvData, pcbData);
RegCloseKey(hSubKey);
}
return rc;
}
rc = imports.RegGetValueA(hKey, lpSubKey, lpValue, RRF_RT_ANY, nullptr,
pvData, pcbData);
if (rc != ERROR_FILE_NOT_FOUND)
return rc;
rc = imports.RegGetValueA(hKey, lpSubKey, lpValue,
RRF_RT_ANY | 0x00010000 /* RRF_SUBKEY_WOW6464KEY */,
nullptr, pvData, pcbData);
if (rc != ERROR_FILE_NOT_FOUND)
return rc;
return imports.RegGetValueA(hKey, lpSubKey, lpValue,
RRF_RT_ANY |
0x00020000 /* RRF_SUBKEY_WOW6432KEY */,
nullptr, pvData, pcbData);
}
#endif
__cold MDBX_MAYBE_UNUSED static bool
bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) {
if (n > 31) {
unsigned bits = 0;
for (unsigned i = 0; i < n; ++i) /* try parse an UUID in text form */ {
uint8_t c = ((const uint8_t *)p)[i];
if (c >= '0' && c <= '9')
c -= '0';
else if (c >= 'a' && c <= 'f')
c -= 'a' - 10;
else if (c >= 'A' && c <= 'F')
c -= 'A' - 10;
else
continue;
assert(c <= 15);
c ^= s->y >> 60;
s->y = s->y << 4 | s->x >> 60;
s->x = s->x << 4 | c;
bits += 4;
}
if (bits > 42 * 3)
/* UUID parsed successfully */
return true;
}
if (n > 15) /* is enough handle it as a binary? */ {
if (n == sizeof(bin128_t)) {
bin128_t aligned;
memcpy(&aligned, p, sizeof(bin128_t));
s->x += aligned.x;
s->y += aligned.y;
} else
bootid_collect(s, p, n);
return check_uuid(*s);
}
if (n)
bootid_collect(s, p, n);
return false;
}
#if defined(__linux__) || defined(__gnu_linux__)
__cold static bool proc_read_uuid(const char *path, bin128_t *target) {
const int fd = open(path, O_RDONLY | O_NOFOLLOW);
if (fd != -1) {
struct statfs fs;
char buf[42];
const ssize_t len =
(fstatfs(fd, &fs) == 0 && fs.f_type == /* procfs */ 0x9FA0)
? read(fd, buf, sizeof(buf))
: -1;
const int err = close(fd);
assert(err == 0);
(void)err;
if (len > 0)
return bootid_parse_uuid(target, buf, len);
}
return false;
}
#endif /* Linux */
__cold static bin128_t osal_bootid(void) {
bin128_t uuid = {{0, 0}};
bool got_machineid = false, got_boottime = false, got_bootseq = false;
#if defined(__linux__) || defined(__gnu_linux__)
if (proc_read_uuid("/proc/sys/kernel/random/boot_id", &uuid))
return uuid;
#endif /* Linux */
#if defined(__APPLE__) || defined(__MACH__)
{
char buf[42];
size_t len = sizeof(buf);
if (!sysctlbyname("kern.bootsessionuuid", buf, &len, nullptr, 0) &&
bootid_parse_uuid(&uuid, buf, len))
return uuid;
#if defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \
__MAC_OS_X_VERSION_MIN_REQUIRED > 1050
uuid_t hostuuid;
struct timespec wait = {0, 1000000000u / 42};
if (!gethostuuid(hostuuid, &wait))
got_machineid = bootid_parse_uuid(&uuid, hostuuid, sizeof(hostuuid));
#endif /* > 10.5 */
struct timeval boottime;
len = sizeof(boottime);
if (!sysctlbyname("kern.boottime", &boottime, &len, nullptr, 0) &&
len == sizeof(boottime) && boottime.tv_sec)
got_boottime = true;
}
#endif /* Apple/Darwin */
#if defined(_WIN32) || defined(_WIN64)
{
union buf {
DWORD BootId;
DWORD BaseTime;
SYSTEM_TIMEOFDAY_INFORMATION SysTimeOfDayInfo;
struct {
LARGE_INTEGER BootTime;
LARGE_INTEGER CurrentTime;
LARGE_INTEGER TimeZoneBias;
ULONG TimeZoneId;
ULONG Reserved;
ULONGLONG BootTimeBias;
ULONGLONG SleepTimeBias;
} SysTimeOfDayInfoHacked;
wchar_t MachineGuid[42];
char DigitalProductId[248];
} buf;
static const char HKLM_MicrosoftCryptography[] =
"SOFTWARE\\Microsoft\\Cryptography";
DWORD len = sizeof(buf);
/* Windows is madness and must die */
if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_MicrosoftCryptography,
"MachineGuid", &buf.MachineGuid,
&len) == ERROR_SUCCESS &&
len < sizeof(buf))
got_machineid = bootid_parse_uuid(&uuid, &buf.MachineGuid, len);
if (!got_machineid) {
/* again, Windows is madness */
static const char HKLM_WindowsNT[] =
"SOFTWARE\\Microsoft\\Windows NT\\CurrentVersion";
static const char HKLM_WindowsNT_DPK[] =
"SOFTWARE\\Microsoft\\Windows "
"NT\\CurrentVersion\\DefaultProductKey";
static const char HKLM_WindowsNT_DPK2[] =
"SOFTWARE\\Microsoft\\Windows "
"NT\\CurrentVersion\\DefaultProductKey2";
len = sizeof(buf);
if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT,
"DigitalProductId", &buf.DigitalProductId,
&len) == ERROR_SUCCESS &&
len > 42 && len < sizeof(buf)) {
bootid_collect(&uuid, &buf.DigitalProductId, len);
got_machineid = true;
}
len = sizeof(buf);
if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK,
"DigitalProductId", &buf.DigitalProductId,
&len) == ERROR_SUCCESS &&
len > 42 && len < sizeof(buf)) {
bootid_collect(&uuid, &buf.DigitalProductId, len);
got_machineid = true;
}
len = sizeof(buf);
if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_WindowsNT_DPK2,
"DigitalProductId", &buf.DigitalProductId,
&len) == ERROR_SUCCESS &&
len > 42 && len < sizeof(buf)) {
bootid_collect(&uuid, &buf.DigitalProductId, len);
got_machineid = true;
}
}
static const char HKLM_PrefetcherParams[] =
"SYSTEM\\CurrentControlSet\\Control\\Session Manager\\Memory "
"Management\\PrefetchParameters";
len = sizeof(buf);
if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BootId",
&buf.BootId, &len) == ERROR_SUCCESS &&
len > 1 && len < sizeof(buf)) {
bootid_collect(&uuid, &buf.BootId, len);
got_bootseq = true;
}
len = sizeof(buf);
if (mdbx_RegGetValue(HKEY_LOCAL_MACHINE, HKLM_PrefetcherParams, "BaseTime",
&buf.BaseTime, &len) == ERROR_SUCCESS &&
len >= sizeof(buf.BaseTime) && buf.BaseTime) {
bootid_collect(&uuid, &buf.BaseTime, len);
got_boottime = true;
}
/* BootTime from SYSTEM_TIMEOFDAY_INFORMATION */
NTSTATUS status = NtQuerySystemInformation(
0x03 /* SystemTmeOfDayInformation */, &buf.SysTimeOfDayInfo,
sizeof(buf.SysTimeOfDayInfo), &len);
if (NT_SUCCESS(status) &&
len >= offsetof(union buf, SysTimeOfDayInfoHacked.BootTimeBias) +
sizeof(buf.SysTimeOfDayInfoHacked.BootTimeBias) &&
buf.SysTimeOfDayInfoHacked.BootTime.QuadPart) {
const uint64_t UnbiasedBootTime =
buf.SysTimeOfDayInfoHacked.BootTime.QuadPart -
buf.SysTimeOfDayInfoHacked.BootTimeBias;
if (UnbiasedBootTime) {
bootid_collect(&uuid, &UnbiasedBootTime, sizeof(UnbiasedBootTime));
got_boottime = true;
}
}
if (!got_boottime) {
uint64_t boottime = windows_bootime();
if (boottime) {
bootid_collect(&uuid, &boottime, sizeof(boottime));
got_boottime = true;
}
}
}
#endif /* Windows */
#if defined(CTL_HW) && defined(HW_UUID)
if (!got_machineid) {
static const int mib[] = {CTL_HW, HW_UUID};
char buf[42];
size_t len = sizeof(buf);
if (sysctl(
#ifdef SYSCTL_LEGACY_NONCONST_MIB
(int *)
#endif
mib,
ARRAY_LENGTH(mib), &buf, &len, nullptr, 0) == 0)
got_machineid = bootid_parse_uuid(&uuid, buf, len);
}
#endif /* CTL_HW && HW_UUID */
#if defined(CTL_KERN) && defined(KERN_HOSTUUID)
if (!got_machineid) {
static const int mib[] = {CTL_KERN, KERN_HOSTUUID};
char buf[42];
size_t len = sizeof(buf);
if (sysctl(
#ifdef SYSCTL_LEGACY_NONCONST_MIB
(int *)
#endif
mib,
ARRAY_LENGTH(mib), &buf, &len, nullptr, 0) == 0)
got_machineid = bootid_parse_uuid(&uuid, buf, len);
}
#endif /* CTL_KERN && KERN_HOSTUUID */
#if defined(__NetBSD__)
if (!got_machineid) {
char buf[42];
size_t len = sizeof(buf);
if (sysctlbyname("machdep.dmi.system-uuid", buf, &len, nullptr, 0) == 0)
got_machineid = bootid_parse_uuid(&uuid, buf, len);
}
#endif /* __NetBSD__ */
#if _XOPEN_SOURCE_EXTENDED
if (!got_machineid) {
const int hostid = gethostid();
if (hostid > 0) {
bootid_collect(&uuid, &hostid, sizeof(hostid));
got_machineid = true;
}
}
#endif /* _XOPEN_SOURCE_EXTENDED */
if (!got_machineid) {
lack:
uuid.x = uuid.y = 0;
return uuid;
}
/*--------------------------------------------------------------------------*/
#if defined(CTL_KERN) && defined(KERN_BOOTTIME)
if (!got_boottime) {
static const int mib[] = {CTL_KERN, KERN_BOOTTIME};
struct timeval boottime;
size_t len = sizeof(boottime);
if (sysctl(
#ifdef SYSCTL_LEGACY_NONCONST_MIB
(int *)
#endif
mib,
ARRAY_LENGTH(mib), &boottime, &len, nullptr, 0) == 0 &&
len == sizeof(boottime) && boottime.tv_sec) {
bootid_collect(&uuid, &boottime, len);
got_boottime = true;
}
}
#endif /* CTL_KERN && KERN_BOOTTIME */
#if defined(__sun) || defined(__SVR4) || defined(__svr4__)
if (!got_boottime) {
kstat_ctl_t *kc = kstat_open();
if (kc) {
kstat_t *kp = kstat_lookup(kc, "unix", 0, "system_misc");
if (kp && kstat_read(kc, kp, 0) != -1) {
kstat_named_t *kn = (kstat_named_t *)kstat_data_lookup(kp, "boot_time");
if (kn) {
switch (kn->data_type) {
case KSTAT_DATA_INT32:
case KSTAT_DATA_UINT32:
bootid_collect(&uuid, &kn->value, sizeof(int32_t));
got_boottime = true;
case KSTAT_DATA_INT64:
case KSTAT_DATA_UINT64:
bootid_collect(&uuid, &kn->value, sizeof(int64_t));
got_boottime = true;
}
}
}
kstat_close(kc);
}
}
#endif /* SunOS / Solaris */
#if _XOPEN_SOURCE_EXTENDED && defined(BOOT_TIME)
if (!got_boottime) {
setutxent();
const struct utmpx id = {.ut_type = BOOT_TIME};
const struct utmpx *entry = getutxid(&id);
if (entry) {
bootid_collect(&uuid, entry, sizeof(*entry));
got_boottime = true;
while (unlikely((entry = getutxid(&id)) != nullptr)) {
/* have multiple reboot records, assuming we can distinguish next
* bootsession even if RTC is wrong or absent */
bootid_collect(&uuid, entry, sizeof(*entry));
got_bootseq = true;
}
}
endutxent();
}
#endif /* _XOPEN_SOURCE_EXTENDED && BOOT_TIME */
if (!got_bootseq) {
if (!got_boottime || !MDBX_TRUST_RTC)
goto lack;
#if defined(_WIN32) || defined(_WIN64)
FILETIME now;
GetSystemTimeAsFileTime(&now);
if (0x1CCCCCC > now.dwHighDateTime)
#else
struct timespec mono, real;
if (clock_gettime(CLOCK_MONOTONIC, &mono) ||
clock_gettime(CLOCK_REALTIME, &real) ||
/* wrong time, RTC is mad or absent */
1555555555l > real.tv_sec ||
/* seems no adjustment by RTC/NTP, i.e. a fake time */
real.tv_sec < mono.tv_sec || 1234567890l > real.tv_sec - mono.tv_sec ||
(real.tv_sec - mono.tv_sec) % 900u == 0)
#endif
goto lack;
}
return uuid;
}
__cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages,
intptr_t *avail_pages) {
if (!page_size && !total_pages && !avail_pages)
return LOG_IFERR(MDBX_EINVAL);
if (total_pages)
*total_pages = -1;
if (avail_pages)
*avail_pages = -1;
const intptr_t pagesize = globals.sys_pagesize;
if (page_size)
*page_size = pagesize;
if (unlikely(pagesize < MDBX_MIN_PAGESIZE || !is_powerof2(pagesize)))
return LOG_IFERR(MDBX_INCOMPATIBLE);
MDBX_MAYBE_UNUSED const int log2page = log2n_powerof2(pagesize);
assert(pagesize == (INT64_C(1) << log2page));
(void)log2page;
#if defined(_WIN32) || defined(_WIN64)
MEMORYSTATUSEX info;
memset(&info, 0, sizeof(info));
info.dwLength = sizeof(info);
if (!GlobalMemoryStatusEx(&info))
return LOG_IFERR((int)GetLastError());
#endif
if (total_pages) {
#if defined(_WIN32) || defined(_WIN64)
const intptr_t total_ram_pages = (intptr_t)(info.ullTotalPhys >> log2page);
#elif defined(_SC_PHYS_PAGES)
const intptr_t total_ram_pages = sysconf(_SC_PHYS_PAGES);
if (total_ram_pages == -1)
return LOG_IFERR(errno);
#elif defined(_SC_AIX_REALMEM)
const intptr_t total_ram_Kb = sysconf(_SC_AIX_REALMEM);
if (total_ram_Kb == -1)
return LOG_IFERR(errno);
const intptr_t total_ram_pages = (total_ram_Kb << 10) >> log2page;
#elif defined(HW_USERMEM) || defined(HW_PHYSMEM64) || defined(HW_MEMSIZE) || \
defined(HW_PHYSMEM)
size_t ram, len = sizeof(ram);
static const int mib[] = {CTL_HW,
#if defined(HW_USERMEM)
HW_USERMEM
#elif defined(HW_PHYSMEM64)
HW_PHYSMEM64
#elif defined(HW_MEMSIZE)
HW_MEMSIZE
#else
HW_PHYSMEM
#endif
};
if (sysctl(
#ifdef SYSCTL_LEGACY_NONCONST_MIB
(int *)
#endif
mib,
ARRAY_LENGTH(mib), &ram, &len, nullptr, 0) != 0)
return LOG_IFERR(errno);
if (len != sizeof(ram))
return LOG_IFERR(MDBX_ENOSYS);
const intptr_t total_ram_pages = (intptr_t)(ram >> log2page);
#else
#error "FIXME: Get User-accessible or physical RAM"
#endif
*total_pages = total_ram_pages;
if (total_ram_pages < 1)
return LOG_IFERR(MDBX_ENOSYS);
}
if (avail_pages) {
#if defined(_WIN32) || defined(_WIN64)
const intptr_t avail_ram_pages = (intptr_t)(info.ullAvailPhys >> log2page);
#elif defined(_SC_AVPHYS_PAGES)
const intptr_t avail_ram_pages = sysconf(_SC_AVPHYS_PAGES);
if (avail_ram_pages == -1)
return LOG_IFERR(errno);
#elif defined(__MACH__)
mach_msg_type_number_t count = HOST_VM_INFO_COUNT;
vm_statistics_data_t vmstat;
mach_port_t mport = mach_host_self();
kern_return_t kerr = host_statistics(mach_host_self(), HOST_VM_INFO,
(host_info_t)&vmstat, &count);
mach_port_deallocate(mach_task_self(), mport);
if (unlikely(kerr != KERN_SUCCESS))
return LOG_IFERR(MDBX_ENOSYS);
const intptr_t avail_ram_pages = vmstat.free_count;
#elif defined(VM_TOTAL) || defined(VM_METER)
struct vmtotal info;
size_t len = sizeof(info);
static const int mib[] = {CTL_VM,
#if defined(VM_TOTAL)
VM_TOTAL
#elif defined(VM_METER)
VM_METER
#endif
};
if (sysctl(
#ifdef SYSCTL_LEGACY_NONCONST_MIB
(int *)
#endif
mib,
ARRAY_LENGTH(mib), &info, &len, nullptr, 0) != 0)
return LOG_IFERR(errno);
if (len != sizeof(info))
return LOG_IFERR(MDBX_ENOSYS);
const intptr_t avail_ram_pages = info.t_free;
#else
#error "FIXME: Get Available RAM"
#endif
*avail_pages = avail_ram_pages;
if (avail_ram_pages < 1)
return LOG_IFERR(MDBX_ENOSYS);
}
return MDBX_SUCCESS;
}
/*----------------------------------------------------------------------------*/
#ifdef __FreeBSD__
#include <sys/uuid.h>
#endif /* FreeBSD */
#ifdef __IPHONE_OS_VERSION_MIN_REQUIRED
#include <CommonCrypto/CommonRandom.h>
#elif __GLIBC_PREREQ(2, 25) || defined(__FreeBSD__) || defined(__NetBSD__) || \
defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \
defined(__APPLE__) || __has_include(<sys/random.h>)
#include <sys/random.h>
#endif /* sys/random.h */
#if defined(_WIN32) || defined(_WIN64)
#include <wincrypt.h>
#endif /* Windows */
MDBX_INTERNAL bin128_t osal_guid(const MDBX_env *env) {
struct {
uint64_t begin, end, cputime;
uintptr_t thread, pid;
const void *x, *y;
bin128_t (*z)(const MDBX_env *env);
} salt;
salt.begin = osal_monotime();
bin128_t uuid = {{0, 0}};
#if defined(__linux__) || defined(__gnu_linux__)
if (proc_read_uuid("/proc/sys/kernel/random/uuid", &uuid) && check_uuid(uuid))
return uuid;
#endif /* Linux */
#ifdef __FreeBSD__
STATIC_ASSERT(sizeof(uuid) == sizeof(struct uuid));
if (uuidgen((struct uuid *)&uuid, 1) == 0 && check_uuid(uuid))
return uuid;
#endif /* FreeBSD */
#if defined(_WIN32) || defined(_WIN64)
if (imports.CoCreateGuid && imports.CoCreateGuid(&uuid) == 0 &&
check_uuid(uuid))
return uuid;
HCRYPTPROV hCryptProv = 0;
if (CryptAcquireContextW(&hCryptProv, nullptr, nullptr, PROV_RSA_FULL,
CRYPT_VERIFYCONTEXT | CRYPT_SILENT)) {
const BOOL ok =
CryptGenRandom(hCryptProv, sizeof(uuid), (unsigned char *)&uuid);
CryptReleaseContext(hCryptProv, 0);
if (ok && check_uuid(uuid))
return uuid;
}
#elif defined(__IPHONE_OS_VERSION_MIN_REQUIRED) && defined(__IPHONE_8_0)
#if __IPHONE_OS_VERSION_MIN_REQUIRED >= __IPHONE_8_0
if (CCRandomGenerateBytes(&uuid, sizeof(uuid)) == kCCSuccess &&
check_uuid(uuid))
return uuid;
#endif /* iOS >= 8.x */
#else
const int fd = open("/dev/urandom", O_RDONLY);
if (fd != -1) {
const ssize_t len = read(fd, &uuid, sizeof(uuid));
const int err = close(fd);
assert(err == 0);
(void)err;
if (len == sizeof(uuid) && check_uuid(uuid))
return uuid;
}
#if (__GLIBC_PREREQ(2, 25) || defined(__FreeBSD__) || defined(__NetBSD__) || \
defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__)) && \
!defined(__APPLE__) && !defined(__ANDROID_API__)
if (getrandom(&uuid, sizeof(uuid), 0) == sizeof(uuid) && check_uuid(uuid))
return uuid;
#elif defined(__OpenBSD__) || (defined(__sun) && defined(__SVR4)) || \
(defined(__MAC_OS_X_VERSION_MIN_REQUIRED) && \
__MAC_OS_X_VERSION_MIN_REQUIRED >= 101200)
if (getentropy(&uuid, sizeof(uuid)) == 0 && check_uuid(uuid))
return uuid;
#endif /* getrandom() / getentropy() */
#endif /* !Windows */
uuid = globals.bootid;
bootid_collect(&uuid, env, sizeof(*env));
salt.thread = osal_thread_self();
salt.pid = osal_getpid();
salt.x = &salt;
salt.y = env;
salt.z = &osal_guid;
do {
salt.cputime = osal_cputime(nullptr);
salt.end = osal_monotime();
bootid_collect(&uuid, &salt, sizeof(salt));
} while (!check_uuid(uuid));
return uuid;
}
/*--------------------------------------------------------------------------*/
void osal_ctor(void) {
#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX)
osal_iov_max = sysconf(_SC_IOV_MAX);
if (RUNNING_ON_VALGRIND && osal_iov_max > 64)
/* чтобы не описывать все 1024 исключения в valgrind_suppress.txt */
osal_iov_max = 64;
#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */
#if defined(_WIN32) || defined(_WIN64)
SYSTEM_INFO si;
GetSystemInfo(&si);
globals.sys_pagesize = si.dwPageSize;
globals.sys_allocation_granularity = si.dwAllocationGranularity;
#else
globals.sys_pagesize = sysconf(_SC_PAGE_SIZE);
globals.sys_allocation_granularity = (MDBX_WORDBITS > 32) ? 65536 : 4096;
globals.sys_allocation_granularity =
(globals.sys_allocation_granularity > globals.sys_pagesize)
? globals.sys_allocation_granularity
: globals.sys_pagesize;
#endif
assert(globals.sys_pagesize > 0 &&
(globals.sys_pagesize & (globals.sys_pagesize - 1)) == 0);
assert(globals.sys_allocation_granularity >= globals.sys_pagesize &&
globals.sys_allocation_granularity % globals.sys_pagesize == 0);
globals.sys_pagesize_ln2 = log2n_powerof2(globals.sys_pagesize);
#if defined(__linux__) || defined(__gnu_linux__)
posix_clockid = choice_monoclock();
#endif
#if defined(_WIN32) || defined(_WIN64)
QueryPerformanceFrequency(&performance_frequency);
#elif defined(__APPLE__) || defined(__MACH__)
mach_timebase_info_data_t ti;
mach_timebase_info(&ti);
ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer;
#endif
monotime_limit = osal_16dot16_to_monotime(UINT32_MAX - 1);
uint32_t proba = UINT32_MAX;
while (true) {
unsigned time_conversion_checkup =
osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba));
unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba;
unsigned one_less = (proba > 0) ? proba - 1 : proba;
ENSURE(nullptr, time_conversion_checkup >= one_less &&
time_conversion_checkup <= one_more);
if (proba == 0)
break;
proba >>= 1;
}
globals.bootid = osal_bootid();
}
void osal_dtor(void) {}