From 924e81ed92e651bd5b4bd452920704c2be6884a4 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 24 May 2017 13:59:50 +0300 Subject: [PATCH] mdbx: refine includes, drop midl.h and mdbx_osal.h --- Makefile | 35 ++- dll.vcxproj | 3 +- libmdbx.files | 2 - mdbx.h | 108 ++++++++- mdbx_osal.h | 131 ----------- src/bits.h | 525 +++++++++++++++++++++++++++++++++---------- src/defs.h | 72 +++++- src/mdbx.c | 338 +--------------------------- src/midl.h | 38 ---- src/osal.h | 114 ++++++---- src/tools/mdbx_chk.c | 1 - test/test.vcxproj | 1 + 12 files changed, 672 insertions(+), 696 deletions(-) delete mode 100644 mdbx_osal.h delete mode 100644 src/midl.h diff --git a/Makefile b/Makefile index e1fd41f4..a8bae342 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ XCFLAGS ?= -DNDEBUG=1 -DMDBX_DEBUG=0 -DLIBMDBX_EXPORTS=1 CFLAGS ?= -O2 -g3 -Wall -Werror -Wextra -ffunction-sections -fPIC -fvisibility=hidden CFLAGS += -D_GNU_SOURCE=1 -std=gnu99 -pthread $(XCFLAGS) CXXFLAGS = -std=c++11 $(filter-out -std=gnu99,$(CFLAGS)) -TESTDB ?= /tmp/mdbx-check.db +TESTDB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-check.db # LY: '--no-as-needed,-lrt' for ability to built with modern glibc, but then run with the old LDFLAGS ?= -Wl,--gc-sections,-z,relro,-O,--no-as-needed,-lrt @@ -44,6 +44,13 @@ TOOLS := mdbx_stat mdbx_copy mdbx_dump mdbx_load mdbx_chk MANPAGES := mdbx_stat.1 mdbx_copy.1 mdbx_dump.1 mdbx_load.1 SHELL := /bin/bash +CORE_SRC := $(filter-out src/lck-windows.c, $(wildcard src/*.c)) +CORE_INC := $(wildcard src/*.h) +CORE_OBJ := $(patsubst %.c,%.o,$(CORE_SRC)) +TEST_SRC := $(filter-out test/osal-windows.cc, $(wildcard test/*.cc)) +TEST_INC := $(wildcard test/*.h) +TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) + .PHONY: mdbx all install clean check coverage all: $(LIBRARIES) $(TOOLS) test/test @@ -63,27 +70,35 @@ install: $(LIBRARIES) $(TOOLS) $(HEADERS) && cp -t $(SANDBOX)$(mandir)/man1 $(MANPAGES) clean: - rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err + rm -rf $(TOOLS) test/test @* *.[ao] *.[ls]o *~ tmp.db/* *.gcov *.log *.err src/*.o test/*.o check: test/test rm -f $(TESTDB) && (set -o pipefail; test/test --pathname=$(TESTDB) --dont-cleanup-after basic | tee test.log | tail -n 42) && ./mdbx_chk -vn $(TESTDB) -src/%.o: src/%.c mdbx.h mdbx_osal.h $(addprefix src/, defs.h bits.h osal.h midl.h) Makefile - $(CC) $(CFLAGS) -c $(filter %.c, $^) -o $@ +define core-rule +$(patsubst %.c,%.o,$(1)): $(1) $(CORE_INC) mdbx.h Makefile + $(CC) $(CFLAGS) -c $(1) -o $$@ -libmdbx.a: $(addprefix src/, mdbx.o osal.o lck-posix.o version.o) +endef +$(foreach file,$(CORE_SRC),$(eval $(call core-rule,$(file)))) + +define test-rule +$(patsubst %.cc,%.o,$(1)): $(1) $(TEST_INC) mdbx.h Makefile + $(CXX) $(CXXFLAGS) -c $(1) -o $$@ + +endef +$(foreach file,$(TEST_SRC),$(eval $(call test-rule,$(file)))) + +libmdbx.a: $(CORE_OBJ) $(AR) rs $@ $? -libmdbx.so: libmdbx.a +libmdbx.so: $(CORE_OBJ) $(CC) $(CFLAGS) -save-temps $^ -pthread -shared $(LDFLAGS) -o $@ mdbx_%: src/tools/mdbx_%.c libmdbx.a $(CC) $(CFLAGS) $^ $(LDFLAGS) -o $@ -test/%.o: test/%.cc $(wildcard test/*.h) Makefile - $(CXX) $(CXXFLAGS) -Isrc -c $(filter %.cc, $^) -o $@ - -test/test: $(patsubst %.cc,%.o,$(filter-out test/osal-windows.cc, $(wildcard test/*.cc))) libmdbx.a +test/test: $(TEST_OBJ) libmdbx.a $(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@ ifneq ($(wildcard $(IOARENA)),) diff --git a/dll.vcxproj b/dll.vcxproj index 8c179768..44c71646 100644 --- a/dll.vcxproj +++ b/dll.vcxproj @@ -146,14 +146,13 @@ + - - diff --git a/libmdbx.files b/libmdbx.files index 63bee3c9..3f51a9b5 100644 --- a/libmdbx.files +++ b/libmdbx.files @@ -3,13 +3,11 @@ LICENSE Makefile README.md mdbx.h -mdbx_osal.h src/bits.h src/defs.h src/lck-posix.c src/lck-windows.c src/mdbx.c -src/midl.h src/osal.c src/osal.h src/tools/mdbx_chk.c diff --git a/mdbx.h b/mdbx.h index eb2f4cfd..44802f30 100644 --- a/mdbx.h +++ b/mdbx.h @@ -49,14 +49,108 @@ #ifndef LIBMDBX_H #define LIBMDBX_H +/*--------------------------------------------------------------------------*/ + +#ifdef _MSC_VER +#pragma warning(push, 1) +#pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ + * semantics are not enabled. Specify /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ + * mode specified; termination on exception is \ + * not guaranteed. Specify /EHsc */ +#endif /* _MSC_VER (warnings) */ + +#include +#include +#include + +#if defined(_WIN32) || defined(_WIN64) + +#include +#include +typedef unsigned mode_t; +typedef HANDLE mdbx_filehandle_t; +typedef DWORD mdbx_pid_t; +typedef DWORD mdbx_tid_t; +#define MDBX_ENODATA ERROR_HANDLE_EOF +#define MDBX_EINVAL ERROR_INVALID_PARAMETER +#define MDBX_EACCESS ERROR_ACCESS_DENIED +#define MDBX_ENOMEM ERROR_OUTOFMEMORY +#define MDBX_EROFS ERROR_FILE_READ_ONLY +#define MDBX_ENOSYS ERROR_NOT_SUPPORTED +#define MDBX_EIO ERROR_WRITE_FAULT + +#else + +#include /* for error codes */ +#include /* for pthread_t */ +#include /* for pid_t */ +#include /* for truct iovec */ +#define HAVE_STRUCT_IOVEC 1 +typedef int mdbx_filehandle_t; +typedef pid_t mdbx_pid_t; +typedef pthread_t mdbx_tid_t; +#define MDBX_ENODATA ENODATA +#define MDBX_EINVAL EINVAL +#define MDBX_EACCESS EACCES +#define MDBX_ENOMEM ENOMEM +#define MDBX_EROFS EROFS +#define MDBX_ENOSYS ENOSYS +#define MDBX_EIO EIO +#endif + +#ifdef _MSC_VER +#pragma warning(pop) +#endif + +/*--------------------------------------------------------------------------*/ + +#ifndef __has_attribute +#define __has_attribute(x) (0) +#endif + +#ifndef __dll_export +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(dllexport) +#define __dll_export __attribute__((dllexport)) +#elif defined(_MSC_VER) +#define __dll_export __declspec(dllexport) +#else +#define __dll_export +#endif +#elif defined(__GNUC__) || __has_attribute(visibility) +#define __dll_export __attribute__((visibility("default"))) +#else +#define __dll_export +#endif +#endif /* __dll_export */ + +#ifndef __dll_import +#if defined(_WIN32) || defined(__CYGWIN__) +#if defined(__GNUC__) || __has_attribute(dllimport) +#define __dll_import __attribute__((dllimport)) +#elif defined(_MSC_VER) +#define __dll_import __declspec(dllimport) +#else +#define __dll_import +#endif +#else +#define __dll_import +#endif +#endif /* __dll_import */ + +/*--------------------------------------------------------------------------*/ + #define MDBX_VERSION_MAJOR 0 #define MDBX_VERSION_MINOR 0 -#ifdef _MSC_VER -#pragma warning(push) -#endif - -#include "mdbx_osal.h" +#if defined(LIBMDBX_EXPORTS) +#define LIBMDBX_API __dll_export +#elif defined(LIBMDBX_IMPORTS) +#define LIBMDBX_API __dll_import +#else +#define LIBMDBX_API +#endif /* LIBMDBX_API */ #ifdef __cplusplus extern "C" { @@ -1511,8 +1605,4 @@ LIBMDBX_API int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, } #endif -#ifdef _MSC_VER -#pragma warning(pop) -#endif - #endif /* LIBMDBX_H */ diff --git a/mdbx_osal.h b/mdbx_osal.h deleted file mode 100644 index 10237c5f..00000000 --- a/mdbx_osal.h +++ /dev/null @@ -1,131 +0,0 @@ -/* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ - -/* - * Copyright 2015-2017 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -#pragma once - -#ifndef __has_attribute -#define __has_attribute(x) (0) -#endif - -#ifndef __dll_export -#if defined(_WIN32) || defined(__CYGWIN__) -#if defined(__GNUC__) || __has_attribute(dllexport) -#define __dll_export __attribute__((dllexport)) -#elif defined(_MSC_VER) -#define __dll_export __declspec(dllexport) -#else -#define __dll_export -#endif -#elif defined(__GNUC__) || __has_attribute(visibility) -#define __dll_export __attribute__((visibility("default"))) -#else -#define __dll_export -#endif -#endif /* __dll_export */ - -#ifndef __dll_import -#if defined(_WIN32) || defined(__CYGWIN__) -#if defined(__GNUC__) || __has_attribute(dllimport) -#define __dll_import __attribute__((dllimport)) -#elif defined(_MSC_VER) -#define __dll_import __declspec(dllimport) -#else -#define __dll_import -#endif -#else -#define __dll_import -#endif -#endif /* __dll_import */ - -/*--------------------------------------------------------------------------*/ - -#ifdef _MSC_VER -#pragma warning(push) -#pragma warning(disable : 4514) /* 'xyz': unreferenced inline function \ - has been removed */ -#pragma warning(disable : 4710) /* 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* function 'xyz' selected for \ - automatic inline expansion */ -#pragma warning(disable : 4061) /* enumerator 'abc' in switch of enum \ - 'xyz' is not explicitly handled by a case \ - label */ -#pragma warning(disable : 4201) /* nonstandard extension used : \ - nameless struct / union */ -#pragma warning(disable : 4127) /* conditional expression is constant \ - */ - -#pragma warning(push, 1) -#pragma warning(disable : 4530) /* C++ exception handler used, but \ - unwind semantics are not enabled. Specify \ - /EHsc */ -#pragma warning(disable : 4577) /* 'noexcept' used with no exception \ - handling mode specified; termination on \ - exception is not guaranteed. Specify /EHsc \ - */ -#endif /* _MSC_VER (warnings) */ - -#include -#include -#include - -#if defined(_WIN32) || defined(_WIN64) - -#include -#include -typedef unsigned mode_t; -typedef HANDLE mdbx_filehandle_t; -typedef DWORD mdbx_pid_t; -typedef DWORD mdbx_tid_t; -#define MDBX_ENODATA ERROR_HANDLE_EOF -#define MDBX_EINVAL ERROR_INVALID_PARAMETER -#define MDBX_EACCESS ERROR_ACCESS_DENIED -#define MDBX_ENOMEM ERROR_OUTOFMEMORY -#define MDBX_EROFS ERROR_FILE_READ_ONLY -#define MDBX_ENOSYS ERROR_NOT_SUPPORTED -#define MDBX_EIO ERROR_WRITE_FAULT - -#else - -#include /* for error codes */ -#include /* for pthread_t */ -#include /* for pid_t */ -#include /* for truct iovec */ -#define HAVE_STRUCT_IOVEC 1 -typedef int mdbx_filehandle_t; -typedef pid_t mdbx_pid_t; -typedef pthread_t mdbx_tid_t; -#define MDBX_ENODATA ENODATA -#define MDBX_EINVAL EINVAL -#define MDBX_EACCESS EACCES -#define MDBX_ENOMEM ENOMEM -#define MDBX_EROFS EROFS -#define MDBX_ENOSYS ENOSYS -#define MDBX_EIO EIO -#endif - -#ifdef _MSC_VER -#pragma warning(pop) -#endif - -/*--------------------------------------------------------------------------*/ - -#if defined(LIBMDBX_EXPORTS) -#define LIBMDBX_API __dll_export -#elif defined(LIBMDBX_IMPORTS) -#define LIBMDBX_API __dll_import -#else -#define LIBMDBX_API -#endif /* LIBMDBX_API */ diff --git a/src/bits.h b/src/bits.h index 897e1eb6..1ef6a799 100644 --- a/src/bits.h +++ b/src/bits.h @@ -15,60 +15,6 @@ /* *INDENT-OFF* */ /* clang-format off */ -#ifndef _FILE_OFFSET_BITS -# define _FILE_OFFSET_BITS 64 -#endif - -#if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_WARNINGS) -# define _CRT_SECURE_NO_WARNINGS -#endif - -#ifdef _MSC_VER -#pragma warning(disable : 4464) /* C4464: relative include path contains '..' */ -#pragma warning(disable : 4710) /* C4710: 'xyz': function not inlined */ -#pragma warning(disable : 4711) /* C4711: function 'xyz' selected for automatic inline expansion */ -#pragma warning(disable : 4201) /* C4201: nonstandard extension used : nameless struct / union */ -#pragma warning(disable : 4706) /* C4706: assignment within conditional expression */ -#pragma warning(disable : 4127) /* C4127: conditional expression is constant */ -#endif /* _MSC_VER (warnings) */ - -#include "../mdbx.h" -#include "./defs.h" - -#if defined(USE_VALGRIND) -# include -# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE - /* LY: available since Valgrind 3.10 */ -# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# endif -#else -# define VALGRIND_CREATE_MEMPOOL(h,r,z) -# define VALGRIND_DESTROY_MEMPOOL(h) -# define VALGRIND_MEMPOOL_TRIM(h,a,s) -# define VALGRIND_MEMPOOL_ALLOC(h,a,s) -# define VALGRIND_MEMPOOL_FREE(h,a) -# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) -# define VALGRIND_MAKE_MEM_NOACCESS(a,s) -# define VALGRIND_MAKE_MEM_DEFINED(a,s) -# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) -# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) -# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) -# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) -#endif /* USE_VALGRIND */ - -#ifdef __SANITIZE_ADDRESS__ -# include -#else -# define ASAN_POISON_MEMORY_REGION(addr, size) \ - ((void)(addr), (void)(size)) -# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ - ((void)(addr), (void)(size)) -#endif /* __SANITIZE_ADDRESS__ */ - -#include "./osal.h" - #ifndef MDBX_DEBUG # define MDBX_DEBUG 0 #endif @@ -77,6 +23,14 @@ # undef NDEBUG #endif +/* Features under development */ +#ifndef MDBX_DEVEL +# define MDBX_DEVEL 0 +#endif + +#include "../mdbx.h" +#include "./defs.h" + #if defined(__GNUC__) && !__GNUC_PREREQ(4,2) /* Actualy libmdbx was not tested with compilers older than GCC from RHEL6. * But you could remove this #error and try to continue at your own risk. @@ -93,53 +47,66 @@ # warning "libmdbx required at least GLIBC 2.12." #endif -#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) -# define UNALIGNED_OK 1 /* TODO */ -#endif -#ifndef UNALIGNED_OK -# define UNALIGNED_OK 0 -#endif /* UNALIGNED_OK */ +#ifdef __SANITIZE_THREAD__ +# warning "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues." +#endif /* __SANITIZE_THREAD__ */ -#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF -# error "Sanity checking failed: Two's complement, reasonably sized integer types" -#endif +#ifdef _MSC_VER +#pragma warning(disable : 4464) /* C4464: relative include path contains '..' */ +#pragma warning(disable : 4710) /* C4710: 'xyz': function not inlined */ +#pragma warning(disable : 4711) /* C4711: function 'xyz' selected for automatic inline expansion */ +#pragma warning(disable : 4201) /* C4201: nonstandard extension used : nameless struct / union */ +#pragma warning(disable : 4706) /* C4706: assignment within conditional expression */ +#pragma warning(disable : 4127) /* C4127: conditional expression is constant */ +#endif /* _MSC_VER (warnings) */ -/*----------------------------------------------------------------------------*/ - -#ifndef ARRAY_LENGTH -# ifdef __cplusplus - template - char (&__ArraySizeHelper(T (&array)[N]))[N]; -# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) -# else -# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) -# endif -#endif /* ARRAY_LENGTH */ - -#ifndef ARRAY_END -# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) -#endif /* ARRAY_END */ - -#ifndef STRINGIFY -# define STRINGIFY_HELPER(x) #x -# define STRINGIFY(x) STRINGIFY_HELPER(x) -#endif /* STRINGIFY */ - -#ifndef offsetof -# define offsetof(type, member) __builtin_offsetof(type, member) -#endif /* offsetof */ - -#ifndef container_of -# define container_of(ptr, type, member) \ - ((type *)((char *)(ptr) - offsetof(type, member))) -#endif /* container_of */ +#include "./osal.h" /* *INDENT-ON* */ /* clang-format on */ -#define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) - /*----------------------------------------------------------------------------*/ +/* Basic constants and types */ + +/* The maximum size of a database page. + * + * It is 32k or 64k, since value-PAGEBASE must fit in + * MDBX_page.mp_upper. + * + * MDBX will use database pages < OS pages if needed. + * That causes more I/O in write transactions: The OS must + * know (read) the whole page before writing a partial page. + * + * Note that we don't currently support Huge pages. On Linux, + * regular data files cannot use Huge pages, and in general + * Huge pages aren't actually pageable. We rely on the OS + * demand-pager to read our data and page it out when memory + * pressure from other processes is high. So until OSs have + * actual paging support for Huge pages, they're not viable. */ +#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) + +/* The minimum number of keys required in a database page. + * Setting this to a larger value will place a smaller bound on the + * maximum size of a data item. Data items larger than this size will + * be pushed into overflow pages instead of being stored directly in + * the B-tree node. This value used to default to 4. With a page size + * of 4096 bytes that meant that any item larger than 1024 bytes would + * go into an overflow page. That also meant that on average 2-3KB of + * each overflow page was wasted space. The value cannot be lower than + * 2 because then there would no longer be a tree structure. With this + * value, items larger than 2KB will go into overflow pages, and on + * average only 1KB will be wasted. */ +#define MDBX_MINKEYS 2 + +/* A stamp that identifies a file as an MDBX file. + * There's nothing special about this value other than that it is easily + * recognizable, and it will reflect any byte order mismatches. */ +#define MDBX_MAGIC 0xBEEFC0DE + +/* The version number for a database's datafile format. */ +#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1) +/* The version number for a database's lockfile format. */ +#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1) /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -162,32 +129,53 @@ typedef uint32_t pgno_t; typedef uint64_t txnid_t; #define PRIaTXN PRIi64 -/* An IDL is an ID List, a sorted array of IDs. The first - * element of the array is a counter for how many actual - * IDs are in the list. In the original back-bdb code, IDLs are - * sorted in ascending order. For libmdb IDLs are sorted in - * descending order. */ -typedef pgno_t *MDBX_IDL; - -/* An ID2 is an ID/pointer pair. */ -typedef struct MDBX_ID2 { - pgno_t mid; /* The ID */ - void *mptr; /* The pointer */ -} MDBX_ID2; - -/* An ID2L is an ID2 List, a sorted array of ID2s. - * The first element's mid member is a count of how many actual - * elements are in the array. The mptr member of the first element is - * unused. The array is sorted in ascending order by mid. */ -typedef MDBX_ID2 *MDBX_ID2L; - /* Used for offsets within a single page. * Since memory pages are typically 4 or 8KB in size, 12-13 bits, * this is plenty. */ typedef uint16_t indx_t; +/*----------------------------------------------------------------------------*/ +/* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) +/* Reader Lock Table + * + * Readers don't acquire any locks for their data access. Instead, they + * simply record their transaction ID in the reader table. The reader + * mutex is needed just to find an empty slot in the reader table. The + * slot's address is saved in thread-specific data so that subsequent + * read transactions started by the same thread need no further locking to + * proceed. + * + * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. + * No reader table is used if the database is on a read-only filesystem. + * + * Since the database uses multi-version concurrency control, readers don't + * actually need any locking. This table is used to keep track of which + * readers are using data from which old transactions, so that we'll know + * when a particular old transaction is no longer in use. Old transactions + * that have discarded any data pages can then have those pages reclaimed + * for use by a later write transaction. + * + * The lock table is constructed such that reader slots are aligned with the + * processor's cache line size. Any slot is only ever used by one thread. + * This alignment guarantees that there will be no contention or cache + * thrashing as threads update their own slot info, and also eliminates + * any need for locking when accessing a slot. + * + * A writer thread will scan every slot in the table to determine the oldest + * outstanding reader transaction. Any freed pages older than this will be + * reclaimed by the writer. The writer doesn't use any locks when scanning + * this table. This means that there's no guarantee that the writer will + * see the most up-to-date reader info, but that's not required for correct + * operation - all we need is to know the upper bound on the oldest reader, + * we don't care at all about the newest reader. So the only consequence of + * reading stale information here is that old pages might hang around a + * while longer before being reclaimed. That's actually good anyway, because + * the longer we delay reclaiming old pages, the more likely it is that a + * string of contiguous pages can be found after coalescing old pages from + * many old transactions together. */ + /* The actual reader record, with cacheline padding. */ typedef struct MDBX_reader { /* Current Transaction ID when this transaction began, or (txnid_t)-1. @@ -343,6 +331,54 @@ typedef struct MDBX_lockinfo { } MDBX_lockinfo; #pragma pack(pop) +/*----------------------------------------------------------------------------*/ +/* Two kind lists of pages (aka IDL) */ + +/* An IDL is an ID List, a sorted array of IDs. The first + * element of the array is a counter for how many actual + * IDs are in the list. In the libmdbx IDLs are sorted in + * descending order. */ +typedef pgno_t *MDBX_IDL; + +/* An ID2 is an ID/pointer pair. */ +typedef struct MDBX_ID2 { + pgno_t mid; /* The ID */ + void *mptr; /* The pointer */ +} MDBX_ID2; + +/* An ID2L is an ID2 List, a sorted array of ID2s. + * The first element's mid member is a count of how many actual + * elements are in the array. The mptr member of the first element is + * unused. The array is sorted in ascending order by mid. */ +typedef MDBX_ID2 *MDBX_ID2L; + +/* IDL sizes - likely should be even bigger + * limiting factors: sizeof(pgno_t), thread stack size */ +#define MDBX_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ +#define MDBX_IDL_DB_SIZE (1 << MDBX_IDL_LOGN) +#define MDBX_IDL_UM_SIZE (1 << (MDBX_IDL_LOGN + 1)) + +#define MDBX_IDL_DB_MAX (MDBX_IDL_DB_SIZE - 1) +#define MDBX_IDL_UM_MAX (MDBX_IDL_UM_SIZE - 1) + +#define MDBX_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(pgno_t)) +#define MDBX_IDL_IS_ZERO(ids) ((ids)[0] == 0) +#define MDBX_IDL_CPY(dst, src) (memcpy(dst, src, MDBX_IDL_SIZEOF(src))) +#define MDBX_IDL_FIRST(ids) ((ids)[1]) +#define MDBX_IDL_LAST(ids) ((ids)[(ids)[0]]) + +/* Current max length of an mdbx_midl_alloc()ed IDL */ +#define MDBX_IDL_ALLOCLEN(ids) ((ids)[-1]) + +/* Append ID to IDL. The IDL must be big enough. */ +#define mdbx_midl_xappend(idl, id) \ + do { \ + pgno_t *xidl = (idl), xlen = ++(xidl[0]); \ + xidl[xlen] = (id); \ + } while (0) + +/*----------------------------------------------------------------------------*/ +/* Internal structures */ /* Auxiliary DB info. * The information here is mostly static/read-only. There is @@ -598,6 +634,7 @@ typedef struct MDBX_ntxn { } MDBX_ntxn; /*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ extern int mdbx_runtime_flags; extern MDBX_debug_func *mdbx_debug_logger; @@ -638,8 +675,6 @@ void mdbx_panic(const char *fmt, ...) #define mdbx_print(fmt, ...) \ mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) -/*----------------------------------------------------------------------------*/ - #define mdbx_trace(fmt, ...) \ do { \ if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ @@ -689,8 +724,6 @@ void mdbx_panic(const char *fmt, ...) fmt "\n", ##__VA_ARGS__); \ } while (0) -/*----------------------------------------------------------------------------*/ - #define mdbx_debug(fmt, ...) \ do { \ if (mdbx_debug_enabled(MDBX_DBG_TRACE)) \ @@ -738,8 +771,6 @@ void mdbx_panic(const char *fmt, ...) /* assert(3) variant in transaction context */ #define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) -/*----------------------------------------------------------------------------*/ - static __inline void mdbx_jitter4testing(bool tiny) { #ifndef NDEBUG mdbx_osal_jitter(tiny); @@ -748,6 +779,9 @@ static __inline void mdbx_jitter4testing(bool tiny) { #endif } +/*----------------------------------------------------------------------------*/ +/* Internal prototypes and inlines */ + int mdbx_reader_check0(MDBX_env *env, int rlocked, int *dead); #define METAPAGE_1(env) (&((MDBX_metabuf *)(env)->me_map)->mb_metabuf.mm_meta) @@ -782,3 +816,256 @@ static __inline size_t roundup2(size_t value, size_t granularity) { #define MDBX_IS_ERROR(rc) \ ((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE) + +/* Internal error codes, not exposed outside libmdbx */ +#define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10) + +/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */ +#define DDBI(mc) \ + (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) + +/* Key size which fits in a DKBUF. */ +#define DKBUF_MAXKEYSIZE 511 /* FIXME */ + +#if MDBX_DEBUG +#define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2] +#define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) +#define DVAL(x) \ + mdbx_dkey(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1) +#else +#define DKBUF ((void)(0)) +#define DKEY(x) ("-") +#define DVAL(x) ("-") +#endif + +/* An invalid page number. + * Mainly used to denote an empty tree. */ +#define P_INVALID (~(pgno_t)0) + +/* Test if the flags f are set in a flag word w. */ +#define F_ISSET(w, f) (((w) & (f)) == (f)) + +/* Round n up to an even number. */ +#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ + +/* Default size of memory map. + * This is certainly too small for any actual applications. Apps should + * always set the size explicitly using mdbx_env_set_mapsize(). */ +#define DEFAULT_MAPSIZE 1048576 + +/* Number of slots in the reader table. + * This value was chosen somewhat arbitrarily. The 61 is a prime number, + * and such readers plus a couple mutexes fit into single 4KB page. + * Applications should set the table size using mdbx_env_set_maxreaders(). */ +#define DEFAULT_READERS 61 + +/* Address of first usable data byte in a page, after the header */ +#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) + +/* ITS#7713, change PAGEBASE to handle 65536 byte pages */ +#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0) + +/* Number of nodes on a page */ +#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) + +/* The amount of space remaining in the page */ +#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) + +/* The percentage of space used in the page, in tenths of a percent. */ +#define PAGEFILL(env, p) \ + (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ + ((env)->me_psize - PAGEHDRSZ)) +/* The minimum page fill factor, in tenths of a percent. + * Pages emptier than this are candidates for merging. */ +#define FILL_THRESHOLD 250 + +/* Test if a page is a leaf page */ +#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) +/* Test if a page is a LEAF2 page */ +#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) +/* Test if a page is a branch page */ +#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) +/* Test if a page is an overflow page */ +#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) +/* Test if a page is a sub page */ +#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) + +/* The number of overflow pages needed to store the given size. */ +#define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) + +/* Link in MDBX_txn.mt_loose_pages list. + * Kept outside the page header, which is needed when reusing the page. */ +#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2)) + +/* Header for a single key/data pair within a page. + * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. + * We guarantee 2-byte alignment for 'MDBX_node's. + * + * mn_lo and mn_hi are used for data size on leaf nodes, and for child + * pgno on branch nodes. On 64 bit platforms, mn_flags is also used + * for pgno. (Branch nodes have no flags). Lo and hi are in host byte + * order in case some accesses can be optimized to 32-bit word access. + * + * Leaf node flags describe node contents. F_BIGDATA says the node's + * data part is the page number of an overflow page with actual data. + * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in + * a sub-page/sub-database, and named databases (just F_SUBDATA). */ +typedef struct MDBX_node { + union { + struct { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + union { + struct { + uint16_t mn_lo, mn_hi; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + }; + uint16_t mn_flags; /* see mdbx_node */ + uint16_t mn_ksize; /* key size */ +#else + uint16_t mn_ksize; /* key size */ + uint16_t mn_flags; /* see mdbx_node */ + union { + struct { + uint16_t mn_hi, mn_lo; /* part of data size or pgno */ + }; + uint32_t mn_dsize; + }; +#endif + }; + pgno_t mn_ksize_and_pgno; + }; + +/* mdbx_node Flags */ +#define F_BIGDATA 0x01 /* data put on overflow page */ +#define F_SUBDATA 0x02 /* data is a sub-database */ +#define F_DUPDATA 0x04 /* data has duplicates */ + +/* valid flags for mdbx_node_add() */ +#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) + uint8_t mn_data[1]; /* key and data are appended here */ +} MDBX_node; + +/* Size of the node header, excluding dynamic data at the end */ +#define NODESIZE offsetof(MDBX_node, mn_data) + +/* Bit position of top word in page number, for shifting mn_flags */ +#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) + +/* Size of a node in a branch page with a given key. + * This is just the node header plus the key, there is no data. */ +#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len)) + +/* Size of a node in a leaf page with a given key and data. + * This is node header plus key plus data size. */ +#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len) + +/* Address of node i in page p */ +static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { + assert(NUMKEYS(p) > (unsigned)(i)); + return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); +} + +/* Address of the key for the node */ +#define NODEKEY(node) (void *)((node)->mn_data) + +/* Address of the data for a node */ +#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) + +/* Get the page number pointed to by a branch node */ +static __inline pgno_t NODEPGNO(const MDBX_node *node) { + pgno_t pgno; + if (UNALIGNED_OK) { + pgno = node->mn_ksize_and_pgno; + if (sizeof(pgno_t) > 4) + pgno &= UINT64_C(0xffffFFFFffff); + } else { + pgno = node->mn_lo | ((pgno_t)node->mn_lo << 16); + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_flags) << 32; + } + return pgno; +} + +/* Set the page number in a branch node */ +static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { + assert(pgno <= (pgno_t)UINT64_C(0xffffFFFFffff)); + + if (UNALIGNED_OK) { + if (sizeof(pgno_t) > 4) + pgno |= ((uint64_t)node->mn_ksize) << 48; + node->mn_ksize_and_pgno = pgno; + } else { + node->mn_lo = (uint16_t)pgno; + node->mn_hi = (uint16_t)(pgno >> 16); + if (sizeof(pgno_t) > 4) + node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); + } +} + +/* Get the size of the data in a leaf node */ +static __inline size_t NODEDSZ(const MDBX_node *node) { + size_t size; + if (UNALIGNED_OK) { + size = node->mn_dsize; + } else { + size = node->mn_lo | ((size_t)node->mn_hi << 16); + } + return size; +} + +/* Set the size of the data for a leaf node */ +static __inline void SETDSZ(MDBX_node *node, unsigned size) { + if (UNALIGNED_OK) { + node->mn_dsize = size; + } else { + node->mn_lo = (uint16_t)size; + node->mn_hi = (uint16_t)(size >> 16); + } +} + +/* The size of a key in a node */ +#define NODEKSZ(node) ((node)->mn_ksize) + +/* The address of a key in a LEAF2 page. + * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. + * There are no node headers, keys are stored contiguously. */ +#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) + +/* Set the node's key into keyptr, if requested. */ +#define MDBX_GET_KEY(node, keyptr) \ + do { \ + if ((keyptr) != NULL) { \ + (keyptr)->iov_len = NODEKSZ(node); \ + (keyptr)->iov_base = NODEKEY(node); \ + } \ + } while (0) + +/* Set the node's key into key. */ +#define MDBX_GET_KEY2(node, key) \ + do { \ + key.iov_len = NODEKSZ(node); \ + key.iov_base = NODEKEY(node); \ + } while (0) + +#define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */ +#define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID)) +/* mdbx_dbi_open() flags */ +#define VALID_FLAGS \ + (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ + MDBX_INTEGERDUP | MDBX_REVERSEDUP | MDBX_CREATE) + +/* max number of pages to commit in one writev() call */ +#define MDBX_COMMIT_PAGES 64 +#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ +#undef MDBX_COMMIT_PAGES +#define MDBX_COMMIT_PAGES IOV_MAX +#endif + +/* Check txn and dbi arguments to a function */ +#define TXN_DBI_EXIST(txn, dbi, validity) \ + ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) + +/* Check for misused dbi handles */ +#define TXN_DBI_CHANGED(txn, dbi) \ + ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) diff --git a/src/defs.h b/src/defs.h index 6cae9714..e4a4c49f 100644 --- a/src/defs.h +++ b/src/defs.h @@ -290,8 +290,6 @@ # define __noop(...) __do_noop(0, __VA_ARGS__) #endif /* __noop */ -/*----------------------------------------------------------------------------*/ - /* Wrapper around __func__, which is a C99 feature */ #if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L # define mdbx_func_ __func__ @@ -301,8 +299,74 @@ # define mdbx_func_ "" #endif -/* *INDENT-ON* */ -/* clang-format on */ +/*----------------------------------------------------------------------------*/ + +#if defined(USE_VALGRIND) +# include +# ifndef VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE + /* LY: available since Valgrind 3.10 */ +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# endif +#else +# define VALGRIND_CREATE_MEMPOOL(h,r,z) +# define VALGRIND_DESTROY_MEMPOOL(h) +# define VALGRIND_MEMPOOL_TRIM(h,a,s) +# define VALGRIND_MEMPOOL_ALLOC(h,a,s) +# define VALGRIND_MEMPOOL_FREE(h,a) +# define VALGRIND_MEMPOOL_CHANGE(h,a,b,s) +# define VALGRIND_MAKE_MEM_NOACCESS(a,s) +# define VALGRIND_MAKE_MEM_DEFINED(a,s) +# define VALGRIND_MAKE_MEM_UNDEFINED(a,s) +# define VALGRIND_DISABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_ENABLE_ADDR_ERROR_REPORTING_IN_RANGE(a,s) +# define VALGRIND_CHECK_MEM_IS_ADDRESSABLE(a,s) (0) +# define VALGRIND_CHECK_MEM_IS_DEFINED(a,s) (0) +#endif /* USE_VALGRIND */ + +#ifdef __SANITIZE_ADDRESS__ +# include +#else +# define ASAN_POISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +# define ASAN_UNPOISON_MEMORY_REGION(addr, size) \ + ((void)(addr), (void)(size)) +#endif /* __SANITIZE_ADDRESS__ */ + +/*----------------------------------------------------------------------------*/ + +#ifndef ARRAY_LENGTH +# ifdef __cplusplus + template + char (&__ArraySizeHelper(T (&array)[N]))[N]; +# define ARRAY_LENGTH(array) (sizeof(::__ArraySizeHelper(array))) +# else +# define ARRAY_LENGTH(array) (sizeof(array) / sizeof(array[0])) +# endif +#endif /* ARRAY_LENGTH */ + +#ifndef ARRAY_END +# define ARRAY_END(array) (&array[ARRAY_LENGTH(array)]) +#endif /* ARRAY_END */ + +#ifndef STRINGIFY +# define STRINGIFY_HELPER(x) #x +# define STRINGIFY(x) STRINGIFY_HELPER(x) +#endif /* STRINGIFY */ + +#ifndef offsetof +# define offsetof(type, member) __builtin_offsetof(type, member) +#endif /* offsetof */ + +#ifndef container_of +# define container_of(ptr, type, member) \ + ((type *)((char *)(ptr) - offsetof(type, member))) +#endif /* container_of */ #define MDBX_TETRAD(a, b, c, d) \ ((uint32_t)(a) << 24 | (uint32_t)(b) << 16 | (uint32_t)(c) << 8 | (d)) + +#define FIXME "FIXME: " __FILE__ ", " STRINGIFY(__LINE__) + +/* *INDENT-ON* */ +/* clang-format on */ diff --git a/src/mdbx.c b/src/mdbx.c index 6e71ea2a..d829bb60 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -36,7 +36,6 @@ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "./bits.h" -#include "./midl.h" /*----------------------------------------------------------------------------*/ /* rthc (tls keys and destructors) */ @@ -255,342 +254,6 @@ int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn); txnid_t mdbx_debug_edge; #endif -/* Features under development */ -#ifndef MDBX_DEVEL -#define MDBX_DEVEL 0 -#endif - -/* Internal error codes, not exposed outside libmdbx */ -#define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10) - -/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */ -#define DDBI(mc) \ - (((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi) - -/* The maximum size of a database page. - * - * It is 32k or 64k, since value-PAGEBASE must fit in - * MDBX_page.mp_upper. - * - * MDBX will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. */ -#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) - -/* The minimum number of keys required in a database page. - * Setting this to a larger value will place a smaller bound on the - * maximum size of a data item. Data items larger than this size will - * be pushed into overflow pages instead of being stored directly in - * the B-tree node. This value used to default to 4. With a page size - * of 4096 bytes that meant that any item larger than 1024 bytes would - * go into an overflow page. That also meant that on average 2-3KB of - * each overflow page was wasted space. The value cannot be lower than - * 2 because then there would no longer be a tree structure. With this - * value, items larger than 2KB will go into overflow pages, and on - * average only 1KB will be wasted. */ -#define MDBX_MINKEYS 2 - -/* A stamp that identifies a file as an MDBX file. - * There's nothing special about this value other than that it is easily - * recognizable, and it will reflect any byte order mismatches. */ -#define MDBX_MAGIC 0xBEEFC0DE - -/* The version number for a database's datafile format. */ -#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1) -/* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1) - -/* Key size which fits in a DKBUF. */ -#define DKBUF_MAXKEYSIZE 511 /* FIXME */ - -#if MDBX_DEBUG -#define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2] -#define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1) -#define DVAL(x) \ - mdbx_dkey(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1) -#else -#define DKBUF ((void)(0)) -#define DKEY(x) ("-") -#define DVAL(x) ("-") -#endif - -/* An invalid page number. - * Mainly used to denote an empty tree. */ -#define P_INVALID (~(pgno_t)0) - -/* Test if the flags f are set in a flag word w. */ -#define F_ISSET(w, f) (((w) & (f)) == (f)) - -/* Round n up to an even number. */ -#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */ - -/* Default size of memory map. - * This is certainly too small for any actual applications. Apps should - * always set the size explicitly using mdbx_env_set_mapsize(). */ -#define DEFAULT_MAPSIZE 1048576 - -/* Reader Lock Table - * - * Readers don't acquire any locks for their data access. Instead, they - * simply record their transaction ID in the reader table. The reader - * mutex is needed just to find an empty slot in the reader table. The - * slot's address is saved in thread-specific data so that subsequent - * read transactions started by the same thread need no further locking to - * proceed. - * - * If MDBX_NOTLS is set, the slot address is not saved in thread-specific data. - * No reader table is used if the database is on a read-only filesystem. - * - * Since the database uses multi-version concurrency control, readers don't - * actually need any locking. This table is used to keep track of which - * readers are using data from which old transactions, so that we'll know - * when a particular old transaction is no longer in use. Old transactions - * that have discarded any data pages can then have those pages reclaimed - * for use by a later write transaction. - * - * The lock table is constructed such that reader slots are aligned with the - * processor's cache line size. Any slot is only ever used by one thread. - * This alignment guarantees that there will be no contention or cache - * thrashing as threads update their own slot info, and also eliminates - * any need for locking when accessing a slot. - * - * A writer thread will scan every slot in the table to determine the oldest - * outstanding reader transaction. Any freed pages older than this will be - * reclaimed by the writer. The writer doesn't use any locks when scanning - * this table. This means that there's no guarantee that the writer will - * see the most up-to-date reader info, but that's not required for correct - * operation - all we need is to know the upper bound on the oldest reader, - * we don't care at all about the newest reader. So the only consequence of - * reading stale information here is that old pages might hang around a - * while longer before being reclaimed. That's actually good anyway, because - * the longer we delay reclaiming old pages, the more likely it is that a - * string of contiguous pages can be found after coalescing old pages from - * many old transactions together. */ - -/* Number of slots in the reader table. - * This value was chosen somewhat arbitrarily. The 61 is a prime number, - * and such readers plus a couple mutexes fit into single 4KB page. - * Applications should set the table size using mdbx_env_set_maxreaders(). */ -#define DEFAULT_READERS 61 - -/* Address of first usable data byte in a page, after the header */ -#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) - -/* ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0) - -/* Number of nodes on a page */ -#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) - -/* The amount of space remaining in the page */ -#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) - -/* The percentage of space used in the page, in tenths of a percent. */ -#define PAGEFILL(env, p) \ - (1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \ - ((env)->me_psize - PAGEHDRSZ)) -/* The minimum page fill factor, in tenths of a percent. - * Pages emptier than this are candidates for merging. */ -#define FILL_THRESHOLD 250 - -/* Test if a page is a leaf page */ -#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF) -/* Test if a page is a LEAF2 page */ -#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2) -/* Test if a page is a branch page */ -#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH) -/* Test if a page is an overflow page */ -#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW) -/* Test if a page is a sub page */ -#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP) - -/* The number of overflow pages needed to store the given size. */ -#define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1) - -/* Link in MDBX_txn.mt_loose_pages list. - * Kept outside the page header, which is needed when reusing the page. */ -#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2)) - -/* Header for a single key/data pair within a page. - * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. - * We guarantee 2-byte alignment for 'MDBX_node's. - * - * mn_lo and mn_hi are used for data size on leaf nodes, and for child - * pgno on branch nodes. On 64 bit platforms, mn_flags is also used - * for pgno. (Branch nodes have no flags). Lo and hi are in host byte - * order in case some accesses can be optimized to 32-bit word access. - * - * Leaf node flags describe node contents. F_BIGDATA says the node's - * data part is the page number of an overflow page with actual data. - * F_DUPDATA and F_SUBDATA can be combined giving duplicate data in - * a sub-page/sub-database, and named databases (just F_SUBDATA). */ -typedef struct MDBX_node { - union { - struct { -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ - union { - struct { - uint16_t mn_lo, mn_hi; /* part of data size or pgno */ - }; - uint32_t mn_dsize; - }; - uint16_t mn_flags; /* see mdbx_node */ - uint16_t mn_ksize; /* key size */ -#else - uint16_t mn_ksize; /* key size */ - uint16_t mn_flags; /* see mdbx_node */ - union { - struct { - uint16_t mn_hi, mn_lo; /* part of data size or pgno */ - }; - uint32_t mn_dsize; - }; -#endif - }; - pgno_t mn_ksize_and_pgno; - }; - -/* mdbx_node Flags */ -#define F_BIGDATA 0x01 /* data put on overflow page */ -#define F_SUBDATA 0x02 /* data is a sub-database */ -#define F_DUPDATA 0x04 /* data has duplicates */ - -/* valid flags for mdbx_node_add() */ -#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND) - uint8_t mn_data[1]; /* key and data are appended here */ -} MDBX_node; - -/* Size of the node header, excluding dynamic data at the end */ -#define NODESIZE offsetof(MDBX_node, mn_data) - -/* Bit position of top word in page number, for shifting mn_flags */ -#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0) - -/* Size of a node in a branch page with a given key. - * This is just the node header plus the key, there is no data. */ -#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len)) - -/* Size of a node in a leaf page with a given key and data. - * This is node header plus key plus data size. */ -#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len) - -/* Address of node i in page p */ -static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { - assert(NUMKEYS(p) > (unsigned)(i)); - return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); -} - -/* Address of the key for the node */ -#define NODEKEY(node) (void *)((node)->mn_data) - -/* Address of the data for a node */ -#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize) - -/* Get the page number pointed to by a branch node */ -static __inline pgno_t NODEPGNO(const MDBX_node *node) { - pgno_t pgno; - if (UNALIGNED_OK) { - pgno = node->mn_ksize_and_pgno; - if (sizeof(pgno_t) > 4) - pgno &= UINT64_C(0xffffFFFFffff); - } else { - pgno = node->mn_lo | ((pgno_t)node->mn_lo << 16); - if (sizeof(pgno_t) > 4) - pgno |= ((uint64_t)node->mn_flags) << 32; - } - return pgno; -} - -/* Set the page number in a branch node */ -static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) { - assert(pgno <= (pgno_t)UINT64_C(0xffffFFFFffff)); - - if (UNALIGNED_OK) { - if (sizeof(pgno_t) > 4) - pgno |= ((uint64_t)node->mn_ksize) << 48; - node->mn_ksize_and_pgno = pgno; - } else { - node->mn_lo = (uint16_t)pgno; - node->mn_hi = (uint16_t)(pgno >> 16); - if (sizeof(pgno_t) > 4) - node->mn_flags = (uint16_t)((uint64_t)pgno >> 32); - } -} - -/* Get the size of the data in a leaf node */ -static __inline size_t NODEDSZ(const MDBX_node *node) { - size_t size; - if (UNALIGNED_OK) { - size = node->mn_dsize; - } else { - size = node->mn_lo | ((size_t)node->mn_hi << 16); - } - return size; -} - -/* Set the size of the data for a leaf node */ -static __inline void SETDSZ(MDBX_node *node, unsigned size) { - if (UNALIGNED_OK) { - node->mn_dsize = size; - } else { - node->mn_lo = (uint16_t)size; - node->mn_hi = (uint16_t)(size >> 16); - } -} - -/* The size of a key in a node */ -#define NODEKSZ(node) ((node)->mn_ksize) - -/* The address of a key in a LEAF2 page. - * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. - * There are no node headers, keys are stored contiguously. */ -#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks))) - -/* Set the node's key into keyptr, if requested. */ -#define MDBX_GET_KEY(node, keyptr) \ - do { \ - if ((keyptr) != NULL) { \ - (keyptr)->iov_len = NODEKSZ(node); \ - (keyptr)->iov_base = NODEKEY(node); \ - } \ - } while (0) - -/* Set the node's key into key. */ -#define MDBX_GET_KEY2(node, key) \ - do { \ - key.iov_len = NODEKSZ(node); \ - key.iov_base = NODEKEY(node); \ - } while (0) - -#define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */ -#define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID)) -/* mdbx_dbi_open() flags */ -#define VALID_FLAGS \ - (MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \ - MDBX_INTEGERDUP | MDBX_REVERSEDUP | MDBX_CREATE) - -/* max number of pages to commit in one writev() call */ -#define MDBX_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ -#undef MDBX_COMMIT_PAGES -#define MDBX_COMMIT_PAGES IOV_MAX -#endif - -/* Check txn and dbi arguments to a function */ -#define TXN_DBI_EXIST(txn, dbi, validity) \ - ((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity))) - -/* Check for misused dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) - static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, int flags); static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, MDBX_page **mp); @@ -622,6 +285,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode); static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **mp, int *lvl); static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, int modify); + #define MDBX_PS_MODIFY 1 #define MDBX_PS_ROOTONLY 2 #define MDBX_PS_FIRST 4 diff --git a/src/midl.h b/src/midl.h deleted file mode 100644 index 8c983c24..00000000 --- a/src/midl.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2015-2017 Leonid Yuriev - * and other libmdbx authors: please see AUTHORS file. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted only as authorized by the OpenLDAP - * Public License. - * - * A copy of this license is available in the file LICENSE in the - * top-level directory of the distribution or, alternatively, at - * . - */ - -/* IDL sizes - likely should be even bigger - * limiting factors: sizeof(pgno_t), thread stack size */ -#define MDBX_IDL_LOGN 16 /* DB_SIZE is 2^16, UM_SIZE is 2^17 */ -#define MDBX_IDL_DB_SIZE (1 << MDBX_IDL_LOGN) -#define MDBX_IDL_UM_SIZE (1 << (MDBX_IDL_LOGN + 1)) - -#define MDBX_IDL_DB_MAX (MDBX_IDL_DB_SIZE - 1) -#define MDBX_IDL_UM_MAX (MDBX_IDL_UM_SIZE - 1) - -#define MDBX_IDL_SIZEOF(ids) (((ids)[0] + 1) * sizeof(pgno_t)) -#define MDBX_IDL_IS_ZERO(ids) ((ids)[0] == 0) -#define MDBX_IDL_CPY(dst, src) (memcpy(dst, src, MDBX_IDL_SIZEOF(src))) -#define MDBX_IDL_FIRST(ids) ((ids)[1]) -#define MDBX_IDL_LAST(ids) ((ids)[(ids)[0]]) - -/* Current max length of an #mdbx_midl_alloc()ed IDL */ -#define MDBX_IDL_ALLOCLEN(ids) ((ids)[-1]) - -/* Append ID to IDL. The IDL must be big enough. */ -#define mdbx_midl_xappend(idl, id) \ - do { \ - pgno_t *xidl = (idl), xlen = ++(xidl[0]); \ - xidl[xlen] = (id); \ - } while (0) diff --git a/src/osal.h b/src/osal.h index 16a12302..846e5673 100644 --- a/src/osal.h +++ b/src/osal.h @@ -16,16 +16,27 @@ #pragma once +/*----------------------------------------------------------------------------*/ +/* Microsoft compiler generates a lot of warning for self includes... */ + #ifdef _MSC_VER #pragma warning(push, 1) -#pragma warning(disable : 4530) /* C++ exception handler used, but \ - unwind semantics are not enabled. Specify \ - /EHsc */ -#pragma warning(disable : 4577) /* 'noexcept' used with no exception \ - handling mode specified; termination on \ - exception is not guaranteed. Specify /EHsc \ - */ -#endif /* _MSC_VER (warnings) */ +#pragma warning(disable : 4530) /* C++ exception handler used, but unwind \ + * semantics are not enabled. Specify /EHsc */ +#pragma warning(disable : 4577) /* 'noexcept' used with no exception handling \ + * mode specified; termination on exception is \ + * not guaranteed. Specify /EHsc */ +#if !defined(_CRT_SECURE_NO_WARNINGS) +#define _CRT_SECURE_NO_WARNINGS +#endif +#endif /* _MSC_VER (warnings) */ + +/*----------------------------------------------------------------------------*/ +/* C99 includes */ + +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif #include #include @@ -52,6 +63,9 @@ #define _XOPEN_SOURCE 0 #endif +/*----------------------------------------------------------------------------*/ +/* Systems includes */ + #if defined(_WIN32) || defined(_WIN64) #include #include @@ -103,7 +117,20 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #include #endif +#if defined(__i386) || defined(__x86_64) || defined(_M_IX86) +#define UNALIGNED_OK 1 /* TODO */ +#endif +#ifndef UNALIGNED_OK +#define UNALIGNED_OK 0 +#endif /* UNALIGNED_OK */ + +#if (-6 & 5) || CHAR_BIT != 8 || UINT_MAX < 0xffffffff || ULONG_MAX % 0xFFFF +#error \ + "Sanity checking failed: Two's complement, reasonably sized integer types" +#endif + /*----------------------------------------------------------------------------*/ +/* Compiler's includes for builtins/intrinsics */ #ifdef _MSC_VER @@ -162,10 +189,6 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #include /* defines BYTE_ORDER on HPUX and Solaris */ #endif -#ifdef _MSC_VER -#pragma warning(pop) -#endif - #if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) #define __ORDER_LITTLE_ENDIAN__ __LITTLE_ENDIAN #define __ORDER_BIG_ENDIAN__ __BIG_ENDIAN @@ -197,36 +220,7 @@ typedef pthread_mutex_t mdbx_fastmutex_t; #endif /*----------------------------------------------------------------------------*/ -/* Cache coherence */ - -#if defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64) || \ - defined(_M_IX86) || defined(__i386) || defined(__amd64) || \ - defined(i386) || defined(__x86_64) || defined(_AMD64_) || defined(_M_X64) -#define MDBX_CACHE_IS_COHERENT 1 -#elif defined(__hppa) || defined(__hppa__) -#define MDBX_CACHE_IS_COHERENT 1 -#endif - -#ifndef MDBX_CACHE_IS_COHERENT -#define MDBX_CACHE_IS_COHERENT 0 -#endif - -#ifndef MDBX_CACHELINE_SIZE -#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) -#define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE -#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) -#define MDBX_CACHELINE_SIZE 128 -#else -#define MDBX_CACHELINE_SIZE 64 -#endif -#endif /* MDBX_CACHELINE_SIZE */ - -#ifndef __cache_aligned -#define __cache_aligned __aligned(MDBX_CACHELINE_SIZE) -#endif - -/*----------------------------------------------------------------------------*/ -/* Memory/Compiler barriers */ +/* Memory/Compiler barriers, cache coherence */ static __inline void mdbx_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) @@ -286,6 +280,35 @@ static __inline void mdbx_memory_barrier(void) { #endif } +/*----------------------------------------------------------------------------*/ +/* Cache coherence and invalidation */ + +#if defined(__i386__) || defined(__x86_64__) || defined(_M_AMD64) || \ + defined(_M_IX86) || defined(__i386) || defined(__amd64) || \ + defined(i386) || defined(__x86_64) || defined(_AMD64_) || defined(_M_X64) +#define MDBX_CACHE_IS_COHERENT 1 +#elif defined(__hppa) || defined(__hppa__) +#define MDBX_CACHE_IS_COHERENT 1 +#endif + +#ifndef MDBX_CACHE_IS_COHERENT +#define MDBX_CACHE_IS_COHERENT 0 +#endif + +#ifndef MDBX_CACHELINE_SIZE +#if defined(SYSTEM_CACHE_ALIGNMENT_SIZE) +#define MDBX_CACHELINE_SIZE SYSTEM_CACHE_ALIGNMENT_SIZE +#elif defined(__ia64__) || defined(__ia64) || defined(_M_IA64) +#define MDBX_CACHELINE_SIZE 128 +#else +#define MDBX_CACHELINE_SIZE 64 +#endif +#endif /* MDBX_CACHELINE_SIZE */ + +#ifndef __cache_aligned +#define __cache_aligned __aligned(MDBX_CACHELINE_SIZE) +#endif + #if MDBX_CACHE_IS_COHERENT #define mdbx_coherent_barrier() mdbx_compiler_barrier() #else @@ -313,6 +336,7 @@ static __inline void mdbx_invalidate_cache(void *addr, size_t nbytes) { } /*----------------------------------------------------------------------------*/ +/* libc compatibility stuff */ #ifndef mdbx_assert_fail void mdbx_assert_fail(MDBX_env *env, const char *msg, const char *func, @@ -338,6 +362,7 @@ int mdbx_asprintf(char **strp, const char *fmt, ...); #endif /* _MSC_VER */ /*----------------------------------------------------------------------------*/ +/* OS abstraction layer stuff */ /* max bytes to write in one call */ #define MAX_WRITE UINT32_C(0x3fff0000) @@ -444,6 +469,7 @@ static __inline mdbx_pid_t mdbx_getpid(void) { void mdbx_osal_jitter(bool tiny); /*----------------------------------------------------------------------------*/ +/* lck stuff */ #if defined(_WIN32) || defined(_WIN64) #undef MDBX_OSAL_LOCK @@ -477,6 +503,7 @@ int mdbx_rpid_clear(MDBX_env *env); int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); /*----------------------------------------------------------------------------*/ +/* Atomics */ #if (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) && \ (__GNUC_PREREQ(4, 9) || __CLANG_PREREQ(3, 8) || \ @@ -485,7 +512,6 @@ int mdbx_rpid_check(MDBX_env *env, mdbx_pid_t pid); #elif defined(__GNUC__) || defined(__clang__) /* LY: nothing required */ #elif defined(_MSC_VER) -#pragma warning(push) #pragma warning(disable : 4163) /* 'xyz': not available as an intrinsic */ #pragma warning(disable : 4133) /* 'function': incompatible types - from \ 'size_t' to 'LONGLONG' */ @@ -558,6 +584,8 @@ static __inline bool mdbx_atomic_compare_and_swap(volatile size_t *p, size_t c, #endif } +/*----------------------------------------------------------------------------*/ + #ifdef _MSC_VER #pragma warning(pop) #endif diff --git a/src/tools/mdbx_chk.c b/src/tools/mdbx_chk.c index 930f8b58..71e8e103 100644 --- a/src/tools/mdbx_chk.c +++ b/src/tools/mdbx_chk.c @@ -27,7 +27,6 @@ #include "../../mdbx.h" #include "../bits.h" -#include "../midl.h" typedef struct flagbit { int bit; diff --git a/test/test.vcxproj b/test/test.vcxproj index 7afeb1c7..6676ffc0 100644 --- a/test/test.vcxproj +++ b/test/test.vcxproj @@ -164,6 +164,7 @@ +