2019-09-09 13:40:24 +03:00
|
|
|
/*
|
2019-02-03 12:28:01 +03:00
|
|
|
* Copyright 2015-2019 Leonid Yuriev <leo@yuriev.ru>
|
2017-03-16 18:09:27 +03:00
|
|
|
* and other libmdbx authors: please see AUTHORS file.
|
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted only as authorized by the OpenLDAP
|
|
|
|
* Public License.
|
|
|
|
*
|
|
|
|
* A copy of this license is available in the file LICENSE in the
|
|
|
|
* top-level directory of the distribution or, alternatively, at
|
2017-05-23 14:44:53 +03:00
|
|
|
* <http://www.OpenLDAP.org/license.html>. */
|
2017-03-16 18:09:27 +03:00
|
|
|
|
|
|
|
#pragma once
|
2019-09-10 14:32:17 +03:00
|
|
|
#ifdef MDBX_CONFIG_H
|
|
|
|
#include MDBX_CONFIG_H
|
|
|
|
#endif
|
|
|
|
|
2017-03-16 18:09:27 +03:00
|
|
|
/* *INDENT-OFF* */
|
|
|
|
/* clang-format off */
|
|
|
|
|
2019-08-26 22:57:14 +03:00
|
|
|
/* In case the MDBX_DEBUG is undefined set it corresponding to NDEBUG */
|
2017-05-24 01:42:10 +03:00
|
|
|
#ifndef MDBX_DEBUG
|
2019-09-10 14:32:17 +03:00
|
|
|
# ifdef NDEBUG
|
|
|
|
# define MDBX_DEBUG 0
|
|
|
|
# else
|
|
|
|
# define MDBX_DEBUG 1
|
|
|
|
# endif
|
2017-03-16 18:09:27 +03:00
|
|
|
#endif
|
|
|
|
|
2019-08-26 22:57:14 +03:00
|
|
|
/* Undefine the NDEBUG if debugging is enforced by MDBX_DEBUG */
|
2017-05-24 01:42:10 +03:00
|
|
|
#if MDBX_DEBUG
|
2017-05-24 13:37:06 +03:00
|
|
|
# undef NDEBUG
|
2017-03-16 18:09:27 +03:00
|
|
|
#endif
|
|
|
|
|
2019-08-20 02:45:03 +03:00
|
|
|
#define MDBX_OSX_WANNA_DURABILITY 0 /* using fcntl(F_FULLFSYNC) with 5-10 times slowdown */
|
|
|
|
#define MDBX_OSX_WANNA_SPEED 1 /* using fsync() with chance of data lost on power failure */
|
2019-08-20 15:04:32 +03:00
|
|
|
#ifndef MDBX_OSX_SPEED_INSTEADOF_DURABILITY
|
2019-09-10 14:32:17 +03:00
|
|
|
# define MDBX_OSX_SPEED_INSTEADOF_DURABILITY MDBX_OSX_WANNA_DURABILITY
|
2019-08-20 02:45:03 +03:00
|
|
|
#endif
|
|
|
|
|
2019-08-31 17:10:04 +03:00
|
|
|
#ifdef MDBX_ALLOY
|
|
|
|
/* Amalgamated build */
|
2019-09-10 14:32:17 +03:00
|
|
|
# define MDBX_INTERNAL_FUNC static
|
|
|
|
# define MDBX_INTERNAL_VAR static
|
2019-08-31 17:10:04 +03:00
|
|
|
#else
|
|
|
|
/* Non-amalgamated build */
|
2019-09-10 14:32:17 +03:00
|
|
|
# define MDBX_INTERNAL_FUNC
|
|
|
|
# define MDBX_INTERNAL_VAR extern
|
2019-08-31 17:10:04 +03:00
|
|
|
#endif /* MDBX_ALLOY */
|
|
|
|
|
2017-05-24 18:50:24 +03:00
|
|
|
/*----------------------------------------------------------------------------*/
|
|
|
|
|
|
|
|
/* Should be defined before any includes */
|
|
|
|
#ifndef _FILE_OFFSET_BITS
|
2017-07-02 09:07:57 +03:00
|
|
|
# define _FILE_OFFSET_BITS 64
|
2017-05-24 18:50:24 +03:00
|
|
|
#endif
|
|
|
|
|
2019-08-13 02:07:10 +03:00
|
|
|
#ifdef __APPLE__
|
|
|
|
#define _DARWIN_C_SOURCE
|
|
|
|
#endif
|
|
|
|
|
2017-07-02 09:07:57 +03:00
|
|
|
#ifdef _MSC_VER
|
2018-06-12 22:56:26 +03:00
|
|
|
# if _MSC_VER < 1400
|
|
|
|
# error "Microsoft Visual C++ 8.0 (Visual Studio 2005) or later version is required"
|
|
|
|
# endif
|
2017-07-02 09:07:57 +03:00
|
|
|
# ifndef _CRT_SECURE_NO_WARNINGS
|
|
|
|
# define _CRT_SECURE_NO_WARNINGS
|
|
|
|
# endif
|
2017-07-03 06:30:43 +03:00
|
|
|
#if _MSC_VER > 1800
|
|
|
|
# pragma warning(disable : 4464) /* relative include path contains '..' */
|
|
|
|
#endif
|
2018-05-21 16:31:36 +03:00
|
|
|
#if _MSC_VER > 1913
|
|
|
|
# pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... */
|
|
|
|
#endif
|
2017-07-02 09:07:57 +03:00
|
|
|
#pragma warning(disable : 4710) /* 'xyz': function not inlined */
|
|
|
|
#pragma warning(disable : 4711) /* function 'xyz' selected for automatic inline expansion */
|
|
|
|
#pragma warning(disable : 4201) /* nonstandard extension used : nameless struct / union */
|
2017-10-29 03:23:39 +03:00
|
|
|
#pragma warning(disable : 4702) /* unreachable code */
|
2017-07-02 09:07:57 +03:00
|
|
|
#pragma warning(disable : 4706) /* assignment within conditional expression */
|
|
|
|
#pragma warning(disable : 4127) /* conditional expression is constant */
|
|
|
|
#pragma warning(disable : 4324) /* 'xyz': structure was padded due to alignment specifier */
|
|
|
|
#pragma warning(disable : 4310) /* cast truncates constant value */
|
2017-07-03 09:56:46 +03:00
|
|
|
#pragma warning(disable : 4820) /* bytes padding added after data member for aligment */
|
2017-07-03 12:50:48 +03:00
|
|
|
#pragma warning(disable : 4548) /* expression before comma has no effect; expected expression with side - effect */
|
2018-03-22 20:34:09 +03:00
|
|
|
#pragma warning(disable : 4366) /* the result of the unary '&' operator may be unaligned */
|
2017-07-02 09:07:57 +03:00
|
|
|
#endif /* _MSC_VER (warnings) */
|
2017-05-24 18:50:24 +03:00
|
|
|
|
2019-08-31 17:10:04 +03:00
|
|
|
#include "../../mdbx.h"
|
2019-09-10 02:19:35 +03:00
|
|
|
#include "defs.h"
|
2017-05-24 13:59:50 +03:00
|
|
|
|
2017-03-16 18:09:27 +03:00
|
|
|
#if defined(__GNUC__) && !__GNUC_PREREQ(4,2)
|
2017-05-24 13:37:06 +03:00
|
|
|
/* Actualy libmdbx was not tested with compilers older than GCC from RHEL6.
|
|
|
|
* But you could remove this #error and try to continue at your own risk.
|
|
|
|
* In such case please don't rise up an issues related ONLY to old compilers.
|
|
|
|
*/
|
2018-03-07 13:06:39 +03:00
|
|
|
# warning "libmdbx required GCC >= 4.2"
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(__clang__) && !__CLANG_PREREQ(3,8)
|
|
|
|
/* Actualy libmdbx was not tested with CLANG older than 3.8.
|
|
|
|
* But you could remove this #error and try to continue at your own risk.
|
|
|
|
* In such case please don't rise up an issues related ONLY to old compilers.
|
|
|
|
*/
|
|
|
|
# warning "libmdbx required CLANG >= 3.8"
|
2017-03-16 18:09:27 +03:00
|
|
|
#endif
|
|
|
|
|
|
|
|
#if defined(__GLIBC__) && !__GLIBC_PREREQ(2,12)
|
2017-05-24 13:37:06 +03:00
|
|
|
/* Actualy libmdbx was not tested with something older than glibc 2.12 (from RHEL6).
|
|
|
|
* But you could remove this #error and try to continue at your own risk.
|
|
|
|
* In such case please don't rise up an issues related ONLY to old systems.
|
|
|
|
*/
|
|
|
|
# warning "libmdbx required at least GLIBC 2.12."
|
2017-03-16 18:09:27 +03:00
|
|
|
#endif
|
|
|
|
|
2017-05-24 13:59:50 +03:00
|
|
|
#ifdef __SANITIZE_THREAD__
|
|
|
|
# warning "libmdbx don't compatible with ThreadSanitizer, you will get a lot of false-positive issues."
|
|
|
|
#endif /* __SANITIZE_THREAD__ */
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2018-03-07 13:07:58 +03:00
|
|
|
#if __has_warning("-Wconstant-logical-operand")
|
2018-03-14 15:16:49 +03:00
|
|
|
# if defined(__clang__)
|
|
|
|
# pragma clang diagnostic ignored "-Wconstant-logical-operand"
|
|
|
|
# elif defined(__GNUC__)
|
|
|
|
# pragma GCC diagnostic ignored "-Wconstant-logical-operand"
|
|
|
|
# else
|
|
|
|
# pragma warning disable "constant-logical-operand"
|
|
|
|
# endif
|
2018-03-07 13:07:58 +03:00
|
|
|
#endif /* -Wconstant-logical-operand */
|
|
|
|
|
2018-03-14 15:16:49 +03:00
|
|
|
#if defined(__LCC__) && (__LCC__ <= 121)
|
|
|
|
/* bug #2798 */
|
|
|
|
# pragma diag_suppress alignment_reduction_ignored
|
2018-03-14 14:57:46 +03:00
|
|
|
#elif defined(__ICC)
|
2018-03-14 15:16:49 +03:00
|
|
|
# pragma warning(disable: 3453 1366)
|
|
|
|
#elif __has_warning("-Walignment-reduction-ignored")
|
|
|
|
# if defined(__clang__)
|
|
|
|
# pragma clang diagnostic ignored "-Walignment-reduction-ignored"
|
|
|
|
# elif defined(__GNUC__)
|
|
|
|
# pragma GCC diagnostic ignored "-Walignment-reduction-ignored"
|
|
|
|
# else
|
|
|
|
# pragma warning disable "alignment-reduction-ignored"
|
|
|
|
# endif
|
2018-03-14 15:32:32 +03:00
|
|
|
#endif /* -Walignment-reduction-ignored */
|
2018-03-07 13:31:33 +03:00
|
|
|
|
2019-09-10 02:19:35 +03:00
|
|
|
#include "osal.h"
|
2017-03-16 18:09:27 +03:00
|
|
|
|
|
|
|
/* *INDENT-ON* */
|
|
|
|
/* clang-format on */
|
|
|
|
|
2018-08-25 18:17:50 +03:00
|
|
|
#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul
|
|
|
|
#define MDBX_WORDBITS 64
|
|
|
|
#else
|
|
|
|
#define MDBX_WORDBITS 32
|
|
|
|
#endif /* MDBX_WORDBITS */
|
|
|
|
|
2019-09-23 21:22:55 +03:00
|
|
|
#ifndef MDBX_64BIT_ATOMIC
|
|
|
|
#if MDBX_WORDBITS >= 64
|
|
|
|
#define MDBX_64BIT_ATOMIC 1
|
|
|
|
#else
|
|
|
|
#define MDBX_64BIT_ATOMIC 0
|
|
|
|
#endif
|
|
|
|
#endif /* MDBX_64BIT_ATOMIC */
|
|
|
|
|
2019-09-05 11:57:52 +03:00
|
|
|
/* Some platforms define the EOWNERDEAD error code even though they
|
|
|
|
* don't support Robust Mutexes. Compile with -DMDBX_USE_ROBUST=0. */
|
|
|
|
#ifndef MDBX_USE_ROBUST
|
|
|
|
#define MDBX_USE_ROBUST_CONFIG AUTO
|
|
|
|
/* Howard Chu: Android currently lacks Robust Mutex support */
|
|
|
|
#if defined(EOWNERDEAD) && !defined(__ANDROID__) && !defined(__APPLE__) && \
|
|
|
|
(!defined(__GLIBC__) || \
|
|
|
|
__GLIBC_PREREQ( \
|
|
|
|
2, \
|
|
|
|
10) /* LY: glibc before 2.10 has a troubles with Robust Mutex too. */ \
|
|
|
|
|| _POSIX_C_SOURCE >= 200809L)
|
|
|
|
#define MDBX_USE_ROBUST 1
|
|
|
|
#else
|
|
|
|
#define MDBX_USE_ROBUST 0
|
|
|
|
#endif
|
|
|
|
#else
|
|
|
|
#define MDBX_USE_ROBUST_CONFIG MDBX_USE_ROBUST
|
|
|
|
#endif /* MDBX_USE_ROBUST */
|
|
|
|
|
|
|
|
#ifndef MDBX_USE_OFDLOCKS
|
|
|
|
#define MDBX_USE_OFDLOCKS_CONFIG AUTO
|
2019-09-11 17:02:01 +03:00
|
|
|
#if defined(F_OFD_SETLK) && defined(F_OFD_SETLKW) && defined(F_OFD_GETLK) && \
|
|
|
|
!defined(MDBX_SAFE4QEMU)
|
2019-09-05 11:57:52 +03:00
|
|
|
#define MDBX_USE_OFDLOCKS 1
|
|
|
|
#else
|
|
|
|
#define MDBX_USE_OFDLOCKS 0
|
|
|
|
#endif
|
|
|
|
#else
|
|
|
|
#define MDBX_USE_OFDLOCKS_CONFIG MDBX_USE_OFDLOCKS
|
|
|
|
#endif /* MDBX_USE_OFDLOCKS */
|
|
|
|
|
|
|
|
/* Controls checking PID against reuse DB environment after the fork() */
|
|
|
|
#ifndef MDBX_TXN_CHECKPID
|
|
|
|
#define MDBX_TXN_CHECKPID_CONFIG AUTO
|
|
|
|
#if defined(MADV_DONTFORK) || defined(_WIN32) || defined(_WIN64)
|
|
|
|
/* PID check could be ommited:
|
|
|
|
* - on Linux when madvise(MADV_DONTFORK) is available. i.e. after the fork()
|
|
|
|
* mapped pages will not be available for child process.
|
|
|
|
* - in Windows where fork() not available. */
|
|
|
|
#define MDBX_TXN_CHECKPID 0
|
|
|
|
#else
|
|
|
|
#define MDBX_TXN_CHECKPID 1
|
|
|
|
#endif
|
|
|
|
#else
|
|
|
|
#define MDBX_TXN_CHECKPID_CONFIG MDBX_TXN_CHECKPID
|
|
|
|
#endif /* MDBX_TXN_CHECKPID */
|
|
|
|
|
2019-09-18 19:52:50 +03:00
|
|
|
#ifndef MDBX_TXN_CHECKOWNER
|
|
|
|
#define MDBX_TXN_CHECKOWNER_CONFIG AUTO
|
|
|
|
#define MDBX_TXN_CHECKOWNER 1
|
|
|
|
#endif /* MDBX_TXN_CHECKOWNER */
|
|
|
|
|
2019-09-05 11:57:52 +03:00
|
|
|
#define mdbx_sourcery_anchor XCONCAT(mdbx_sourcery_, MDBX_BUILD_SOURCERY)
|
|
|
|
#if defined(MDBX_TOOLS)
|
|
|
|
extern LIBMDBX_API const char *const mdbx_sourcery_anchor;
|
|
|
|
#endif
|
|
|
|
|
2017-03-16 18:09:27 +03:00
|
|
|
/*----------------------------------------------------------------------------*/
|
2017-05-24 13:59:50 +03:00
|
|
|
/* Basic constants and types */
|
|
|
|
|
|
|
|
/* The minimum number of keys required in a database page.
|
|
|
|
* Setting this to a larger value will place a smaller bound on the
|
|
|
|
* maximum size of a data item. Data items larger than this size will
|
|
|
|
* be pushed into overflow pages instead of being stored directly in
|
|
|
|
* the B-tree node. This value used to default to 4. With a page size
|
|
|
|
* of 4096 bytes that meant that any item larger than 1024 bytes would
|
|
|
|
* go into an overflow page. That also meant that on average 2-3KB of
|
|
|
|
* each overflow page was wasted space. The value cannot be lower than
|
|
|
|
* 2 because then there would no longer be a tree structure. With this
|
|
|
|
* value, items larger than 2KB will go into overflow pages, and on
|
|
|
|
* average only 1KB will be wasted. */
|
|
|
|
#define MDBX_MINKEYS 2
|
|
|
|
|
|
|
|
/* A stamp that identifies a file as an MDBX file.
|
|
|
|
* There's nothing special about this value other than that it is easily
|
|
|
|
* recognizable, and it will reflect any byte order mismatches. */
|
2017-05-30 16:22:42 +03:00
|
|
|
#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
|
2017-05-24 13:59:50 +03:00
|
|
|
|
|
|
|
/* The version number for a database's datafile format. */
|
2018-06-13 11:37:56 +03:00
|
|
|
#define MDBX_DATA_VERSION 2
|
2017-05-24 13:59:50 +03:00
|
|
|
/* The version number for a database's lockfile format. */
|
2019-08-23 03:36:56 +03:00
|
|
|
#define MDBX_LOCK_VERSION 3
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* handle for the DB used to track free pages. */
|
2017-03-16 18:09:27 +03:00
|
|
|
#define FREE_DBI 0
|
2017-05-23 14:44:53 +03:00
|
|
|
/* handle for the default DB. */
|
2017-03-16 18:09:27 +03:00
|
|
|
#define MAIN_DBI 1
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
|
2017-03-16 18:09:27 +03:00
|
|
|
#define CORE_DBS 2
|
2017-06-21 01:19:04 +03:00
|
|
|
#define MAX_DBI (INT16_MAX - CORE_DBS)
|
2019-09-12 03:30:10 +03:00
|
|
|
#if MAX_DBI != MDBX_MAX_DBI
|
|
|
|
#error "Opps, MAX_DBI != MDBX_MAX_DBI"
|
|
|
|
#endif
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Number of meta pages - also hardcoded elsewhere */
|
2017-05-24 21:43:29 +03:00
|
|
|
#define NUM_METAS 3
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-23 14:49:12 +03:00
|
|
|
/* A page number in the database.
|
|
|
|
*
|
|
|
|
* MDBX uses 32 bit for page numbers. This limits database
|
|
|
|
* size up to 2^44 bytes, in case of 4K pages. */
|
2017-06-05 14:22:52 +03:00
|
|
|
typedef uint32_t pgno_t;
|
|
|
|
#define PRIaPGNO PRIu32
|
2018-08-25 18:17:50 +03:00
|
|
|
#define MAX_PAGENO UINT32_C(0x7FFFffff)
|
2017-06-21 01:34:56 +03:00
|
|
|
#define MIN_PAGENO NUM_METAS
|
2017-05-23 14:49:12 +03:00
|
|
|
|
|
|
|
/* A transaction ID. */
|
|
|
|
typedef uint64_t txnid_t;
|
|
|
|
#define PRIaTXN PRIi64
|
2019-09-10 19:27:46 +03:00
|
|
|
#define MIN_TXNID UINT64_C(1)
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Used for offsets within a single page.
|
|
|
|
* Since memory pages are typically 4 or 8KB in size, 12-13 bits,
|
|
|
|
* this is plenty. */
|
2017-03-16 18:09:27 +03:00
|
|
|
typedef uint16_t indx_t;
|
|
|
|
|
2017-06-21 01:19:04 +03:00
|
|
|
#define MEGABYTE ((size_t)1 << 20)
|
|
|
|
|
2017-05-24 13:59:50 +03:00
|
|
|
/*----------------------------------------------------------------------------*/
|
|
|
|
/* Core structures for database and shared memory (i.e. format definition) */
|
2017-03-16 18:09:27 +03:00
|
|
|
#pragma pack(push, 1)
|
|
|
|
|
2019-09-23 21:22:55 +03:00
|
|
|
typedef union mdbx_safe64 {
|
|
|
|
volatile uint64_t inconsistent;
|
|
|
|
#if MDBX_64BIT_ATOMIC
|
|
|
|
volatile uint64_t atomic;
|
|
|
|
#else
|
|
|
|
struct {
|
|
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
volatile uint32_t low;
|
|
|
|
volatile uint32_t high;
|
|
|
|
#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
|
|
|
volatile uint32_t high;
|
|
|
|
volatile uint32_t low;
|
|
|
|
#else
|
|
|
|
#error "FIXME: Unsupported byte order"
|
|
|
|
#endif /* __BYTE_ORDER__ */
|
|
|
|
};
|
|
|
|
#endif /* MDBX_64BIT_ATOMIC */
|
|
|
|
} mdbx_safe64_t;
|
|
|
|
|
|
|
|
#define SAFE64_INVALID_THRESHOLD UINT64_C(0xffffFFFF00000000)
|
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Information about a single database in the environment. */
|
2017-05-24 01:42:10 +03:00
|
|
|
typedef struct MDBX_db {
|
2017-06-05 14:22:52 +03:00
|
|
|
uint16_t md_flags; /* see mdbx_dbi_open */
|
|
|
|
uint16_t md_depth; /* depth of this tree */
|
|
|
|
uint32_t md_xsize; /* also ksize for LEAF2 pages */
|
|
|
|
pgno_t md_root; /* the root page of this tree */
|
|
|
|
pgno_t md_branch_pages; /* number of internal pages */
|
|
|
|
pgno_t md_leaf_pages; /* number of leaf pages */
|
|
|
|
pgno_t md_overflow_pages; /* number of overflow pages */
|
|
|
|
uint64_t md_seq; /* table sequence counter */
|
|
|
|
uint64_t md_entries; /* number of data items */
|
|
|
|
uint64_t md_merkle; /* Merkle tree checksum */
|
2017-05-24 01:42:10 +03:00
|
|
|
} MDBX_db;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2019-09-17 23:52:51 +03:00
|
|
|
/* database size-related parameters */
|
|
|
|
typedef struct mdbx_geo_t {
|
|
|
|
uint16_t grow; /* datafile growth step in pages */
|
|
|
|
uint16_t shrink; /* datafile shrink threshold in pages */
|
|
|
|
pgno_t lower; /* minimal size of datafile in pages */
|
|
|
|
pgno_t upper; /* maximal size of datafile in pages */
|
|
|
|
pgno_t now; /* current size of datafile in pages */
|
|
|
|
pgno_t next; /* first unused page in the datafile,
|
|
|
|
* but actually the file may be shorter. */
|
|
|
|
} mdbx_geo_t;
|
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Meta page content.
|
|
|
|
* A meta page is the start point for accessing a database snapshot.
|
|
|
|
* Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */
|
2017-05-24 01:42:10 +03:00
|
|
|
typedef struct MDBX_meta {
|
2017-05-30 16:22:42 +03:00
|
|
|
/* Stamp identifying this as an MDBX file.
|
|
|
|
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
|
|
|
|
uint64_t mm_magic_and_version;
|
|
|
|
|
|
|
|
/* txnid that committed this page, the first of a two-phase-update pair */
|
2019-09-23 21:22:55 +03:00
|
|
|
mdbx_safe64_t mm_txnid_a;
|
2017-05-30 16:22:42 +03:00
|
|
|
|
|
|
|
uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */
|
|
|
|
uint8_t mm_validator_id; /* ID of checksum and page validation method,
|
|
|
|
* zero (nothing) for now */
|
|
|
|
uint8_t mm_extra_pagehdr; /* extra bytes in the page header,
|
|
|
|
* zero (nothing) for now */
|
|
|
|
|
2019-09-17 23:52:51 +03:00
|
|
|
mdbx_geo_t mm_geo; /* database size-related parameters */
|
2017-06-21 01:34:56 +03:00
|
|
|
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */
|
|
|
|
/* The size of pages used in this DB */
|
2017-03-16 18:09:27 +03:00
|
|
|
#define mm_psize mm_dbs[FREE_DBI].md_xsize
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Any persistent environment flags, see mdbx_env */
|
2017-03-16 18:09:27 +03:00
|
|
|
#define mm_flags mm_dbs[FREE_DBI].md_flags
|
2017-05-26 15:46:45 +03:00
|
|
|
mdbx_canary mm_canary;
|
2017-05-30 16:22:42 +03:00
|
|
|
|
2017-05-24 01:42:10 +03:00
|
|
|
#define MDBX_DATASIGN_NONE 0u
|
|
|
|
#define MDBX_DATASIGN_WEAK 1u
|
|
|
|
#define SIGN_IS_WEAK(sign) ((sign) == MDBX_DATASIGN_WEAK)
|
|
|
|
#define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK)
|
2017-04-27 15:17:30 +03:00
|
|
|
#define META_IS_WEAK(meta) SIGN_IS_WEAK((meta)->mm_datasync_sign)
|
|
|
|
#define META_IS_STEADY(meta) SIGN_IS_STEADY((meta)->mm_datasync_sign)
|
2017-05-30 16:22:42 +03:00
|
|
|
volatile uint64_t mm_datasync_sign;
|
|
|
|
|
|
|
|
/* txnid that committed this page, the second of a two-phase-update pair */
|
2019-09-23 21:22:55 +03:00
|
|
|
mdbx_safe64_t mm_txnid_b;
|
2019-08-23 03:36:56 +03:00
|
|
|
|
|
|
|
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
|
|
|
|
* DB was previously handled by libmdbx without corresponding feature.
|
|
|
|
* This value in couple with mr_snapshot_pages_retired allows fast estimation
|
|
|
|
* of "how much reader is restraining GC recycling". */
|
|
|
|
uint64_t mm_pages_retired;
|
2017-05-24 01:42:10 +03:00
|
|
|
} MDBX_meta;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Common header for all page types. The page type depends on mp_flags.
|
|
|
|
*
|
2017-05-23 21:02:39 +03:00
|
|
|
* P_BRANCH and P_LEAF pages have unsorted 'MDBX_node's at the end, with
|
2017-05-23 14:44:53 +03:00
|
|
|
* sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages
|
2017-05-24 01:42:10 +03:00
|
|
|
* omit mp_ptrs and pack sorted MDBX_DUPFIXED values after the page header.
|
2017-05-23 14:44:53 +03:00
|
|
|
*
|
|
|
|
* P_OVERFLOW records occupy one or more contiguous pages where only the
|
|
|
|
* first has a page header. They hold the real data of F_BIGDATA nodes.
|
|
|
|
*
|
|
|
|
* P_SUBP sub-pages are small leaf "pages" with duplicate data.
|
|
|
|
* A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page.
|
|
|
|
* (Duplicate data can also go in sub-databases, which use normal pages.)
|
|
|
|
*
|
2017-05-24 01:42:10 +03:00
|
|
|
* P_META pages contain MDBX_meta, the start point of an MDBX snapshot.
|
2017-05-23 14:44:53 +03:00
|
|
|
*
|
2017-05-24 01:42:10 +03:00
|
|
|
* Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once
|
2017-05-23 14:44:53 +03:00
|
|
|
* in the snapshot: Either used by a database or listed in a freeDB record. */
|
2017-05-23 21:04:23 +03:00
|
|
|
typedef struct MDBX_page {
|
2017-03-16 18:09:27 +03:00
|
|
|
union {
|
2017-05-31 17:10:17 +03:00
|
|
|
struct MDBX_page *mp_next; /* for in-memory list of freed pages,
|
|
|
|
* must be first field, see NEXT_LOOSE_PAGE */
|
2017-05-30 16:22:42 +03:00
|
|
|
uint64_t mp_validator; /* checksum of page content or a txnid during
|
|
|
|
* which the page has been updated */
|
2017-05-23 21:04:23 +03:00
|
|
|
};
|
2017-05-23 14:44:53 +03:00
|
|
|
uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
|
|
|
|
#define P_BRANCH 0x01 /* branch page */
|
|
|
|
#define P_LEAF 0x02 /* leaf page */
|
|
|
|
#define P_OVERFLOW 0x04 /* overflow page */
|
|
|
|
#define P_META 0x08 /* meta page */
|
|
|
|
#define P_DIRTY 0x10 /* dirty page, also set for P_SUBP pages */
|
2017-05-24 01:42:10 +03:00
|
|
|
#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */
|
|
|
|
#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */
|
2017-05-23 14:44:53 +03:00
|
|
|
#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */
|
|
|
|
#define P_KEEP 0x8000 /* leave this page alone during spill */
|
|
|
|
uint16_t mp_flags;
|
2017-03-16 18:09:27 +03:00
|
|
|
union {
|
|
|
|
struct {
|
2017-05-23 21:04:23 +03:00
|
|
|
indx_t mp_lower; /* lower bound of free space */
|
|
|
|
indx_t mp_upper; /* upper bound of free space */
|
|
|
|
};
|
|
|
|
uint32_t mp_pages; /* number of overflow pages */
|
|
|
|
};
|
2017-05-30 16:22:42 +03:00
|
|
|
pgno_t mp_pgno; /* page number */
|
2017-05-24 21:43:29 +03:00
|
|
|
|
|
|
|
/* dynamic size */
|
|
|
|
union {
|
|
|
|
indx_t mp_ptrs[1];
|
|
|
|
MDBX_meta mp_meta;
|
|
|
|
uint8_t mp_data[1];
|
|
|
|
};
|
2017-05-23 21:04:23 +03:00
|
|
|
} MDBX_page;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Size of the page header, excluding dynamic data at the end */
|
2017-05-24 21:43:29 +03:00
|
|
|
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data))
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
#pragma pack(pop)
|
|
|
|
|
|
|
|
/* Reader Lock Table
|
|
|
|
*
|
|
|
|
* Readers don't acquire any locks for their data access. Instead, they
|
|
|
|
* simply record their transaction ID in the reader table. The reader
|
|
|
|
* mutex is needed just to find an empty slot in the reader table. The
|
|
|
|
* slot's address is saved in thread-specific data so that subsequent
|
|
|
|
* read transactions started by the same thread need no further locking to
|
|
|
|
* proceed.
|
|
|
|
*
|
|
|
|
* If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
|
|
|
|
* No reader table is used if the database is on a read-only filesystem.
|
|
|
|
*
|
|
|
|
* Since the database uses multi-version concurrency control, readers don't
|
|
|
|
* actually need any locking. This table is used to keep track of which
|
|
|
|
* readers are using data from which old transactions, so that we'll know
|
|
|
|
* when a particular old transaction is no longer in use. Old transactions
|
|
|
|
* that have discarded any data pages can then have those pages reclaimed
|
|
|
|
* for use by a later write transaction.
|
|
|
|
*
|
|
|
|
* The lock table is constructed such that reader slots are aligned with the
|
|
|
|
* processor's cache line size. Any slot is only ever used by one thread.
|
|
|
|
* This alignment guarantees that there will be no contention or cache
|
|
|
|
* thrashing as threads update their own slot info, and also eliminates
|
|
|
|
* any need for locking when accessing a slot.
|
|
|
|
*
|
|
|
|
* A writer thread will scan every slot in the table to determine the oldest
|
|
|
|
* outstanding reader transaction. Any freed pages older than this will be
|
|
|
|
* reclaimed by the writer. The writer doesn't use any locks when scanning
|
|
|
|
* this table. This means that there's no guarantee that the writer will
|
|
|
|
* see the most up-to-date reader info, but that's not required for correct
|
|
|
|
* operation - all we need is to know the upper bound on the oldest reader,
|
|
|
|
* we don't care at all about the newest reader. So the only consequence of
|
|
|
|
* reading stale information here is that old pages might hang around a
|
|
|
|
* while longer before being reclaimed. That's actually good anyway, because
|
|
|
|
* the longer we delay reclaiming old pages, the more likely it is that a
|
|
|
|
* string of contiguous pages can be found after coalescing old pages from
|
|
|
|
* many old transactions together. */
|
|
|
|
|
|
|
|
/* The actual reader record, with cacheline padding. */
|
|
|
|
typedef struct MDBX_reader {
|
|
|
|
/* Current Transaction ID when this transaction began, or (txnid_t)-1.
|
|
|
|
* Multiple readers that start at the same time will probably have the
|
|
|
|
* same ID here. Again, it's not important to exclude them from
|
|
|
|
* anything; all we need to know is which version of the DB they
|
|
|
|
* started from so we can avoid overwriting any data used in that
|
|
|
|
* particular version. */
|
2019-09-23 21:22:55 +03:00
|
|
|
mdbx_safe64_t /* txnid_t */ mr_txnid;
|
2019-08-23 03:36:56 +03:00
|
|
|
|
|
|
|
/* The information we store in a single slot of the reader table.
|
|
|
|
* In addition to a transaction ID, we also record the process and
|
|
|
|
* thread ID that owns a slot, so that we can detect stale information,
|
|
|
|
* e.g. threads or processes that went away without cleaning up.
|
|
|
|
*
|
|
|
|
* NOTE: We currently don't check for stale records.
|
|
|
|
* We simply re-init the table when we know that we're the only process
|
|
|
|
* opening the lock file. */
|
|
|
|
|
|
|
|
/* The thread ID of the thread owning this txn. */
|
2019-09-23 15:32:29 +03:00
|
|
|
#if MDBX_WORDBITS >= 64
|
|
|
|
volatile uint64_t mr_tid;
|
|
|
|
#else
|
|
|
|
volatile uint32_t mr_tid;
|
|
|
|
volatile uint32_t mr_aba_curer; /* CSN to resolve ABA_problems on 32-bit arch,
|
|
|
|
unused for now */
|
|
|
|
#endif
|
2019-08-23 03:36:56 +03:00
|
|
|
/* The process ID of the process owning this reader txn. */
|
2019-09-23 15:32:29 +03:00
|
|
|
volatile uint32_t mr_pid;
|
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
/* The number of pages used in the reader's MVCC snapshot,
|
|
|
|
* i.e. the value of meta->mm_geo.next and txn->mt_next_pgno */
|
|
|
|
volatile pgno_t mr_snapshot_pages_used;
|
|
|
|
/* Number of retired pages at the time this reader starts transaction. So,
|
|
|
|
* at any time the difference mm_pages_retired - mr_snapshot_pages_retired
|
|
|
|
* will give the number of pages which this reader restraining from reuse. */
|
|
|
|
volatile uint64_t mr_snapshot_pages_retired;
|
|
|
|
} MDBX_reader;
|
|
|
|
|
2017-03-16 18:09:27 +03:00
|
|
|
/* The header for the reader table (a memory-mapped lock file). */
|
|
|
|
typedef struct MDBX_lockinfo {
|
2017-05-30 16:22:42 +03:00
|
|
|
/* Stamp identifying this as an MDBX file.
|
|
|
|
* It must be set to MDBX_MAGIC with with MDBX_LOCK_VERSION. */
|
|
|
|
uint64_t mti_magic_and_version;
|
|
|
|
|
2017-05-24 01:42:10 +03:00
|
|
|
/* Format of this lock file. Must be set to MDBX_LOCK_FORMAT. */
|
2017-05-30 16:22:42 +03:00
|
|
|
uint32_t mti_os_and_format;
|
|
|
|
|
2017-04-21 19:00:33 +03:00
|
|
|
/* Flags which environment was opened. */
|
2017-05-30 16:22:42 +03:00
|
|
|
volatile uint32_t mti_envmode;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
/* Threshold of un-synced-with-disk pages for auto-sync feature,
|
|
|
|
* zero means no-threshold, i.e. auto-sync is disabled. */
|
|
|
|
volatile pgno_t mti_autosync_threshold;
|
2019-09-11 19:12:57 +03:00
|
|
|
|
|
|
|
uint32_t reserved_pad;
|
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
/* Period for timed auto-sync feature, i.e. at the every steady checkpoint
|
|
|
|
* the mti_unsynced_timeout sets to the current_time + mti_autosync_period.
|
|
|
|
* The time value is represented in a suitable system-dependent form, for
|
|
|
|
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
|
|
|
|
* Zero means timed auto-sync is disabled. */
|
|
|
|
volatile uint64_t mti_autosync_period;
|
|
|
|
|
2019-08-31 00:55:15 +03:00
|
|
|
/* Marker to distinguish uniqueness of DB/CLK.*/
|
|
|
|
volatile uint64_t mti_bait_uniqueness;
|
|
|
|
|
2019-09-11 19:12:57 +03:00
|
|
|
/* the hash of /proc/sys/kernel/random/boot_id or analogue */
|
2019-08-31 00:55:15 +03:00
|
|
|
volatile uint64_t mti_boot_id;
|
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
|
2017-03-16 18:09:27 +03:00
|
|
|
#ifdef MDBX_OSAL_LOCK
|
2019-08-23 03:36:56 +03:00
|
|
|
/* Mutex protecting write-txn. */
|
|
|
|
MDBX_OSAL_LOCK mti_wmutex;
|
2018-03-22 20:34:09 +03:00
|
|
|
#endif
|
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
volatile txnid_t mti_oldest_reader;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
/* Timestamp for auto-sync feature, i.e. the steady checkpoint should be
|
|
|
|
* created at the first commit that will be not early this timestamp.
|
|
|
|
* The time value is represented in a suitable system-dependent form, for
|
|
|
|
* example clock_gettime(CLOCK_BOOTTIME) or clock_gettime(CLOCK_MONOTONIC).
|
|
|
|
* Zero means timed auto-sync is not pending. */
|
|
|
|
volatile uint64_t mti_unsynced_timeout;
|
2017-06-14 23:33:13 +03:00
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
/* Number un-synced-with-disk pages for auto-sync feature. */
|
|
|
|
volatile pgno_t mti_unsynced_pages;
|
2017-06-30 00:20:33 +03:00
|
|
|
|
2019-08-28 04:57:07 +03:00
|
|
|
/* Number of page which was discarded last time by madvise(MADV_FREE). */
|
|
|
|
volatile pgno_t mti_discarded_tail;
|
|
|
|
|
2019-09-11 19:12:57 +03:00
|
|
|
/* Timestamp of the last readers check. */
|
|
|
|
volatile uint64_t mti_reader_check_timestamp;
|
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
|
2018-03-22 20:34:09 +03:00
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
#ifdef MDBX_OSAL_LOCK
|
|
|
|
/* Mutex protecting readers registration access to this table. */
|
|
|
|
MDBX_OSAL_LOCK mti_rmutex;
|
|
|
|
#endif
|
2017-06-30 00:20:33 +03:00
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
/* The number of slots that have been used in the reader table.
|
|
|
|
* This always records the maximum count, it is not decremented
|
|
|
|
* when readers release their slots. */
|
|
|
|
volatile unsigned mti_numreaders;
|
|
|
|
volatile unsigned mti_readers_refresh_flag;
|
2017-06-14 23:33:13 +03:00
|
|
|
|
2019-08-23 03:36:56 +03:00
|
|
|
alignas(MDBX_CACHELINE_SIZE) /* cacheline ---------------------------------*/
|
|
|
|
MDBX_reader mti_readers[1];
|
2017-03-16 18:09:27 +03:00
|
|
|
} MDBX_lockinfo;
|
|
|
|
|
2017-05-30 16:22:42 +03:00
|
|
|
/* Lockfile format signature: version, features and field layout */
|
|
|
|
#define MDBX_LOCK_FORMAT \
|
2019-08-23 03:36:56 +03:00
|
|
|
(MDBX_OSAL_LOCK_SIGN * 27733 + (unsigned)sizeof(MDBX_reader) * 13 + \
|
|
|
|
(unsigned)offsetof(MDBX_reader, mr_snapshot_pages_used) * 251 + \
|
|
|
|
(unsigned)offsetof(MDBX_lockinfo, mti_oldest_reader) * 83 + \
|
|
|
|
(unsigned)offsetof(MDBX_lockinfo, mti_numreaders) * 29)
|
2017-05-30 16:22:42 +03:00
|
|
|
|
|
|
|
#define MDBX_DATA_MAGIC ((MDBX_MAGIC << 8) + MDBX_DATA_VERSION)
|
2018-10-15 12:18:39 +03:00
|
|
|
#define MDBX_DATA_MAGIC_DEVEL ((MDBX_MAGIC << 8) + 255)
|
2017-05-30 16:22:42 +03:00
|
|
|
|
|
|
|
#define MDBX_LOCK_MAGIC ((MDBX_MAGIC << 8) + MDBX_LOCK_VERSION)
|
|
|
|
|
2018-09-01 19:36:45 +03:00
|
|
|
#ifndef MDBX_ASSUME_MALLOC_OVERHEAD
|
|
|
|
#define MDBX_ASSUME_MALLOC_OVERHEAD (sizeof(void *) * 2u)
|
|
|
|
#endif /* MDBX_ASSUME_MALLOC_OVERHEAD */
|
|
|
|
|
2019-09-02 13:23:39 +03:00
|
|
|
/* The maximum size of a database page.
|
|
|
|
*
|
|
|
|
* It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper.
|
|
|
|
*
|
|
|
|
* MDBX will use database pages < OS pages if needed.
|
|
|
|
* That causes more I/O in write transactions: The OS must
|
|
|
|
* know (read) the whole page before writing a partial page.
|
|
|
|
*
|
|
|
|
* Note that we don't currently support Huge pages. On Linux,
|
|
|
|
* regular data files cannot use Huge pages, and in general
|
|
|
|
* Huge pages aren't actually pageable. We rely on the OS
|
|
|
|
* demand-pager to read our data and page it out when memory
|
|
|
|
* pressure from other processes is high. So until OSs have
|
|
|
|
* actual paging support for Huge pages, they're not viable. */
|
|
|
|
#define MAX_PAGESIZE 0x10000u
|
|
|
|
#define MIN_PAGESIZE 512u
|
|
|
|
|
|
|
|
#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
|
|
|
|
#if defined(_WIN32) || defined(_WIN64)
|
|
|
|
#define MAX_MAPSIZE32 UINT32_C(0x38000000)
|
|
|
|
#else
|
|
|
|
#define MAX_MAPSIZE32 UINT32_C(0x7ff80000)
|
|
|
|
#endif
|
|
|
|
#define MAX_MAPSIZE64 (MAX_PAGENO * (uint64_t)MAX_PAGESIZE)
|
|
|
|
|
|
|
|
#if MDBX_WORDBITS >= 64
|
|
|
|
#define MAX_MAPSIZE MAX_MAPSIZE64
|
|
|
|
#define MDBX_READERS_LIMIT \
|
|
|
|
((65536 - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader) + 1)
|
|
|
|
#else
|
|
|
|
#define MDBX_READERS_LIMIT 1024
|
|
|
|
#define MAX_MAPSIZE MAX_MAPSIZE32
|
|
|
|
#endif /* MDBX_WORDBITS */
|
|
|
|
|
2017-05-24 13:59:50 +03:00
|
|
|
/*----------------------------------------------------------------------------*/
|
2017-07-24 00:54:10 +03:00
|
|
|
/* Two kind lists of pages (aka PNL) */
|
2017-05-24 13:59:50 +03:00
|
|
|
|
2017-07-24 00:54:10 +03:00
|
|
|
/* An PNL is an Page Number List, a sorted array of IDs. The first element of
|
|
|
|
* the array is a counter for how many actual page-numbers are in the list.
|
|
|
|
* PNLs are sorted in descending order, this allow cut off a page with lowest
|
|
|
|
* pgno (at the tail) just truncating the list */
|
|
|
|
#define MDBX_PNL_ASCENDING 0
|
2017-07-26 09:31:22 +03:00
|
|
|
typedef pgno_t *MDBX_PNL;
|
2017-05-24 13:59:50 +03:00
|
|
|
|
2017-07-24 00:54:10 +03:00
|
|
|
#if MDBX_PNL_ASCENDING
|
|
|
|
#define MDBX_PNL_ORDERED(first, last) ((first) < (last))
|
|
|
|
#define MDBX_PNL_DISORDERED(first, last) ((first) >= (last))
|
|
|
|
#else
|
|
|
|
#define MDBX_PNL_ORDERED(first, last) ((first) > (last))
|
|
|
|
#define MDBX_PNL_DISORDERED(first, last) ((first) <= (last))
|
|
|
|
#endif
|
|
|
|
|
2017-06-05 14:02:44 +03:00
|
|
|
/* List of txnid, only for MDBX_env.mt_lifo_reclaimed */
|
|
|
|
typedef txnid_t *MDBX_TXL;
|
|
|
|
|
2018-09-01 19:36:45 +03:00
|
|
|
/* An Dirty-Page list item is an pgno/pointer pair. */
|
|
|
|
typedef union MDBX_DP {
|
2018-08-25 20:17:26 +03:00
|
|
|
struct {
|
|
|
|
pgno_t pgno;
|
2018-08-25 21:12:30 +03:00
|
|
|
void *ptr;
|
2018-08-25 20:17:26 +03:00
|
|
|
};
|
2018-09-01 19:36:45 +03:00
|
|
|
struct {
|
2019-09-21 19:57:05 +03:00
|
|
|
unsigned sorted;
|
2018-09-01 19:36:45 +03:00
|
|
|
unsigned length;
|
|
|
|
};
|
|
|
|
} MDBX_DP;
|
|
|
|
|
|
|
|
/* An DPL (dirty-page list) is a sorted array of MDBX_DPs.
|
|
|
|
* The first element's length member is a count of how many actual
|
|
|
|
* elements are in the array. */
|
|
|
|
typedef MDBX_DP *MDBX_DPL;
|
|
|
|
|
|
|
|
/* PNL sizes - likely should be even bigger */
|
|
|
|
#define MDBX_PNL_GRANULATE 1024
|
|
|
|
#define MDBX_PNL_INITIAL \
|
|
|
|
(MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
|
|
|
|
#define MDBX_PNL_MAX \
|
|
|
|
((1u << 24) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t))
|
|
|
|
#define MDBX_DPL_TXNFULL (MDBX_PNL_MAX / 4)
|
|
|
|
|
|
|
|
#define MDBX_TXL_GRANULATE 32
|
|
|
|
#define MDBX_TXL_INITIAL \
|
|
|
|
(MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
|
|
|
|
#define MDBX_TXL_MAX \
|
|
|
|
((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t))
|
2017-05-24 13:59:50 +03:00
|
|
|
|
2017-07-26 09:31:22 +03:00
|
|
|
#define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1])
|
2018-09-01 19:36:45 +03:00
|
|
|
#define MDBX_PNL_SIZE(pl) ((pl)[0])
|
|
|
|
#define MDBX_PNL_FIRST(pl) ((pl)[1])
|
|
|
|
#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)])
|
|
|
|
#define MDBX_PNL_BEGIN(pl) (&(pl)[1])
|
|
|
|
#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1])
|
|
|
|
|
|
|
|
#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t))
|
|
|
|
#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0)
|
2017-05-24 13:59:50 +03:00
|
|
|
|
|
|
|
/*----------------------------------------------------------------------------*/
|
|
|
|
/* Internal structures */
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-04-26 18:12:48 +03:00
|
|
|
/* Auxiliary DB info.
|
|
|
|
* The information here is mostly static/read-only. There is
|
|
|
|
* only a single copy of this record in the environment. */
|
2017-05-24 01:42:10 +03:00
|
|
|
typedef struct MDBX_dbx {
|
|
|
|
MDBX_val md_name; /* name of the database */
|
|
|
|
MDBX_cmp_func *md_cmp; /* function for comparing keys */
|
|
|
|
MDBX_cmp_func *md_dcmp; /* function for comparing data items */
|
|
|
|
} MDBX_dbx;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-04-26 18:12:48 +03:00
|
|
|
/* A database transaction.
|
|
|
|
* Every operation requires a transaction handle. */
|
2017-05-23 21:36:09 +03:00
|
|
|
struct MDBX_txn {
|
2017-06-21 01:19:04 +03:00
|
|
|
#define MDBX_MT_SIGNATURE UINT32_C(0x93D53A31)
|
2017-07-03 09:56:46 +03:00
|
|
|
size_t mt_signature;
|
2017-05-23 21:36:09 +03:00
|
|
|
MDBX_txn *mt_parent; /* parent of a nested txn */
|
2017-05-24 01:42:10 +03:00
|
|
|
/* Nested txn under this txn, set together with flag MDBX_TXN_HAS_CHILD */
|
2017-05-23 21:36:09 +03:00
|
|
|
MDBX_txn *mt_child;
|
2019-09-17 23:52:51 +03:00
|
|
|
mdbx_geo_t mt_geo;
|
|
|
|
/* next unallocated page */
|
|
|
|
#define mt_next_pgno mt_geo.next
|
|
|
|
/* corresponding to the current size of datafile */
|
|
|
|
#define mt_end_pgno mt_geo.now
|
|
|
|
|
|
|
|
/* Transaction Flags */
|
|
|
|
/* mdbx_txn_begin() flags */
|
|
|
|
#define MDBX_TXN_BEGIN_FLAGS \
|
|
|
|
(MDBX_NOMETASYNC | MDBX_NOSYNC | MDBX_MAPASYNC | MDBX_RDONLY | MDBX_TRYTXN)
|
|
|
|
/* internal txn flags */
|
|
|
|
#define MDBX_TXN_FINISHED 0x01 /* txn is finished or never began */
|
|
|
|
#define MDBX_TXN_ERROR 0x02 /* txn is unusable after an error */
|
|
|
|
#define MDBX_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */
|
|
|
|
#define MDBX_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */
|
|
|
|
#define MDBX_TXN_HAS_CHILD 0x10 /* txn has an MDBX_txn.mt_child */
|
|
|
|
/* most operations on the txn are currently illegal */
|
|
|
|
#define MDBX_TXN_BLOCKED \
|
|
|
|
(MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_HAS_CHILD)
|
|
|
|
unsigned mt_flags;
|
2017-04-26 18:12:48 +03:00
|
|
|
/* The ID of this transaction. IDs are integers incrementing from 1.
|
|
|
|
* Only committed write transactions increment the ID. If a transaction
|
|
|
|
* aborts, the ID may be re-used by the next writer. */
|
2017-03-16 18:09:27 +03:00
|
|
|
txnid_t mt_txnid;
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_env *mt_env; /* the DB environment */
|
|
|
|
/* The list of reclaimed txns from freeDB */
|
2017-06-05 14:02:44 +03:00
|
|
|
MDBX_TXL mt_lifo_reclaimed;
|
2017-04-26 18:12:48 +03:00
|
|
|
/* The list of pages that became unused during this transaction. */
|
2019-09-20 20:44:35 +03:00
|
|
|
MDBX_PNL mt_retired_pages;
|
2017-04-26 18:12:48 +03:00
|
|
|
/* The list of loose pages that became unused and may be reused
|
2017-05-23 14:44:53 +03:00
|
|
|
* in this transaction, linked through NEXT_LOOSE_PAGE(page). */
|
2017-05-23 21:36:09 +03:00
|
|
|
MDBX_page *mt_loose_pages;
|
|
|
|
/* Number of loose pages (mt_loose_pages) */
|
2017-05-23 14:44:53 +03:00
|
|
|
unsigned mt_loose_count;
|
2017-04-26 18:12:48 +03:00
|
|
|
/* The sorted list of dirty pages we temporarily wrote to disk
|
|
|
|
* because the dirty list was full. page numbers in here are
|
|
|
|
* shifted left by 1, deleted slots have the LSB set. */
|
2017-07-26 09:31:22 +03:00
|
|
|
MDBX_PNL mt_spill_pages;
|
2017-03-16 18:09:27 +03:00
|
|
|
union {
|
2017-05-24 01:42:10 +03:00
|
|
|
/* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */
|
2018-09-01 19:36:45 +03:00
|
|
|
MDBX_DPL mt_rw_dirtylist;
|
2017-04-26 18:12:48 +03:00
|
|
|
/* For read txns: This thread/txn's reader table slot, or NULL. */
|
2017-05-23 21:36:09 +03:00
|
|
|
MDBX_reader *mt_ro_reader;
|
|
|
|
};
|
2017-04-26 18:12:48 +03:00
|
|
|
/* Array of records for each DB known in the environment. */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_dbx *mt_dbxs;
|
|
|
|
/* Array of MDBX_db records for each known DB */
|
|
|
|
MDBX_db *mt_dbs;
|
2017-04-26 18:12:48 +03:00
|
|
|
/* Array of sequence numbers for each DB handle */
|
2017-03-16 18:09:27 +03:00
|
|
|
unsigned *mt_dbiseqs;
|
2017-04-26 18:12:48 +03:00
|
|
|
|
2017-05-17 20:06:57 +03:00
|
|
|
/* Transaction DB Flags */
|
2017-06-09 16:35:41 +03:00
|
|
|
#define DB_DIRTY MDBX_TBL_DIRTY /* DB was written in this txn */
|
|
|
|
#define DB_STALE MDBX_TBL_STALE /* Named-DB record is older than txnID */
|
2018-02-04 12:57:36 +03:00
|
|
|
#define DB_FRESH MDBX_TBL_FRESH /* Named-DB handle opened in this txn */
|
|
|
|
#define DB_CREAT MDBX_TBL_CREAT /* Named-DB handle created in this txn */
|
|
|
|
#define DB_VALID 0x10 /* DB handle is valid, see also MDBX_VALID */
|
|
|
|
#define DB_USRVALID 0x20 /* As DB_VALID, but not set for FREE_DBI */
|
|
|
|
#define DB_DUPDATA 0x40 /* DB is MDBX_DUPSORT data */
|
2017-05-17 20:06:57 +03:00
|
|
|
/* In write txns, array of cursors for each DB */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_cursor **mt_cursors;
|
2017-05-17 20:06:57 +03:00
|
|
|
/* Array of flags for each DB */
|
|
|
|
uint8_t *mt_dbflags;
|
|
|
|
/* Number of DB records in use, or 0 when the txn is finished.
|
2017-05-23 14:44:53 +03:00
|
|
|
* This number only ever increments until the txn finishes; we
|
|
|
|
* don't decrement it when individual DB handles are closed. */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_dbi mt_numdbs;
|
2017-05-23 21:36:09 +03:00
|
|
|
/* dirtylist room: Array size - dirty pages visible to this txn.
|
2017-05-23 14:44:53 +03:00
|
|
|
* Includes ancestor txns' dirty pages not hidden by other txns'
|
|
|
|
* dirty/spilled pages. Thus commit(nested txn) has room to merge
|
2017-05-23 21:36:09 +03:00
|
|
|
* dirtylist into mt_parent after freeing hidden mt_parent pages. */
|
|
|
|
unsigned mt_dirtyroom;
|
2019-09-23 15:32:29 +03:00
|
|
|
size_t mt_owner; /* thread ID that owns this transaction */
|
2017-07-03 09:56:46 +03:00
|
|
|
mdbx_canary mt_canary;
|
2017-03-16 18:09:27 +03:00
|
|
|
};
|
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
|
|
|
|
* At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
|
|
|
|
* raise this on a 64 bit machine. */
|
2017-03-16 18:09:27 +03:00
|
|
|
#define CURSOR_STACK 32
|
|
|
|
|
2017-05-24 01:42:10 +03:00
|
|
|
struct MDBX_xcursor;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Cursors are used for all DB operations.
|
|
|
|
* A cursor holds a path of (page pointer, key index) from the DB
|
2017-05-24 01:42:10 +03:00
|
|
|
* root to a position in the DB, plus other state. MDBX_DUPSORT
|
2017-05-23 14:44:53 +03:00
|
|
|
* cursors include an xcursor to the current data item. Write txns
|
|
|
|
* track their cursors and keep them up to date when data moves.
|
|
|
|
* Exception: An xcursor's pointer to a P_SUBP page can be stale.
|
|
|
|
* (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */
|
2017-05-24 01:42:10 +03:00
|
|
|
struct MDBX_cursor {
|
2017-06-21 01:19:04 +03:00
|
|
|
#define MDBX_MC_SIGNATURE UINT32_C(0xFE05D5B1)
|
|
|
|
#define MDBX_MC_READY4CLOSE UINT32_C(0x2817A047)
|
|
|
|
#define MDBX_MC_WAIT4EOT UINT32_C(0x90E297A7)
|
|
|
|
uint32_t mc_signature;
|
2017-07-03 09:56:46 +03:00
|
|
|
/* The database handle this cursor operates on */
|
|
|
|
MDBX_dbi mc_dbi;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Next cursor on this DB in this txn */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_cursor *mc_next;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Backup of the original cursor if this cursor is a shadow */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_cursor *mc_backup;
|
|
|
|
/* Context used for databases with MDBX_DUPSORT, otherwise NULL */
|
|
|
|
struct MDBX_xcursor *mc_xcursor;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* The transaction that owns this cursor */
|
2017-05-23 21:36:09 +03:00
|
|
|
MDBX_txn *mc_txn;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* The database record for this cursor */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_db *mc_db;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* The database auxiliary record for this cursor */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_dbx *mc_dbx;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* The mt_dbflag for this database */
|
2017-03-16 18:09:27 +03:00
|
|
|
uint8_t *mc_dbflag;
|
2017-05-23 21:04:23 +03:00
|
|
|
uint16_t mc_snum; /* number of pushed pages */
|
|
|
|
uint16_t mc_top; /* index of top page, normally mc_snum-1 */
|
|
|
|
/* Cursor state flags. */
|
|
|
|
#define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */
|
|
|
|
#define C_EOF 0x02 /* No more data */
|
|
|
|
#define C_SUB 0x04 /* Cursor is a sub-cursor */
|
|
|
|
#define C_DEL 0x08 /* last op was a cursor_del */
|
2018-08-28 11:04:45 +03:00
|
|
|
#define C_UNTRACK 0x10 /* Un-track cursor when closing */
|
|
|
|
#define C_RECLAIMING 0x20 /* FreeDB lookup is prohibited */
|
|
|
|
#define C_GCFREEZE 0x40 /* me_reclaimed_pglist must not be updated */
|
2017-05-23 21:04:23 +03:00
|
|
|
unsigned mc_flags; /* see mdbx_cursor */
|
|
|
|
MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
|
|
|
|
indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
|
2017-03-16 18:09:27 +03:00
|
|
|
};
|
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Context for sorted-dup records.
|
|
|
|
* We could have gone to a fully recursive design, with arbitrarily
|
|
|
|
* deep nesting of sub-databases. But for now we only handle these
|
|
|
|
* levels - main DB, optional sub-DB, sorted-duplicate DB. */
|
2017-05-24 01:42:10 +03:00
|
|
|
typedef struct MDBX_xcursor {
|
2017-05-23 14:44:53 +03:00
|
|
|
/* A sub-cursor for traversing the Dup DB */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_cursor mx_cursor;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* The database record for this Dup DB */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_db mx_db;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* The auxiliary DB record for this Dup DB */
|
2017-05-24 01:42:10 +03:00
|
|
|
MDBX_dbx mx_dbx;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* The mt_dbflag for this Dup DB */
|
|
|
|
uint8_t mx_dbflag;
|
2017-05-24 01:42:10 +03:00
|
|
|
} MDBX_xcursor;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2018-08-29 19:15:59 +03:00
|
|
|
typedef struct MDBX_cursor_couple {
|
|
|
|
MDBX_cursor outer;
|
|
|
|
MDBX_xcursor inner;
|
|
|
|
} MDBX_cursor_couple;
|
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */
|
2017-03-16 18:09:27 +03:00
|
|
|
#define XCURSOR_INITED(mc) \
|
|
|
|
((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
|
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Update sub-page pointer, if any, in mc->mc_xcursor.
|
|
|
|
* Needed when the node which contains the sub-page may have moved.
|
|
|
|
* Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */
|
2017-03-16 18:09:27 +03:00
|
|
|
#define XCURSOR_REFRESH(mc, mp, ki) \
|
|
|
|
do { \
|
2017-05-23 21:04:23 +03:00
|
|
|
MDBX_page *xr_pg = (mp); \
|
2017-05-23 21:02:39 +03:00
|
|
|
MDBX_node *xr_node = NODEPTR(xr_pg, ki); \
|
2017-03-16 18:09:27 +03:00
|
|
|
if ((xr_node->mn_flags & (F_DUPDATA | F_SUBDATA)) == F_DUPDATA) \
|
|
|
|
(mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \
|
|
|
|
} while (0)
|
|
|
|
|
2017-05-24 01:42:10 +03:00
|
|
|
/* State of FreeDB old pages, stored in the MDBX_env */
|
|
|
|
typedef struct MDBX_pgstate {
|
2017-06-21 01:34:56 +03:00
|
|
|
pgno_t *mf_reclaimed_pglist; /* Reclaimed freeDB pages, or NULL before use */
|
|
|
|
txnid_t mf_last_reclaimed; /* ID of last used record, or 0 if
|
|
|
|
!mf_reclaimed_pglist */
|
2017-05-24 01:42:10 +03:00
|
|
|
} MDBX_pgstate;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* The database environment. */
|
2017-05-24 01:42:10 +03:00
|
|
|
struct MDBX_env {
|
2017-06-21 01:19:04 +03:00
|
|
|
#define MDBX_ME_SIGNATURE UINT32_C(0x9A899641)
|
2017-07-03 09:56:46 +03:00
|
|
|
size_t me_signature;
|
2017-07-11 14:10:24 +03:00
|
|
|
mdbx_mmap_t me_dxb_mmap; /* The main data file */
|
|
|
|
#define me_map me_dxb_mmap.dxb
|
|
|
|
#define me_fd me_dxb_mmap.fd
|
2017-07-12 21:13:17 +03:00
|
|
|
#define me_mapsize me_dxb_mmap.length
|
|
|
|
mdbx_mmap_t me_lck_mmap; /* The lock file */
|
2017-07-11 14:10:24 +03:00
|
|
|
#define me_lfd me_lck_mmap.fd
|
2017-07-12 21:13:17 +03:00
|
|
|
#define me_lck me_lck_mmap.lck
|
2017-07-11 14:10:24 +03:00
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Failed to update the meta page. Probably an I/O error. */
|
2017-06-21 01:19:04 +03:00
|
|
|
#define MDBX_FATAL_ERROR UINT32_C(0x80000000)
|
2017-07-21 10:30:46 +03:00
|
|
|
/* Additional flag for mdbx_sync_locked() */
|
|
|
|
#define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000)
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Some fields are initialized. */
|
2017-06-21 01:19:04 +03:00
|
|
|
#define MDBX_ENV_ACTIVE UINT32_C(0x20000000)
|
2017-05-23 14:44:53 +03:00
|
|
|
/* me_txkey is set */
|
2017-06-21 01:19:04 +03:00
|
|
|
#define MDBX_ENV_TXKEY UINT32_C(0x10000000)
|
2017-05-23 14:44:53 +03:00
|
|
|
uint32_t me_flags; /* see mdbx_env */
|
|
|
|
unsigned me_psize; /* DB page size, inited from me_os_psize */
|
2017-06-05 17:16:21 +03:00
|
|
|
unsigned me_psize2log; /* log2 of DB page size */
|
2017-05-23 14:44:53 +03:00
|
|
|
unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */
|
|
|
|
unsigned me_maxreaders; /* size of the reader table */
|
2017-05-28 18:50:09 +03:00
|
|
|
mdbx_fastmutex_t me_dbi_lock;
|
2018-06-30 11:58:57 +03:00
|
|
|
MDBX_dbi me_numdbs; /* number of DBs opened */
|
|
|
|
MDBX_dbi me_maxdbs; /* size of the DB table */
|
2019-09-23 15:32:29 +03:00
|
|
|
uint32_t me_pid; /* process ID of this env */
|
2018-06-30 11:58:57 +03:00
|
|
|
mdbx_thread_key_t me_txkey; /* thread-key for readers */
|
|
|
|
char *me_path; /* path to the DB files */
|
|
|
|
void *me_pbuf; /* scratch area for DUPSORT put() */
|
|
|
|
MDBX_txn *me_txn; /* current write transaction */
|
|
|
|
MDBX_txn *me_txn0; /* prealloc'd write transaction */
|
|
|
|
#ifdef MDBX_OSAL_LOCK
|
|
|
|
MDBX_OSAL_LOCK *me_wmutex; /* write-txn mutex */
|
|
|
|
#endif
|
2017-06-14 23:33:13 +03:00
|
|
|
MDBX_dbx *me_dbxs; /* array of static DB info */
|
|
|
|
uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */
|
|
|
|
unsigned *me_dbiseqs; /* array of dbi sequence numbers */
|
|
|
|
volatile txnid_t *me_oldest; /* ID of oldest reader last time we looked */
|
|
|
|
MDBX_pgstate me_pgstate; /* state of old pages from freeDB */
|
2017-06-21 01:34:56 +03:00
|
|
|
#define me_last_reclaimed me_pgstate.mf_last_reclaimed
|
|
|
|
#define me_reclaimed_pglist me_pgstate.mf_reclaimed_pglist
|
2017-05-23 21:04:23 +03:00
|
|
|
MDBX_page *me_dpages; /* list of malloc'd blocks for re-use */
|
2017-07-26 09:31:22 +03:00
|
|
|
/* PNL of pages that became unused in a write txn */
|
2019-09-20 20:44:35 +03:00
|
|
|
MDBX_PNL me_retired_pages;
|
2018-09-01 19:36:45 +03:00
|
|
|
/* MDBX_DP of pages written during a write txn. Length MDBX_DPL_TXNFULL. */
|
|
|
|
MDBX_DPL me_dirtylist;
|
2018-08-13 01:20:24 +03:00
|
|
|
/* Number of freelist items that can fit in a single overflow page */
|
|
|
|
unsigned me_maxgc_ov1page;
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Max size of a node on a page */
|
2017-03-16 18:09:27 +03:00
|
|
|
unsigned me_nodemax;
|
2019-09-23 15:32:29 +03:00
|
|
|
unsigned me_maxkey_limit; /* max size of a key */
|
|
|
|
uint32_t me_live_reader; /* have liveness lock in reader table */
|
|
|
|
void *me_userctx; /* User-settable context */
|
2019-08-23 13:13:20 +03:00
|
|
|
volatile uint64_t *me_unsynced_timeout;
|
|
|
|
volatile uint64_t *me_autosync_period;
|
2019-08-23 11:40:30 +03:00
|
|
|
volatile pgno_t *me_unsynced_pages;
|
|
|
|
volatile pgno_t *me_autosync_threshold;
|
2019-08-28 04:57:07 +03:00
|
|
|
volatile pgno_t *me_discarded_tail;
|
2017-05-23 14:44:53 +03:00
|
|
|
MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */
|
2019-08-23 11:40:30 +03:00
|
|
|
struct {
|
|
|
|
#ifdef MDBX_OSAL_LOCK
|
|
|
|
MDBX_OSAL_LOCK wmutex;
|
|
|
|
#endif
|
|
|
|
txnid_t oldest;
|
2019-08-23 13:13:20 +03:00
|
|
|
uint64_t unsynced_timeout;
|
|
|
|
uint64_t autosync_period;
|
2019-08-23 11:40:30 +03:00
|
|
|
pgno_t autosync_pending;
|
|
|
|
pgno_t autosync_threshold;
|
2019-08-28 04:57:07 +03:00
|
|
|
pgno_t discarded_tail;
|
2019-08-23 11:40:30 +03:00
|
|
|
} me_lckless_stub;
|
2017-06-14 23:33:13 +03:00
|
|
|
#if MDBX_DEBUG
|
|
|
|
MDBX_assert_func *me_assert_func; /* Callback for assertion failures */
|
|
|
|
#endif
|
2017-03-16 18:09:27 +03:00
|
|
|
#ifdef USE_VALGRIND
|
|
|
|
int me_valgrind_handle;
|
|
|
|
#endif
|
2019-08-31 00:55:15 +03:00
|
|
|
MDBX_env *me_lcklist_next;
|
2017-07-12 21:13:17 +03:00
|
|
|
|
2017-06-21 01:34:56 +03:00
|
|
|
struct {
|
|
|
|
size_t lower; /* minimal size of datafile */
|
|
|
|
size_t upper; /* maximal size of datafile */
|
|
|
|
size_t now; /* current size of datafile */
|
|
|
|
size_t grow; /* step to grow datafile */
|
|
|
|
size_t shrink; /* threshold to shrink datafile */
|
|
|
|
} me_dbgeo; /* */
|
2018-01-07 14:37:38 +03:00
|
|
|
|
|
|
|
#if defined(_WIN32) || defined(_WIN64)
|
2018-06-12 16:43:33 +03:00
|
|
|
MDBX_srwlock me_remap_guard;
|
2018-01-31 15:15:54 +03:00
|
|
|
/* Workaround for LockFileEx and WriteFile multithread bug */
|
|
|
|
CRITICAL_SECTION me_windowsbug_lock;
|
2018-01-07 14:37:38 +03:00
|
|
|
#else
|
|
|
|
mdbx_fastmutex_t me_remap_guard;
|
|
|
|
#endif
|
2017-03-16 18:09:27 +03:00
|
|
|
};
|
|
|
|
|
2017-05-23 14:44:53 +03:00
|
|
|
/* Nested transaction */
|
2017-05-24 01:42:10 +03:00
|
|
|
typedef struct MDBX_ntxn {
|
|
|
|
MDBX_txn mnt_txn; /* the transaction */
|
|
|
|
MDBX_pgstate mnt_pgstate; /* parent transaction's saved freestate */
|
|
|
|
} MDBX_ntxn;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
|
|
|
/*----------------------------------------------------------------------------*/
|
2017-05-24 13:59:50 +03:00
|
|
|
/* Debug and Logging stuff */
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2019-08-31 17:10:04 +03:00
|
|
|
#define MDBX_RUNTIME_FLAGS_INIT \
|
2019-09-24 02:07:00 +03:00
|
|
|
((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT
|
2019-08-31 17:10:04 +03:00
|
|
|
|
2019-08-27 00:29:15 +03:00
|
|
|
#ifndef mdbx_runtime_flags /* avoid override from tools */
|
2019-09-24 02:07:00 +03:00
|
|
|
MDBX_INTERNAL_VAR uint8_t mdbx_runtime_flags;
|
|
|
|
#endif
|
|
|
|
#ifndef mdbx_runtime_flags /* avoid override from tools */
|
|
|
|
MDBX_INTERNAL_VAR uint8_t mdbx_loglevel;
|
2019-08-27 00:29:15 +03:00
|
|
|
#endif
|
2019-08-31 17:10:04 +03:00
|
|
|
MDBX_INTERNAL_VAR MDBX_debug_func *mdbx_debug_logger;
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2019-08-31 17:10:04 +03:00
|
|
|
MDBX_INTERNAL_FUNC void mdbx_debug_log(int type, const char *function, int line,
|
|
|
|
const char *fmt, ...)
|
|
|
|
__printf_args(4, 5);
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2019-08-31 17:10:04 +03:00
|
|
|
MDBX_INTERNAL_FUNC void mdbx_panic(const char *fmt, ...) __printf_args(1, 2);
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2017-05-24 01:42:10 +03:00
|
|
|
#if MDBX_DEBUG
|
2017-03-16 18:09:27 +03:00
|
|
|
|
|
|
|
#define mdbx_assert_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_ASSERT)
|
|
|
|
|
|
|
|
#define mdbx_audit_enabled() unlikely(mdbx_runtime_flags &MDBX_DBG_AUDIT)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#ifdef MDBX_LOGLEVEL_BUILD
|
|
|
|
#define mdbx_log_enabled(msg) \
|
|
|
|
(msg <= MDBX_LOGLEVEL_BUILD && unlikely(msg <= mdbx_loglevel))
|
2017-03-16 18:09:27 +03:00
|
|
|
#else
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel)
|
|
|
|
#endif /* MDBX_LOGLEVEL_BUILD */
|
|
|
|
|
|
|
|
#else /* MDBX_DEBUG */
|
|
|
|
|
2017-03-16 18:09:27 +03:00
|
|
|
#define mdbx_audit_enabled() (0)
|
2019-09-24 02:07:00 +03:00
|
|
|
|
2018-09-18 18:16:24 +03:00
|
|
|
#if !defined(NDEBUG) || defined(MDBX_FORCE_ASSERT)
|
2017-05-19 00:31:54 +03:00
|
|
|
#define mdbx_assert_enabled() (1)
|
|
|
|
#else
|
2017-03-16 18:09:27 +03:00
|
|
|
#define mdbx_assert_enabled() (0)
|
2017-05-19 00:31:54 +03:00
|
|
|
#endif /* NDEBUG */
|
2019-09-24 02:07:00 +03:00
|
|
|
|
|
|
|
#ifdef MDBX_LOGLEVEL_BUILD
|
|
|
|
#define mdbx_log_enabled(msg) (msg <= MDBX_LOGLEVEL_BUILD)
|
|
|
|
#else
|
|
|
|
#define mdbx_log_enabled(msg) (0)
|
|
|
|
#endif /* MDBX_LOGLEVEL_BUILD */
|
|
|
|
|
2017-05-24 01:42:10 +03:00
|
|
|
#endif /* MDBX_DEBUG */
|
2017-03-16 18:09:27 +03:00
|
|
|
|
2019-08-31 17:10:04 +03:00
|
|
|
MDBX_INTERNAL_FUNC void mdbx_assert_fail(const MDBX_env *env, const char *msg,
|
|
|
|
const char *func, int line);
|
2018-10-14 01:11:14 +03:00
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_debug_extra(fmt, ...) \
|
2017-04-21 16:00:43 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_EXTRA, __FUNCTION__, __LINE__, fmt, \
|
2017-04-21 16:00:43 +03:00
|
|
|
##__VA_ARGS__); \
|
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_debug_extra_print(fmt, ...) \
|
2017-04-21 16:00:43 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, ##__VA_ARGS__); \
|
2017-04-21 16:00:43 +03:00
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_trace(fmt, ...) \
|
2017-04-21 16:00:43 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_TRACE)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_TRACE, __FUNCTION__, __LINE__, fmt "\n", \
|
|
|
|
##__VA_ARGS__); \
|
2017-04-21 16:00:43 +03:00
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_debug(fmt, ...) \
|
2017-04-21 16:00:43 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_DEBUG, __FUNCTION__, __LINE__, fmt "\n", \
|
|
|
|
##__VA_ARGS__); \
|
2017-04-21 16:00:43 +03:00
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_debug_print(fmt, ...) \
|
2017-04-21 16:00:43 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_DEBUG, NULL, 0, fmt, ##__VA_ARGS__); \
|
2017-04-21 16:00:43 +03:00
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_verbose(fmt, ...) \
|
2017-04-21 16:00:43 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_VERBOSE, __FUNCTION__, __LINE__, fmt "\n", \
|
|
|
|
##__VA_ARGS__); \
|
2017-04-21 16:00:43 +03:00
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_notice(fmt, ...) \
|
2017-03-16 18:09:27 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_NOTICE, __FUNCTION__, __LINE__, fmt "\n", \
|
2017-03-16 18:09:27 +03:00
|
|
|
##__VA_ARGS__); \
|
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_warning(fmt, ...) \
|
2017-03-16 18:09:27 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_WARN)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_WARN, __FUNCTION__, __LINE__, fmt "\n", \
|
|
|
|
##__VA_ARGS__); \
|
2017-03-16 18:09:27 +03:00
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_error(fmt, ...) \
|
2017-03-16 18:09:27 +03:00
|
|
|
do { \
|
2019-09-24 02:07:00 +03:00
|
|
|
if (mdbx_log_enabled(MDBX_LOG_ERROR)) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_ERROR, __FUNCTION__, __LINE__, fmt "\n", \
|
2017-03-16 18:09:27 +03:00
|
|
|
##__VA_ARGS__); \
|
|
|
|
} while (0)
|
|
|
|
|
2019-09-24 02:07:00 +03:00
|
|
|
#define mdbx_fatal(fmt, ...) \
|
|
|
|
mdbx_debug_log(MDBX_LOG_FATAL, __FUNCTION__, __LINE__, fmt "\n", \
|
|
|
|
##__VA_ARGS__);
|
2017-03-16 18:09:27 +03:00
|
|
|
|
|
|
|
#define mdbx_ensure_msg(env, expr, msg) \
|
|
|
|
do { \
|
|
|
|
if (unlikely(!(expr))) \
|
|
|
|
mdbx_assert_fail(env, msg, __FUNCTION__, __LINE__); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr)
|
|
|
|
|
|
|
|
/* assert(3) variant in environment context */
|
|
|
|
#define mdbx_assert(env, expr) \
|
|
|
|
do { \
|
|
|
|
if (mdbx_assert_enabled()) \
|
|
|
|
mdbx_ensure(env, expr); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/* assert(3) variant in cursor context */
|
|
|
|
#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr)
|
|
|
|
|
|
|
|
/* assert(3) variant in transaction context */
|
|
|
|
#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr)
|
|
|
|
|
2019-08-31 17:13:02 +03:00
|
|
|
#ifndef MDBX_TOOLS /* Avoid using internal mdbx_assert() */
|
2018-10-14 01:11:14 +03:00
|
|
|
#undef assert
|
|
|
|
#define assert(expr) mdbx_assert(NULL, expr)
|
2019-08-31 17:13:02 +03:00
|
|
|
#endif
|
2018-10-14 01:11:14 +03:00
|
|
|
|
2017-05-24 13:59:50 +03:00
|
|
|
/*----------------------------------------------------------------------------*/
|
2018-09-19 00:21:42 +03:00
|
|
|
/* Internal prototypes */
|
2017-05-24 13:59:50 +03:00
|
|
|
|
2019-08-31 17:10:04 +03:00
|
|
|
MDBX_INTERNAL_FUNC int mdbx_reader_check0(MDBX_env *env, int rlocked,
|
|
|
|
int *dead);
|
|
|
|
MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key,
|
|
|
|
MDBX_reader *begin, MDBX_reader *end);
|
|
|
|
MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key);
|
2018-03-28 15:57:16 +03:00
|
|
|
|
2019-08-31 17:10:04 +03:00
|
|
|
MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void);
|
|
|
|
MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void);
|
|
|
|
MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr);
|
2017-04-05 18:33:19 +03:00
|
|
|
|
2017-04-21 16:02:27 +03:00
|
|
|
#define MDBX_IS_ERROR(rc) \
|
|
|
|
((rc) != MDBX_RESULT_TRUE && (rc) != MDBX_RESULT_FALSE)
|
2017-05-24 13:59:50 +03:00
|
|
|
|
|
|
|
/* Internal error codes, not exposed outside libmdbx */
|
|
|
|
#define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10)
|
|
|
|
|
|
|
|
/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */
|
|
|
|
#define DDBI(mc) \
|
|
|
|
(((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
|
|
|
|
|
|
|
|
/* Key size which fits in a DKBUF. */
|
|
|
|
#define DKBUF_MAXKEYSIZE 511 /* FIXME */
|
|
|
|
|
|
|
|
#if MDBX_DEBUG
|
|
|
|
#define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2]
|
2019-09-24 02:07:00 +03:00
|
|
|
#define DKEY(x) mdbx_dump_val(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1)
|
2017-05-24 13:59:50 +03:00
|
|
|
#define DVAL(x) \
|
2019-09-24 02:07:00 +03:00
|
|
|
mdbx_dump_val(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1)
|
2017-05-24 13:59:50 +03:00
|
|
|
#else
|
|
|
|
#define DKBUF ((void)(0))
|
|
|
|
#define DKEY(x) ("-")
|
|
|
|
#define DVAL(x) ("-")
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* An invalid page number.
|
|
|
|
* Mainly used to denote an empty tree. */
|
|
|
|
#define P_INVALID (~(pgno_t)0)
|
|
|
|
|
|
|
|
/* Test if the flags f are set in a flag word w. */
|
|
|
|
#define F_ISSET(w, f) (((w) & (f)) == (f))
|
|
|
|
|
|
|
|
/* Round n up to an even number. */
|
|
|
|
#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
|
|
|
|
|
|
|
|
/* Default size of memory map.
|
|
|
|
* This is certainly too small for any actual applications. Apps should
|
|
|
|
* always set the size explicitly using mdbx_env_set_mapsize(). */
|
|
|
|
#define DEFAULT_MAPSIZE 1048576
|
|
|
|
|
|
|
|
/* Number of slots in the reader table.
|
|
|
|
* This value was chosen somewhat arbitrarily. The 61 is a prime number,
|
|
|
|
* and such readers plus a couple mutexes fit into single 4KB page.
|
|
|
|
* Applications should set the table size using mdbx_env_set_maxreaders(). */
|
|
|
|
#define DEFAULT_READERS 61
|
|
|
|
|
|
|
|
/* Address of first usable data byte in a page, after the header */
|
|
|
|
#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
|
|
|
|
|
|
|
|
/* Number of nodes on a page */
|
2017-06-05 20:48:05 +03:00
|
|
|
#define NUMKEYS(p) ((unsigned)(p)->mp_lower >> 1)
|
2017-05-24 13:59:50 +03:00
|
|
|
|
|
|
|
/* The amount of space remaining in the page */
|
2018-09-06 17:10:59 +03:00
|
|
|
#define SIZELEFT(p) ((indx_t)((p)->mp_upper - (p)->mp_lower))
|
2017-05-24 13:59:50 +03:00
|
|
|
|
|
|
|
/* The percentage of space used in the page, in tenths of a percent. */
|
|
|
|
#define PAGEFILL(env, p) \
|
2017-07-26 12:23:01 +03:00
|
|
|
(1024UL * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
|
2017-05-24 13:59:50 +03:00
|
|
|
((env)->me_psize - PAGEHDRSZ))
|
|
|
|
/* The minimum page fill factor, in tenths of a percent.
|
|
|
|
* Pages emptier than this are candidates for merging. */
|
2017-06-05 16:56:10 +03:00
|
|
|
#define FILL_THRESHOLD 256
|
2017-05-24 13:59:50 +03:00
|
|
|
|
|
|
|
/* Test if a page is a leaf page */
|
2018-09-05 22:00:47 +03:00
|
|
|
#define IS_LEAF(p) (((p)->mp_flags & P_LEAF) != 0)
|
2017-05-24 13:59:50 +03:00
|
|
|
/* Test if a page is a LEAF2 page */
|
2018-09-05 22:00:47 +03:00
|
|
|
#define IS_LEAF2(p) unlikely(((p)->mp_flags & P_LEAF2) != 0)
|
2017-05-24 13:59:50 +03:00
|
|
|
/* Test if a page is a branch page */
|
2018-09-05 22:00:47 +03:00
|
|
|
#define IS_BRANCH(p) (((p)->mp_flags & P_BRANCH) != 0)
|
2017-05-24 13:59:50 +03:00
|
|
|
/* Test if a page is an overflow page */
|
2018-09-05 22:00:47 +03:00
|
|
|
#define IS_OVERFLOW(p) unlikely(((p)->mp_flags & P_OVERFLOW) != 0)
|
2017-05-24 13:59:50 +03:00
|
|
|
/* Test if a page is a sub page */
|
2018-09-05 22:00:47 +03:00
|
|
|
#define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0)
|
|
|
|
/* Test if a page is dirty */
|
|
|
|
#define IS_DIRTY(p) (((p)->mp_flags & P_DIRTY) != 0)
|
|
|
|
|
|
|
|
#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))
|
2017-05-24 13:59:50 +03:00
|
|
|
|
|
|
|
/* The number of overflow pages needed to store the given size. */
|
2017-06-05 17:16:21 +03:00
|
|
|
#define OVPAGES(env, size) (bytes2pgno(env, PAGEHDRSZ - 1 + (size)) + 1)
|
2017-05-24 13:59:50 +03:00
|
|
|
|
|
|
|
/* Link in MDBX_txn.mt_loose_pages list.
|
|
|
|
* Kept outside the page header, which is needed when reusing the page. */
|
|
|
|
#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2))
|
|
|
|
|
|
|
|
/* Header for a single key/data pair within a page.
|
|
|
|
* Used in pages of type P_BRANCH and P_LEAF without P_LEAF2.
|
|
|
|
* We guarantee 2-byte alignment for 'MDBX_node's.
|
|
|
|
*
|
|
|
|
* mn_lo and mn_hi are used for data size on leaf nodes, and for child
|
|
|
|
* pgno on branch nodes. On 64 bit platforms, mn_flags is also used
|
|
|
|
* for pgno. (Branch nodes have no flags). Lo and hi are in host byte
|
|
|
|
* order in case some accesses can be optimized to 32-bit word access.
|
|
|
|
*
|
|
|
|
* Leaf node flags describe node contents. F_BIGDATA says the node's
|
|
|
|
* data part is the page number of an overflow page with actual data.
|
|
|
|
* F_DUPDATA and F_SUBDATA can be combined giving duplicate data in
|
|
|
|
* a sub-page/sub-database, and named databases (just F_SUBDATA). */
|
|
|
|
typedef struct MDBX_node {
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
uint16_t mn_lo, mn_hi; /* part of data size or pgno */
|
|
|
|
};
|
|
|
|
uint32_t mn_dsize;
|
|
|
|
};
|
|
|
|
uint16_t mn_flags; /* see mdbx_node */
|
|
|
|
uint16_t mn_ksize; /* key size */
|
|
|
|
#else
|
|
|
|
uint16_t mn_ksize; /* key size */
|
|
|
|
uint16_t mn_flags; /* see mdbx_node */
|
|
|
|
union {
|
|
|
|
struct {
|
|
|
|
uint16_t mn_hi, mn_lo; /* part of data size or pgno */
|
|
|
|
};
|
|
|
|
uint32_t mn_dsize;
|
|
|
|
};
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
pgno_t mn_ksize_and_pgno;
|
|
|
|
};
|
|
|
|
|
|
|
|
/* mdbx_node Flags */
|
|
|
|
#define F_BIGDATA 0x01 /* data put on overflow page */
|
|
|
|
#define F_SUBDATA 0x02 /* data is a sub-database */
|
|
|
|
#define F_DUPDATA 0x04 /* data has duplicates */
|
|
|
|
|
|
|
|
/* valid flags for mdbx_node_add() */
|
|
|
|
#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND)
|
|
|
|
uint8_t mn_data[1]; /* key and data are appended here */
|
|
|
|
} MDBX_node;
|
|
|
|
|
|
|
|
/* Size of the node header, excluding dynamic data at the end */
|
|
|
|
#define NODESIZE offsetof(MDBX_node, mn_data)
|
|
|
|
|
|
|
|
/* Bit position of top word in page number, for shifting mn_flags */
|
|
|
|
#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
|
|
|
|
|
|
|
|
/* Size of a node in a branch page with a given key.
|
|
|
|
* This is just the node header plus the key, there is no data. */
|
|
|
|
#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len))
|
|
|
|
|
|
|
|
/* Size of a node in a leaf page with a given key and data.
|
|
|
|
* This is node header plus key plus data size. */
|
|
|
|
#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len)
|
|
|
|
|
|
|
|
/* Address of the key for the node */
|
|
|
|
#define NODEKEY(node) (void *)((node)->mn_data)
|
|
|
|
|
|
|
|
/* Address of the data for a node */
|
|
|
|
#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
|
|
|
|
|
|
|
|
/* The size of a key in a node */
|
|
|
|
#define NODEKSZ(node) ((node)->mn_ksize)
|
|
|
|
|
|
|
|
/* The address of a key in a LEAF2 page.
|
|
|
|
* LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs.
|
|
|
|
* There are no node headers, keys are stored contiguously. */
|
|
|
|
#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks)))
|
|
|
|
|
|
|
|
/* Set the node's key into keyptr, if requested. */
|
2019-05-23 23:17:35 +03:00
|
|
|
#define MDBX_GET_MAYNULL_KEYPTR(node, keyptr) \
|
2017-05-24 13:59:50 +03:00
|
|
|
do { \
|
|
|
|
if ((keyptr) != NULL) { \
|
|
|
|
(keyptr)->iov_len = NODEKSZ(node); \
|
|
|
|
(keyptr)->iov_base = NODEKEY(node); \
|
|
|
|
} \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/* Set the node's key into key. */
|
2019-05-23 23:17:35 +03:00
|
|
|
#define MDBX_GET_KEYVALUE(node, key) \
|
2017-05-24 13:59:50 +03:00
|
|
|
do { \
|
|
|
|
key.iov_len = NODEKSZ(node); \
|
|
|
|
key.iov_base = NODEKEY(node); \
|
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */
|
|
|
|
#define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID))
|
|
|
|
/* mdbx_dbi_open() flags */
|
|
|
|
#define VALID_FLAGS \
|
|
|
|
(MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \
|
|
|
|
MDBX_INTEGERDUP | MDBX_REVERSEDUP | MDBX_CREATE)
|
|
|
|
|
|
|
|
/* max number of pages to commit in one writev() call */
|
|
|
|
#define MDBX_COMMIT_PAGES 64
|
|
|
|
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
|
|
|
|
#undef MDBX_COMMIT_PAGES
|
|
|
|
#define MDBX_COMMIT_PAGES IOV_MAX
|
|
|
|
#endif
|
|
|
|
|
2017-05-24 15:50:03 +03:00
|
|
|
/* LY: fast enough on most systems
|
|
|
|
*
|
|
|
|
* /
|
|
|
|
* | -1, a < b
|
|
|
|
* cmp2int(a,b) = < 0, a == b
|
|
|
|
* | 1, a > b
|
|
|
|
* \
|
|
|
|
*/
|
|
|
|
#if 1
|
|
|
|
#define mdbx_cmp2int(a, b) (((b) > (a)) ? -1 : (a) > (b))
|
|
|
|
#else
|
|
|
|
#define mdbx_cmp2int(a, b) (((a) > (b)) - ((b) > (a)))
|
|
|
|
#endif
|
2017-06-05 17:16:21 +03:00
|
|
|
|
2018-09-19 00:21:42 +03:00
|
|
|
/* Do not spill pages to disk if txn is getting full, may fail instead */
|
|
|
|
#define MDBX_NOSPILL 0x8000
|
2017-07-26 10:28:09 +03:00
|
|
|
|
2019-09-19 13:23:47 +03:00
|
|
|
static __maybe_unused __inline pgno_t pgno_add(pgno_t base, pgno_t augend) {
|
2017-07-26 10:28:09 +03:00
|
|
|
assert(base <= MAX_PAGENO);
|
|
|
|
return (augend < MAX_PAGENO - base) ? base + augend : MAX_PAGENO;
|
|
|
|
}
|
2017-07-26 10:19:05 +03:00
|
|
|
|
2019-09-19 13:23:47 +03:00
|
|
|
static __maybe_unused __inline pgno_t pgno_sub(pgno_t base, pgno_t subtrahend) {
|
2017-07-24 00:54:10 +03:00
|
|
|
assert(base >= MIN_PAGENO);
|
|
|
|
return (subtrahend < base - MIN_PAGENO) ? base - subtrahend : MIN_PAGENO;
|
|
|
|
}
|
|
|
|
|
2019-09-19 13:23:47 +03:00
|
|
|
static __maybe_unused __inline void mdbx_jitter4testing(bool tiny) {
|
2019-07-07 20:18:12 +03:00
|
|
|
#if MDBX_DEBUG
|
2018-09-19 00:21:42 +03:00
|
|
|
if (MDBX_DBG_JITTER & mdbx_runtime_flags)
|
|
|
|
mdbx_osal_jitter(tiny);
|
|
|
|
#else
|
|
|
|
(void)tiny;
|
|
|
|
#endif
|
2017-07-26 10:19:05 +03:00
|
|
|
}
|