libmdbx/src/layout-dxb.h

301 lines
10 KiB
C
Raw Normal View History

/// \copyright SPDX-License-Identifier: Apache-2.0
/// \note Please refer to the COPYRIGHT file for explanations license change,
/// credits and acknowledgments.
/// \author Леонид Юрьев aka Leonid Yuriev <leo@yuriev.ru> \date 2015-2024
#pragma once
#include "essentials.h"
#pragma pack(push, 4)
/* A stamp that identifies a file as an MDBX file.
* There's nothing special about this value other than that it is easily
* recognizable, and it will reflect any byte order mismatches. */
#define MDBX_MAGIC UINT64_C(/* 56-bit prime */ 0x59659DBDEF4C11)
/* FROZEN: The version number for a database's datafile format. */
#define MDBX_DATA_VERSION 3
#define MDBX_DATA_MAGIC \
((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + MDBX_DATA_VERSION)
#define MDBX_DATA_MAGIC_LEGACY_COMPAT \
((MDBX_MAGIC << 8) + MDBX_PNL_ASCENDING * 64 + 2)
#define MDBX_DATA_MAGIC_LEGACY_DEVEL ((MDBX_MAGIC << 8) + 255)
/* handle for the DB used to track free pages. */
#define FREE_DBI 0
/* handle for the default DB. */
#define MAIN_DBI 1
/* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
#define CORE_DBS 2
/* Number of meta pages - also hardcoded elsewhere */
#define NUM_METAS 3
/* A page number in the database.
*
* MDBX uses 32 bit for page numbers. This limits database
* size up to 2^44 bytes, in case of 4K pages. */
typedef uint32_t pgno_t;
typedef mdbx_atomic_uint32_t atomic_pgno_t;
#define PRIaPGNO PRIu32
#define MAX_PAGENO UINT32_C(0x7FFFffff)
#define MIN_PAGENO NUM_METAS
/* An invalid page number.
* Mainly used to denote an empty tree. */
#define P_INVALID (~(pgno_t)0)
/* A transaction ID. */
typedef uint64_t txnid_t;
typedef mdbx_atomic_uint64_t atomic_txnid_t;
#define PRIaTXN PRIi64
#define MIN_TXNID UINT64_C(1)
#define MAX_TXNID (SAFE64_INVALID_THRESHOLD - 1)
#define INITIAL_TXNID (MIN_TXNID + NUM_METAS - 1)
#define INVALID_TXNID UINT64_MAX
/* Used for offsets within a single page. */
typedef uint16_t indx_t;
typedef struct tree {
uint16_t flags; /* see mdbx_dbi_open */
uint16_t height; /* height of this tree */
uint32_t dupfix_size; /* key-size for MDBX_DUPFIXED (DUPFIX pages) */
pgno_t root; /* the root page of this tree */
pgno_t branch_pages; /* number of internal pages */
pgno_t leaf_pages; /* number of leaf pages */
pgno_t large_pages; /* number of large pages */
uint64_t sequence; /* table sequence counter */
uint64_t items; /* number of data items */
uint64_t mod_txnid; /* txnid of last committed modification */
} tree_t;
/* database size-related parameters */
typedef struct geo {
uint16_t grow_pv; /* datafile growth step as a 16-bit packed (exponential
quantized) value */
uint16_t shrink_pv; /* datafile shrink threshold as a 16-bit packed
(exponential quantized) value */
pgno_t lower; /* minimal size of datafile in pages */
pgno_t upper; /* maximal size of datafile in pages */
union {
pgno_t now; /* current size of datafile in pages */
pgno_t end_pgno;
};
union {
pgno_t first_unallocated; /* first unused page in the datafile,
but actually the file may be shorter. */
pgno_t next_pgno;
};
} geo_t;
/* Meta page content.
* A meta page is the start point for accessing a database snapshot.
* Pages 0-2 are meta pages. */
typedef struct meta {
/* Stamp identifying this as an MDBX file.
* It must be set to MDBX_MAGIC with MDBX_DATA_VERSION. */
uint32_t magic_and_version[2];
/* txnid that committed this meta, the first of a two-phase-update pair */
union {
mdbx_atomic_uint32_t txnid_a[2];
uint64_t unsafe_txnid;
};
uint16_t reserve16; /* extra flags, zero (nothing) for now */
uint8_t validator_id; /* ID of checksum and page validation method,
* zero (nothing) for now */
int8_t extra_pagehdr; /* extra bytes in the page header,
* zero (nothing) for now */
geo_t geometry; /* database size-related parameters */
union {
struct {
tree_t gc, main;
} trees;
__anonymous_struct_extension__ struct {
uint16_t gc_flags;
uint16_t gc_height;
uint32_t pagesize;
};
};
MDBX_canary canary;
#define DATASIGN_NONE 0u
#define DATASIGN_WEAK 1u
#define SIGN_IS_STEADY(sign) ((sign) > DATASIGN_WEAK)
union {
uint32_t sign[2];
uint64_t unsafe_sign;
};
/* txnid that committed this meta, the second of a two-phase-update pair */
mdbx_atomic_uint32_t txnid_b[2];
/* Number of non-meta pages which were put in GC after COW. May be 0 in case
* DB was previously handled by libmdbx without corresponding feature.
* This value in couple with reader.snapshot_pages_retired allows fast
* estimation of "how much reader is restraining GC recycling". */
uint32_t pages_retired[2];
/* The analogue /proc/sys/kernel/random/boot_id or similar to determine
* whether the system was rebooted after the last use of the database files.
* If there was no reboot, but there is no need to rollback to the last
* steady sync point. Zeros mean that no relevant information is available
* from the system. */
bin128_t bootid;
/* GUID базы данных, начиная с v0.13.1 */
bin128_t dxbid;
} meta_t;
#pragma pack(1)
typedef enum page_type {
P_BRANCH = 0x01u /* branch page */,
P_LEAF = 0x02u /* leaf page */,
P_LARGE = 0x04u /* large/overflow page */,
P_META = 0x08u /* meta page */,
P_LEGACY_DIRTY = 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */,
P_BAD = P_LEGACY_DIRTY /* explicit flag for invalid/bad page */,
P_DUPFIX = 0x20u /* for MDBX_DUPFIXED records */,
P_SUBP = 0x40u /* for MDBX_DUPSORT sub-pages */,
P_SPILLED = 0x2000u /* spilled in parent txn */,
P_LOOSE = 0x4000u /* page was dirtied then freed, can be reused */,
P_FROZEN = 0x8000u /* used for retire page with known status */,
P_ILL_BITS = (uint16_t)~(P_BRANCH | P_LEAF | P_DUPFIX | P_LARGE | P_SPILLED),
page_broken = 0,
page_large = P_LARGE,
page_branch = P_BRANCH,
page_leaf = P_LEAF,
page_dupfix_leaf = P_DUPFIX,
page_sub_leaf = P_SUBP | P_LEAF,
page_sub_dupfix_leaf = P_SUBP | P_DUPFIX,
page_sub_broken = P_SUBP,
} page_type_t;
/* Common header for all page types. The page type depends on flags.
*
* P_BRANCH and P_LEAF pages have unsorted 'node_t's at the end, with
* sorted entries[] entries referring to them. Exception: P_DUPFIX pages
* omit entries and pack sorted MDBX_DUPFIXED values after the page header.
*
* P_LARGE records occupy one or more contiguous pages where only the
* first has a page header. They hold the real data of N_BIGDATA nodes.
*
* P_SUBP sub-pages are small leaf "pages" with duplicate data.
* A node with flag N_DUPDATA but not N_SUBDATA contains a sub-page.
* (Duplicate data can also go in tables, which use normal pages.)
*
* P_META pages contain meta_t, the start point of an MDBX snapshot.
*
* Each non-metapage up to meta_t.mm_last_pg is reachable exactly once
* in the snapshot: Either used by a database or listed in a GC record. */
typedef struct page {
uint64_t txnid; /* txnid which created page, maybe zero in legacy DB */
uint16_t dupfix_ksize; /* key size if this is a DUPFIX page */
uint16_t flags;
union {
uint32_t pages; /* number of overflow pages */
__anonymous_struct_extension__ struct {
indx_t lower; /* lower bound of free space */
indx_t upper; /* upper bound of free space */
};
};
pgno_t pgno; /* page number */
#if FLEXIBLE_ARRAY_MEMBERS
indx_t entries[] /* dynamic size */;
#endif /* FLEXIBLE_ARRAY_MEMBERS */
} page_t;
/* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ 20u
/* Header for a single key/data pair within a page.
* Used in pages of type P_BRANCH and P_LEAF without P_DUPFIX.
* We guarantee 2-byte alignment for 'node_t's.
*
* Leaf node flags describe node contents. N_BIGDATA says the node's
* data part is the page number of an overflow page with actual data.
* N_DUPDATA and N_SUBDATA can be combined giving duplicate data in
* a sub-page/table, and named databases (just N_SUBDATA). */
typedef struct node {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
union {
uint32_t dsize;
uint32_t child_pgno;
};
uint8_t flags; /* see node_flags */
uint8_t extra;
uint16_t ksize; /* key size */
#else
uint16_t ksize; /* key size */
uint8_t extra;
uint8_t flags; /* see node_flags */
union {
uint32_t child_pgno;
uint32_t dsize;
};
#endif /* __BYTE_ORDER__ */
#if FLEXIBLE_ARRAY_MEMBERS
uint8_t payload[] /* key and data are appended here */;
#endif /* FLEXIBLE_ARRAY_MEMBERS */
} node_t;
/* Size of the node header, excluding dynamic data at the end */
#define NODESIZE 8u
typedef enum node_flags {
N_BIGDATA = 0x01 /* data put on large page */,
N_SUBDATA = 0x02 /* data is a table */,
N_DUPDATA = 0x04 /* data has duplicates */
} node_flags_t;
#pragma pack(pop)
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
page_type(const page_t *mp) {
return mp->flags;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline uint8_t
page_type_compat(const page_t *mp) {
/* Drop legacy P_DIRTY flag for sub-pages for compatilibity,
* for assertions only. */
return unlikely(mp->flags & P_SUBP) ? mp->flags & ~(P_SUBP | P_LEGACY_DIRTY)
: mp->flags;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_leaf(const page_t *mp) {
return (mp->flags & P_LEAF) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_dupfix_leaf(const page_t *mp) {
return (mp->flags & P_DUPFIX) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_branch(const page_t *mp) {
return (mp->flags & P_BRANCH) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_largepage(const page_t *mp) {
return (mp->flags & P_LARGE) != 0;
}
MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static inline bool
is_subpage(const page_t *mp) {
return (mp->flags & P_SUBP) != 0;
}