mdbx: cleanup internals (mostly formatting, but not only).

This commit is contained in:
Leo Yuriev 2017-05-23 14:44:53 +03:00
parent 277bdfb4c4
commit 8b9e391dd0
7 changed files with 872 additions and 1031 deletions

View File

@ -9,8 +9,7 @@
* *
* A copy of this license is available in the file LICENSE in the * A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at * top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. * <http://www.OpenLDAP.org/license.html>. */
*/
#pragma once #pragma once
/* *INDENT-OFF* */ /* *INDENT-OFF* */
@ -143,14 +142,14 @@
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
/** handle for the DB used to track free pages. */ /* handle for the DB used to track free pages. */
#define FREE_DBI 0 #define FREE_DBI 0
/** handle for the default DB. */ /* handle for the default DB. */
#define MAIN_DBI 1 #define MAIN_DBI 1
/** Number of DBs in metapage (free and main) - also hardcoded elsewhere */ /* Number of DBs in metapage (free and main) - also hardcoded elsewhere */
#define CORE_DBS 2 #define CORE_DBS 2
/** Number of meta pages - also hardcoded elsewhere */ /* Number of meta pages - also hardcoded elsewhere */
#define NUM_METAS 2 #define NUM_METAS 2
/* A generic unsigned ID number. These were entryIDs in back-bdb. /* A generic unsigned ID number. These were entryIDs in back-bdb.
@ -181,100 +180,92 @@ typedef MDB_ID txnid_t;
*/ */
typedef MDB_ID *MDB_IDL; typedef MDB_ID *MDB_IDL;
/* An ID2 is an ID/pointer pair. /* An ID2 is an ID/pointer pair. */
*/
typedef struct MDB_ID2 { typedef struct MDB_ID2 {
MDB_ID mid; /* The ID */ MDB_ID mid; /* The ID */
void *mptr; /* The pointer */ void *mptr; /* The pointer */
} MDB_ID2; } MDB_ID2;
/* An ID2L is an ID2 List, a sorted array of ID2s. /* An ID2L is an ID2 List, a sorted array of ID2s.
* The first element's \b mid member is a count of how many actual * The first element's mid member is a count of how many actual
* elements are in the array. The \b mptr member of the first element is * elements are in the array. The mptr member of the first element is
* unused. * unused. The array is sorted in ascending order by mid. */
* The array is sorted in ascending order by \b mid.
*/
typedef MDB_ID2 *MDB_ID2L; typedef MDB_ID2 *MDB_ID2L;
/** Used for offsets within a single page. /* Used for offsets within a single page.
* Since memory pages are typically 4 or 8KB in size, 12-13 bits, * Since memory pages are typically 4 or 8KB in size, 12-13 bits,
* this is plenty. * this is plenty. */
*/
typedef uint16_t indx_t; typedef uint16_t indx_t;
#pragma pack(push, 1) #pragma pack(push, 1)
/** The information we store in a single slot of the reader table. /* The information we store in a single slot of the reader table.
* In addition to a transaction ID, we also record the process and * In addition to a transaction ID, we also record the process and
* thread ID that owns a slot, so that we can detect stale information, * thread ID that owns a slot, so that we can detect stale information,
* e.g. threads or processes that went away without cleaning up. * e.g. threads or processes that went away without cleaning up.
* @note We currently don't check for stale records. We simply re-init * NOTE: We currently don't check for stale records. We simply re-init
* the table when we know that we're the only process opening the * the table when we know that we're the only process opening the
* lock file. * lock file. */
*/
typedef struct MDB_rxbody { typedef struct MDB_rxbody {
/** Current Transaction ID when this transaction began, or (txnid_t)-1. /* Current Transaction ID when this transaction began, or (txnid_t)-1.
* Multiple readers that start at the same time will probably have the * Multiple readers that start at the same time will probably have the
* same ID here. Again, it's not important to exclude them from * same ID here. Again, it's not important to exclude them from
* anything; all we need to know is which version of the DB they * anything; all we need to know is which version of the DB they
* started from so we can avoid overwriting any data used in that * started from so we can avoid overwriting any data used in that
* particular version. * particular version. */
*/
volatile txnid_t mrb_txnid; volatile txnid_t mrb_txnid;
/** The process ID of the process owning this reader txn. */ /* The process ID of the process owning this reader txn. */
volatile mdbx_pid_t mrb_pid; volatile mdbx_pid_t mrb_pid;
/** The thread ID of the thread owning this txn. */ /* The thread ID of the thread owning this txn. */
volatile mdbx_tid_t mrb_tid; volatile mdbx_tid_t mrb_tid;
} MDB_rxbody; } MDB_rxbody;
/** The actual reader record, with cacheline padding. */ /* The actual reader record, with cacheline padding. */
typedef struct MDB_reader { typedef struct MDB_reader {
union { union {
MDB_rxbody mrx; MDB_rxbody mrx;
/** shorthand for mrb_txnid */ /* shorthand for mrb_txnid */
#define mr_txnid mru.mrx.mrb_txnid #define mr_txnid mru.mrx.mrb_txnid
#define mr_pid mru.mrx.mrb_pid #define mr_pid mru.mrx.mrb_pid
#define mr_tid mru.mrx.mrb_tid #define mr_tid mru.mrx.mrb_tid
/** cache line alignment */ /* cache line alignment */
char pad[(sizeof(MDB_rxbody) + MDBX_CACHELINE_SIZE - 1) & char pad[(sizeof(MDB_rxbody) + MDBX_CACHELINE_SIZE - 1) &
~(MDBX_CACHELINE_SIZE - 1)]; ~(MDBX_CACHELINE_SIZE - 1)];
} mru; } mru;
} MDB_reader; } MDB_reader;
/** Information about a single database in the environment. */ /* Information about a single database in the environment. */
typedef struct MDB_db { typedef struct MDB_db {
uint32_t md_xsize; /**< also ksize for LEAF2 pages */ uint32_t md_xsize; /* also ksize for LEAF2 pages */
uint16_t md_flags; /**< @ref mdbx_dbi_open */ uint16_t md_flags; /* see mdbx_dbi_open */
uint16_t md_depth; /**< depth of this tree */ uint16_t md_depth; /* depth of this tree */
uint64_t md_seq; /* table sequence counter */ uint64_t md_seq; /* table sequence counter */
pgno_t md_branch_pages; /**< number of internal pages */ pgno_t md_branch_pages; /* number of internal pages */
pgno_t md_leaf_pages; /**< number of leaf pages */ pgno_t md_leaf_pages; /* number of leaf pages */
pgno_t md_overflow_pages; /**< number of overflow pages */ pgno_t md_overflow_pages; /* number of overflow pages */
size_t md_entries; /**< number of data items */ pgno_t md_root; /* the root page of this tree */
pgno_t md_root; /**< the root page of this tree */ uint64_t md_entries; /* number of data items */
} MDB_db; } MDB_db;
/** Meta page content. /* Meta page content.
* A meta page is the start point for accessing a database snapshot. * A meta page is the start point for accessing a database snapshot.
* Pages 0-1 are meta pages. Transaction N writes meta page #(N % 2). * Pages 0-1 are meta pages. Transaction N writes meta page (N % 2). */
*/
typedef struct MDB_meta { typedef struct MDB_meta {
/** Stamp identifying this as an LMDB file. It must be set /* Stamp identifying this as an LMDB file. It must be set
* to #MDB_MAGIC. */ * to MDB_MAGIC. */
uint32_t mm_magic; uint32_t mm_magic;
/** Version number of this file. Must be set to #MDB_DATA_VERSION. */ /* Version number of this file. Must be set to MDB_DATA_VERSION. */
uint32_t mm_version; uint32_t mm_version;
size_t mm_mapsize; /**< size of mmap region */ size_t mm_mapsize; /* size of mmap region */
MDB_db mm_dbs[CORE_DBS]; /**< first is free space, 2nd is main db */ MDB_db mm_dbs[CORE_DBS]; /* first is free space, 2nd is main db */
/** The size of pages used in this DB */ /* The size of pages used in this DB */
#define mm_psize mm_dbs[FREE_DBI].md_xsize #define mm_psize mm_dbs[FREE_DBI].md_xsize
/** Any persistent environment flags. @ref mdbx_env */ /* Any persistent environment flags, see mdbx_env */
#define mm_flags mm_dbs[FREE_DBI].md_flags #define mm_flags mm_dbs[FREE_DBI].md_flags
/** Last used page in the datafile. /* Last used page in the datafile.
* Actually the file may be shorter if the freeDB lists the final pages. * Actually the file may be shorter if the freeDB lists the final pages. */
*/
pgno_t mm_last_pg; pgno_t mm_last_pg;
volatile txnid_t mm_txnid; /**< txnid that committed this page */ volatile txnid_t mm_txnid; /* txnid that committed this page */
#define MDB_DATASIGN_NONE 0u #define MDB_DATASIGN_NONE 0u
#define MDB_DATASIGN_WEAK 1u #define MDB_DATASIGN_WEAK 1u
volatile uint64_t mm_datasync_sign; volatile uint64_t mm_datasync_sign;
@ -285,69 +276,61 @@ typedef struct MDB_meta {
volatile mdbx_canary mm_canary; volatile mdbx_canary mm_canary;
} MDB_meta; } MDB_meta;
/** Common header for all page types. The page type depends on #mp_flags. /* Common header for all page types. The page type depends on mp_flags.
* *
* #P_BRANCH and #P_LEAF pages have unsorted '#MDB_node's at the end, with * P_BRANCH and P_LEAF pages have unsorted 'MDB_node's at the end, with
* sorted #mp_ptrs[] entries referring to them. Exception: #P_LEAF2 pages * sorted mp_ptrs[] entries referring to them. Exception: P_LEAF2 pages
* omit mp_ptrs and pack sorted #MDB_DUPFIXED values after the page header. * omit mp_ptrs and pack sorted MDB_DUPFIXED values after the page header.
* *
* #P_OVERFLOW records occupy one or more contiguous pages where only the * P_OVERFLOW records occupy one or more contiguous pages where only the
* first has a page header. They hold the real data of #F_BIGDATA nodes. * first has a page header. They hold the real data of F_BIGDATA nodes.
* *
* #P_SUBP sub-pages are small leaf "pages" with duplicate data. * P_SUBP sub-pages are small leaf "pages" with duplicate data.
* A node with flag #F_DUPDATA but not #F_SUBDATA contains a sub-page. * A node with flag F_DUPDATA but not F_SUBDATA contains a sub-page.
* (Duplicate data can also go in sub-databases, which use normal pages.) * (Duplicate data can also go in sub-databases, which use normal pages.)
* *
* #P_META pages contain #MDB_meta, the start point of an LMDB snapshot. * P_META pages contain MDB_meta, the start point of an LMDB snapshot.
* *
* Each non-metapage up to #MDB_meta.%mm_last_pg is reachable exactly once * Each non-metapage up to MDB_meta.mm_last_pg is reachable exactly once
* in the snapshot: Either used by a database or listed in a freeDB record. * in the snapshot: Either used by a database or listed in a freeDB record. */
*/
typedef struct MDB_page { typedef struct MDB_page {
#define mp_pgno mp_p.p_pgno #define mp_pgno mp_p.p_pgno
#define mp_next mp_p.p_next #define mp_next mp_p.p_next
union { union {
pgno_t p_pgno; /**< page number */ pgno_t p_pgno; /* page number */
struct MDB_page *p_next; /**< for in-memory list of freed pages */ struct MDB_page *p_next; /* for in-memory list of freed pages */
} mp_p; } mp_p;
uint16_t mp_leaf2_ksize; /**< key size if this is a LEAF2 page */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
/** @defgroup mdbx_page Page Flags #define P_BRANCH 0x01 /* branch page */
* @ingroup internal #define P_LEAF 0x02 /* leaf page */
* Flags for the page headers. #define P_OVERFLOW 0x04 /* overflow page */
* @{ #define P_META 0x08 /* meta page */
*/ #define P_DIRTY 0x10 /* dirty page, also set for P_SUBP pages */
#define P_BRANCH 0x01 /**< branch page */ #define P_LEAF2 0x20 /* for MDB_DUPFIXED records */
#define P_LEAF 0x02 /**< leaf page */ #define P_SUBP 0x40 /* for MDB_DUPSORT sub-pages */
#define P_OVERFLOW 0x04 /**< overflow page */ #define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */
#define P_META 0x08 /**< meta page */ #define P_KEEP 0x8000 /* leave this page alone during spill */
#define P_DIRTY 0x10 /**< dirty page, also set for #P_SUBP pages */ uint16_t mp_flags;
#define P_LEAF2 0x20 /**< for #MDB_DUPFIXED records */
#define P_SUBP 0x40 /**< for #MDB_DUPSORT sub-pages */
#define P_LOOSE 0x4000 /**< page was dirtied then freed, can be reused */
#define P_KEEP 0x8000 /**< leave this page alone during spill */
/** @} */
uint16_t mp_flags; /**< @ref mdbx_page */
#define mp_lower mp_pb.pb.pb_lower #define mp_lower mp_pb.pb.pb_lower
#define mp_upper mp_pb.pb.pb_upper #define mp_upper mp_pb.pb.pb_upper
#define mp_pages mp_pb.pb_pages #define mp_pages mp_pb.pb_pages
union { union {
struct { struct {
indx_t pb_lower; /**< lower bound of free space */ indx_t pb_lower; /* lower bound of free space */
indx_t pb_upper; /**< upper bound of free space */ indx_t pb_upper; /* upper bound of free space */
} pb; } pb;
uint32_t pb_pages; /**< number of overflow pages */ uint32_t pb_pages; /* number of overflow pages */
} mp_pb; } mp_pb;
indx_t mp_ptrs[1]; /**< dynamic size */ indx_t mp_ptrs[1]; /* dynamic size */
} MDB_page; } MDB_page;
/** Size of the page header, excluding dynamic data at the end */ /* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDB_page, mp_ptrs)) #define PAGEHDRSZ ((unsigned)offsetof(MDB_page, mp_ptrs))
/** Buffer for a stack-allocated meta page. /* Buffer for a stack-allocated meta page.
* The members define size and alignment, and silence type * The members define size and alignment, and silence type
* aliasing warnings. They are not used directly; that could * aliasing warnings. They are not used directly; that could
* mean incorrectly using several union members in parallel. * mean incorrectly using several union members in parallel. */
*/
typedef union MDB_metabuf { typedef union MDB_metabuf {
MDB_page mb_page; MDB_page mb_page;
struct { struct {
@ -386,9 +369,9 @@ typedef struct MDBX_lockinfo {
* The information here is mostly static/read-only. There is * The information here is mostly static/read-only. There is
* only a single copy of this record in the environment. */ * only a single copy of this record in the environment. */
typedef struct MDB_dbx { typedef struct MDB_dbx {
MDB_val md_name; /**< name of the database */ MDB_val md_name; /* name of the database */
MDB_cmp_func *md_cmp; /**< function for comparing keys */ MDB_cmp_func *md_cmp; /* function for comparing keys */
MDB_cmp_func *md_dcmp; /**< function for comparing data items */ MDB_cmp_func *md_dcmp; /* function for comparing data items */
} MDB_dbx; } MDB_dbx;
/* A database transaction. /* A database transaction.
@ -396,24 +379,24 @@ typedef struct MDB_dbx {
struct MDB_txn { struct MDB_txn {
#define MDBX_MT_SIGNATURE (0x93D53A31) #define MDBX_MT_SIGNATURE (0x93D53A31)
unsigned mt_signature; unsigned mt_signature;
MDB_txn *mt_parent; /**< parent of a nested txn */ MDB_txn *mt_parent; /* parent of a nested txn */
/** Nested txn under this txn, set together with flag #MDB_TXN_HAS_CHILD */ /* Nested txn under this txn, set together with flag MDB_TXN_HAS_CHILD */
MDB_txn *mt_child; MDB_txn *mt_child;
pgno_t mt_next_pgno; /**< next unallocated page */ pgno_t mt_next_pgno; /* next unallocated page */
/* The ID of this transaction. IDs are integers incrementing from 1. /* The ID of this transaction. IDs are integers incrementing from 1.
* Only committed write transactions increment the ID. If a transaction * Only committed write transactions increment the ID. If a transaction
* aborts, the ID may be re-used by the next writer. */ * aborts, the ID may be re-used by the next writer. */
txnid_t mt_txnid; txnid_t mt_txnid;
MDB_env *mt_env; /**< the DB environment */ MDB_env *mt_env; /* the DB environment */
/** The list of reclaimed txns from freeDB */ /* The list of reclaimed txns from freeDB */
MDB_IDL mt_lifo_reclaimed; MDB_IDL mt_lifo_reclaimed;
/* The list of pages that became unused during this transaction. */ /* The list of pages that became unused during this transaction. */
MDB_IDL mt_free_pgs; MDB_IDL mt_free_pgs;
/* The list of loose pages that became unused and may be reused /* The list of loose pages that became unused and may be reused
* in this transaction, linked through #NEXT_LOOSE_PAGE(page). */ * in this transaction, linked through NEXT_LOOSE_PAGE(page). */
MDB_page *mt_loose_pgs; MDB_page *mt_loose_pgs;
/** Number of loose pages (#mt_loose_pgs) */ /* Number of loose pages (mt_loose_pgs) */
int mt_loose_count; unsigned mt_loose_count;
/* The sorted list of dirty pages we temporarily wrote to disk /* The sorted list of dirty pages we temporarily wrote to disk
* because the dirty list was full. page numbers in here are * because the dirty list was full. page numbers in here are
* shifted left by 1, deleted slots have the LSB set. */ * shifted left by 1, deleted slots have the LSB set. */
@ -435,9 +418,9 @@ struct MDB_txn {
#define DB_DIRTY 0x01 /* DB was written in this txn */ #define DB_DIRTY 0x01 /* DB was written in this txn */
#define DB_STALE 0x02 /* Named-DB record is older than txnID */ #define DB_STALE 0x02 /* Named-DB record is older than txnID */
#define DB_NEW 0x04 /* Named-DB handle opened in this txn */ #define DB_NEW 0x04 /* Named-DB handle opened in this txn */
#define DB_VALID 0x08 /* DB handle is valid, see also #MDB_VALID */ #define DB_VALID 0x08 /* DB handle is valid, see also MDB_VALID */
#define DB_USRVALID 0x10 /* As #DB_VALID, but not set for #FREE_DBI */ #define DB_USRVALID 0x10 /* As DB_VALID, but not set for FREE_DBI */
#define DB_DUPDATA 0x20 /* DB is #MDB_DUPSORT data */ #define DB_DUPDATA 0x20 /* DB is MDB_DUPSORT data */
/* In write txns, array of cursors for each DB */ /* In write txns, array of cursors for each DB */
MDB_cursor **mt_cursors; MDB_cursor **mt_cursors;
/* Array of flags for each DB */ /* Array of flags for each DB */
@ -447,118 +430,102 @@ struct MDB_txn {
* don't decrement it when individual DB handles are closed. */ * don't decrement it when individual DB handles are closed. */
MDB_dbi mt_numdbs; MDB_dbi mt_numdbs;
/** @defgroup mdbx_txn Transaction Flags /* Transaction Flags */
* @ingroup internal /* mdbx_txn_begin() flags */
* @{
*/
/** #mdbx_txn_begin() flags */
#define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC | MDB_NOSYNC | MDB_RDONLY) #define MDB_TXN_BEGIN_FLAGS (MDB_NOMETASYNC | MDB_NOSYNC | MDB_RDONLY)
#define MDB_TXN_NOMETASYNC \ #define MDB_TXN_NOMETASYNC \
MDB_NOMETASYNC /**< don't sync meta for this txn on commit */ MDB_NOMETASYNC /* don't sync meta for this txn on commit */
#define MDB_TXN_NOSYNC MDB_NOSYNC /**< don't sync this txn on commit */ #define MDB_TXN_NOSYNC MDB_NOSYNC /* don't sync this txn on commit */
#define MDB_TXN_RDONLY MDB_RDONLY /**< read-only transaction */ #define MDB_TXN_RDONLY MDB_RDONLY /* read-only transaction */
/* internal txn flags */ /* internal txn flags */
#define MDB_TXN_WRITEMAP \ #define MDB_TXN_WRITEMAP MDB_WRITEMAP /* copy of MDB_env flag in writers */
MDB_WRITEMAP /**< copy of #MDB_env flag in writers \ #define MDB_TXN_FINISHED 0x01 /* txn is finished or never began */
*/ #define MDB_TXN_ERROR 0x02 /* txn is unusable after an error */
#define MDB_TXN_FINISHED 0x01 /**< txn is finished or never began */ #define MDB_TXN_DIRTY 0x04 /* must write, even if dirty list is empty */
#define MDB_TXN_ERROR 0x02 /**< txn is unusable after an error */ #define MDB_TXN_SPILLS 0x08 /* txn or a parent has spilled pages */
#define MDB_TXN_DIRTY 0x04 /**< must write, even if dirty list is empty */ #define MDB_TXN_HAS_CHILD 0x10 /* txn has an MDB_txn.mt_child */
#define MDB_TXN_SPILLS 0x08 /**< txn or a parent has spilled pages */ /* most operations on the txn are currently illegal */
#define MDB_TXN_HAS_CHILD 0x10 /**< txn has an #MDB_txn.%mt_child */
/** most operations on the txn are currently illegal */
#define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD) #define MDB_TXN_BLOCKED (MDB_TXN_FINISHED | MDB_TXN_ERROR | MDB_TXN_HAS_CHILD)
/** @} */ unsigned mt_flags;
unsigned mt_flags; /**< @ref mdbx_txn */ /* dirty_list room: Array size - dirty pages visible to this txn.
/** #dirty_list room: Array size - \#dirty pages visible to this txn.
* Includes ancestor txns' dirty pages not hidden by other txns' * Includes ancestor txns' dirty pages not hidden by other txns'
* dirty/spilled pages. Thus commit(nested txn) has room to merge * dirty/spilled pages. Thus commit(nested txn) has room to merge
* dirty_list into mt_parent after freeing hidden mt_parent pages. * dirty_list into mt_parent after freeing hidden mt_parent pages. */
*/
unsigned mt_dirty_room; unsigned mt_dirty_room;
mdbx_canary mt_canary; mdbx_canary mt_canary;
}; };
/** Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty. /* Enough space for 2^32 nodes with minimum of 2 keys per node. I.e., plenty.
* At 4 keys per node, enough for 2^64 nodes, so there's probably no need to * At 4 keys per node, enough for 2^64 nodes, so there's probably no need to
* raise this on a 64 bit machine. * raise this on a 64 bit machine. */
*/
#define CURSOR_STACK 32 #define CURSOR_STACK 32
struct MDB_xcursor; struct MDB_xcursor;
/** Cursors are used for all DB operations. /* Cursors are used for all DB operations.
* A cursor holds a path of (page pointer, key index) from the DB * A cursor holds a path of (page pointer, key index) from the DB
* root to a position in the DB, plus other state. #MDB_DUPSORT * root to a position in the DB, plus other state. MDB_DUPSORT
* cursors include an xcursor to the current data item. Write txns * cursors include an xcursor to the current data item. Write txns
* track their cursors and keep them up to date when data moves. * track their cursors and keep them up to date when data moves.
* Exception: An xcursor's pointer to a #P_SUBP page can be stale. * Exception: An xcursor's pointer to a P_SUBP page can be stale.
* (A node with #F_DUPDATA but no #F_SUBDATA contains a subpage). * (A node with F_DUPDATA but no F_SUBDATA contains a subpage). */
*/
struct MDB_cursor { struct MDB_cursor {
#define MDBX_MC_SIGNATURE (0xFE05D5B1) #define MDBX_MC_SIGNATURE (0xFE05D5B1)
#define MDBX_MC_READY4CLOSE (0x2817A047) #define MDBX_MC_READY4CLOSE (0x2817A047)
#define MDBX_MC_WAIT4EOT (0x90E297A7) #define MDBX_MC_WAIT4EOT (0x90E297A7)
unsigned mc_signature; unsigned mc_signature;
/** Next cursor on this DB in this txn */ /* Next cursor on this DB in this txn */
MDB_cursor *mc_next; MDB_cursor *mc_next;
/** Backup of the original cursor if this cursor is a shadow */ /* Backup of the original cursor if this cursor is a shadow */
MDB_cursor *mc_backup; MDB_cursor *mc_backup;
/** Context used for databases with #MDB_DUPSORT, otherwise NULL */ /* Context used for databases with MDB_DUPSORT, otherwise NULL */
struct MDB_xcursor *mc_xcursor; struct MDB_xcursor *mc_xcursor;
/** The transaction that owns this cursor */ /* The transaction that owns this cursor */
MDB_txn *mc_txn; MDB_txn *mc_txn;
/** The database handle this cursor operates on */ /* The database handle this cursor operates on */
MDB_dbi mc_dbi; MDB_dbi mc_dbi;
/** The database record for this cursor */ /* The database record for this cursor */
MDB_db *mc_db; MDB_db *mc_db;
/** The database auxiliary record for this cursor */ /* The database auxiliary record for this cursor */
MDB_dbx *mc_dbx; MDB_dbx *mc_dbx;
/** The @ref mt_dbflag for this database */ /* The mt_dbflag for this database */
uint8_t *mc_dbflag; uint8_t *mc_dbflag;
uint16_t mc_snum; /**< number of pushed pages */ uint16_t mc_snum; /* number of pushed pages */
uint16_t mc_top; /**< index of top page, normally mc_snum-1 */ uint16_t mc_top; /* index of top page, normally mc_snum-1 */
/** @defgroup mdbx_cursor Cursor Flags /* Cursor state flags. */
* @ingroup internal #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */
* Cursor state flags. #define C_EOF 0x02 /* No more data */
* @{ #define C_SUB 0x04 /* Cursor is a sub-cursor */
*/ #define C_DEL 0x08 /* last op was a cursor_del */
#define C_INITIALIZED 0x01 /**< cursor has been initialized and is valid */ #define C_UNTRACK 0x40 /* Un-track cursor when closing */
#define C_EOF 0x02 /**< No more data */ #define C_RECLAIMING 0x80 /* FreeDB lookup is prohibited */
#define C_SUB 0x04 /**< Cursor is a sub-cursor */ unsigned mc_flags; /* see mdbx_cursor */
#define C_DEL 0x08 /**< last op was a cursor_del */ MDB_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */
#define C_UNTRACK 0x40 /**< Un-track cursor when closing */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */
#define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */
/** @} */
unsigned mc_flags; /**< @ref mdbx_cursor */
MDB_page *mc_pg[CURSOR_STACK]; /**< stack of pushed pages */
indx_t mc_ki[CURSOR_STACK]; /**< stack of page indices */
}; };
/** Context for sorted-dup records. /* Context for sorted-dup records.
* We could have gone to a fully recursive design, with arbitrarily * We could have gone to a fully recursive design, with arbitrarily
* deep nesting of sub-databases. But for now we only handle these * deep nesting of sub-databases. But for now we only handle these
* levels - main DB, optional sub-DB, sorted-duplicate DB. * levels - main DB, optional sub-DB, sorted-duplicate DB. */
*/
typedef struct MDB_xcursor { typedef struct MDB_xcursor {
/** A sub-cursor for traversing the Dup DB */ /* A sub-cursor for traversing the Dup DB */
MDB_cursor mx_cursor; MDB_cursor mx_cursor;
/** The database record for this Dup DB */ /* The database record for this Dup DB */
MDB_db mx_db; MDB_db mx_db;
/** The auxiliary DB record for this Dup DB */ /* The auxiliary DB record for this Dup DB */
MDB_dbx mx_dbx; MDB_dbx mx_dbx;
/** The @ref mt_dbflag for this Dup DB */ /* The mt_dbflag for this Dup DB */
unsigned char mx_dbflag; uint8_t mx_dbflag;
} MDB_xcursor; } MDB_xcursor;
/** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ /* Check if there is an inited xcursor, so XCURSOR_REFRESH() is proper */
#define XCURSOR_INITED(mc) \ #define XCURSOR_INITED(mc) \
((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))
/** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed /* Update sub-page pointer, if any, in mc->mc_xcursor.
* when the node which contains the sub-page may have moved. Called * Needed when the node which contains the sub-page may have moved.
* with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. * Called with mp = mc->mc_pg[mc->mc_top], ki = mc->mc_ki[mc->mc_top]. */
*/
#define XCURSOR_REFRESH(mc, mp, ki) \ #define XCURSOR_REFRESH(mc, mp, ki) \
do { \ do { \
MDB_page *xr_pg = (mp); \ MDB_page *xr_pg = (mp); \
@ -567,88 +534,87 @@ typedef struct MDB_xcursor {
(mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \
} while (0) } while (0)
/** State of FreeDB old pages, stored in the MDB_env */ /* State of FreeDB old pages, stored in the MDB_env */
typedef struct MDB_pgstate { typedef struct MDB_pgstate {
pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ pgno_t *mf_pghead; /* Reclaimed freeDB pages, or NULL before use */
txnid_t mf_pglast; /**< ID of last used record, or 0 if !mf_pghead */ txnid_t mf_pglast; /* ID of last used record, or 0 if !mf_pghead */
} MDB_pgstate; } MDB_pgstate;
#define MDBX_LOCKINFO_WHOLE_SIZE \ #define MDBX_LOCKINFO_WHOLE_SIZE \
((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \ ((sizeof(MDBX_lockinfo) + MDBX_CACHELINE_SIZE - 1) & \
~((size_t)MDBX_CACHELINE_SIZE - 1)) ~((size_t)MDBX_CACHELINE_SIZE - 1))
/** Lockfile format signature: version, features and field layout */ /* Lockfile format signature: version, features and field layout */
#define MDB_LOCK_FORMAT \ #define MDB_LOCK_FORMAT \
(((uint64_t)(MDBX_OSAL_LOCK_SIGN) << 32) + \ (((uint64_t)(MDBX_OSAL_LOCK_SIGN) << 32) + \
((MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1) << 16) + \ ((MDBX_LOCKINFO_WHOLE_SIZE + MDBX_CACHELINE_SIZE - 1) << 16) + \
(MDB_LOCK_VERSION) /* Flags which describe functionality */) (MDB_LOCK_VERSION) /* Flags which describe functionality */)
/** The database environment. */ /* The database environment. */
struct MDB_env { struct MDB_env {
#define MDBX_ME_SIGNATURE (0x9A899641) #define MDBX_ME_SIGNATURE (0x9A899641)
unsigned me_signature; unsigned me_signature;
mdbx_filehandle_t me_fd; /**< The main data file */ mdbx_filehandle_t me_fd; /* The main data file */
mdbx_filehandle_t me_lfd; /**< The lock file */ mdbx_filehandle_t me_lfd; /* The lock file */
/** Failed to update the meta page. Probably an I/O error. */ /* Failed to update the meta page. Probably an I/O error. */
#define MDB_FATAL_ERROR 0x80000000U #define MDB_FATAL_ERROR 0x80000000U
/** Some fields are initialized. */ /* Some fields are initialized. */
#define MDB_ENV_ACTIVE 0x20000000U #define MDB_ENV_ACTIVE 0x20000000U
/** me_txkey is set */ /* me_txkey is set */
#define MDB_ENV_TXKEY 0x10000000U #define MDB_ENV_TXKEY 0x10000000U
uint32_t me_flags; /**< @ref mdbx_env */ uint32_t me_flags; /* see mdbx_env */
unsigned me_psize; /**< DB page size, inited from me_os_psize */ unsigned me_psize; /* DB page size, inited from me_os_psize */
unsigned me_os_psize; /**< OS page size, from mdbx_syspagesize() */ unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */
unsigned me_maxreaders; /**< size of the reader table */ unsigned me_maxreaders; /* size of the reader table */
/** Max #MDBX_lockinfo.mti_numreaders of interest to #mdbx_env_close() */ /* Max MDBX_lockinfo.mti_numreaders of interest to mdbx_env_close() */
unsigned me_close_readers; unsigned me_close_readers;
MDB_dbi me_numdbs; /**< number of DBs opened */ MDB_dbi me_numdbs; /* number of DBs opened */
MDB_dbi me_maxdbs; /**< size of the DB table */ MDB_dbi me_maxdbs; /* size of the DB table */
mdbx_pid_t me_pid; /**< process ID of this env */ mdbx_pid_t me_pid; /* process ID of this env */
char *me_path; /**< path to the DB files */ char *me_path; /* path to the DB files */
char *me_map; /**< the memory map of the data file */ char *me_map; /* the memory map of the data file */
MDBX_lockinfo *me_lck; /**< the memory map of the lock file, never NULL */ MDBX_lockinfo *me_lck; /* the memory map of the lock file, never NULL */
void *me_pbuf; /**< scratch area for DUPSORT put() */ void *me_pbuf; /* scratch area for DUPSORT put() */
MDB_txn *me_txn; /**< current write transaction */ MDB_txn *me_txn; /* current write transaction */
MDB_txn *me_txn0; /**< prealloc'd write transaction */ MDB_txn *me_txn0; /* prealloc'd write transaction */
size_t me_mapsize; /**< size of the data memory map */ size_t me_mapsize; /* size of the data memory map */
pgno_t me_maxpg; /**< me_mapsize / me_psize */ pgno_t me_maxpg; /* me_mapsize / me_psize */
MDB_dbx *me_dbxs; /**< array of static DB info */ MDB_dbx *me_dbxs; /* array of static DB info */
uint16_t *me_dbflags; /**< array of flags from MDB_db.md_flags */ uint16_t *me_dbflags; /* array of flags from MDB_db.md_flags */
unsigned *me_dbiseqs; /**< array of dbi sequence numbers */ unsigned *me_dbiseqs; /* array of dbi sequence numbers */
mdbx_thread_key_t me_txkey; /**< thread-key for readers */ mdbx_thread_key_t me_txkey; /* thread-key for readers */
txnid_t me_pgoldest; /**< ID of oldest reader last time we looked */ txnid_t me_pgoldest; /* ID of oldest reader last time we looked */
MDB_pgstate me_pgstate; /**< state of old pages from freeDB */ MDB_pgstate me_pgstate; /* state of old pages from freeDB */
#define me_pglast me_pgstate.mf_pglast #define me_pglast me_pgstate.mf_pglast
#define me_pghead me_pgstate.mf_pghead #define me_pghead me_pgstate.mf_pghead
MDB_page *me_dpages; /**< list of malloc'd blocks for re-use */ MDB_page *me_dpages; /* list of malloc'd blocks for re-use */
/** IDL of pages that became unused in a write txn */ /* IDL of pages that became unused in a write txn */
MDB_IDL me_free_pgs; MDB_IDL me_free_pgs;
/** ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */ /* ID2L of pages written during a write txn. Length MDB_IDL_UM_SIZE. */
MDB_ID2L me_dirty_list; MDB_ID2L me_dirty_list;
/** Max number of freelist items that can fit in a single overflow page */ /* Max number of freelist items that can fit in a single overflow page */
unsigned me_maxfree_1pg; unsigned me_maxfree_1pg;
/** Max size of a node on a page */ /* Max size of a node on a page */
unsigned me_nodemax; unsigned me_nodemax;
unsigned me_maxkey_limit; /**< max size of a key */ unsigned me_maxkey_limit; /* max size of a key */
int me_live_reader; /**< have liveness lock in reader table */ int me_live_reader; /* have liveness lock in reader table */
void *me_userctx; /**< User-settable context */ void *me_userctx; /* User-settable context */
#if MDB_DEBUG #if MDB_DEBUG
MDB_assert_func *me_assert_func; /**< Callback for assertion failures */ MDB_assert_func *me_assert_func; /* Callback for assertion failures */
#endif #endif
uint64_t me_sync_pending; /**< Total dirty/commited bytes since the last uint64_t me_sync_pending; /* Total dirty/non-sync'ed bytes
mdbx_env_sync() */ * since the last mdbx_env_sync() */
uint64_t uint64_t me_sync_threshold; /* Treshold of above to force synchronous flush */
me_sync_threshold; /**< Treshold of above to force synchronous flush */ MDBX_oom_func *me_oom_func; /* Callback for kicking laggard readers */
MDBX_oom_func *me_oom_func; /**< Callback for kicking laggard readers */
#ifdef USE_VALGRIND #ifdef USE_VALGRIND
int me_valgrind_handle; int me_valgrind_handle;
#endif #endif
}; };
/** Nested transaction */ /* Nested transaction */
typedef struct MDB_ntxn { typedef struct MDB_ntxn {
MDB_txn mnt_txn; /**< the transaction */ MDB_txn mnt_txn; /* the transaction */
MDB_pgstate mnt_pgstate; /**< parent transaction's saved freestate */ MDB_pgstate mnt_pgstate; /* parent transaction's saved freestate */
} MDB_ntxn; } MDB_ntxn;
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
@ -692,7 +658,7 @@ void mdbx_panic(const char *fmt, ...)
#define mdbx_print(fmt, ...) \ #define mdbx_print(fmt, ...) \
mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__) mdbx_debug_log(MDBX_DBG_PRINT, NULL, 0, fmt, ##__VA_ARGS__)
/*****************************************/ /*----------------------------------------------------------------------------*/
#define mdbx_trace(fmt, ...) \ #define mdbx_trace(fmt, ...) \
do { \ do { \
@ -743,7 +709,7 @@ void mdbx_panic(const char *fmt, ...)
fmt "\n", ##__VA_ARGS__); \ fmt "\n", ##__VA_ARGS__); \
} while (0) } while (0)
/*****************************************/ /*----------------------------------------------------------------------------*/
#define mdbx_debug(fmt, ...) \ #define mdbx_debug(fmt, ...) \
do { \ do { \

1186
src/mdbx.c

File diff suppressed because it is too large Load Diff

View File

@ -11,8 +11,7 @@
* *
* A copy of this license is available in the file LICENSE in the * A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at * top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. * <http://www.OpenLDAP.org/license.html>. */
*/
#include <ctype.h> #include <ctype.h>
#include <errno.h> #include <errno.h>

View File

@ -11,8 +11,7 @@
* *
* A copy of this license is available in the file LICENSE in the * A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at * top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. * <http://www.OpenLDAP.org/license.html>. */
*/
#include "../../mdbx.h" #include "../../mdbx.h"
#include <signal.h> #include <signal.h>

View File

@ -11,8 +11,7 @@
* *
* A copy of this license is available in the file LICENSE in the * A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at * top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. * <http://www.OpenLDAP.org/license.html>. */
*/
#include "../../mdbx.h" #include "../../mdbx.h"
#include <ctype.h> #include <ctype.h>

View File

@ -11,8 +11,7 @@
* *
* A copy of this license is available in the file LICENSE in the * A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at * top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. * <http://www.OpenLDAP.org/license.html>. */
*/
#include "../../mdbx.h" #include "../../mdbx.h"
#include <ctype.h> #include <ctype.h>

View File

@ -11,8 +11,7 @@
* *
* A copy of this license is available in the file LICENSE in the * A copy of this license is available in the file LICENSE in the
* top-level directory of the distribution or, alternatively, at * top-level directory of the distribution or, alternatively, at
* <http://www.OpenLDAP.org/license.html>. * <http://www.OpenLDAP.org/license.html>. */
*/
#include "../../mdbx.h" #include "../../mdbx.h"
#include <inttypes.h> #include <inttypes.h>