mdbx: refine includes, drop midl.h and mdbx_osal.h

This commit is contained in:
Leo Yuriev
2017-05-24 13:59:50 +03:00
parent dd7855c30d
commit 924e81ed92
12 changed files with 672 additions and 696 deletions

View File

@@ -36,7 +36,6 @@
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#include "./bits.h"
#include "./midl.h"
/*----------------------------------------------------------------------------*/
/* rthc (tls keys and destructors) */
@@ -255,342 +254,6 @@ int mdbx_setup_debug(int flags, MDBX_debug_func *logger, long edge_txn);
txnid_t mdbx_debug_edge;
#endif
/* Features under development */
#ifndef MDBX_DEVEL
#define MDBX_DEVEL 0
#endif
/* Internal error codes, not exposed outside libmdbx */
#define MDBX_NO_ROOT (MDBX_LAST_ERRCODE + 10)
/* Debuging output value of a cursor DBI: Negative in a sub-cursor. */
#define DDBI(mc) \
(((mc)->mc_flags & C_SUB) ? -(int)(mc)->mc_dbi : (int)(mc)->mc_dbi)
/* The maximum size of a database page.
*
* It is 32k or 64k, since value-PAGEBASE must fit in
* MDBX_page.mp_upper.
*
* MDBX will use database pages < OS pages if needed.
* That causes more I/O in write transactions: The OS must
* know (read) the whole page before writing a partial page.
*
* Note that we don't currently support Huge pages. On Linux,
* regular data files cannot use Huge pages, and in general
* Huge pages aren't actually pageable. We rely on the OS
* demand-pager to read our data and page it out when memory
* pressure from other processes is high. So until OSs have
* actual paging support for Huge pages, they're not viable. */
#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
/* The minimum number of keys required in a database page.
* Setting this to a larger value will place a smaller bound on the
* maximum size of a data item. Data items larger than this size will
* be pushed into overflow pages instead of being stored directly in
* the B-tree node. This value used to default to 4. With a page size
* of 4096 bytes that meant that any item larger than 1024 bytes would
* go into an overflow page. That also meant that on average 2-3KB of
* each overflow page was wasted space. The value cannot be lower than
* 2 because then there would no longer be a tree structure. With this
* value, items larger than 2KB will go into overflow pages, and on
* average only 1KB will be wasted. */
#define MDBX_MINKEYS 2
/* A stamp that identifies a file as an MDBX file.
* There's nothing special about this value other than that it is easily
* recognizable, and it will reflect any byte order mismatches. */
#define MDBX_MAGIC 0xBEEFC0DE
/* The version number for a database's datafile format. */
#define MDBX_DATA_VERSION ((MDBX_DEVEL) ? 999 : 1)
/* The version number for a database's lockfile format. */
#define MDBX_LOCK_VERSION ((MDBX_DEVEL) ? 999 : 1)
/* Key size which fits in a DKBUF. */
#define DKBUF_MAXKEYSIZE 511 /* FIXME */
#if MDBX_DEBUG
#define DKBUF char _kbuf[DKBUF_MAXKEYSIZE * 4 + 2]
#define DKEY(x) mdbx_dkey(x, _kbuf, DKBUF_MAXKEYSIZE * 2 + 1)
#define DVAL(x) \
mdbx_dkey(x, _kbuf + DKBUF_MAXKEYSIZE * 2 + 1, DKBUF_MAXKEYSIZE * 2 + 1)
#else
#define DKBUF ((void)(0))
#define DKEY(x) ("-")
#define DVAL(x) ("-")
#endif
/* An invalid page number.
* Mainly used to denote an empty tree. */
#define P_INVALID (~(pgno_t)0)
/* Test if the flags f are set in a flag word w. */
#define F_ISSET(w, f) (((w) & (f)) == (f))
/* Round n up to an even number. */
#define EVEN(n) (((n) + 1U) & -2) /* sign-extending -2 to match n+1U */
/* Default size of memory map.
* This is certainly too small for any actual applications. Apps should
* always set the size explicitly using mdbx_env_set_mapsize(). */
#define DEFAULT_MAPSIZE 1048576
/* Reader Lock Table
*
* Readers don't acquire any locks for their data access. Instead, they
* simply record their transaction ID in the reader table. The reader
* mutex is needed just to find an empty slot in the reader table. The
* slot's address is saved in thread-specific data so that subsequent
* read transactions started by the same thread need no further locking to
* proceed.
*
* If MDBX_NOTLS is set, the slot address is not saved in thread-specific data.
* No reader table is used if the database is on a read-only filesystem.
*
* Since the database uses multi-version concurrency control, readers don't
* actually need any locking. This table is used to keep track of which
* readers are using data from which old transactions, so that we'll know
* when a particular old transaction is no longer in use. Old transactions
* that have discarded any data pages can then have those pages reclaimed
* for use by a later write transaction.
*
* The lock table is constructed such that reader slots are aligned with the
* processor's cache line size. Any slot is only ever used by one thread.
* This alignment guarantees that there will be no contention or cache
* thrashing as threads update their own slot info, and also eliminates
* any need for locking when accessing a slot.
*
* A writer thread will scan every slot in the table to determine the oldest
* outstanding reader transaction. Any freed pages older than this will be
* reclaimed by the writer. The writer doesn't use any locks when scanning
* this table. This means that there's no guarantee that the writer will
* see the most up-to-date reader info, but that's not required for correct
* operation - all we need is to know the upper bound on the oldest reader,
* we don't care at all about the newest reader. So the only consequence of
* reading stale information here is that old pages might hang around a
* while longer before being reclaimed. That's actually good anyway, because
* the longer we delay reclaiming old pages, the more likely it is that a
* string of contiguous pages can be found after coalescing old pages from
* many old transactions together. */
/* Number of slots in the reader table.
* This value was chosen somewhat arbitrarily. The 61 is a prime number,
* and such readers plus a couple mutexes fit into single 4KB page.
* Applications should set the table size using mdbx_env_set_maxreaders(). */
#define DEFAULT_READERS 61
/* Address of first usable data byte in a page, after the header */
#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
/* ITS#7713, change PAGEBASE to handle 65536 byte pages */
#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0)
/* Number of nodes on a page */
#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1)
/* The amount of space remaining in the page */
#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
/* The percentage of space used in the page, in tenths of a percent. */
#define PAGEFILL(env, p) \
(1000L * ((env)->me_psize - PAGEHDRSZ - SIZELEFT(p)) / \
((env)->me_psize - PAGEHDRSZ))
/* The minimum page fill factor, in tenths of a percent.
* Pages emptier than this are candidates for merging. */
#define FILL_THRESHOLD 250
/* Test if a page is a leaf page */
#define IS_LEAF(p) F_ISSET((p)->mp_flags, P_LEAF)
/* Test if a page is a LEAF2 page */
#define IS_LEAF2(p) F_ISSET((p)->mp_flags, P_LEAF2)
/* Test if a page is a branch page */
#define IS_BRANCH(p) F_ISSET((p)->mp_flags, P_BRANCH)
/* Test if a page is an overflow page */
#define IS_OVERFLOW(p) F_ISSET((p)->mp_flags, P_OVERFLOW)
/* Test if a page is a sub page */
#define IS_SUBP(p) F_ISSET((p)->mp_flags, P_SUBP)
/* The number of overflow pages needed to store the given size. */
#define OVPAGES(size, psize) ((PAGEHDRSZ - 1 + (size)) / (psize) + 1)
/* Link in MDBX_txn.mt_loose_pages list.
* Kept outside the page header, which is needed when reusing the page. */
#define NEXT_LOOSE_PAGE(p) (*(MDBX_page **)((p) + 2))
/* Header for a single key/data pair within a page.
* Used in pages of type P_BRANCH and P_LEAF without P_LEAF2.
* We guarantee 2-byte alignment for 'MDBX_node's.
*
* mn_lo and mn_hi are used for data size on leaf nodes, and for child
* pgno on branch nodes. On 64 bit platforms, mn_flags is also used
* for pgno. (Branch nodes have no flags). Lo and hi are in host byte
* order in case some accesses can be optimized to 32-bit word access.
*
* Leaf node flags describe node contents. F_BIGDATA says the node's
* data part is the page number of an overflow page with actual data.
* F_DUPDATA and F_SUBDATA can be combined giving duplicate data in
* a sub-page/sub-database, and named databases (just F_SUBDATA). */
typedef struct MDBX_node {
union {
struct {
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
union {
struct {
uint16_t mn_lo, mn_hi; /* part of data size or pgno */
};
uint32_t mn_dsize;
};
uint16_t mn_flags; /* see mdbx_node */
uint16_t mn_ksize; /* key size */
#else
uint16_t mn_ksize; /* key size */
uint16_t mn_flags; /* see mdbx_node */
union {
struct {
uint16_t mn_hi, mn_lo; /* part of data size or pgno */
};
uint32_t mn_dsize;
};
#endif
};
pgno_t mn_ksize_and_pgno;
};
/* mdbx_node Flags */
#define F_BIGDATA 0x01 /* data put on overflow page */
#define F_SUBDATA 0x02 /* data is a sub-database */
#define F_DUPDATA 0x04 /* data has duplicates */
/* valid flags for mdbx_node_add() */
#define NODE_ADD_FLAGS (F_DUPDATA | F_SUBDATA | MDBX_RESERVE | MDBX_APPEND)
uint8_t mn_data[1]; /* key and data are appended here */
} MDBX_node;
/* Size of the node header, excluding dynamic data at the end */
#define NODESIZE offsetof(MDBX_node, mn_data)
/* Bit position of top word in page number, for shifting mn_flags */
#define PGNO_TOPWORD ((pgno_t)-1 > 0xffffffffu ? 32 : 0)
/* Size of a node in a branch page with a given key.
* This is just the node header plus the key, there is no data. */
#define INDXSIZE(k) (NODESIZE + ((k) == NULL ? 0 : (k)->iov_len))
/* Size of a node in a leaf page with a given key and data.
* This is node header plus key plus data size. */
#define LEAFSIZE(k, d) (NODESIZE + (k)->iov_len + (d)->iov_len)
/* Address of node i in page p */
static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) {
assert(NUMKEYS(p) > (unsigned)(i));
return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE);
}
/* Address of the key for the node */
#define NODEKEY(node) (void *)((node)->mn_data)
/* Address of the data for a node */
#define NODEDATA(node) (void *)((char *)(node)->mn_data + (node)->mn_ksize)
/* Get the page number pointed to by a branch node */
static __inline pgno_t NODEPGNO(const MDBX_node *node) {
pgno_t pgno;
if (UNALIGNED_OK) {
pgno = node->mn_ksize_and_pgno;
if (sizeof(pgno_t) > 4)
pgno &= UINT64_C(0xffffFFFFffff);
} else {
pgno = node->mn_lo | ((pgno_t)node->mn_lo << 16);
if (sizeof(pgno_t) > 4)
pgno |= ((uint64_t)node->mn_flags) << 32;
}
return pgno;
}
/* Set the page number in a branch node */
static __inline void SETPGNO(MDBX_node *node, pgno_t pgno) {
assert(pgno <= (pgno_t)UINT64_C(0xffffFFFFffff));
if (UNALIGNED_OK) {
if (sizeof(pgno_t) > 4)
pgno |= ((uint64_t)node->mn_ksize) << 48;
node->mn_ksize_and_pgno = pgno;
} else {
node->mn_lo = (uint16_t)pgno;
node->mn_hi = (uint16_t)(pgno >> 16);
if (sizeof(pgno_t) > 4)
node->mn_flags = (uint16_t)((uint64_t)pgno >> 32);
}
}
/* Get the size of the data in a leaf node */
static __inline size_t NODEDSZ(const MDBX_node *node) {
size_t size;
if (UNALIGNED_OK) {
size = node->mn_dsize;
} else {
size = node->mn_lo | ((size_t)node->mn_hi << 16);
}
return size;
}
/* Set the size of the data for a leaf node */
static __inline void SETDSZ(MDBX_node *node, unsigned size) {
if (UNALIGNED_OK) {
node->mn_dsize = size;
} else {
node->mn_lo = (uint16_t)size;
node->mn_hi = (uint16_t)(size >> 16);
}
}
/* The size of a key in a node */
#define NODEKSZ(node) ((node)->mn_ksize)
/* The address of a key in a LEAF2 page.
* LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs.
* There are no node headers, keys are stored contiguously. */
#define LEAF2KEY(p, i, ks) ((char *)(p) + PAGEHDRSZ + ((i) * (ks)))
/* Set the node's key into keyptr, if requested. */
#define MDBX_GET_KEY(node, keyptr) \
do { \
if ((keyptr) != NULL) { \
(keyptr)->iov_len = NODEKSZ(node); \
(keyptr)->iov_base = NODEKEY(node); \
} \
} while (0)
/* Set the node's key into key. */
#define MDBX_GET_KEY2(node, key) \
do { \
key.iov_len = NODEKSZ(node); \
key.iov_base = NODEKEY(node); \
} while (0)
#define MDBX_VALID 0x8000 /* DB handle is valid, for me_dbflags */
#define PERSISTENT_FLAGS (0xffff & ~(MDBX_VALID))
/* mdbx_dbi_open() flags */
#define VALID_FLAGS \
(MDBX_REVERSEKEY | MDBX_DUPSORT | MDBX_INTEGERKEY | MDBX_DUPFIXED | \
MDBX_INTEGERDUP | MDBX_REVERSEDUP | MDBX_CREATE)
/* max number of pages to commit in one writev() call */
#define MDBX_COMMIT_PAGES 64
#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */
#undef MDBX_COMMIT_PAGES
#define MDBX_COMMIT_PAGES IOV_MAX
#endif
/* Check txn and dbi arguments to a function */
#define TXN_DBI_EXIST(txn, dbi, validity) \
((dbi) < (txn)->mt_numdbs && ((txn)->mt_dbflags[dbi] & (validity)))
/* Check for misused dbi handles */
#define TXN_DBI_CHANGED(txn, dbi) \
((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi])
static int mdbx_page_alloc(MDBX_cursor *mc, int num, MDBX_page **mp, int flags);
static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num,
MDBX_page **mp);
@@ -622,6 +285,7 @@ static int mdbx_txn_end(MDBX_txn *txn, unsigned mode);
static int mdbx_page_get(MDBX_cursor *mc, pgno_t pgno, MDBX_page **mp,
int *lvl);
static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, int modify);
#define MDBX_PS_MODIFY 1
#define MDBX_PS_ROOTONLY 2
#define MDBX_PS_FIRST 4