mirror of
https://github.com/isar/libmdbx.git
synced 2025-01-04 17:14:12 +08:00
mdbx: MAX_PAGESIZE always 64K.
This commit is contained in:
parent
19dd181b6f
commit
95ebdb7065
59
src/bits.h
59
src/bits.h
@ -82,24 +82,6 @@
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* Basic constants and types */
|
||||
|
||||
/* The maximum size of a database page.
|
||||
*
|
||||
* It is 32k or 64k, since value-PAGEBASE must fit in
|
||||
* MDBX_page.mp_upper.
|
||||
*
|
||||
* MDBX will use database pages < OS pages if needed.
|
||||
* That causes more I/O in write transactions: The OS must
|
||||
* know (read) the whole page before writing a partial page.
|
||||
*
|
||||
* Note that we don't currently support Huge pages. On Linux,
|
||||
* regular data files cannot use Huge pages, and in general
|
||||
* Huge pages aren't actually pageable. We rely on the OS
|
||||
* demand-pager to read our data and page it out when memory
|
||||
* pressure from other processes is high. So until OSs have
|
||||
* actual paging support for Huge pages, they're not viable. */
|
||||
#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
|
||||
#define MIN_PAGESIZE 1024
|
||||
|
||||
/* The minimum number of keys required in a database page.
|
||||
* Setting this to a larger value will place a smaller bound on the
|
||||
* maximum size of a data item. Data items larger than this size will
|
||||
@ -151,13 +133,6 @@ typedef uint64_t txnid_t;
|
||||
* this is plenty. */
|
||||
typedef uint16_t indx_t;
|
||||
|
||||
#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
|
||||
#define MAX_MAPSIZE \
|
||||
((sizeof(size_t) < 8) \
|
||||
? UINT32_C(0x7ff80000) \
|
||||
: ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \
|
||||
: MAX_PAGENO * (uint64_t)MAX_PAGESIZE))
|
||||
|
||||
/*----------------------------------------------------------------------------*/
|
||||
/* Core structures for database and shared memory (i.e. format definition) */
|
||||
#pragma pack(push, 1)
|
||||
@ -308,9 +283,10 @@ typedef struct MDBX_meta {
|
||||
* in the snapshot: Either used by a database or listed in a freeDB record. */
|
||||
typedef struct MDBX_page {
|
||||
union {
|
||||
struct MDBX_page *mp_next; /* for in-memory list of freed pages,
|
||||
* must be first field, see NEXT_LOOSE_PAGE */
|
||||
uint64_t mp_validator; /* checksum of page content or a txnid during
|
||||
* which the page has been updated */
|
||||
struct MDBX_page *mp_next; /* for in-memory list of freed pages */
|
||||
};
|
||||
uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
|
||||
#define P_BRANCH 0x01 /* branch page */
|
||||
@ -343,6 +319,30 @@ typedef struct MDBX_page {
|
||||
/* Size of the page header, excluding dynamic data at the end */
|
||||
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data))
|
||||
|
||||
/* The maximum size of a database page.
|
||||
*
|
||||
* It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper.
|
||||
*
|
||||
* MDBX will use database pages < OS pages if needed.
|
||||
* That causes more I/O in write transactions: The OS must
|
||||
* know (read) the whole page before writing a partial page.
|
||||
*
|
||||
* Note that we don't currently support Huge pages. On Linux,
|
||||
* regular data files cannot use Huge pages, and in general
|
||||
* Huge pages aren't actually pageable. We rely on the OS
|
||||
* demand-pager to read our data and page it out when memory
|
||||
* pressure from other processes is high. So until OSs have
|
||||
* actual paging support for Huge pages, they're not viable. */
|
||||
#define MAX_PAGESIZE 0x10000u
|
||||
#define MIN_PAGESIZE 512u
|
||||
|
||||
#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
|
||||
#define MAX_MAPSIZE \
|
||||
((sizeof(size_t) < 8) \
|
||||
? UINT32_C(0x7ff80000) \
|
||||
: ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \
|
||||
: MAX_PAGENO * (uint64_t)MAX_PAGESIZE))
|
||||
|
||||
#pragma pack(pop)
|
||||
|
||||
/* The header for the reader table (a memory-mapped lock file). */
|
||||
@ -885,11 +885,8 @@ static __inline size_t roundup2(size_t value, size_t granularity) {
|
||||
/* Address of first usable data byte in a page, after the header */
|
||||
#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
|
||||
|
||||
/* ITS#7713, change PAGEBASE to handle 65536 byte pages */
|
||||
#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0)
|
||||
|
||||
/* Number of nodes on a page */
|
||||
#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1)
|
||||
#define NUMKEYS(p) ((p)->mp_lower >> 1)
|
||||
|
||||
/* The amount of space remaining in the page */
|
||||
#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
|
||||
@ -986,7 +983,7 @@ typedef struct MDBX_node {
|
||||
/* Address of node i in page p */
|
||||
static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) {
|
||||
assert(NUMKEYS(p) > (unsigned)(i));
|
||||
return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE);
|
||||
return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEHDRSZ);
|
||||
}
|
||||
|
||||
/* Address of the key for the node */
|
||||
|
45
src/mdbx.c
45
src/mdbx.c
@ -833,7 +833,7 @@ static void mdbx_page_list(MDBX_page *mp) {
|
||||
total = EVEN(total);
|
||||
}
|
||||
mdbx_print("Total: header %u + contents %u + unused %u\n",
|
||||
IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total,
|
||||
IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total,
|
||||
SIZELEFT(mp));
|
||||
}
|
||||
|
||||
@ -1816,14 +1816,16 @@ done:
|
||||
* [in] src page to copy from
|
||||
* [in] psize size of a page */
|
||||
static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) {
|
||||
STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ);
|
||||
STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 42);
|
||||
enum { Align = sizeof(pgno_t) };
|
||||
indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower;
|
||||
|
||||
/* If page isn't full, just copy the used portion. Adjust
|
||||
* alignment so memcpy may copy words instead of bytes. */
|
||||
if ((unused &= -Align) && !IS_LEAF2(src)) {
|
||||
upper = (upper + PAGEBASE) & -Align;
|
||||
memcpy(dst, src, (lower + PAGEBASE + (Align - 1)) & -Align);
|
||||
upper = (upper + PAGEHDRSZ) & -Align;
|
||||
memcpy(dst, src, (lower + PAGEHDRSZ + (Align - 1)) & -Align);
|
||||
memcpy((pgno_t *)((char *)dst + upper), (pgno_t *)((char *)src + upper),
|
||||
psize - upper);
|
||||
} else {
|
||||
@ -6068,7 +6070,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
|
||||
fp_flags = P_LEAF | P_DIRTY;
|
||||
fp = env->me_pbuf;
|
||||
fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDBX_DUPFIXED */
|
||||
fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE);
|
||||
fp->mp_lower = fp->mp_upper = 0;
|
||||
olddata.iov_len = PAGEHDRSZ;
|
||||
goto prep_subDB;
|
||||
}
|
||||
@ -6140,7 +6142,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
|
||||
|
||||
/* Make sub-page header for the dup items, with dummy body */
|
||||
fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP;
|
||||
fp->mp_lower = (PAGEHDRSZ - PAGEBASE);
|
||||
fp->mp_lower = 0;
|
||||
xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len;
|
||||
if (mc->mc_db->md_flags & MDBX_DUPFIXED) {
|
||||
fp->mp_flags |= P_LEAF2;
|
||||
@ -6150,7 +6152,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
|
||||
xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) +
|
||||
(dkey.iov_len & 1) + (data->iov_len & 1);
|
||||
}
|
||||
fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEBASE);
|
||||
fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ);
|
||||
olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */
|
||||
} else if (leaf->mn_flags & F_SUBDATA) {
|
||||
/* Data is on sub-DB, just store it */
|
||||
@ -6218,9 +6220,9 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
|
||||
if (fp_flags & P_LEAF2) {
|
||||
memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize);
|
||||
} else {
|
||||
memcpy((char *)mp + mp->mp_upper + PAGEBASE,
|
||||
(char *)fp + fp->mp_upper + PAGEBASE,
|
||||
olddata.iov_len - fp->mp_upper - PAGEBASE);
|
||||
memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ,
|
||||
(char *)fp + fp->mp_upper + PAGEHDRSZ,
|
||||
olddata.iov_len - fp->mp_upper - PAGEHDRSZ);
|
||||
for (i = 0; i < NUMKEYS(fp); i++)
|
||||
mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
|
||||
}
|
||||
@ -6594,8 +6596,8 @@ static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num,
|
||||
mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno,
|
||||
mc->mc_txn->mt_env->me_psize);
|
||||
np->mp_flags = flags | P_DIRTY;
|
||||
np->mp_lower = (PAGEHDRSZ - PAGEBASE);
|
||||
np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE;
|
||||
np->mp_lower = 0;
|
||||
np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ;
|
||||
|
||||
if (IS_BRANCH(np))
|
||||
mc->mc_db->md_branch_pages++;
|
||||
@ -6850,7 +6852,7 @@ static void mdbx_node_del(MDBX_cursor *mc, int ksize) {
|
||||
}
|
||||
}
|
||||
|
||||
base = (char *)mp + mp->mp_upper + PAGEBASE;
|
||||
base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
|
||||
memmove(base + sz, base, ptr - mp->mp_upper);
|
||||
|
||||
mp->mp_lower -= sizeof(indx_t);
|
||||
@ -6888,7 +6890,7 @@ static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) {
|
||||
SETDSZ(node, nsize);
|
||||
|
||||
/* Shift <lower nodes...initial part of subpage> upward */
|
||||
base = (char *)mp + mp->mp_upper + PAGEBASE;
|
||||
base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
|
||||
memmove(base + delta, base, (char *)sp + len - base);
|
||||
|
||||
ptr = mp->mp_ptrs[indx];
|
||||
@ -7231,7 +7233,7 @@ static int mdbx_update_key(MDBX_cursor *mc, MDBX_val *key) {
|
||||
mp->mp_ptrs[i] -= delta;
|
||||
}
|
||||
|
||||
base = (char *)mp + mp->mp_upper + PAGEBASE;
|
||||
base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
|
||||
len = ptr - mp->mp_upper + NODESIZE;
|
||||
memmove(base - delta, base, len);
|
||||
mp->mp_upper -= delta;
|
||||
@ -8130,8 +8132,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
|
||||
}
|
||||
copy->mp_pgno = mp->mp_pgno;
|
||||
copy->mp_flags = mp->mp_flags;
|
||||
copy->mp_lower = (PAGEHDRSZ - PAGEBASE);
|
||||
copy->mp_upper = env->me_psize - PAGEBASE;
|
||||
copy->mp_lower = 0;
|
||||
copy->mp_upper = env->me_psize - PAGEHDRSZ;
|
||||
|
||||
/* prepare to insert */
|
||||
for (i = 0, j = 0; i < nkeys; i++) {
|
||||
@ -8173,7 +8175,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
|
||||
psize += nsize;
|
||||
node = NULL;
|
||||
} else {
|
||||
node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
|
||||
node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ);
|
||||
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
|
||||
if (IS_LEAF(mp)) {
|
||||
if (F_ISSET(node->mn_flags, F_BIGDATA))
|
||||
@ -8193,7 +8195,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
|
||||
sepkey.iov_len = newkey->iov_len;
|
||||
sepkey.iov_base = newkey->iov_base;
|
||||
} else {
|
||||
node = (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE);
|
||||
node =
|
||||
(MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEHDRSZ);
|
||||
sepkey.iov_len = node->mn_ksize;
|
||||
sepkey.iov_base = NODEKEY(node);
|
||||
}
|
||||
@ -8272,7 +8275,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
|
||||
/* Update index for the new key. */
|
||||
mc->mc_ki[mc->mc_top] = j;
|
||||
} else {
|
||||
node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE);
|
||||
node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ);
|
||||
rkey.iov_base = NODEKEY(node);
|
||||
rkey.iov_len = node->mn_ksize;
|
||||
if (IS_LEAF(mp)) {
|
||||
@ -8308,7 +8311,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
|
||||
mp->mp_lower = copy->mp_lower;
|
||||
mp->mp_upper = copy->mp_upper;
|
||||
memcpy(NODEPTR(mp, nkeys - 1), NODEPTR(copy, nkeys - 1),
|
||||
env->me_psize - copy->mp_upper - PAGEBASE);
|
||||
env->me_psize - copy->mp_upper - PAGEHDRSZ);
|
||||
|
||||
/* reset back to original page */
|
||||
if (newindx < split_indx) {
|
||||
@ -9806,7 +9809,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi,
|
||||
return MDBX_CORRUPTED;
|
||||
|
||||
nkeys = NUMKEYS(mp);
|
||||
header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower;
|
||||
header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower;
|
||||
unused_size = SIZELEFT(mp);
|
||||
payload_size = 0;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user