mdbx: MAX_PAGESIZE always 64K.

This commit is contained in:
Leo Yuriev 2017-05-31 17:10:17 +03:00
parent 19dd181b6f
commit 95ebdb7065
2 changed files with 52 additions and 52 deletions

View File

@ -82,24 +82,6 @@
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
/* Basic constants and types */ /* Basic constants and types */
/* The maximum size of a database page.
*
* It is 32k or 64k, since value-PAGEBASE must fit in
* MDBX_page.mp_upper.
*
* MDBX will use database pages < OS pages if needed.
* That causes more I/O in write transactions: The OS must
* know (read) the whole page before writing a partial page.
*
* Note that we don't currently support Huge pages. On Linux,
* regular data files cannot use Huge pages, and in general
* Huge pages aren't actually pageable. We rely on the OS
* demand-pager to read our data and page it out when memory
* pressure from other processes is high. So until OSs have
* actual paging support for Huge pages, they're not viable. */
#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000)
#define MIN_PAGESIZE 1024
/* The minimum number of keys required in a database page. /* The minimum number of keys required in a database page.
* Setting this to a larger value will place a smaller bound on the * Setting this to a larger value will place a smaller bound on the
* maximum size of a data item. Data items larger than this size will * maximum size of a data item. Data items larger than this size will
@ -151,13 +133,6 @@ typedef uint64_t txnid_t;
* this is plenty. */ * this is plenty. */
typedef uint16_t indx_t; typedef uint16_t indx_t;
#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
#define MAX_MAPSIZE \
((sizeof(size_t) < 8) \
? UINT32_C(0x7ff80000) \
: ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \
: MAX_PAGENO * (uint64_t)MAX_PAGESIZE))
/*----------------------------------------------------------------------------*/ /*----------------------------------------------------------------------------*/
/* Core structures for database and shared memory (i.e. format definition) */ /* Core structures for database and shared memory (i.e. format definition) */
#pragma pack(push, 1) #pragma pack(push, 1)
@ -308,9 +283,10 @@ typedef struct MDBX_meta {
* in the snapshot: Either used by a database or listed in a freeDB record. */ * in the snapshot: Either used by a database or listed in a freeDB record. */
typedef struct MDBX_page { typedef struct MDBX_page {
union { union {
struct MDBX_page *mp_next; /* for in-memory list of freed pages,
* must be first field, see NEXT_LOOSE_PAGE */
uint64_t mp_validator; /* checksum of page content or a txnid during uint64_t mp_validator; /* checksum of page content or a txnid during
* which the page has been updated */ * which the page has been updated */
struct MDBX_page *mp_next; /* for in-memory list of freed pages */
}; };
uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */
#define P_BRANCH 0x01 /* branch page */ #define P_BRANCH 0x01 /* branch page */
@ -343,6 +319,30 @@ typedef struct MDBX_page {
/* Size of the page header, excluding dynamic data at the end */ /* Size of the page header, excluding dynamic data at the end */
#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data))
/* The maximum size of a database page.
*
* It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper.
*
* MDBX will use database pages < OS pages if needed.
* That causes more I/O in write transactions: The OS must
* know (read) the whole page before writing a partial page.
*
* Note that we don't currently support Huge pages. On Linux,
* regular data files cannot use Huge pages, and in general
* Huge pages aren't actually pageable. We rely on the OS
* demand-pager to read our data and page it out when memory
* pressure from other processes is high. So until OSs have
* actual paging support for Huge pages, they're not viable. */
#define MAX_PAGESIZE 0x10000u
#define MIN_PAGESIZE 512u
#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO)
#define MAX_MAPSIZE \
((sizeof(size_t) < 8) \
? UINT32_C(0x7ff80000) \
: ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \
: MAX_PAGENO * (uint64_t)MAX_PAGESIZE))
#pragma pack(pop) #pragma pack(pop)
/* The header for the reader table (a memory-mapped lock file). */ /* The header for the reader table (a memory-mapped lock file). */
@ -885,11 +885,8 @@ static __inline size_t roundup2(size_t value, size_t granularity) {
/* Address of first usable data byte in a page, after the header */ /* Address of first usable data byte in a page, after the header */
#define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) #define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ))
/* ITS#7713, change PAGEBASE to handle 65536 byte pages */
#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0)
/* Number of nodes on a page */ /* Number of nodes on a page */
#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) #define NUMKEYS(p) ((p)->mp_lower >> 1)
/* The amount of space remaining in the page */ /* The amount of space remaining in the page */
#define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower)
@ -986,7 +983,7 @@ typedef struct MDBX_node {
/* Address of node i in page p */ /* Address of node i in page p */
static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) {
assert(NUMKEYS(p) > (unsigned)(i)); assert(NUMKEYS(p) > (unsigned)(i));
return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEHDRSZ);
} }
/* Address of the key for the node */ /* Address of the key for the node */

View File

@ -833,7 +833,7 @@ static void mdbx_page_list(MDBX_page *mp) {
total = EVEN(total); total = EVEN(total);
} }
mdbx_print("Total: header %u + contents %u + unused %u\n", mdbx_print("Total: header %u + contents %u + unused %u\n",
IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total,
SIZELEFT(mp)); SIZELEFT(mp));
} }
@ -1816,14 +1816,16 @@ done:
* [in] src page to copy from * [in] src page to copy from
* [in] psize size of a page */ * [in] psize size of a page */
static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) { static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) {
STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ);
STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 42);
enum { Align = sizeof(pgno_t) }; enum { Align = sizeof(pgno_t) };
indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower;
/* If page isn't full, just copy the used portion. Adjust /* If page isn't full, just copy the used portion. Adjust
* alignment so memcpy may copy words instead of bytes. */ * alignment so memcpy may copy words instead of bytes. */
if ((unused &= -Align) && !IS_LEAF2(src)) { if ((unused &= -Align) && !IS_LEAF2(src)) {
upper = (upper + PAGEBASE) & -Align; upper = (upper + PAGEHDRSZ) & -Align;
memcpy(dst, src, (lower + PAGEBASE + (Align - 1)) & -Align); memcpy(dst, src, (lower + PAGEHDRSZ + (Align - 1)) & -Align);
memcpy((pgno_t *)((char *)dst + upper), (pgno_t *)((char *)src + upper), memcpy((pgno_t *)((char *)dst + upper), (pgno_t *)((char *)src + upper),
psize - upper); psize - upper);
} else { } else {
@ -6068,7 +6070,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
fp_flags = P_LEAF | P_DIRTY; fp_flags = P_LEAF | P_DIRTY;
fp = env->me_pbuf; fp = env->me_pbuf;
fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDBX_DUPFIXED */ fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDBX_DUPFIXED */
fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE); fp->mp_lower = fp->mp_upper = 0;
olddata.iov_len = PAGEHDRSZ; olddata.iov_len = PAGEHDRSZ;
goto prep_subDB; goto prep_subDB;
} }
@ -6140,7 +6142,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
/* Make sub-page header for the dup items, with dummy body */ /* Make sub-page header for the dup items, with dummy body */
fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP;
fp->mp_lower = (PAGEHDRSZ - PAGEBASE); fp->mp_lower = 0;
xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len;
if (mc->mc_db->md_flags & MDBX_DUPFIXED) { if (mc->mc_db->md_flags & MDBX_DUPFIXED) {
fp->mp_flags |= P_LEAF2; fp->mp_flags |= P_LEAF2;
@ -6150,7 +6152,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) +
(dkey.iov_len & 1) + (data->iov_len & 1); (dkey.iov_len & 1) + (data->iov_len & 1);
} }
fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEBASE); fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ);
olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */
} else if (leaf->mn_flags & F_SUBDATA) { } else if (leaf->mn_flags & F_SUBDATA) {
/* Data is on sub-DB, just store it */ /* Data is on sub-DB, just store it */
@ -6218,9 +6220,9 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data,
if (fp_flags & P_LEAF2) { if (fp_flags & P_LEAF2) {
memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize);
} else { } else {
memcpy((char *)mp + mp->mp_upper + PAGEBASE, memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ,
(char *)fp + fp->mp_upper + PAGEBASE, (char *)fp + fp->mp_upper + PAGEHDRSZ,
olddata.iov_len - fp->mp_upper - PAGEBASE); olddata.iov_len - fp->mp_upper - PAGEHDRSZ);
for (i = 0; i < NUMKEYS(fp); i++) for (i = 0; i < NUMKEYS(fp); i++)
mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset;
} }
@ -6594,8 +6596,8 @@ static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num,
mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno,
mc->mc_txn->mt_env->me_psize); mc->mc_txn->mt_env->me_psize);
np->mp_flags = flags | P_DIRTY; np->mp_flags = flags | P_DIRTY;
np->mp_lower = (PAGEHDRSZ - PAGEBASE); np->mp_lower = 0;
np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ;
if (IS_BRANCH(np)) if (IS_BRANCH(np))
mc->mc_db->md_branch_pages++; mc->mc_db->md_branch_pages++;
@ -6850,7 +6852,7 @@ static void mdbx_node_del(MDBX_cursor *mc, int ksize) {
} }
} }
base = (char *)mp + mp->mp_upper + PAGEBASE; base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
memmove(base + sz, base, ptr - mp->mp_upper); memmove(base + sz, base, ptr - mp->mp_upper);
mp->mp_lower -= sizeof(indx_t); mp->mp_lower -= sizeof(indx_t);
@ -6888,7 +6890,7 @@ static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) {
SETDSZ(node, nsize); SETDSZ(node, nsize);
/* Shift <lower nodes...initial part of subpage> upward */ /* Shift <lower nodes...initial part of subpage> upward */
base = (char *)mp + mp->mp_upper + PAGEBASE; base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
memmove(base + delta, base, (char *)sp + len - base); memmove(base + delta, base, (char *)sp + len - base);
ptr = mp->mp_ptrs[indx]; ptr = mp->mp_ptrs[indx];
@ -7231,7 +7233,7 @@ static int mdbx_update_key(MDBX_cursor *mc, MDBX_val *key) {
mp->mp_ptrs[i] -= delta; mp->mp_ptrs[i] -= delta;
} }
base = (char *)mp + mp->mp_upper + PAGEBASE; base = (char *)mp + mp->mp_upper + PAGEHDRSZ;
len = ptr - mp->mp_upper + NODESIZE; len = ptr - mp->mp_upper + NODESIZE;
memmove(base - delta, base, len); memmove(base - delta, base, len);
mp->mp_upper -= delta; mp->mp_upper -= delta;
@ -8130,8 +8132,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
} }
copy->mp_pgno = mp->mp_pgno; copy->mp_pgno = mp->mp_pgno;
copy->mp_flags = mp->mp_flags; copy->mp_flags = mp->mp_flags;
copy->mp_lower = (PAGEHDRSZ - PAGEBASE); copy->mp_lower = 0;
copy->mp_upper = env->me_psize - PAGEBASE; copy->mp_upper = env->me_psize - PAGEHDRSZ;
/* prepare to insert */ /* prepare to insert */
for (i = 0, j = 0; i < nkeys; i++) { for (i = 0, j = 0; i < nkeys; i++) {
@ -8173,7 +8175,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
psize += nsize; psize += nsize;
node = NULL; node = NULL;
} else { } else {
node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ);
psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t);
if (IS_LEAF(mp)) { if (IS_LEAF(mp)) {
if (F_ISSET(node->mn_flags, F_BIGDATA)) if (F_ISSET(node->mn_flags, F_BIGDATA))
@ -8193,7 +8195,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
sepkey.iov_len = newkey->iov_len; sepkey.iov_len = newkey->iov_len;
sepkey.iov_base = newkey->iov_base; sepkey.iov_base = newkey->iov_base;
} else { } else {
node = (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); node =
(MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEHDRSZ);
sepkey.iov_len = node->mn_ksize; sepkey.iov_len = node->mn_ksize;
sepkey.iov_base = NODEKEY(node); sepkey.iov_base = NODEKEY(node);
} }
@ -8272,7 +8275,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
/* Update index for the new key. */ /* Update index for the new key. */
mc->mc_ki[mc->mc_top] = j; mc->mc_ki[mc->mc_top] = j;
} else { } else {
node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ);
rkey.iov_base = NODEKEY(node); rkey.iov_base = NODEKEY(node);
rkey.iov_len = node->mn_ksize; rkey.iov_len = node->mn_ksize;
if (IS_LEAF(mp)) { if (IS_LEAF(mp)) {
@ -8308,7 +8311,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata,
mp->mp_lower = copy->mp_lower; mp->mp_lower = copy->mp_lower;
mp->mp_upper = copy->mp_upper; mp->mp_upper = copy->mp_upper;
memcpy(NODEPTR(mp, nkeys - 1), NODEPTR(copy, nkeys - 1), memcpy(NODEPTR(mp, nkeys - 1), NODEPTR(copy, nkeys - 1),
env->me_psize - copy->mp_upper - PAGEBASE); env->me_psize - copy->mp_upper - PAGEHDRSZ);
/* reset back to original page */ /* reset back to original page */
if (newindx < split_indx) { if (newindx < split_indx) {
@ -9806,7 +9809,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi,
return MDBX_CORRUPTED; return MDBX_CORRUPTED;
nkeys = NUMKEYS(mp); nkeys = NUMKEYS(mp);
header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower; header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower;
unused_size = SIZELEFT(mp); unused_size = SIZELEFT(mp);
payload_size = 0; payload_size = 0;