From 95ebdb706519edaeaf6144a6ca3fa083deeda1ea Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Wed, 31 May 2017 17:10:17 +0300 Subject: [PATCH] mdbx: MAX_PAGESIZE always 64K. --- src/bits.h | 59 ++++++++++++++++++++++++++---------------------------- src/mdbx.c | 45 ++++++++++++++++++++++------------------- 2 files changed, 52 insertions(+), 52 deletions(-) diff --git a/src/bits.h b/src/bits.h index 734c1fe7..89d3f808 100644 --- a/src/bits.h +++ b/src/bits.h @@ -82,24 +82,6 @@ /*----------------------------------------------------------------------------*/ /* Basic constants and types */ -/* The maximum size of a database page. - * - * It is 32k or 64k, since value-PAGEBASE must fit in - * MDBX_page.mp_upper. - * - * MDBX will use database pages < OS pages if needed. - * That causes more I/O in write transactions: The OS must - * know (read) the whole page before writing a partial page. - * - * Note that we don't currently support Huge pages. On Linux, - * regular data files cannot use Huge pages, and in general - * Huge pages aren't actually pageable. We rely on the OS - * demand-pager to read our data and page it out when memory - * pressure from other processes is high. So until OSs have - * actual paging support for Huge pages, they're not viable. */ -#define MAX_PAGESIZE (PAGEBASE ? 0x10000 : 0x8000) -#define MIN_PAGESIZE 1024 - /* The minimum number of keys required in a database page. * Setting this to a larger value will place a smaller bound on the * maximum size of a data item. Data items larger than this size will @@ -151,13 +133,6 @@ typedef uint64_t txnid_t; * this is plenty. */ typedef uint16_t indx_t; -#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) -#define MAX_MAPSIZE \ - ((sizeof(size_t) < 8) \ - ? UINT32_C(0x7ff80000) \ - : ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ - : MAX_PAGENO * (uint64_t)MAX_PAGESIZE)) - /*----------------------------------------------------------------------------*/ /* Core structures for database and shared memory (i.e. format definition) */ #pragma pack(push, 1) @@ -308,9 +283,10 @@ typedef struct MDBX_meta { * in the snapshot: Either used by a database or listed in a freeDB record. */ typedef struct MDBX_page { union { + struct MDBX_page *mp_next; /* for in-memory list of freed pages, + * must be first field, see NEXT_LOOSE_PAGE */ uint64_t mp_validator; /* checksum of page content or a txnid during * which the page has been updated */ - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ }; uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ #define P_BRANCH 0x01 /* branch page */ @@ -343,6 +319,30 @@ typedef struct MDBX_page { /* Size of the page header, excluding dynamic data at the end */ #define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_data)) +/* The maximum size of a database page. +* +* It is 64K, but value-PAGEHDRSZ must fit in MDBX_page.mp_upper. +* +* MDBX will use database pages < OS pages if needed. +* That causes more I/O in write transactions: The OS must +* know (read) the whole page before writing a partial page. +* +* Note that we don't currently support Huge pages. On Linux, +* regular data files cannot use Huge pages, and in general +* Huge pages aren't actually pageable. We rely on the OS +* demand-pager to read our data and page it out when memory +* pressure from other processes is high. So until OSs have +* actual paging support for Huge pages, they're not viable. */ +#define MAX_PAGESIZE 0x10000u +#define MIN_PAGESIZE 512u + +#define MIN_MAPSIZE (MIN_PAGESIZE * MIN_PAGENO) +#define MAX_MAPSIZE \ + ((sizeof(size_t) < 8) \ + ? UINT32_C(0x7ff80000) \ + : ((sizeof(pgno_t) > 4) ? UINT64_C(0x7fffFFFFfff80000) \ + : MAX_PAGENO * (uint64_t)MAX_PAGESIZE)) + #pragma pack(pop) /* The header for the reader table (a memory-mapped lock file). */ @@ -885,11 +885,8 @@ static __inline size_t roundup2(size_t value, size_t granularity) { /* Address of first usable data byte in a page, after the header */ #define PAGEDATA(p) ((void *)((char *)(p) + PAGEHDRSZ)) -/* ITS#7713, change PAGEBASE to handle 65536 byte pages */ -#define PAGEBASE ((MDBX_DEVEL) ? PAGEHDRSZ : 0) - /* Number of nodes on a page */ -#define NUMKEYS(p) (((p)->mp_lower - (PAGEHDRSZ - PAGEBASE)) >> 1) +#define NUMKEYS(p) ((p)->mp_lower >> 1) /* The amount of space remaining in the page */ #define SIZELEFT(p) (indx_t)((p)->mp_upper - (p)->mp_lower) @@ -986,7 +983,7 @@ typedef struct MDBX_node { /* Address of node i in page p */ static __inline MDBX_node *NODEPTR(MDBX_page *p, unsigned i) { assert(NUMKEYS(p) > (unsigned)(i)); - return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEBASE); + return (MDBX_node *)((char *)(p) + (p)->mp_ptrs[i] + PAGEHDRSZ); } /* Address of the key for the node */ diff --git a/src/mdbx.c b/src/mdbx.c index e7f55865..f3c1567e 100644 --- a/src/mdbx.c +++ b/src/mdbx.c @@ -833,7 +833,7 @@ static void mdbx_page_list(MDBX_page *mp) { total = EVEN(total); } mdbx_print("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower, total, + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, SIZELEFT(mp)); } @@ -1816,14 +1816,16 @@ done: * [in] src page to copy from * [in] psize size of a page */ static void mdbx_page_copy(MDBX_page *dst, MDBX_page *src, unsigned psize) { + STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); + STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 42); enum { Align = sizeof(pgno_t) }; indx_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; /* If page isn't full, just copy the used portion. Adjust * alignment so memcpy may copy words instead of bytes. */ if ((unused &= -Align) && !IS_LEAF2(src)) { - upper = (upper + PAGEBASE) & -Align; - memcpy(dst, src, (lower + PAGEBASE + (Align - 1)) & -Align); + upper = (upper + PAGEHDRSZ) & -Align; + memcpy(dst, src, (lower + PAGEHDRSZ + (Align - 1)) & -Align); memcpy((pgno_t *)((char *)dst + upper), (pgno_t *)((char *)src + upper), psize - upper); } else { @@ -6068,7 +6070,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, fp_flags = P_LEAF | P_DIRTY; fp = env->me_pbuf; fp->mp_leaf2_ksize = (uint16_t)data->iov_len; /* used if MDBX_DUPFIXED */ - fp->mp_lower = fp->mp_upper = (PAGEHDRSZ - PAGEBASE); + fp->mp_lower = fp->mp_upper = 0; olddata.iov_len = PAGEHDRSZ; goto prep_subDB; } @@ -6140,7 +6142,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, /* Make sub-page header for the dup items, with dummy body */ fp->mp_flags = P_LEAF | P_DIRTY | P_SUBP; - fp->mp_lower = (PAGEHDRSZ - PAGEBASE); + fp->mp_lower = 0; xdata.iov_len = PAGEHDRSZ + dkey.iov_len + data->iov_len; if (mc->mc_db->md_flags & MDBX_DUPFIXED) { fp->mp_flags |= P_LEAF2; @@ -6150,7 +6152,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + (dkey.iov_len & 1) + (data->iov_len & 1); } - fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEBASE); + fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ } else if (leaf->mn_flags & F_SUBDATA) { /* Data is on sub-DB, just store it */ @@ -6218,9 +6220,9 @@ int mdbx_cursor_put(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if (fp_flags & P_LEAF2) { memcpy(PAGEDATA(mp), PAGEDATA(fp), NUMKEYS(fp) * fp->mp_leaf2_ksize); } else { - memcpy((char *)mp + mp->mp_upper + PAGEBASE, - (char *)fp + fp->mp_upper + PAGEBASE, - olddata.iov_len - fp->mp_upper - PAGEBASE); + memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, + (char *)fp + fp->mp_upper + PAGEHDRSZ, + olddata.iov_len - fp->mp_upper - PAGEHDRSZ); for (i = 0; i < NUMKEYS(fp); i++) mp->mp_ptrs[i] = fp->mp_ptrs[i] + offset; } @@ -6594,8 +6596,8 @@ static int mdbx_page_new(MDBX_cursor *mc, uint32_t flags, int num, mdbx_debug("allocated new page #%" PRIaPGNO ", size %u", np->mp_pgno, mc->mc_txn->mt_env->me_psize); np->mp_flags = flags | P_DIRTY; - np->mp_lower = (PAGEHDRSZ - PAGEBASE); - np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEBASE; + np->mp_lower = 0; + np->mp_upper = mc->mc_txn->mt_env->me_psize - PAGEHDRSZ; if (IS_BRANCH(np)) mc->mc_db->md_branch_pages++; @@ -6850,7 +6852,7 @@ static void mdbx_node_del(MDBX_cursor *mc, int ksize) { } } - base = (char *)mp + mp->mp_upper + PAGEBASE; + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; memmove(base + sz, base, ptr - mp->mp_upper); mp->mp_lower -= sizeof(indx_t); @@ -6888,7 +6890,7 @@ static void mdbx_node_shrink(MDBX_page *mp, indx_t indx) { SETDSZ(node, nsize); /* Shift upward */ - base = (char *)mp + mp->mp_upper + PAGEBASE; + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; memmove(base + delta, base, (char *)sp + len - base); ptr = mp->mp_ptrs[indx]; @@ -7231,7 +7233,7 @@ static int mdbx_update_key(MDBX_cursor *mc, MDBX_val *key) { mp->mp_ptrs[i] -= delta; } - base = (char *)mp + mp->mp_upper + PAGEBASE; + base = (char *)mp + mp->mp_upper + PAGEHDRSZ; len = ptr - mp->mp_upper + NODESIZE; memmove(base - delta, base, len); mp->mp_upper -= delta; @@ -8130,8 +8132,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, } copy->mp_pgno = mp->mp_pgno; copy->mp_flags = mp->mp_flags; - copy->mp_lower = (PAGEHDRSZ - PAGEBASE); - copy->mp_upper = env->me_psize - PAGEBASE; + copy->mp_lower = 0; + copy->mp_upper = env->me_psize - PAGEHDRSZ; /* prepare to insert */ for (i = 0, j = 0; i < nkeys; i++) { @@ -8173,7 +8175,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, psize += nsize; node = NULL; } else { - node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); psize += NODESIZE + NODEKSZ(node) + sizeof(indx_t); if (IS_LEAF(mp)) { if (F_ISSET(node->mn_flags, F_BIGDATA)) @@ -8193,7 +8195,8 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, sepkey.iov_len = newkey->iov_len; sepkey.iov_base = newkey->iov_base; } else { - node = (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEBASE); + node = + (MDBX_node *)((char *)mp + copy->mp_ptrs[split_indx] + PAGEHDRSZ); sepkey.iov_len = node->mn_ksize; sepkey.iov_base = NODEKEY(node); } @@ -8272,7 +8275,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = j; } else { - node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEBASE); + node = (MDBX_node *)((char *)mp + copy->mp_ptrs[i] + PAGEHDRSZ); rkey.iov_base = NODEKEY(node); rkey.iov_len = node->mn_ksize; if (IS_LEAF(mp)) { @@ -8308,7 +8311,7 @@ static int mdbx_page_split(MDBX_cursor *mc, MDBX_val *newkey, MDBX_val *newdata, mp->mp_lower = copy->mp_lower; mp->mp_upper = copy->mp_upper; memcpy(NODEPTR(mp, nkeys - 1), NODEPTR(copy, nkeys - 1), - env->me_psize - copy->mp_upper - PAGEBASE); + env->me_psize - copy->mp_upper - PAGEHDRSZ); /* reset back to original page */ if (newindx < split_indx) { @@ -9806,7 +9809,7 @@ static int __cold mdbx_env_walk(mdbx_walk_ctx_t *ctx, const char *dbi, return MDBX_CORRUPTED; nkeys = NUMKEYS(mp); - header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEBASE + mp->mp_lower; + header_size = IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower; unused_size = SIZELEFT(mp); payload_size = 0;