From e381191c0fc3c6587b2379e26daeebb7b36792e9 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sun, 27 Nov 2016 12:56:27 +0300 Subject: [PATCH 01/16] mdbx: fix typo. Change-Id: I46344bf13a71b04b32d84bf0e0bc0a34ae6ef162 --- mdb_chk.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mdb_chk.c b/mdb_chk.c index 4fe83ff0..b86b8f96 100644 --- a/mdb_chk.c +++ b/mdb_chk.c @@ -52,7 +52,7 @@ flagbit dbflags[] = { static volatile sig_atomic_t gotsignal; -static void signal_hanlder( int sig ) { +static void signal_handler( int sig ) { (void) sig; gotsignal = 1; } @@ -660,13 +660,13 @@ int main(int argc, char *argv[]) usage(prog); #ifdef SIGPIPE - signal(SIGPIPE, signal_hanlder); + signal(SIGPIPE, signal_handler); #endif #ifdef SIGHUP - signal(SIGHUP, signal_hanlder); + signal(SIGHUP, signal_handler); #endif - signal(SIGINT, signal_hanlder); - signal(SIGTERM, signal_hanlder); + signal(SIGINT, signal_handler); + signal(SIGTERM, signal_handler); envname = argv[optind]; print("Running mdbx_chk for '%s' in %s mode...\n", From 533ad276bbf68b4c62f18a3325dc144ddd13cf4c Mon Sep 17 00:00:00 2001 From: Howard Chu Date: Tue, 29 Nov 2016 19:19:45 +0000 Subject: [PATCH 02/16] mdbx: backport - more for cursor tracking after deletion (ITS#8406). xcursor fixup depends on init state Change-Id: I13139c401e2ae6bbe3d7e6b9fda3739f9ec789cf --- mdb.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mdb.c b/mdb.c index 408cebdb..6d713ba7 100644 --- a/mdb.c +++ b/mdb.c @@ -8451,6 +8451,11 @@ mdb_cursor_del0(MDB_cursor *mc) if (m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] == ki) { m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDB_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); + } + continue; } else if (m3->mc_ki[mc->mc_top] > ki) { m3->mc_ki[mc->mc_top]--; } @@ -8496,10 +8501,14 @@ mdb_cursor_del0(MDB_cursor *mc) if (mc->mc_db->md_flags & MDB_DUPSORT) { MDB_node *node = NODEPTR(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); /* If this node is a fake page, it needs to be reinited - * because its data has moved. + * because its data has moved. But just reset mc_pg[0] + * if the xcursor is already live. */ if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) { - mdb_xcursor_init1(m3, node); + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) + m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); + else + mdb_xcursor_init1(m3, node); } } } From 2196a9b72c741fc922db1e5933df23f9b0e978ad Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 1 Dec 2016 21:17:42 +0100 Subject: [PATCH 03/16] mdbx: backport - factor out refreshing sub-page pointers. Change-Id: If2d3efde19ff751da208959f6f2834ece1f64e56 --- mdb.c | 88 ++++++++++++++++++++++++----------------------------------- 1 file changed, 36 insertions(+), 52 deletions(-) diff --git a/mdb.c b/mdb.c index 6d713ba7..302a4256 100644 --- a/mdb.c +++ b/mdb.c @@ -973,6 +973,21 @@ typedef struct MDB_xcursor { unsigned char mx_dbflag; } MDB_xcursor; + /** Check if there is an inited xcursor, so #XCURSOR_REFRESH() is proper */ +#define XCURSOR_INITED(mc) \ + ((mc)->mc_xcursor && ((mc)->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) + + /** Update sub-page pointer, if any, in \b mc->mc_xcursor. Needed + * when the node which contains the sub-page may have moved. Called + * with \b mp = mc->mc_pg[mc->mc_top], \b ki = mc->mc_ki[mc->mc_top]. + */ +#define XCURSOR_REFRESH(mc, mp, ki) do { \ + MDB_page *xr_pg = (mp); \ + MDB_node *xr_node = NODEPTR(xr_pg, ki); \ + if ((xr_node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) \ + (mc)->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(xr_node); \ +} while (0) + /** State of FreeDB old pages, stored in the MDB_env */ typedef struct MDB_pgstate { pgno_t *mf_pghead; /**< Reclaimed freeDB pages, or NULL before use */ @@ -1445,7 +1460,7 @@ mdb_cursor_chk(MDB_cursor *mc) } if (unlikely(mc->mc_ki[i] >= NUMKEYS(mc->mc_pg[i]))) mdb_print("ack!\n"); - if (mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { + if (XCURSOR_INITED(mc)) { node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) && mc->mc_xcursor->mx_cursor.mc_pg[0] != NODEDATA(node)) { @@ -2544,14 +2559,8 @@ done: if (m2 == mc) continue; if (m2->mc_pg[mc->mc_top] == mp) { m2->mc_pg[mc->mc_top] = np; - if ((mc->mc_db->md_flags & MDB_DUPSORT) && - IS_LEAF(np) && - (m2->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) - { - MDB_node *leaf = NODEPTR(np, m2->mc_ki[mc->mc_top]); - if ((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } + if (XCURSOR_INITED(m2) && IS_LEAF(np)) + XCURSOR_REFRESH(m2, np, m2->mc_ki[mc->mc_top]); } } } @@ -6926,11 +6935,8 @@ new_sub: if (m3->mc_ki[i] >= mc->mc_ki[i] && insert_key) { m3->mc_ki[i]++; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - MDB_node *n2 = NODEPTR(mp, m3->mc_ki[i]); - if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); - } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, mp, m3->mc_ki[i]); } } } @@ -6981,9 +6987,7 @@ put_sub: if (m2->mc_ki[i] == mc->mc_ki[i]) { mdb_xcursor_init2(m2, mx, new_dupdata); } else if (!insert_key && m2->mc_ki[i] < nkeys) { - MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]); - if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + XCURSOR_REFRESH(m2, mp, m2->mc_ki[i]); } } } @@ -7094,13 +7098,12 @@ mdb_cursor_del(MDB_cursor *mc, unsigned flags) if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) continue; if (m2->mc_pg[mc->mc_top] == mp) { - if (m2->mc_ki[mc->mc_top] == mc->mc_ki[mc->mc_top]) { - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(leaf); - } else { - MDB_node *n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); - if (!(n2->mn_flags & F_SUBDATA)) - m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); + MDB_node *n2 = leaf; + if (m2->mc_ki[mc->mc_top] != mc->mc_ki[mc->mc_top]) { + n2 = NODEPTR(mp, m2->mc_ki[mc->mc_top]); + if (n2->mn_flags & F_SUBDATA) continue; } + m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); } } } @@ -7962,12 +7965,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; m3->mc_ki[csrc->mc_top-1]++; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mps)) { - MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); } } else /* Adding on the right, bump others down */ @@ -7988,12 +7987,8 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) } else { m3->mc_ki[csrc->mc_top]--; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mps)) { - MDB_node *node = NODEPTR(m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (XCURSOR_INITED(m3) && IS_LEAF(mps)) + XCURSOR_REFRESH(m3, m3->mc_pg[csrc->mc_top], m3->mc_ki[csrc->mc_top]); } } } @@ -8192,12 +8187,8 @@ mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst) m3->mc_ki[top-1] > csrc->mc_ki[top-1]) { m3->mc_ki[top-1]--; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(psrc)) { - MDB_node *node = NODEPTR(m3->mc_pg[top], m3->mc_ki[top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) + XCURSOR_REFRESH(m3, m3->mc_pg[top], m3->mc_ki[top]); } } { @@ -8459,11 +8450,8 @@ mdb_cursor_del0(MDB_cursor *mc) } else if (m3->mc_ki[mc->mc_top] > ki) { m3->mc_ki[mc->mc_top]--; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } } } @@ -8997,12 +8985,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno m3->mc_ki[ptop] >= mc->mc_ki[ptop]) { m3->mc_ki[ptop]++; } - if (m3->mc_xcursor && (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) && - IS_LEAF(mp)) { - MDB_node *node = NODEPTR(m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - if ((node->mn_flags & (F_DUPDATA|F_SUBDATA)) == F_DUPDATA) - m3->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(node); - } + if (XCURSOR_INITED(m3) && IS_LEAF(mp)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } } mdb_debug("mp left: %d, rp left: %d", SIZELEFT(mp), SIZELEFT(rp)); From 2fb5a5426427bd49f5e227e3173edd23914cbbf5 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Sat, 3 Dec 2016 17:39:26 +0300 Subject: [PATCH 04/16] mdbx: minor simplify mc_signature. Change-Id: Ib3952853350d220dd62910bcd55ac74cf5f47886 --- mdb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mdb.c b/mdb.c index 302a4256..6a2782ff 100644 --- a/mdb.c +++ b/mdb.c @@ -7548,7 +7548,6 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) mx->mx_dbx.md_cmp = mdb_cmp_clong; #endif */ - mc->mc_signature = MDBX_MC_SIGNATURE; } @@ -7587,6 +7586,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) static void mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) { + mc->mc_signature = MDBX_MC_SIGNATURE; mc->mc_next = NULL; mc->mc_backup = NULL; mc->mc_dbi = dbi; @@ -7610,7 +7610,6 @@ mdb_cursor_init(MDB_cursor *mc, MDB_txn *txn, MDB_dbi dbi, MDB_xcursor *mx) if (*mc->mc_dbflag & DB_STALE) { mdb_page_search(mc, NULL, MDB_PS_ROOTONLY); } - mc->mc_signature = MDBX_MC_SIGNATURE; } int From ba688e63dccb7a613874375bfb97279aeab9feaa Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 7 Dec 2016 18:55:21 +0100 Subject: [PATCH 05/16] mdbx: backport (comments) - Note functions which must set MDB_TXN_ERROR on failure. Other functions depend on them to do so. For mdb_node_read(), instead remove such a dependence. Change-Id: I49c1c8bbb1c20527cbf76ef004cb7a1300ef465c --- mdb.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mdb.c b/mdb.c index 6a2782ff..d918fa79 100644 --- a/mdb.c +++ b/mdb.c @@ -1542,6 +1542,7 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) /** Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. */ static MDB_page * mdb_page_malloc(MDB_txn *txn, unsigned num) @@ -2045,7 +2046,7 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) } /** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. @@ -2472,6 +2473,7 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) } /** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc cursor pointing to the page to be touched * @return 0 on success, non-zero on failure. */ @@ -5355,7 +5357,9 @@ mdb_cursor_pop(MDB_cursor *mc) } } -/** Push a page onto the top of the cursor's stack. */ +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) { @@ -5375,6 +5379,7 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) } /** Find the address of the page corresponding to a given page number. + * Set #MDB_TXN_ERROR on failure. * @param[in] txn the transaction for this access. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. @@ -7148,6 +7153,7 @@ fail: } /** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc a cursor on the database being added to. * @param[in] flags flags defining what type of page is being allocated. * @param[in] num the number of pages to allocate. This is usually 1, @@ -7233,6 +7239,7 @@ mdb_branch_size(MDB_env *env, MDB_val *key) } /** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc The cursor for this operation. * @param[in] indx The index on the page where the new node should be added. * @param[in] key The key for the new node. @@ -7747,6 +7754,7 @@ mdb_cursor_dbi(MDB_cursor *mc) } /** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc Cursor pointing to the node to operate on. * @param[in] key The new key to use. * @return 0 on success, non-zero on failure. @@ -8577,6 +8585,7 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, } /** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. * @param[in,out] mc Cursor pointing to the page and desired insertion index. * The cursor will be updated to point to the actual page and index where * the node got inserted after the split. From baf61da42bbd058d9da7c62e43420906d206fed4 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 7 Dec 2016 19:04:19 +0100 Subject: [PATCH 06/16] mdbx: backport (minor) - doxygen cleanup. Change-Id: Ide60614f4fc631aa2bfba3609115f39ec294b3de --- mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdb.c b/mdb.c index d918fa79..0208b73a 100644 --- a/mdb.c +++ b/mdb.c @@ -10276,7 +10276,7 @@ mdb_reader_check(MDB_env *env, int *dead) return mdb_reader_check0(env, 0, dead); } -/** As #mdb_reader_check(). rlocked = . */ +/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ static int __cold mdb_reader_check0(MDB_env *env, int rlocked, int *dead) { From 207f43003ce81a67c71571b71255d0eb6f0c2281 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 7 Dec 2016 19:06:11 +0100 Subject: [PATCH 07/16] mdbx: backport (comments) - MDB_CP_COMPACT comments. Change-Id: I4965d5e511395fceafcd922f513dcf0d5050c9b8 --- mdb.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mdb.c b/mdb.c index 0208b73a..07cbeb9a 100644 --- a/mdb.c +++ b/mdb.c @@ -9057,7 +9057,10 @@ typedef struct mdb_copy { HANDLE mc_fd; int mc_toggle; /**< Buffer number in provider */ int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ - volatile int mc_error; /**< Error code, never cleared if set */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; } mdb_copy; /** Dedicated writer thread for compacting copy. */ @@ -9135,7 +9138,11 @@ mdb_env_cthr_toggle(mdb_copy *my, int adjust) return my->mc_error; } - /** Depth-first tree traversal for compacting copy. */ + /** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ static int __cold mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { From b33fe4a3f04397ad1bc016010041bb7ea631da35 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 12 Jan 2016 23:18:06 +0100 Subject: [PATCH 08/16] mdbx: backport - note about reserved vs. actual mem/disk usage. Change-Id: Ibd75bdafac646f4a577c7cbebda8173e5b7e5ef1 --- lmdb.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lmdb.h b/lmdb.h index 2d2aabb7..52abd372 100644 --- a/lmdb.h +++ b/lmdb.h @@ -66,6 +66,11 @@ * This does not use actual memory or disk space, but users may need * to understand the difference so they won't be scared off. * + * - An LMDB configuration will often reserve considerable \b unused + * memory address space and maybe file size for future growth. + * This does not use actual memory or disk space, but users may need + * to understand the difference so they won't be scared off. + * * - By default, in versions before 0.9.10, unused portions of the data * file might receive garbage data from memory freed by other code. * (This does not happen when using the #MDB_WRITEMAP flag.) As of From 7b773e6f2db24192dd0d22e7c031251cbaa1f36e Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 9 Dec 2016 00:03:36 +0100 Subject: [PATCH 09/16] mdbx: backport - Cleanup: Add flag DB_DUPDATA, drop DB_DIRTY hack. Change-Id: I5d30367104d025c1d2f8c39d29455faca59d7f19 --- mdb.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mdb.c b/mdb.c index 07cbeb9a..1611e7fe 100644 --- a/mdb.c +++ b/mdb.c @@ -859,11 +859,12 @@ struct MDB_txn { * @ingroup internal * @{ */ -#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; @@ -6465,7 +6466,8 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { + /* Touch DB record of named DB */ MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -7550,7 +7552,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) } mdb_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, mx->mx_db.md_root); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; /* #if UINT_MAX < SIZE_MAX if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) mx->mx_dbx.md_cmp = mdb_cmp_clong; @@ -7576,7 +7578,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags |= C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; #if UINT_MAX < SIZE_MAX mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; #endif From ca97abb7f3d9c11cf03943581c157082fc0c04cc Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 10 Dec 2016 09:16:17 +0100 Subject: [PATCH 10/16] mdbx: backport - mdb_dbi_open(): Protect mainDB cursors (ITS#8542). Change-Id: I5bdd3727eddc16a518c4b88534a3e7253e9789fd --- mdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mdb.c b/mdb.c index 1611e7fe..d520f5bf 100644 --- a/mdb.c +++ b/mdb.c @@ -9874,7 +9874,8 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi) memset(&dummy, 0, sizeof(dummy)); dummy.md_root = P_INVALID; dummy.md_flags = flags & PERSISTENT_FLAGS; - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); + WITH_CURSOR_TRACKING(mc, + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); dbflag |= DB_DIRTY; } From fe4e9993d633e6019d9964f30de53691bd02be28 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 23 Aug 2015 20:33:02 +0200 Subject: [PATCH 11/16] mdbx: backport - Pass cursor to mdb_page_get(), mdb_node_read(). No change in behavior. Change-Id: I19054cfd96fa883970a0dc66a0088596a142ea07 --- mdb.c | 100 ++++++++++++++++++++++++++++----------------------------- mdbx.c | 11 +++++-- 2 files changed, 58 insertions(+), 53 deletions(-) diff --git a/mdb.c b/mdb.c index d520f5bf..21983552 100644 --- a/mdb.c +++ b/mdb.c @@ -1092,10 +1092,10 @@ typedef struct MDB_ntxn { #define METAPAGE_2(env) \ (&((MDB_metabuf*) ((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) -static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); -static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); -static int mdb_page_touch(MDB_cursor *mc); -static int mdb_cursor_touch(MDB_cursor *mc); +static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); +static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); +static int mdb_page_touch(MDB_cursor *mc); +static int mdb_cursor_touch(MDB_cursor *mc); #define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ "reset-tmp", "fail-begin", "fail-beginchild"} @@ -1108,16 +1108,16 @@ enum { #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ -static int mdb_txn_end(MDB_txn *txn, unsigned mode); +static int mdb_txn_end(MDB_txn *txn, unsigned mode); -static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); -static int mdb_page_search_root(MDB_cursor *mc, +static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); #define MDB_PS_MODIFY 1 #define MDB_PS_ROOTONLY 2 #define MDB_PS_FIRST 4 #define MDB_PS_LAST 8 -static int mdb_page_search(MDB_cursor *mc, +static int mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags); static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); @@ -1125,17 +1125,17 @@ static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned nflags); -static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); -static int mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); -static void mdb_env_close0(MDB_env *env); +static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); +static int mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); +static void mdb_env_close0(MDB_env *env); -static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); -static int mdb_node_add(MDB_cursor *mc, indx_t indx, +static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static int mdb_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags); -static void mdb_node_del(MDB_cursor *mc, int ksize); -static void mdb_node_shrink(MDB_page *mp, indx_t indx); +static void mdb_node_del(MDB_cursor *mc, int ksize); +static void mdb_node_shrink(MDB_page *mp, indx_t indx); static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); -static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); +static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); static size_t mdb_branch_size(MDB_env *env, MDB_val *key); @@ -1161,8 +1161,8 @@ static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); static int mdb_drop0(MDB_cursor *mc, int subs); -static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); -static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); +static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); +static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); /** @cond */ static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int_ai, mdb_cmp_int_a2, mdb_cmp_int_ua; @@ -1716,7 +1716,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; MDB_txn *txn = mc->mc_txn; - MDB_cursor *m3; + MDB_cursor *m3, *m0 = mc; MDB_xcursor *mx; MDB_page *dp, *mp; MDB_node *leaf; @@ -1759,7 +1759,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + if (unlikely((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS)) break; if ((dp->mp_flags & Mask) == pflags && level <= 1) dp->mp_flags ^= P_KEEP; @@ -2215,7 +2215,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) np = m2.mc_pg[m2.mc_top]; leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if (unlikely((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS)) goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { @@ -5381,15 +5381,16 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) /** Find the address of the page corresponding to a given page number. * Set #MDB_TXN_ERROR on failure. - * @param[in] txn the transaction for this access. + * @param[in] mc the cursor accessing the page. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. * @return 0 on success, non-zero on failure. */ static int -mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) +mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) { + MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; MDB_page *p = NULL; int level; @@ -5483,7 +5484,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) mdb_cassert(mc, i < NUMKEYS(mp)); node = NODEPTR(mp, i); - if (unlikely((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)) + if (unlikely((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) return rc; mc->mc_ki[mc->mc_top] = i; @@ -5525,7 +5526,7 @@ mdb_page_search_lowest(MDB_cursor *mc) MDB_node *node = NODEPTR(mp, 0); int rc; - if (unlikely((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)) + if (unlikely((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) return rc; mc->mc_ki[mc->mc_top] = 0; @@ -5577,7 +5578,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) return MDB_NOTFOUND; if (unlikely((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)) return MDB_INCOMPATIBLE; /* not a named DB */ - rc = mdb_node_read(mc->mc_txn, leaf, &data); + rc = mdb_node_read(&mc2, leaf, &data); if (rc) return rc; memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), @@ -5601,7 +5602,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) mdb_cassert(mc, root > 1); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) - if (unlikely((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0)) + if (unlikely((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) return rc; mc->mc_snum = 1; @@ -5698,13 +5699,13 @@ release: } /** Return the data associated with a given node. - * @param[in] txn The transaction for this operation. + * @param[in] mc The cursor for this operation. * @param[in] leaf The node being read. * @param[out] data Updated to point to the node's data. * @return 0 on success, non-zero on failure. */ static MDBX_INLINE int -mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) +mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) { MDB_page *omp; /* overflow page */ pgno_t pgno; @@ -5720,7 +5721,7 @@ mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) */ data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if (unlikely((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0)) { + if (unlikely((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0)) { mdb_debug("read overflow page %zu failed", pgno); return rc; } @@ -5801,7 +5802,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0)) { + if (unlikely((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED|C_EOF); return rc; @@ -5884,7 +5885,7 @@ skip: mdb_xcursor_init1(mc, leaf); } if (data) { - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5967,7 +5968,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mdb_xcursor_init1(mc, leaf); } if (data) { - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -6156,7 +6157,7 @@ set1: } } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { MDB_val olddata; - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, &olddata)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)) return rc; rc = mc->mc_dbx->md_dcmp(data, &olddata); if (rc) { @@ -6169,7 +6170,7 @@ set1: } else { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; } } @@ -6218,7 +6219,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (unlikely(rc)) return rc; } else { - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; } } @@ -6263,7 +6264,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (unlikely(rc)) return rc; } else { - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; } } @@ -6312,7 +6313,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); } else { - rc = mdb_node_read(mc->mc_txn, leaf, data); + rc = mdb_node_read(mc, leaf, data); } } } @@ -6429,7 +6430,7 @@ fetchm: MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { MDB_GET_KEY(leaf, key); - rc = mdb_node_read(mc->mc_txn, leaf, data); + rc = mdb_node_read(mc, leaf, data); break; } } @@ -6830,7 +6831,7 @@ current: int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); memcpy(&pg, olddata.mv_data, sizeof(pg)); - if (unlikely((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)) + if (unlikely((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0)) return rc2; ovpages = omp->mp_pages; @@ -7141,7 +7142,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned flags) pgno_t pg; memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if (unlikely((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || + if (unlikely((rc = mdb_page_get(mc, pg, &omp, NULL)) || (rc = mdb_ovpage_free(mc, omp)))) goto fail; } @@ -8313,7 +8314,7 @@ mdb_rebalance(MDB_cursor *mc) if (unlikely(rc)) return rc; mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); + rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); if (unlikely(rc)) return rc; mc->mc_db->md_depth--; @@ -8374,7 +8375,7 @@ mdb_rebalance(MDB_cursor *mc) mdb_debug("reading right neighbor"); mn.mc_ki[ptop]++; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); if (unlikely(rc)) return rc; mn.mc_ki[mn.mc_top] = 0; @@ -8386,7 +8387,7 @@ mdb_rebalance(MDB_cursor *mc) mdb_debug("reading left neighbor"); mn.mc_ki[ptop]--; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); if (unlikely(rc)) return rc; mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; @@ -9149,7 +9150,6 @@ static int __cold mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { MDB_cursor mc; - MDB_txn *txn = my->mc_txn; MDB_node *ni; MDB_page *mo, *mp, *leaf; char *buf, *ptr; @@ -9162,9 +9162,9 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) memset(&mc, 0, sizeof(mc)); mc.mc_snum = 1; - mc.mc_txn = txn; + mc.mc_txn = my->mc_txn; - rc = mdb_page_get(txn, *pg, &mc.mc_pg[0], NULL); + rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); if (rc) return rc; rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); @@ -9209,7 +9209,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) memcpy(&pg, NODEDATA(ni), sizeof(pg)); memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); - rc = mdb_page_get(txn, pg, &omp, NULL); + rc = mdb_page_get(&mc, pg, &omp, NULL); if (rc) goto done; if (my->mc_wlen[toggle] >= MDB_WBUF) { @@ -9259,7 +9259,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) again: ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); pg = NODEPGNO(ni); - rc = mdb_page_get(txn, pg, &mp, NULL); + rc = mdb_page_get(&mc, pg, &mp, NULL); if (rc) goto done; mc.mc_top++; @@ -10006,7 +10006,7 @@ mdb_drop0(MDB_cursor *mc, int subs) MDB_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(txn, pg, &omp, NULL); + rc = mdb_page_get(mc, pg, &omp, NULL); if (unlikely(rc)) goto done; mdb_cassert(mc, IS_OVERFLOW(omp)); diff --git a/mdbx.c b/mdbx.c index fb4aac85..9f07548e 100644 --- a/mdbx.c +++ b/mdbx.c @@ -182,7 +182,12 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee if (pg == P_INVALID) return MDB_SUCCESS; /* empty db */ - rc = mdb_page_get(ctx->mw_txn, pg, &mp, NULL); + MDB_cursor mc; + memset(&mc, 0, sizeof(mc)); + mc.mc_snum = 1; + mc.mc_txn = ctx->mw_txn; + + rc = mdb_page_get(&mc, pg, &mp, NULL); if (rc) return rc; if (pg != mp->mp_p.p_pgno) @@ -220,7 +225,7 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee } for (align_bytes = i = 0; i < nkeys; - align_bytes += ((payload_size + align_bytes) & 1), i++) { + align_bytes += ((payload_size + align_bytes) & 1), i++) { MDB_node *node; if (IS_LEAF2(mp)) { @@ -249,7 +254,7 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee payload_size += sizeof(pgno_t); opg = NODEDATA(node); - rc = mdb_page_get(ctx->mw_txn, *opg, &omp, NULL); + rc = mdb_page_get(&mc, *opg, &omp, NULL); if (rc) return rc; if (*opg != omp->mp_p.p_pgno) From fca2f4d9295a136c5d9e151f03dd077d71f990c2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 10 Dec 2016 21:42:39 +0100 Subject: [PATCH 12/16] mdbx: backport - clean up and comment C_UNTRACK. Don't use it as a "cursor is tracked" hint in mdb_pages_xkeep(). It's been harmless so far, but would break after mdb_cursor_copy(). Checking m0 directly short-circuits better anyway. Change-Id: Ibf180214db603e08ed11e298cff85866eb79f4bb --- mdb.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mdb.c b/mdb.c index 21983552..6e778cb0 100644 --- a/mdb.c +++ b/mdb.c @@ -1723,13 +1723,9 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) unsigned i, j; int rc = MDB_SUCCESS, level; - /* Mark pages seen by cursors */ - if (mc->mc_flags & C_UNTRACK) - mc = NULL; /* will find mc in mt_cursors */ - for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { - for (; mc; mc=mc->mc_next) { - if (!(mc->mc_flags & C_INITIALIZED)) - continue; + /* Mark pages seen by cursors: First m0, then tracked cursors */ + for (i = txn->mt_numdbs;; ) { + if (mc->mc_flags & C_INITIALIZED) { for (m3 = mc;; m3 = &mx->mx_cursor) { mp = NULL; for (j=0; jmc_snum; j++) { @@ -1748,10 +1744,13 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) break; } } - if (i == 0) - break; + mc = mc->mc_next; + for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) + if (i == 0) + goto mark_done; } +mark_done: if (all) { /* Mark dirty root pages */ for (i=0; imt_numdbs; i++) { @@ -7727,7 +7726,10 @@ mdb_cursor_close(MDB_cursor *mc) if (mc) { mdb_ensure(NULL, mc->mc_signature == MDBX_MC_SIGNATURE); if (!mc->mc_backup) { - /* remove from txn, if tracked */ + /* Remove from txn, if tracked. + * A read-only txn (!C_UNTRACK) may have been freed already, + * so do not peek inside it. Only write txns track cursors. + */ if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; @@ -8578,7 +8580,6 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, * run out of space, triggering a split. We need this * cursor to be consistent until the end of the rebalance. */ - mc.mc_flags |= C_UNTRACK; mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; rc = mdb_cursor_del(&mc, flags); From 362714512939ec5239787a7eb58ac134dbf0a3e9 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 10 Dec 2016 22:00:31 +0100 Subject: [PATCH 13/16] mdbx: backport - catch mdb_cursor_sibling() error (ITS#7377). Change-Id: I440ff1f9f92156e19935195d656f4d77b088f605 --- mdb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mdb.c b/mdb.c index 6e778cb0..ce69a040 100644 --- a/mdb.c +++ b/mdb.c @@ -8678,7 +8678,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno split_indx = newindx; nkeys = 0; } else { - split_indx = (nkeys+1) / 2; if (IS_LEAF2(rp)) { @@ -8838,7 +8837,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno } else { /* find right page's left sibling */ mc->mc_ki[ptop] = mn.mc_ki[ptop]; - mdb_cursor_sibling(mc, 0); + rc = mdb_cursor_sibling(mc, 0); } } } else { @@ -8846,8 +8845,11 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); mn.mc_top++; } - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDB_SUCCESS)) { + if (rc == MDB_NOTFOUND) /* improper mdb_cursor_sibling() result */ + rc = MDB_PROBLEM; goto done; + } if (nflags & MDB_APPEND) { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[mc->mc_top] = 0; From cbff64757953b6ccb1ab8853a6d41491d2e94fc5 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 15 Dec 2016 20:09:07 +0300 Subject: [PATCH 14/16] mdbx: fix ov-pages copying in cursor_put(). I think I just lost one line of code. This bug was added by 09d790431710f6456cb80bcfc5962da5851893ed --- mdb.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mdb.c b/mdb.c index ce69a040..81dea96e 100644 --- a/mdb.c +++ b/mdb.c @@ -6864,13 +6864,8 @@ current: * parent txn, in case the user peeks at MDB_RESERVEd * or unused parts. Some users treat ovpages specially. */ -#if MDBX_MODE_ENABLED - /* LY: New page will contain only header from origin, - * but no any payload */ - memcpy(np, omp, PAGEHDRSZ); -#else size_t sz = (size_t) env->me_psize * ovpages, off; - if (!(flags & MDB_RESERVE)) { + if (MDBX_MODE_ENABLED || !(flags & MDB_RESERVE)) { /* Skip the part where LMDB will put *data. * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. @@ -6881,7 +6876,6 @@ current: sz = PAGEHDRSZ; } memcpy(np, omp, sz); /* Copy whole or header of page */ -#endif /* MDBX_MODE_ENABLED */ omp = np; } SETDSZ(leaf, data->mv_size); From 70a138472b1ba6a9307fade117375b084a444997 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Thu, 15 Dec 2016 22:12:45 +0100 Subject: [PATCH 15/16] mdbx: backport - Mention MDB_PREV_MULTIPLE along with MDB_NEXT_MULTIPLE. Change-Id: I0c216203c3aa2005ef254293c1c472c9b7f257f3 --- lmdb.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lmdb.h b/lmdb.h index 52abd372..d0b8b37c 100644 --- a/lmdb.h +++ b/lmdb.h @@ -1149,8 +1149,9 @@ int mdb_txn_renew(MDB_txn *txn); * This flag may only be used in combination with #MDB_DUPSORT. This option * tells the library that the data items for this database are all the same * size, which allows further optimizations in storage and retrieval. When - * all data items are the same size, the #MDB_GET_MULTIPLE and #MDB_NEXT_MULTIPLE - * cursor operations may be used to retrieve multiple items at once. + * all data items are the same size, the #MDB_GET_MULTIPLE, #MDB_NEXT_MULTIPLE + * and #MDB_PREV_MULTIPLE cursor operations may be used to retrieve multiple + * items at once. *
  • #MDB_INTEGERDUP * This option specifies that duplicate data items are binary integers, * similar to #MDB_INTEGERKEY keys. From b950e39c10ace74431310a190aa3041adbd37d65 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 20 Oct 2016 09:51:22 +0200 Subject: [PATCH 16/16] mdbx: backport - mdb_env_copyfd2(): Don't abort on SIGPIPE (ITS#8504). Return EPIPE instead. Never clear mc_error, we could lose a failure in the other thread. Change-Id: Ief08803ed56293309f07be116e69123c10907e77 --- mdb.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mdb.c b/mdb.c index 81dea96e..4c2cd1ed 100644 --- a/mdb.c +++ b/mdb.c @@ -9072,6 +9072,14 @@ mdb_env_copythr(void *arg) int toggle = 0, wsize, rc = 0; int len; +#ifdef SIGPIPE + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGPIPE); + if ((rc = pthread_sigmask(SIG_BLOCK, &set, NULL)) != 0) + my->mc_error = rc; +#endif + pthread_mutex_lock(&my->mc_mutex); for(;;) { while (!my->mc_new) @@ -9086,6 +9094,15 @@ again: len = write(my->mc_fd, ptr, wsize); if (len < 0) { rc = errno; +#ifdef SIGPIPE + if (rc == EPIPE) { + /* Collect the pending SIGPIPE, otherwise at least OS X + * gives it to the process on thread-exit (ITS#8504). + */ + int tmp; + sigwait(&set, &tmp); + } +#endif break; } else if (len > 0) { rc = MDB_SUCCESS;