From ba688e63dccb7a613874375bfb97279aeab9feaa Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 7 Dec 2016 18:55:21 +0100 Subject: [PATCH 01/10] mdbx: backport (comments) - Note functions which must set MDB_TXN_ERROR on failure. Other functions depend on them to do so. For mdb_node_read(), instead remove such a dependence. Change-Id: I49c1c8bbb1c20527cbf76ef004cb7a1300ef465c --- mdb.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mdb.c b/mdb.c index 6a2782ff..d918fa79 100644 --- a/mdb.c +++ b/mdb.c @@ -1542,6 +1542,7 @@ mdb_dcmp(MDB_txn *txn, MDB_dbi dbi, const MDB_val *a, const MDB_val *b) /** Allocate memory for a page. * Re-use old malloc'd pages first for singletons, otherwise just malloc. + * Set #MDB_TXN_ERROR on failure. */ static MDB_page * mdb_page_malloc(MDB_txn *txn, unsigned num) @@ -2045,7 +2046,7 @@ mdb_page_dirty(MDB_txn *txn, MDB_page *mp) } /** Allocate page numbers and memory for writing. Maintain me_pglast, - * me_pghead and mt_next_pgno. + * me_pghead and mt_next_pgno. Set #MDB_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. @@ -2472,6 +2473,7 @@ mdb_page_unspill(MDB_txn *txn, MDB_page *mp, MDB_page **ret) } /** Touch a page: make it dirty and re-insert into tree with updated pgno. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc cursor pointing to the page to be touched * @return 0 on success, non-zero on failure. */ @@ -5355,7 +5357,9 @@ mdb_cursor_pop(MDB_cursor *mc) } } -/** Push a page onto the top of the cursor's stack. */ +/** Push a page onto the top of the cursor's stack. + * Set #MDB_TXN_ERROR on failure. + */ static int mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) { @@ -5375,6 +5379,7 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) } /** Find the address of the page corresponding to a given page number. + * Set #MDB_TXN_ERROR on failure. * @param[in] txn the transaction for this access. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. @@ -7148,6 +7153,7 @@ fail: } /** Allocate and initialize new pages for a database. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc a cursor on the database being added to. * @param[in] flags flags defining what type of page is being allocated. * @param[in] num the number of pages to allocate. This is usually 1, @@ -7233,6 +7239,7 @@ mdb_branch_size(MDB_env *env, MDB_val *key) } /** Add a node to the page pointed to by the cursor. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc The cursor for this operation. * @param[in] indx The index on the page where the new node should be added. * @param[in] key The key for the new node. @@ -7747,6 +7754,7 @@ mdb_cursor_dbi(MDB_cursor *mc) } /** Replace the key for a branch node with a new key. + * Set #MDB_TXN_ERROR on failure. * @param[in] mc Cursor pointing to the node to operate on. * @param[in] key The new key to use. * @return 0 on success, non-zero on failure. @@ -8577,6 +8585,7 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, } /** Split a page and insert a new node. + * Set #MDB_TXN_ERROR on failure. * @param[in,out] mc Cursor pointing to the page and desired insertion index. * The cursor will be updated to point to the actual page and index where * the node got inserted after the split. From baf61da42bbd058d9da7c62e43420906d206fed4 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 7 Dec 2016 19:04:19 +0100 Subject: [PATCH 02/10] mdbx: backport (minor) - doxygen cleanup. Change-Id: Ide60614f4fc631aa2bfba3609115f39ec294b3de --- mdb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mdb.c b/mdb.c index d918fa79..0208b73a 100644 --- a/mdb.c +++ b/mdb.c @@ -10276,7 +10276,7 @@ mdb_reader_check(MDB_env *env, int *dead) return mdb_reader_check0(env, 0, dead); } -/** As #mdb_reader_check(). rlocked = . */ +/** As #mdb_reader_check(). \b rlocked is set if caller locked #me_rmutex. */ static int __cold mdb_reader_check0(MDB_env *env, int rlocked, int *dead) { From 207f43003ce81a67c71571b71255d0eb6f0c2281 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Wed, 7 Dec 2016 19:06:11 +0100 Subject: [PATCH 03/10] mdbx: backport (comments) - MDB_CP_COMPACT comments. Change-Id: I4965d5e511395fceafcd922f513dcf0d5050c9b8 --- mdb.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mdb.c b/mdb.c index 0208b73a..07cbeb9a 100644 --- a/mdb.c +++ b/mdb.c @@ -9057,7 +9057,10 @@ typedef struct mdb_copy { HANDLE mc_fd; int mc_toggle; /**< Buffer number in provider */ int mc_new; /**< (0-2 buffers to write) | (#MDB_EOF at end) */ - volatile int mc_error; /**< Error code, never cleared if set */ + /** Error code. Never cleared if set. Both threads can set nonzero + * to fail the copy. Not mutex-protected, LMDB expects atomic int. + */ + volatile int mc_error; } mdb_copy; /** Dedicated writer thread for compacting copy. */ @@ -9135,7 +9138,11 @@ mdb_env_cthr_toggle(mdb_copy *my, int adjust) return my->mc_error; } - /** Depth-first tree traversal for compacting copy. */ + /** Depth-first tree traversal for compacting copy. + * @param[in] my control structure. + * @param[in,out] pg database root. + * @param[in] flags includes #F_DUPDATA if it is a sorted-duplicate sub-DB. + */ static int __cold mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { From b33fe4a3f04397ad1bc016010041bb7ea631da35 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Tue, 12 Jan 2016 23:18:06 +0100 Subject: [PATCH 04/10] mdbx: backport - note about reserved vs. actual mem/disk usage. Change-Id: Ibd75bdafac646f4a577c7cbebda8173e5b7e5ef1 --- lmdb.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lmdb.h b/lmdb.h index 2d2aabb7..52abd372 100644 --- a/lmdb.h +++ b/lmdb.h @@ -66,6 +66,11 @@ * This does not use actual memory or disk space, but users may need * to understand the difference so they won't be scared off. * + * - An LMDB configuration will often reserve considerable \b unused + * memory address space and maybe file size for future growth. + * This does not use actual memory or disk space, but users may need + * to understand the difference so they won't be scared off. + * * - By default, in versions before 0.9.10, unused portions of the data * file might receive garbage data from memory freed by other code. * (This does not happen when using the #MDB_WRITEMAP flag.) As of From 7b773e6f2db24192dd0d22e7c031251cbaa1f36e Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Fri, 9 Dec 2016 00:03:36 +0100 Subject: [PATCH 05/10] mdbx: backport - Cleanup: Add flag DB_DUPDATA, drop DB_DIRTY hack. Change-Id: I5d30367104d025c1d2f8c39d29455faca59d7f19 --- mdb.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mdb.c b/mdb.c index 07cbeb9a..1611e7fe 100644 --- a/mdb.c +++ b/mdb.c @@ -859,11 +859,12 @@ struct MDB_txn { * @ingroup internal * @{ */ -#define DB_DIRTY 0x01 /**< DB was modified or is DUPSORT data */ +#define DB_DIRTY 0x01 /**< DB was written in this txn */ #define DB_STALE 0x02 /**< Named-DB record is older than txnID */ #define DB_NEW 0x04 /**< Named-DB handle opened in this txn */ #define DB_VALID 0x08 /**< DB handle is valid, see also #MDB_VALID */ #define DB_USRVALID 0x10 /**< As #DB_VALID, but not set for #FREE_DBI */ +#define DB_DUPDATA 0x20 /**< DB is #MDB_DUPSORT data */ /** @} */ /** In write txns, array of cursors for each DB */ MDB_cursor **mt_cursors; @@ -6465,7 +6466,8 @@ mdb_cursor_touch(MDB_cursor *mc) { int rc = MDB_SUCCESS; - if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & DB_DIRTY)) { + if (mc->mc_dbi >= CORE_DBS && !(*mc->mc_dbflag & (DB_DIRTY|DB_DUPDATA))) { + /* Touch DB record of named DB */ MDB_cursor mc2; MDB_xcursor mcx; if (TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi)) @@ -7550,7 +7552,7 @@ mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node) } mdb_debug("Sub-db -%u root page %zu", mx->mx_cursor.mc_dbi, mx->mx_db.md_root); - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; /* #if UINT_MAX < SIZE_MAX if (mx->mx_dbx.md_cmp == mdb_cmp_int && mx->mx_db.md_pad == sizeof(size_t)) mx->mx_dbx.md_cmp = mdb_cmp_clong; @@ -7576,7 +7578,7 @@ mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int new_dupdata) mx->mx_cursor.mc_top = 0; mx->mx_cursor.mc_flags |= C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; - mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DIRTY; /* DB_DIRTY guides mdb_cursor_touch */ + mx->mx_dbflag = DB_VALID|DB_USRVALID|DB_DUPDATA; #if UINT_MAX < SIZE_MAX mx->mx_dbx.md_cmp = src_mx->mx_dbx.md_cmp; #endif From ca97abb7f3d9c11cf03943581c157082fc0c04cc Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 10 Dec 2016 09:16:17 +0100 Subject: [PATCH 06/10] mdbx: backport - mdb_dbi_open(): Protect mainDB cursors (ITS#8542). Change-Id: I5bdd3727eddc16a518c4b88534a3e7253e9789fd --- mdb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mdb.c b/mdb.c index 1611e7fe..d520f5bf 100644 --- a/mdb.c +++ b/mdb.c @@ -9874,7 +9874,8 @@ int mdb_dbi_open(MDB_txn *txn, const char *name, unsigned flags, MDB_dbi *dbi) memset(&dummy, 0, sizeof(dummy)); dummy.md_root = P_INVALID; dummy.md_flags = flags & PERSISTENT_FLAGS; - rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA); + WITH_CURSOR_TRACKING(mc, + rc = mdb_cursor_put(&mc, &key, &data, F_SUBDATA)); dbflag |= DB_DIRTY; } From fe4e9993d633e6019d9964f30de53691bd02be28 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sun, 23 Aug 2015 20:33:02 +0200 Subject: [PATCH 07/10] mdbx: backport - Pass cursor to mdb_page_get(), mdb_node_read(). No change in behavior. Change-Id: I19054cfd96fa883970a0dc66a0088596a142ea07 --- mdb.c | 100 ++++++++++++++++++++++++++++----------------------------- mdbx.c | 11 +++++-- 2 files changed, 58 insertions(+), 53 deletions(-) diff --git a/mdb.c b/mdb.c index d520f5bf..21983552 100644 --- a/mdb.c +++ b/mdb.c @@ -1092,10 +1092,10 @@ typedef struct MDB_ntxn { #define METAPAGE_2(env) \ (&((MDB_metabuf*) ((env)->me_map + env->me_psize))->mb_metabuf.mm_meta) -static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); -static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); -static int mdb_page_touch(MDB_cursor *mc); -static int mdb_cursor_touch(MDB_cursor *mc); +static int mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags); +static int mdb_page_new(MDB_cursor *mc, uint32_t flags, int num, MDB_page **mp); +static int mdb_page_touch(MDB_cursor *mc); +static int mdb_cursor_touch(MDB_cursor *mc); #define MDB_END_NAMES {"committed", "empty-commit", "abort", "reset", \ "reset-tmp", "fail-begin", "fail-beginchild"} @@ -1108,16 +1108,16 @@ enum { #define MDB_END_UPDATE 0x10 /**< update env state (DBIs) */ #define MDB_END_FREE 0x20 /**< free txn unless it is #MDB_env.%me_txn0 */ #define MDB_END_SLOT MDB_NOTLS /**< release any reader slot if #MDB_NOTLS */ -static int mdb_txn_end(MDB_txn *txn, unsigned mode); +static int mdb_txn_end(MDB_txn *txn, unsigned mode); -static int mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **mp, int *lvl); -static int mdb_page_search_root(MDB_cursor *mc, +static int mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **mp, int *lvl); +static int mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int modify); #define MDB_PS_MODIFY 1 #define MDB_PS_ROOTONLY 2 #define MDB_PS_FIRST 4 #define MDB_PS_LAST 8 -static int mdb_page_search(MDB_cursor *mc, +static int mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags); static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); @@ -1125,17 +1125,17 @@ static int mdb_page_merge(MDB_cursor *csrc, MDB_cursor *cdst); static int mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno, unsigned nflags); -static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); -static int mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); -static void mdb_env_close0(MDB_env *env); +static int mdb_env_read_header(MDB_env *env, MDB_meta *meta); +static int mdb_env_sync0(MDB_env *env, unsigned flags, MDB_meta *pending); +static void mdb_env_close0(MDB_env *env); -static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); -static int mdb_node_add(MDB_cursor *mc, indx_t indx, +static MDB_node *mdb_node_search(MDB_cursor *mc, MDB_val *key, int *exactp); +static int mdb_node_add(MDB_cursor *mc, indx_t indx, MDB_val *key, MDB_val *data, pgno_t pgno, unsigned flags); -static void mdb_node_del(MDB_cursor *mc, int ksize); -static void mdb_node_shrink(MDB_page *mp, indx_t indx); +static void mdb_node_del(MDB_cursor *mc, int ksize); +static void mdb_node_shrink(MDB_page *mp, indx_t indx); static int mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft); -static int mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data); +static int mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data); static size_t mdb_leaf_size(MDB_env *env, MDB_val *key, MDB_val *data); static size_t mdb_branch_size(MDB_env *env, MDB_val *key); @@ -1161,8 +1161,8 @@ static void mdb_xcursor_init1(MDB_cursor *mc, MDB_node *node); static void mdb_xcursor_init2(MDB_cursor *mc, MDB_xcursor *src_mx, int force); static int mdb_drop0(MDB_cursor *mc, int subs); -static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); -static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); +static void mdb_default_cmp(MDB_txn *txn, MDB_dbi dbi); +static int mdb_reader_check0(MDB_env *env, int rlocked, int *dead); /** @cond */ static MDB_cmp_func mdb_cmp_memn, mdb_cmp_memnr, mdb_cmp_int_ai, mdb_cmp_int_a2, mdb_cmp_int_ua; @@ -1716,7 +1716,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) { enum { Mask = P_SUBP|P_DIRTY|P_LOOSE|P_KEEP }; MDB_txn *txn = mc->mc_txn; - MDB_cursor *m3; + MDB_cursor *m3, *m0 = mc; MDB_xcursor *mx; MDB_page *dp, *mp; MDB_node *leaf; @@ -1759,7 +1759,7 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) pgno_t pgno = txn->mt_dbs[i].md_root; if (pgno == P_INVALID) continue; - if ((rc = mdb_page_get(txn, pgno, &dp, &level)) != MDB_SUCCESS) + if (unlikely((rc = mdb_page_get(m0, pgno, &dp, &level)) != MDB_SUCCESS)) break; if ((dp->mp_flags & Mask) == pflags && level <= 1) dp->mp_flags ^= P_KEEP; @@ -2215,7 +2215,7 @@ mdb_page_alloc(MDB_cursor *mc, int num, MDB_page **mp, int flags) np = m2.mc_pg[m2.mc_top]; leaf = NODEPTR(np, m2.mc_ki[m2.mc_top]); - if (unlikely((rc = mdb_node_read(txn, leaf, &data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(&m2, leaf, &data)) != MDB_SUCCESS)) goto fail; if ((flags & MDBX_LIFORECLAIM) && !txn->mt_lifo_reclaimed) { @@ -5381,15 +5381,16 @@ mdb_cursor_push(MDB_cursor *mc, MDB_page *mp) /** Find the address of the page corresponding to a given page number. * Set #MDB_TXN_ERROR on failure. - * @param[in] txn the transaction for this access. + * @param[in] mc the cursor accessing the page. * @param[in] pgno the page number for the page to retrieve. * @param[out] ret address of a pointer where the page's address will be stored. * @param[out] lvl dirty_list inheritance level of found page. 1=current txn, 0=mapped page. * @return 0 on success, non-zero on failure. */ static int -mdb_page_get(MDB_txn *txn, pgno_t pgno, MDB_page **ret, int *lvl) +mdb_page_get(MDB_cursor *mc, pgno_t pgno, MDB_page **ret, int *lvl) { + MDB_txn *txn = mc->mc_txn; MDB_env *env = txn->mt_env; MDB_page *p = NULL; int level; @@ -5483,7 +5484,7 @@ mdb_page_search_root(MDB_cursor *mc, MDB_val *key, int flags) mdb_cassert(mc, i < NUMKEYS(mp)); node = NODEPTR(mp, i); - if (unlikely((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)) + if (unlikely((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) return rc; mc->mc_ki[mc->mc_top] = i; @@ -5525,7 +5526,7 @@ mdb_page_search_lowest(MDB_cursor *mc) MDB_node *node = NODEPTR(mp, 0); int rc; - if (unlikely((rc = mdb_page_get(mc->mc_txn, NODEPGNO(node), &mp, NULL)) != 0)) + if (unlikely((rc = mdb_page_get(mc, NODEPGNO(node), &mp, NULL)) != 0)) return rc; mc->mc_ki[mc->mc_top] = 0; @@ -5577,7 +5578,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) return MDB_NOTFOUND; if (unlikely((leaf->mn_flags & (F_DUPDATA|F_SUBDATA)) != F_SUBDATA)) return MDB_INCOMPATIBLE; /* not a named DB */ - rc = mdb_node_read(mc->mc_txn, leaf, &data); + rc = mdb_node_read(&mc2, leaf, &data); if (rc) return rc; memcpy(&flags, ((char *) data.mv_data + offsetof(MDB_db, md_flags)), @@ -5601,7 +5602,7 @@ mdb_page_search(MDB_cursor *mc, MDB_val *key, int flags) mdb_cassert(mc, root > 1); if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) - if (unlikely((rc = mdb_page_get(mc->mc_txn, root, &mc->mc_pg[0], NULL)) != 0)) + if (unlikely((rc = mdb_page_get(mc, root, &mc->mc_pg[0], NULL)) != 0)) return rc; mc->mc_snum = 1; @@ -5698,13 +5699,13 @@ release: } /** Return the data associated with a given node. - * @param[in] txn The transaction for this operation. + * @param[in] mc The cursor for this operation. * @param[in] leaf The node being read. * @param[out] data Updated to point to the node's data. * @return 0 on success, non-zero on failure. */ static MDBX_INLINE int -mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) +mdb_node_read(MDB_cursor *mc, MDB_node *leaf, MDB_val *data) { MDB_page *omp; /* overflow page */ pgno_t pgno; @@ -5720,7 +5721,7 @@ mdb_node_read(MDB_txn *txn, MDB_node *leaf, MDB_val *data) */ data->mv_size = NODEDSZ(leaf); memcpy(&pgno, NODEDATA(leaf), sizeof(pgno)); - if (unlikely((rc = mdb_page_get(txn, pgno, &omp, NULL)) != 0)) { + if (unlikely((rc = mdb_page_get(mc, pgno, &omp, NULL)) != 0)) { mdb_debug("read overflow page %zu failed", pgno); return rc; } @@ -5801,7 +5802,7 @@ mdb_cursor_sibling(MDB_cursor *mc, int move_right) mdb_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); indx = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdb_page_get(mc->mc_txn, NODEPGNO(indx), &mp, NULL)) != 0)) { + if (unlikely((rc = mdb_page_get(mc, NODEPGNO(indx), &mp, NULL)) != 0)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED|C_EOF); return rc; @@ -5884,7 +5885,7 @@ skip: mdb_xcursor_init1(mc, leaf); } if (data) { - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -5967,7 +5968,7 @@ mdb_cursor_prev(MDB_cursor *mc, MDB_val *key, MDB_val *data, MDB_cursor_op op) mdb_xcursor_init1(mc, leaf); } if (data) { - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { @@ -6156,7 +6157,7 @@ set1: } } else if (op == MDB_GET_BOTH || op == MDB_GET_BOTH_RANGE) { MDB_val olddata; - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, &olddata)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, &olddata)) != MDB_SUCCESS)) return rc; rc = mc->mc_dbx->md_dcmp(data, &olddata); if (rc) { @@ -6169,7 +6170,7 @@ set1: } else { if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED|C_EOF); - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; } } @@ -6218,7 +6219,7 @@ mdb_cursor_first(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (unlikely(rc)) return rc; } else { - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; } } @@ -6263,7 +6264,7 @@ mdb_cursor_last(MDB_cursor *mc, MDB_val *key, MDB_val *data) if (unlikely(rc)) return rc; } else { - if (unlikely((rc = mdb_node_read(mc->mc_txn, leaf, data)) != MDB_SUCCESS)) + if (unlikely((rc = mdb_node_read(mc, leaf, data)) != MDB_SUCCESS)) return rc; } } @@ -6312,7 +6313,7 @@ mdb_cursor_get(MDB_cursor *mc, MDB_val *key, MDB_val *data, if (F_ISSET(leaf->mn_flags, F_DUPDATA)) { rc = mdb_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, MDB_GET_CURRENT); } else { - rc = mdb_node_read(mc->mc_txn, leaf, data); + rc = mdb_node_read(mc, leaf, data); } } } @@ -6429,7 +6430,7 @@ fetchm: MDB_node *leaf = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!F_ISSET(leaf->mn_flags, F_DUPDATA)) { MDB_GET_KEY(leaf, key); - rc = mdb_node_read(mc->mc_txn, leaf, data); + rc = mdb_node_read(mc, leaf, data); break; } } @@ -6830,7 +6831,7 @@ current: int level, ovpages, dpages = OVPAGES(data->mv_size, env->me_psize); memcpy(&pg, olddata.mv_data, sizeof(pg)); - if (unlikely((rc2 = mdb_page_get(mc->mc_txn, pg, &omp, &level)) != 0)) + if (unlikely((rc2 = mdb_page_get(mc, pg, &omp, &level)) != 0)) return rc2; ovpages = omp->mp_pages; @@ -7141,7 +7142,7 @@ mdb_cursor_del(MDB_cursor *mc, unsigned flags) pgno_t pg; memcpy(&pg, NODEDATA(leaf), sizeof(pg)); - if (unlikely((rc = mdb_page_get(mc->mc_txn, pg, &omp, NULL)) || + if (unlikely((rc = mdb_page_get(mc, pg, &omp, NULL)) || (rc = mdb_ovpage_free(mc, omp)))) goto fail; } @@ -8313,7 +8314,7 @@ mdb_rebalance(MDB_cursor *mc) if (unlikely(rc)) return rc; mc->mc_db->md_root = NODEPGNO(NODEPTR(mp, 0)); - rc = mdb_page_get(mc->mc_txn,mc->mc_db->md_root,&mc->mc_pg[0],NULL); + rc = mdb_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], NULL); if (unlikely(rc)) return rc; mc->mc_db->md_depth--; @@ -8374,7 +8375,7 @@ mdb_rebalance(MDB_cursor *mc) mdb_debug("reading right neighbor"); mn.mc_ki[ptop]++; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); if (unlikely(rc)) return rc; mn.mc_ki[mn.mc_top] = 0; @@ -8386,7 +8387,7 @@ mdb_rebalance(MDB_cursor *mc) mdb_debug("reading left neighbor"); mn.mc_ki[ptop]--; node = NODEPTR(mc->mc_pg[ptop], mn.mc_ki[ptop]); - rc = mdb_page_get(mc->mc_txn,NODEPGNO(node),&mn.mc_pg[mn.mc_top],NULL); + rc = mdb_page_get(mc, NODEPGNO(node), &mn.mc_pg[mn.mc_top], NULL); if (unlikely(rc)) return rc; mn.mc_ki[mn.mc_top] = NUMKEYS(mn.mc_pg[mn.mc_top]) - 1; @@ -9149,7 +9150,6 @@ static int __cold mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) { MDB_cursor mc; - MDB_txn *txn = my->mc_txn; MDB_node *ni; MDB_page *mo, *mp, *leaf; char *buf, *ptr; @@ -9162,9 +9162,9 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) memset(&mc, 0, sizeof(mc)); mc.mc_snum = 1; - mc.mc_txn = txn; + mc.mc_txn = my->mc_txn; - rc = mdb_page_get(txn, *pg, &mc.mc_pg[0], NULL); + rc = mdb_page_get(&mc, *pg, &mc.mc_pg[0], NULL); if (rc) return rc; rc = mdb_page_search_root(&mc, NULL, MDB_PS_FIRST); @@ -9209,7 +9209,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) memcpy(&pg, NODEDATA(ni), sizeof(pg)); memcpy(NODEDATA(ni), &my->mc_next_pgno, sizeof(pgno_t)); - rc = mdb_page_get(txn, pg, &omp, NULL); + rc = mdb_page_get(&mc, pg, &omp, NULL); if (rc) goto done; if (my->mc_wlen[toggle] >= MDB_WBUF) { @@ -9259,7 +9259,7 @@ mdb_env_cwalk(mdb_copy *my, pgno_t *pg, int flags) again: ni = NODEPTR(mp, mc.mc_ki[mc.mc_top]); pg = NODEPGNO(ni); - rc = mdb_page_get(txn, pg, &mp, NULL); + rc = mdb_page_get(&mc, pg, &mp, NULL); if (rc) goto done; mc.mc_top++; @@ -10006,7 +10006,7 @@ mdb_drop0(MDB_cursor *mc, int subs) MDB_page *omp; pgno_t pg; memcpy(&pg, NODEDATA(ni), sizeof(pg)); - rc = mdb_page_get(txn, pg, &omp, NULL); + rc = mdb_page_get(mc, pg, &omp, NULL); if (unlikely(rc)) goto done; mdb_cassert(mc, IS_OVERFLOW(omp)); diff --git a/mdbx.c b/mdbx.c index fb4aac85..9f07548e 100644 --- a/mdbx.c +++ b/mdbx.c @@ -182,7 +182,12 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee if (pg == P_INVALID) return MDB_SUCCESS; /* empty db */ - rc = mdb_page_get(ctx->mw_txn, pg, &mp, NULL); + MDB_cursor mc; + memset(&mc, 0, sizeof(mc)); + mc.mc_snum = 1; + mc.mc_txn = ctx->mw_txn; + + rc = mdb_page_get(&mc, pg, &mp, NULL); if (rc) return rc; if (pg != mp->mp_p.p_pgno) @@ -220,7 +225,7 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee } for (align_bytes = i = 0; i < nkeys; - align_bytes += ((payload_size + align_bytes) & 1), i++) { + align_bytes += ((payload_size + align_bytes) & 1), i++) { MDB_node *node; if (IS_LEAF2(mp)) { @@ -249,7 +254,7 @@ mdb_env_walk(mdb_walk_ctx_t *ctx, const char* dbi, pgno_t pg, int flags, int dee payload_size += sizeof(pgno_t); opg = NODEDATA(node); - rc = mdb_page_get(ctx->mw_txn, *opg, &omp, NULL); + rc = mdb_page_get(&mc, *opg, &omp, NULL); if (rc) return rc; if (*opg != omp->mp_p.p_pgno) From fca2f4d9295a136c5d9e151f03dd077d71f990c2 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 10 Dec 2016 21:42:39 +0100 Subject: [PATCH 08/10] mdbx: backport - clean up and comment C_UNTRACK. Don't use it as a "cursor is tracked" hint in mdb_pages_xkeep(). It's been harmless so far, but would break after mdb_cursor_copy(). Checking m0 directly short-circuits better anyway. Change-Id: Ibf180214db603e08ed11e298cff85866eb79f4bb --- mdb.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/mdb.c b/mdb.c index 21983552..6e778cb0 100644 --- a/mdb.c +++ b/mdb.c @@ -1723,13 +1723,9 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) unsigned i, j; int rc = MDB_SUCCESS, level; - /* Mark pages seen by cursors */ - if (mc->mc_flags & C_UNTRACK) - mc = NULL; /* will find mc in mt_cursors */ - for (i = txn->mt_numdbs;; mc = txn->mt_cursors[--i]) { - for (; mc; mc=mc->mc_next) { - if (!(mc->mc_flags & C_INITIALIZED)) - continue; + /* Mark pages seen by cursors: First m0, then tracked cursors */ + for (i = txn->mt_numdbs;; ) { + if (mc->mc_flags & C_INITIALIZED) { for (m3 = mc;; m3 = &mx->mx_cursor) { mp = NULL; for (j=0; jmc_snum; j++) { @@ -1748,10 +1744,13 @@ mdb_pages_xkeep(MDB_cursor *mc, unsigned pflags, int all) break; } } - if (i == 0) - break; + mc = mc->mc_next; + for (; !mc || mc == m0; mc = txn->mt_cursors[--i]) + if (i == 0) + goto mark_done; } +mark_done: if (all) { /* Mark dirty root pages */ for (i=0; imt_numdbs; i++) { @@ -7727,7 +7726,10 @@ mdb_cursor_close(MDB_cursor *mc) if (mc) { mdb_ensure(NULL, mc->mc_signature == MDBX_MC_SIGNATURE); if (!mc->mc_backup) { - /* remove from txn, if tracked */ + /* Remove from txn, if tracked. + * A read-only txn (!C_UNTRACK) may have been freed already, + * so do not peek inside it. Only write txns track cursors. + */ if ((mc->mc_flags & C_UNTRACK) && mc->mc_txn->mt_cursors) { MDB_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; @@ -8578,7 +8580,6 @@ mdb_del0(MDB_txn *txn, MDB_dbi dbi, * run out of space, triggering a split. We need this * cursor to be consistent until the end of the rebalance. */ - mc.mc_flags |= C_UNTRACK; mc.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &mc; rc = mdb_cursor_del(&mc, flags); From 362714512939ec5239787a7eb58ac134dbf0a3e9 Mon Sep 17 00:00:00 2001 From: Hallvard Furuseth Date: Sat, 10 Dec 2016 22:00:31 +0100 Subject: [PATCH 09/10] mdbx: backport - catch mdb_cursor_sibling() error (ITS#7377). Change-Id: I440ff1f9f92156e19935195d656f4d77b088f605 --- mdb.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mdb.c b/mdb.c index 6e778cb0..ce69a040 100644 --- a/mdb.c +++ b/mdb.c @@ -8678,7 +8678,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno split_indx = newindx; nkeys = 0; } else { - split_indx = (nkeys+1) / 2; if (IS_LEAF2(rp)) { @@ -8838,7 +8837,7 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno } else { /* find right page's left sibling */ mc->mc_ki[ptop] = mn.mc_ki[ptop]; - mdb_cursor_sibling(mc, 0); + rc = mdb_cursor_sibling(mc, 0); } } } else { @@ -8846,8 +8845,11 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); mn.mc_top++; } - if (unlikely(rc != MDB_SUCCESS)) + if (unlikely(rc != MDB_SUCCESS)) { + if (rc == MDB_NOTFOUND) /* improper mdb_cursor_sibling() result */ + rc = MDB_PROBLEM; goto done; + } if (nflags & MDB_APPEND) { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[mc->mc_top] = 0; From cbff64757953b6ccb1ab8853a6d41491d2e94fc5 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Thu, 15 Dec 2016 20:09:07 +0300 Subject: [PATCH 10/10] mdbx: fix ov-pages copying in cursor_put(). I think I just lost one line of code. This bug was added by 09d790431710f6456cb80bcfc5962da5851893ed --- mdb.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mdb.c b/mdb.c index ce69a040..81dea96e 100644 --- a/mdb.c +++ b/mdb.c @@ -6864,13 +6864,8 @@ current: * parent txn, in case the user peeks at MDB_RESERVEd * or unused parts. Some users treat ovpages specially. */ -#if MDBX_MODE_ENABLED - /* LY: New page will contain only header from origin, - * but no any payload */ - memcpy(np, omp, PAGEHDRSZ); -#else size_t sz = (size_t) env->me_psize * ovpages, off; - if (!(flags & MDB_RESERVE)) { + if (MDBX_MODE_ENABLED || !(flags & MDB_RESERVE)) { /* Skip the part where LMDB will put *data. * Copy end of page, adjusting alignment so * compiler may copy words instead of bytes. @@ -6881,7 +6876,6 @@ current: sz = PAGEHDRSZ; } memcpy(np, omp, sz); /* Copy whole or header of page */ -#endif /* MDBX_MODE_ENABLED */ omp = np; } SETDSZ(leaf, data->mv_size);