From aeea7ebb083d6856bfb59657736eed5bd75d2a41 Mon Sep 17 00:00:00 2001 From: Leo Yuriev Date: Mon, 23 Nov 2015 09:29:01 +0300 Subject: [PATCH] mdbx: backport - ITS#8321 a lot for cursor tracking. Multiple bugs were fixed in the cursor fixups which adjust other open cursors in response to various write ops. Includes: - ITS#8321 Fix del/dupsort. When deleting a dupsort key, if other cursors pointed at that key, set them to uninit'd, not EOF. They no longer have anything to point at. - ITS#8321 don't skip fixups on splitting cursors. Adjustments can't be skipped, in recursive calls each level must fixup their own level. - ITS#8321 fix mdb_cursor_chk(). It was reporting spurious errors due to uninit'd cursors - ITS#8321 fix mdb_cursor_shadow(). Set a valid txn so that cursor fixup code works on the shadows - ITS#8321 fix mdb_cursor_put. Ignore sub-cursors that shouldn't be fixed up - ITS#8321 track temporary cursors. In rebalance/split operations, temporary cursors need to be visible to propagate fixups - ITS#8321 simplify page_split fixups. - ITS#8321 reorganize page_split fixups. DUPFIXED fixups needed to occur after separator update. MDB_RESERVE handling moved after split fixup. Change-Id: I0c04acf54ebf6e84f32996b5723ec6fafb983ad9 --- CHANGES | 1 + mdb.c | 123 +++++++++++++++++++++++++++++++++++--------------------- 2 files changed, 79 insertions(+), 45 deletions(-) diff --git a/CHANGES b/CHANGES index e9f98c0b..3f9084f0 100644 --- a/CHANGES +++ b/CHANGES @@ -21,6 +21,7 @@ LMDB 0.9.17 Release Engineering Fix ITS#8313 mdb_rebalance dummy cursor Fix ITS#8315 dirty_room in nested txn Fix ITS#8316 page_merge cursor tracking + Fix ITS#8321 cursor tracking Added mdb_txn_id() (ITS#7994) Added robust mutex support Miscellaneous cleanup/simplification diff --git a/mdb.c b/mdb.c index 21174ac3..0bc9a48a 100644 --- a/mdb.c +++ b/mdb.c @@ -889,7 +889,6 @@ struct MDB_cursor { #define C_EOF 0x02 /**< No more data */ #define C_SUB 0x04 /**< Cursor is a sub-cursor */ #define C_DEL 0x08 /**< last op was a cursor_del */ -#define C_SPLITTING 0x20 /**< Cursor is in page_split */ #define C_UNTRACK 0x40 /**< Un-track cursor when closing */ #define C_RECLAIMING 0x80 /**< FreeDB lookup is prohibited */ /** @} */ @@ -1401,7 +1400,7 @@ mdb_cursor_chk(MDB_cursor *mc) MDB_node *node; MDB_page *mp; - if (!mc->mc_snum && !(mc->mc_flags & C_INITIALIZED)) return; + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED)) return; for (i=0; imc_top; i++) { mp = mc->mc_pg[i]; node = NODEPTR(mp, mc->mc_ki[i]); @@ -2604,14 +2603,15 @@ mdb_cursor_shadow(MDB_txn *src, MDB_txn *dst) *bk = *mc; mc->mc_backup = bk; mc->mc_db = &dst->mt_dbs[i]; - /* Kill pointers into src - and dst to reduce abuse: The - * user may not use mc until dst ends. Otherwise we'd... + /* Kill pointers into src to reduce abuse: The + * user may not use mc until dst ends. But we need a valid + * txn pointer here for cursor fixups to keep working. */ - mc->mc_txn = NULL; /* ...set this to dst */ - mc->mc_dbflag = NULL; /* ...and &dst->mt_dbflags[i] */ + mc->mc_txn = dst; + mc->mc_dbflag = &dst->mt_dbflags[i]; if ((mx = mc->mc_xcursor) != NULL) { *(MDB_xcursor *)(bk+1) = *mx; - mx->mx_cursor.mc_txn = NULL; /* ...and dst. */ + mx->mx_cursor.mc_txn = dst; } mc->mc_next = dst->mt_cursors[i]; dst->mt_cursors[i] = mc; @@ -6835,6 +6835,7 @@ put_sub: MDB_xcursor *mx = mc->mc_xcursor; unsigned i = mc->mc_top; MDB_page *mp = mc->mc_pg[i]; + int nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2=m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; @@ -6842,9 +6843,9 @@ put_sub: if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { mdb_xcursor_init2(m2, mx, new_dupdata); - } else if (!insert_key) { + } else if (!insert_key && m2->mc_ki[i] < nkeys) { MDB_node *n2 = NODEPTR(mp, m2->mc_ki[i]); - if (!(n2->mn_flags & F_SUBDATA)) + if ((n2->mn_flags & (F_SUBDATA|F_DUPDATA)) == F_DUPDATA) m2->mc_xcursor->mx_cursor.mc_pg[0] = NODEDATA(n2); } } @@ -7678,6 +7679,22 @@ mdb_update_key(MDB_cursor *mc, MDB_val *key) static void mdb_cursor_copy(const MDB_cursor *csrc, MDB_cursor *cdst); +/** Track a temporary cursor */ +#define CURSOR_TMP_TRACK(mc, mn, dummy, tracked) \ + if (mc->mc_flags & C_SUB) { \ + dummy.mc_flags = C_INITIALIZED; \ + dummy.mc_xcursor = (MDB_xcursor *)&mn; \ + tracked = &dummy; \ + } else { \ + tracked = &mn; \ + } \ + tracked->mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; \ + mc->mc_txn->mt_cursors[mc->mc_dbi] = tracked + +/** Stop tracking a temporary cursor */ +#define CURSOR_TMP_UNTRACK(mc, tracked) \ + mc->mc_txn->mt_cursors[mc->mc_dbi] = tracked->mc_next + /** Move a node from csrc to cdst. */ static int @@ -7833,6 +7850,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) */ if (csrc->mc_ki[csrc->mc_top] == 0) { if (csrc->mc_ki[csrc->mc_top-1] != 0) { + MDB_cursor dummy, *tracked; if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_data = LEAF2KEY(csrc->mc_pg[csrc->mc_top], 0, key.mv_size); } else { @@ -7845,7 +7863,11 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) mdb_cursor_copy(csrc, &mn); mn.mc_snum--; mn.mc_top--; - if (unlikely((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)) + /* We want mdb_rebalance to find mn when doing fixups */ + CURSOR_TMP_TRACK(csrc, mn, dummy, tracked); + rc = mdb_update_key(&mn, &key); + CURSOR_TMP_UNTRACK(csrc, tracked); + if (unlikely(rc != MDB_SUCCESS)) return rc; } if (IS_BRANCH(csrc->mc_pg[csrc->mc_top])) { @@ -7861,6 +7883,7 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) if (cdst->mc_ki[cdst->mc_top] == 0) { if (cdst->mc_ki[cdst->mc_top-1] != 0) { + MDB_cursor dummy, *tracked; if (IS_LEAF2(csrc->mc_pg[csrc->mc_top])) { key.mv_data = LEAF2KEY(cdst->mc_pg[cdst->mc_top], 0, key.mv_size); } else { @@ -7873,7 +7896,11 @@ mdb_node_move(MDB_cursor *csrc, MDB_cursor *cdst, int fromleft) mdb_cursor_copy(cdst, &mn); mn.mc_snum--; mn.mc_top--; - if (unlikely((rc = mdb_update_key(&mn, &key)) != MDB_SUCCESS)) + /* We want mdb_rebalance to find mn when doing fixups */ + CURSOR_TMP_TRACK(cdst, mn, dummy, tracked); + rc = mdb_update_key(&mn, &key); + CURSOR_TMP_UNTRACK(cdst, tracked); + if (unlikely(rc != MDB_SUCCESS)) return rc; } if (IS_BRANCH(cdst->mc_pg[cdst->mc_top])) { @@ -8231,24 +8258,13 @@ mdb_rebalance(MDB_cursor *mc) if (!fromleft) { rc = mdb_page_merge(&mn, mc); } else { - MDB_cursor dummy; + MDB_cursor dummy, *tracked; oldki += NUMKEYS(mn.mc_pg[mn.mc_top]); mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; /* We want mdb_rebalance to find mn when doing fixups */ - if (mc->mc_flags & C_SUB) { - dummy.mc_flags = C_INITIALIZED; - dummy.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; - mc->mc_txn->mt_cursors[mc->mc_dbi] = &dummy; - dummy.mc_xcursor = (MDB_xcursor *)&mn; - } else { - mn.mc_next = mc->mc_txn->mt_cursors[mc->mc_dbi]; - mc->mc_txn->mt_cursors[mc->mc_dbi] = &mn; - } + CURSOR_TMP_TRACK(mc, mn, dummy, tracked); rc = mdb_page_merge(mc, &mn); - if (mc->mc_flags & C_SUB) - mc->mc_txn->mt_cursors[mc->mc_dbi] = dummy.mc_next; - else - mc->mc_txn->mt_cursors[mc->mc_dbi] = mn.mc_next; + CURSOR_TMP_UNTRACK(mc, tracked); mdb_cursor_copy(&mn, mc); } mc->mc_flags &= ~C_EOF; @@ -8286,7 +8302,7 @@ mdb_cursor_del0(MDB_cursor *mc) if (m3->mc_ki[mc->mc_top] > ki) m3->mc_ki[mc->mc_top]--; else if (mc->mc_db->md_flags & MDB_DUPSORT) - m3->mc_xcursor->mx_cursor.mc_flags |= C_EOF; + m3->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } } } @@ -8475,7 +8491,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno mdb_debug("parent branch page is %zu", mc->mc_pg[ptop]->mp_pgno); } - mc->mc_flags |= C_SPLITTING; mdb_cursor_copy(mc, &mn); mn.mc_pg[mn.mc_top] = rp; mn.mc_ki[ptop] = mc->mc_ki[ptop]+1; @@ -8526,8 +8541,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rp->mp_lower += sizeof(indx_t); rp->mp_upper -= ksize - sizeof(indx_t); mc->mc_ki[mc->mc_top] = x; - mc->mc_pg[mc->mc_top] = rp; - mc->mc_ki[ptop]++; } } else { int psize, nsize, k; @@ -8621,11 +8634,15 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno */ if (SIZELEFT(mn.mc_pg[ptop]) < mdb_branch_size(env, &sepkey)) { int snum = mc->mc_snum; + MDB_cursor dummy, *tracked; mn.mc_snum--; mn.mc_top--; did_split = 1; + /* We want other splits to find mn when doing fixups */ + CURSOR_TMP_TRACK(mc, mn, dummy, tracked); rc = mdb_page_split(&mn, &sepkey, NULL, rp->mp_pgno, 0); - if (unlikely(rc)) + CURSOR_TMP_UNTRACK(mc, tracked); + if (unlikely(rc != MDB_SUCCESS)) goto done; /* root split? */ @@ -8655,10 +8672,8 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno rc = mdb_node_add(&mn, mn.mc_ki[ptop], &sepkey, NULL, rp->mp_pgno, 0); mn.mc_top++; } - mc->mc_flags ^= C_SPLITTING; - if (unlikely(rc != MDB_SUCCESS)) { + if (unlikely(rc != MDB_SUCCESS)) goto done; - } if (nflags & MDB_APPEND) { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[mc->mc_top] = 0; @@ -8725,11 +8740,6 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno /* reset back to original page */ if (newindx < split_indx) { mc->mc_pg[mc->mc_top] = mp; - if (nflags & MDB_RESERVE) { - node = NODEPTR(mp, mc->mc_ki[mc->mc_top]); - if (!(node->mn_flags & F_BIGDATA)) - newdata->mv_data = NODEDATA(node); - } } else { mc->mc_pg[mc->mc_top] = rp; mc->mc_ki[ptop]++; @@ -8743,13 +8753,32 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno } } } + if (nflags & MDB_RESERVE) { + node = NODEPTR(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); + if (!(node->mn_flags & F_BIGDATA)) + newdata->mv_data = NODEDATA(node); + } + } else { + if (newindx >= split_indx) { + mc->mc_pg[mc->mc_top] = rp; + mc->mc_ki[ptop]++; + /* Make sure mc_ki is still valid. + */ + if (mn.mc_pg[ptop] != mc->mc_pg[ptop] && + mc->mc_ki[ptop] >= NUMKEYS(mc->mc_pg[ptop])) { + for (i=0; i<=ptop; i++) { + mc->mc_pg[i] = mn.mc_pg[i]; + mc->mc_ki[i] = mn.mc_ki[i]; + } + } + } } { /* Adjust other cursors pointing to mp */ MDB_cursor *m2, *m3; MDB_dbi dbi = mc->mc_dbi; - int fixup = NUMKEYS(mp); + nkeys = NUMKEYS(mp); for (m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2=m2->mc_next) { if (mc->mc_flags & C_SUB) @@ -8762,12 +8791,15 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno continue; if (new_root) { int k; + /* sub cursors may be on different DB */ + if (m3->mc_pg[0] != mp) + continue; /* root split */ for (k=new_root; k>=0; k--) { m3->mc_ki[k+1] = m3->mc_ki[k]; m3->mc_pg[k+1] = m3->mc_pg[k]; } - if (m3->mc_ki[0] >= split_indx) { + if (m3->mc_ki[0] > nkeys) { m3->mc_ki[0] = 1; } else { m3->mc_ki[0] = 0; @@ -8776,15 +8808,16 @@ mdb_page_split(MDB_cursor *mc, MDB_val *newkey, MDB_val *newdata, pgno_t newpgno m3->mc_snum++; m3->mc_top++; } - if (m3->mc_flags & C_SPLITTING) - continue; if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp) { if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDB_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; - if (m3->mc_ki[mc->mc_top] >= fixup) { + if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = rp; - m3->mc_ki[mc->mc_top] -= fixup; - m3->mc_ki[ptop] = mn.mc_ki[ptop]; + m3->mc_ki[mc->mc_top] -= nkeys; + for (i=0; imc_top; i++) { + m3->mc_ki[i] = mn.mc_ki[i]; + m3->mc_pg[i] = mn.mc_pg[i]; + } } } else if (!did_split && m3->mc_top >= ptop && m3->mc_pg[ptop] == mc->mc_pg[ptop] && m3->mc_ki[ptop] >= mc->mc_ki[ptop]) {