mirror of
				https://github.com/isar/libmdbx.git
				synced 2025-10-31 03:29:01 +08:00 
			
		
		
		
	mdbx: refine/optimize mdbx_page_alloc().
Change-Id: Iebbb8a611a82a379cf23b683bb21c9b6626ea9a5
This commit is contained in:
		| @@ -3203,7 +3203,7 @@ bailout: | ||||
|  * | ||||
|  * If there are free pages available from older transactions, they | ||||
|  * are re-used first. Otherwise allocate a new page at mt_next_pgno. | ||||
|  * Do not modify the freedB, just merge GC records into mt_reclaimed_pglist | ||||
|  * Do not modify the GC, just merge GC records into mt_reclaimed_pglist | ||||
|  * and move mt_last_reclaimed to say which records were consumed.  Only this | ||||
|  * function can create mt_reclaimed_pglist and move | ||||
|  * mt_last_reclaimed/mt_next_pgno. | ||||
| @@ -3272,8 +3272,8 @@ skip_cache: | ||||
|  | ||||
|   mdbx_tassert( | ||||
|       txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, txn->mt_next_pgno)); | ||||
|   pgno_t pgno, *repg_list = txn->tw.reclaimed_pglist; | ||||
|   unsigned repg_pos = 0, repg_len = MDBX_PNL_SIZE(repg_list); | ||||
|   pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; | ||||
|   unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list); | ||||
|   txnid_t oldest = 0, last = 0; | ||||
|   const unsigned wanna_range = num - 1; | ||||
|  | ||||
| @@ -3293,24 +3293,33 @@ skip_cache: | ||||
|        * Prefer pages with lower pgno. */ | ||||
|       mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, | ||||
|                                               txn->mt_next_pgno)); | ||||
|       if (likely(flags & MDBX_ALLOC_CACHE) && repg_len > wanna_range && | ||||
|       if (likely(flags & MDBX_ALLOC_CACHE) && re_len > wanna_range && | ||||
|           (!(flags & MDBX_COALESCE) || op == MDBX_FIRST)) { | ||||
|         mdbx_tassert(txn, MDBX_PNL_LAST(repg_list) < txn->mt_next_pgno && | ||||
|                               MDBX_PNL_FIRST(repg_list) < txn->mt_next_pgno); | ||||
|         mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && | ||||
|                               MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); | ||||
|         range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; | ||||
|         pgno = MDBX_PNL_LEAST(re_list); | ||||
|         if (likely(wanna_range == 0)) | ||||
|           goto done; | ||||
| #if MDBX_PNL_ASCENDING | ||||
|         for (repg_pos = 1; repg_pos <= repg_len - wanna_range; ++repg_pos) { | ||||
|           pgno = repg_list[repg_pos]; | ||||
|           if (likely(repg_list[repg_pos + wanna_range - 1] == | ||||
|                      pgno + wanna_range - 1)) | ||||
|         mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); | ||||
|         while (true) { | ||||
|           unsigned range_end = range_begin + wanna_range; | ||||
|           if (re_list[range_end] - pgno == wanna_range) | ||||
|             goto done; | ||||
|           if (range_end == re_len) | ||||
|             break; | ||||
|           pgno = re_list[++range_begin]; | ||||
|         } | ||||
| #else | ||||
|         repg_pos = repg_len; | ||||
|         do { | ||||
|           pgno = repg_list[repg_pos]; | ||||
|           if (likely(repg_list[repg_pos - wanna_range] == pgno + wanna_range)) | ||||
|         mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); | ||||
|         while (true) { | ||||
|           if (re_list[range_begin - wanna_range] - pgno == wanna_range) | ||||
|             goto done; | ||||
|         } while (--repg_pos > wanna_range); | ||||
|           if (range_begin == wanna_range) | ||||
|             break; | ||||
|           pgno = re_list[--range_begin]; | ||||
|         } | ||||
| #endif /* MDBX_PNL sort-order */ | ||||
|       } | ||||
|  | ||||
| @@ -3369,7 +3378,15 @@ skip_cache: | ||||
|         goto fail; | ||||
|       } | ||||
|  | ||||
|       last = *(txnid_t *)key.iov_base; | ||||
|       if (unlikely(key.iov_len != sizeof(txnid_t))) { | ||||
|         rc = MDBX_CORRUPTED; | ||||
|         goto fail; | ||||
|       } | ||||
|       memcpy(&last, key.iov_base, sizeof(txnid_t)); | ||||
|       if (unlikely(last < 1 || last >= SAFE64_INVALID_THRESHOLD)) { | ||||
|         rc = MDBX_CORRUPTED; | ||||
|         goto fail; | ||||
|       } | ||||
|       if (oldest <= last) { | ||||
|         oldest = mdbx_find_oldest(txn); | ||||
|         if (oldest <= last) { | ||||
| @@ -3407,20 +3424,20 @@ skip_cache: | ||||
|  | ||||
|       /* Append PNL from GC record to me_reclaimed_pglist */ | ||||
|       mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); | ||||
|       pgno_t *re_pnl = (pgno_t *)data.iov_base; | ||||
|       mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(re_pnl)); | ||||
|       if (unlikely(data.iov_len < MDBX_PNL_SIZEOF(re_pnl) || | ||||
|                    !mdbx_pnl_check(re_pnl, txn->mt_next_pgno))) { | ||||
|       pgno_t *gc_pnl = (pgno_t *)data.iov_base; | ||||
|       mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); | ||||
|       if (unlikely(data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || | ||||
|                    !mdbx_pnl_check(gc_pnl, txn->mt_next_pgno))) { | ||||
|         rc = MDBX_CORRUPTED; | ||||
|         goto fail; | ||||
|       } | ||||
|       repg_pos = MDBX_PNL_SIZE(re_pnl); | ||||
|       rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, repg_pos); | ||||
|       const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); | ||||
|       rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); | ||||
|       if (unlikely(rc != MDBX_SUCCESS)) | ||||
|         goto fail; | ||||
|       repg_list = txn->tw.reclaimed_pglist; | ||||
|       re_list = txn->tw.reclaimed_pglist; | ||||
|  | ||||
|       /* Remember ID of FreeDB record */ | ||||
|       /* Remember ID of GC record */ | ||||
|       if (flags & MDBX_LIFORECLAIM) { | ||||
|         if ((rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, last)) != 0) | ||||
|           goto fail; | ||||
| @@ -3430,66 +3447,76 @@ skip_cache: | ||||
|       if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { | ||||
|         mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO | ||||
|                          " num %u, PNL", | ||||
|                          last, txn->mt_dbs[FREE_DBI].md_root, repg_pos); | ||||
|                          last, txn->mt_dbs[FREE_DBI].md_root, gc_len); | ||||
|         unsigned i; | ||||
|         for (i = repg_pos; i; i--) | ||||
|           mdbx_debug_extra_print(" %" PRIaPGNO, re_pnl[i]); | ||||
|         for (i = gc_len; i; i--) | ||||
|           mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]); | ||||
|         mdbx_debug_extra_print("\n"); | ||||
|       } | ||||
|  | ||||
|       /* Merge in descending sorted order */ | ||||
|       mdbx_pnl_xmerge(repg_list, re_pnl); | ||||
|       const unsigned prev_re_len = MDBX_PNL_SIZE(re_list); | ||||
|       mdbx_pnl_xmerge(re_list, gc_pnl); | ||||
|       /* re-check to avoid duplicates */ | ||||
|       if (unlikely(!mdbx_pnl_check(repg_list, txn->mt_next_pgno))) { | ||||
|       if (unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { | ||||
|         rc = MDBX_CORRUPTED; | ||||
|         goto fail; | ||||
|       } | ||||
|       repg_len = MDBX_PNL_SIZE(repg_list); | ||||
|  | ||||
|       re_len = MDBX_PNL_SIZE(re_list); | ||||
|       mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); | ||||
|       if (re_len && unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { | ||||
|         /* Refund suitable pages into "unallocated" space */ | ||||
|         mdbx_refund(txn); | ||||
|         re_list = txn->tw.reclaimed_pglist; | ||||
|         re_len = MDBX_PNL_SIZE(re_list); | ||||
|       } | ||||
|  | ||||
|       if (unlikely((flags & MDBX_ALLOC_CACHE) == 0)) { | ||||
|         /* Done for a kick-reclaim mode, actually no page needed */ | ||||
|         return MDBX_SUCCESS; | ||||
|       } | ||||
|  | ||||
|       mdbx_tassert(txn, | ||||
|                    repg_len == 0 || repg_list[repg_len] < txn->mt_next_pgno); | ||||
|       if (repg_len && | ||||
|           unlikely(MDBX_PNL_MOST(repg_list) == txn->mt_next_pgno - 1)) { | ||||
|         /* Refund suitable pages into "unallocated" space */ | ||||
|         mdbx_refund(txn); | ||||
|         repg_list = txn->tw.reclaimed_pglist; | ||||
|         repg_len = MDBX_PNL_SIZE(repg_list); | ||||
|       } | ||||
|  | ||||
|       /* Don't try to coalesce too much. */ | ||||
|       if (unlikely(repg_len > MDBX_DPL_TXNFULL / 4)) | ||||
|       if (unlikely(re_len > MDBX_DPL_TXNFULL / 4)) | ||||
|         break; | ||||
|       if (repg_len /* current size */ >= env->me_maxgc_ov1page || | ||||
|           repg_pos /* prev size */ >= env->me_maxgc_ov1page / 2) | ||||
|       if (re_len /* current size */ >= env->me_maxgc_ov1page || | ||||
|           (re_len > prev_re_len && re_len - prev_re_len /* delta from prev */ >= | ||||
|                                        env->me_maxgc_ov1page / 2)) | ||||
|         flags &= ~MDBX_COALESCE; | ||||
|     } | ||||
|  | ||||
|     if ((flags & (MDBX_COALESCE | MDBX_ALLOC_CACHE)) == | ||||
|             (MDBX_COALESCE | MDBX_ALLOC_CACHE) && | ||||
|         repg_len > wanna_range) { | ||||
|         re_len > wanna_range) { | ||||
|       range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; | ||||
|       pgno = MDBX_PNL_LEAST(re_list); | ||||
|       if (likely(wanna_range == 0)) | ||||
|         goto done; | ||||
| #if MDBX_PNL_ASCENDING | ||||
|       for (repg_pos = 1; repg_pos <= repg_len - wanna_range; ++repg_pos) { | ||||
|         pgno = repg_list[repg_pos]; | ||||
|         if (likely(repg_list[repg_pos + wanna_range - 1] == | ||||
|                    pgno + wanna_range - 1)) | ||||
|       mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); | ||||
|       while (true) { | ||||
|         unsigned range_end = range_begin + wanna_range; | ||||
|         if (re_list[range_end] - pgno == wanna_range) | ||||
|           goto done; | ||||
|         if (range_end == re_len) | ||||
|           break; | ||||
|         pgno = re_list[++range_begin]; | ||||
|       } | ||||
| #else | ||||
|       repg_pos = repg_len; | ||||
|       do { | ||||
|         pgno = repg_list[repg_pos]; | ||||
|         if (likely(repg_list[repg_pos - wanna_range] == pgno + wanna_range)) | ||||
|       mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); | ||||
|       while (true) { | ||||
|         if (re_list[range_begin - wanna_range] - pgno == wanna_range) | ||||
|           goto done; | ||||
|       } while (--repg_pos > wanna_range); | ||||
|         if (range_begin == wanna_range) | ||||
|           break; | ||||
|         pgno = re_list[--range_begin]; | ||||
|       } | ||||
| #endif /* MDBX_PNL sort-order */ | ||||
|     } | ||||
|  | ||||
|     /* Use new pages from the map when nothing suitable in the GC */ | ||||
|     repg_pos = 0; | ||||
|     range_begin = 0; | ||||
|     pgno = txn->mt_next_pgno; | ||||
|     rc = MDBX_MAP_FULL; | ||||
|     const pgno_t next = pgno_add(pgno, num); | ||||
| @@ -3590,14 +3617,20 @@ done: | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   if (repg_pos) { | ||||
|   if (range_begin) { | ||||
|     mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); | ||||
|     mdbx_tassert(txn, pgno < txn->mt_next_pgno); | ||||
|     mdbx_tassert(txn, pgno == repg_list[repg_pos]); | ||||
|     mdbx_tassert(txn, pgno == re_list[range_begin]); | ||||
|     /* Cutoff allocated pages from me_reclaimed_pglist */ | ||||
|     MDBX_PNL_SIZE(repg_list) = repg_len -= num; | ||||
|     for (unsigned i = repg_pos - num; i < repg_len;) | ||||
|       repg_list[++i] = repg_list[++repg_pos]; | ||||
| #if MDBX_PNL_ASCENDING | ||||
|     for (unsigned i = range_begin + num; i <= re_len;) | ||||
|       re_list[range_begin++] = re_list[i++]; | ||||
|     MDBX_PNL_SIZE(re_list) = re_len = range_begin - 1; | ||||
| #else | ||||
|     MDBX_PNL_SIZE(re_list) = re_len -= num; | ||||
|     for (unsigned i = range_begin - num; i < re_len;) | ||||
|       re_list[++i] = re_list[++range_begin]; | ||||
| #endif | ||||
|     mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, | ||||
|                                             txn->mt_next_pgno)); | ||||
|   } else { | ||||
| @@ -3606,7 +3639,7 @@ done: | ||||
|   } | ||||
|  | ||||
|   if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) | ||||
|     memset(np, 0x71 /* 'q', 113 */, pgno2bytes(env, num)); | ||||
|     memset(np, -1, pgno2bytes(env, num)); | ||||
|   VALGRIND_MAKE_MEM_UNDEFINED(np, pgno2bytes(env, num)); | ||||
|  | ||||
|   np->mp_pgno = pgno; | ||||
| @@ -5369,7 +5402,7 @@ retry: | ||||
|         rc = mdbx_page_alloc(&mc, 0, NULL, MDBX_ALLOC_GC | MDBX_ALLOC_KICK); | ||||
|         mc.mc_flags |= C_RECLAIMING; | ||||
|         if (likely(rc == MDBX_SUCCESS)) { | ||||
|           /* LY: ok, reclaimed from freedb. */ | ||||
|           /* LY: ok, reclaimed from GC. */ | ||||
|           mdbx_trace("%s: took @%" PRIaTXN " from GC, continue", | ||||
|                      dbg_prefix_mode, MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); | ||||
|           continue; | ||||
| @@ -5378,7 +5411,7 @@ retry: | ||||
|           /* LY: other troubles... */ | ||||
|           goto bailout; | ||||
|  | ||||
|         /* LY: freedb is empty, will look any free txn-id in high2low order. */ | ||||
|         /* LY: GC is empty, will look any free txn-id in high2low order. */ | ||||
|         do { | ||||
|           --head_gc_id; | ||||
|           mdbx_assert(env, | ||||
| @@ -5479,7 +5512,7 @@ retry: | ||||
|     mdbx_tassert(txn, reservation_gc_id < *env->me_oldest); | ||||
|     if (unlikely(reservation_gc_id < 1 || | ||||
|                  reservation_gc_id >= *env->me_oldest)) { | ||||
|       /* LY: not any txn in the past of freedb. */ | ||||
|       /* LY: not any txn in the past of GC. */ | ||||
|       rc = MDBX_PROBLEM; | ||||
|       goto bailout; | ||||
|     } | ||||
| @@ -6414,20 +6447,20 @@ static int __cold mdbx_read_header(MDBX_env *env, MDBX_meta *meta, | ||||
|       continue; | ||||
|     } | ||||
|  | ||||
|     /* LY: FreeDB root */ | ||||
|     /* LY: GC root */ | ||||
|     if (page.mp_meta.mm_dbs[FREE_DBI].md_root == P_INVALID) { | ||||
|       if (page.mp_meta.mm_dbs[FREE_DBI].md_branch_pages || | ||||
|           page.mp_meta.mm_dbs[FREE_DBI].md_depth || | ||||
|           page.mp_meta.mm_dbs[FREE_DBI].md_entries || | ||||
|           page.mp_meta.mm_dbs[FREE_DBI].md_leaf_pages || | ||||
|           page.mp_meta.mm_dbs[FREE_DBI].md_overflow_pages) { | ||||
|         mdbx_notice("meta[%u] has false-empty freedb, skip it", meta_number); | ||||
|         mdbx_notice("meta[%u] has false-empty GC, skip it", meta_number); | ||||
|         rc = MDBX_CORRUPTED; | ||||
|         continue; | ||||
|       } | ||||
|     } else if (page.mp_meta.mm_dbs[FREE_DBI].md_root >= | ||||
|                page.mp_meta.mm_geo.next) { | ||||
|       mdbx_notice("meta[%u] has invalid freedb-root %" PRIaPGNO ", skip it", | ||||
|       mdbx_notice("meta[%u] has invalid GC-root %" PRIaPGNO ", skip it", | ||||
|                   meta_number, page.mp_meta.mm_dbs[FREE_DBI].md_root); | ||||
|       rc = MDBX_CORRUPTED; | ||||
|       continue; | ||||
| @@ -8744,8 +8777,8 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, MDBX_val *key, | ||||
|  | ||||
|     mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, | ||||
|                NUMKEYS(mp)); | ||||
|     /* Don't assert on branch pages in the FreeDB. We can get here | ||||
|      * while in the process of rebalancing a FreeDB branch page; we must | ||||
|     /* Don't assert on branch pages in the GC. We can get here | ||||
|      * while in the process of rebalancing a GC branch page; we must | ||||
|      * let that proceed. ITS#8336 */ | ||||
|     mdbx_cassert(mc, !mc->mc_dbi || NUMKEYS(mp) > 1); | ||||
|     mdbx_debug("found index 0 to page %" PRIaPGNO, NODEPGNO(NODEPTR(mp, 0))); | ||||
| @@ -11738,7 +11771,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { | ||||
|     } | ||||
|   } | ||||
|  | ||||
|   /* If not operating on FreeDB, allow this page to be reused | ||||
|   /* If not operating on GC, allow this page to be reused | ||||
|    * in this txn. Otherwise just add to free list. */ | ||||
|   rc = mdbx_page_retire(csrc, psrc); | ||||
|   if (unlikely(rc)) | ||||
|   | ||||
| @@ -438,7 +438,7 @@ typedef struct MDBX_meta { | ||||
|  * P_META pages contain MDBX_meta, the start point of an MDBX snapshot. | ||||
|  * | ||||
|  * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once | ||||
|  * in the snapshot: Either used by a database or listed in a freeDB record. */ | ||||
|  * in the snapshot: Either used by a database or listed in a GC record. */ | ||||
| typedef struct MDBX_page { | ||||
|   union { | ||||
|     struct MDBX_page *mp_next; /* for in-memory list of freed pages */ | ||||
| @@ -829,7 +829,7 @@ struct MDBX_txn { | ||||
|       MDBX_reader *reader; | ||||
|     } to; | ||||
|     struct { | ||||
|       pgno_t *reclaimed_pglist; /* Reclaimed freeDB pages */ | ||||
|       pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ | ||||
|       txnid_t last_reclaimed;   /* ID of last used record */ | ||||
|       pgno_t loose_refund_wl /* FIXME: describe */; | ||||
|       /* dirtylist room: Dirty array size - dirty pages visible to this txn. | ||||
| @@ -904,7 +904,7 @@ struct MDBX_cursor { | ||||
| #define C_SUB 0x04                /* Cursor is a sub-cursor */ | ||||
| #define C_DEL 0x08                /* last op was a cursor_del */ | ||||
| #define C_UNTRACK 0x10            /* Un-track cursor when closing */ | ||||
| #define C_RECLAIMING 0x20         /* FreeDB lookup is prohibited */ | ||||
| #define C_RECLAIMING 0x20         /* GC lookup is prohibited */ | ||||
| #define C_GCFREEZE 0x40           /* reclaimed_pglist must not be updated */ | ||||
|   unsigned mc_flags;              /* see mdbx_cursor */ | ||||
|   MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ | ||||
|   | ||||
		Reference in New Issue
	
	Block a user