mirror of
				https://github.com/isar/libmdbx.git
				synced 2025-10-25 13:18:56 +08:00 
			
		
		
		
	mdbx: использование msync(MS_ASYNC) для спиллинга в режиме MDBX_WRITEMAP вне зависимости от MDBX_AVOID_MSYNC и MDBX_MMAP_USE_MS_ASYNC.
				
					
				
			This commit is contained in:
		
							
								
								
									
										196
									
								
								src/core.c
									
									
									
									
									
								
							
							
						
						
									
										196
									
								
								src/core.c
									
									
									
									
									
								
							| @@ -4179,7 +4179,6 @@ __cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, | |||||||
| static __inline void page_wash(MDBX_txn *txn, const size_t di, | static __inline void page_wash(MDBX_txn *txn, const size_t di, | ||||||
|                                MDBX_page *const mp, const size_t npages) { |                                MDBX_page *const mp, const size_t npages) { | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); |   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); | ||||||
|   tASSERT(txn, (di > 0) == (txn->tw.dirtylist != nullptr)); |  | ||||||
|   mp->mp_txnid = INVALID_TXNID; |   mp->mp_txnid = INVALID_TXNID; | ||||||
|   mp->mp_flags = P_BAD; |   mp->mp_flags = P_BAD; | ||||||
|  |  | ||||||
| @@ -4194,10 +4193,13 @@ static __inline void page_wash(MDBX_txn *txn, const size_t di, | |||||||
|                      (txn->mt_parent ? txn->mt_parent->tw.dirtyroom |                      (txn->mt_parent ? txn->mt_parent->tw.dirtyroom | ||||||
|                                      : txn->mt_env->me_options.dp_limit)); |                                      : txn->mt_env->me_options.dp_limit)); | ||||||
|   } else { |   } else { | ||||||
|     tASSERT(txn, txn->tw.dirtylist == nullptr); |     tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP)); | ||||||
|     tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); |     if (txn->tw.dirtylist == nullptr) { | ||||||
|     tASSERT(txn, txn->tw.writemap_dirty_npages >= npages); |       tASSERT(txn, !MDBX_AVOID_MSYNC); | ||||||
|     txn->tw.writemap_dirty_npages -= npages; |       txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) | ||||||
|  |                                            ? npages | ||||||
|  |                                            : txn->tw.writemap_dirty_npages; | ||||||
|  |     } | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); |   VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); | ||||||
| @@ -4686,14 +4688,13 @@ __must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, | |||||||
|  |  | ||||||
| static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, | static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, | ||||||
|                       const size_t npages) { |                       const size_t npages) { | ||||||
|   tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP) || MDBX_AVOID_MSYNC); |   tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); | ||||||
| #if MDBX_ENABLE_PGOP_STAT | #if MDBX_ENABLE_PGOP_STAT | ||||||
|   txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; |   txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; | ||||||
| #endif /* MDBX_ENABLE_PGOP_STAT */ | #endif /* MDBX_ENABLE_PGOP_STAT */ | ||||||
|   const pgno_t pgno = dp->mp_pgno; |   const pgno_t pgno = dp->mp_pgno; | ||||||
|   int err = iov_page(txn, ctx, dp, npages); |   int err = iov_page(txn, ctx, dp, npages); | ||||||
|   if (likely(err == MDBX_SUCCESS) && |   if (likely(err == MDBX_SUCCESS)) | ||||||
|       (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP))) |  | ||||||
|     err = pnl_append_range(true, &txn->tw.spilled.list, pgno << 1, npages); |     err = pnl_append_range(true, &txn->tw.spilled.list, pgno << 1, npages); | ||||||
|   return err; |   return err; | ||||||
| } | } | ||||||
| @@ -4702,7 +4703,7 @@ static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, | |||||||
|  * Returns the number of pages marked as unspillable. */ |  * Returns the number of pages marked as unspillable. */ | ||||||
| static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { | static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); |   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); |   tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); | ||||||
|   size_t keep = 0; |   size_t keep = 0; | ||||||
|   while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { |   while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { | ||||||
|     tASSERT(txn, mc->mc_top == mc->mc_snum - 1); |     tASSERT(txn, mc->mc_top == mc->mc_snum - 1); | ||||||
| @@ -4736,7 +4737,8 @@ static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { | |||||||
| } | } | ||||||
|  |  | ||||||
| static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { | static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); |   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); | ||||||
|  |   tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); | ||||||
|   txn_lru_turn(txn); |   txn_lru_turn(txn); | ||||||
|   size_t keep = m0 ? cursor_keep(txn, m0) : 0; |   size_t keep = m0 ? cursor_keep(txn, m0) : 0; | ||||||
|   for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) |   for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) | ||||||
| @@ -4839,13 +4841,15 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, | |||||||
| static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, | static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, | ||||||
|                               const size_t need) { |                               const size_t need) { | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); |   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); |  | ||||||
|   tASSERT(txn, !m0 || cursor_is_tracked(m0)); |   tASSERT(txn, !m0 || cursor_is_tracked(m0)); | ||||||
|  |  | ||||||
|   intptr_t wanna_spill_entries = need - txn->tw.dirtyroom - txn->tw.loose_count; |   const intptr_t wanna_spill_entries = | ||||||
|   intptr_t wanna_spill_npages = |       txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0; | ||||||
|       need + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count - |   const intptr_t wanna_spill_npages = | ||||||
|       txn->mt_env->me_options.dp_limit; |       need + | ||||||
|  |       (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose | ||||||
|  |                          : txn->tw.writemap_dirty_npages) - | ||||||
|  |       txn->tw.loose_count - txn->mt_env->me_options.dp_limit; | ||||||
|  |  | ||||||
|   /* production mode */ |   /* production mode */ | ||||||
|   if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1) |   if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1) | ||||||
| @@ -4882,15 +4886,19 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, | |||||||
|                                      const intptr_t wanna_spill_npages, |                                      const intptr_t wanna_spill_npages, | ||||||
|                                      const size_t need) { |                                      const size_t need) { | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); |   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); |  | ||||||
|  |  | ||||||
|   int rc = MDBX_SUCCESS; |   int rc = MDBX_SUCCESS; | ||||||
|   if (unlikely(txn->tw.dirtylist->length <= txn->tw.loose_count)) |   if (unlikely(txn->tw.loose_count >= | ||||||
|  |                (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose | ||||||
|  |                                   : txn->tw.writemap_dirty_npages))) | ||||||
|     goto done; |     goto done; | ||||||
|  |  | ||||||
|   const size_t dirty_entries = txn->tw.dirtylist->length - txn->tw.loose_count; |   const size_t dirty_entries = | ||||||
|  |       txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1; | ||||||
|   const size_t dirty_npages = |   const size_t dirty_npages = | ||||||
|       txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; |       (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose | ||||||
|  |                          : txn->tw.writemap_dirty_npages) - | ||||||
|  |       txn->tw.loose_count; | ||||||
|   const size_t need_spill_entries = |   const size_t need_spill_entries = | ||||||
|       spill_gate(txn->mt_env, wanna_spill_entries, dirty_entries); |       spill_gate(txn->mt_env, wanna_spill_entries, dirty_entries); | ||||||
|   const size_t need_spill_npages = |   const size_t need_spill_npages = | ||||||
| @@ -4902,17 +4910,18 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, | |||||||
|   if (!need_spill) |   if (!need_spill) | ||||||
|     goto done; |     goto done; | ||||||
|  |  | ||||||
| #if !MDBX_AVOID_MSYNC |  | ||||||
|   if (txn->mt_flags & MDBX_WRITEMAP) { |   if (txn->mt_flags & MDBX_WRITEMAP) { | ||||||
|     NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", |     NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", | ||||||
|            dirty_entries, dirty_npages); |            dirty_entries, dirty_npages); | ||||||
|     tASSERT(txn, txn->tw.spilled.list == nullptr); |  | ||||||
|     const MDBX_env *env = txn->mt_env; |     const MDBX_env *env = txn->mt_env; | ||||||
|  |     tASSERT(txn, txn->tw.spilled.list == nullptr); | ||||||
|     rc = |     rc = | ||||||
|         osal_msync(&txn->mt_env->me_dxb_mmap, 0, |         osal_msync(&txn->mt_env->me_dxb_mmap, 0, | ||||||
|                    pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK); |                    pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK); | ||||||
|     if (unlikely(rc != MDBX_SUCCESS)) |     if (unlikely(rc != MDBX_SUCCESS)) | ||||||
|       goto bailout; |       goto bailout; | ||||||
|  | #if MDBX_AVOID_MSYNC | ||||||
|  |     tASSERT(txn, dirtylist_check(txn)); | ||||||
|     env->me_lck->mti_unsynced_pages.weak += |     env->me_lck->mti_unsynced_pages.weak += | ||||||
|         txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; |         txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; | ||||||
|     dpl_clear(txn->tw.dirtylist); |     dpl_clear(txn->tw.dirtylist); | ||||||
| @@ -4921,17 +4930,24 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, | |||||||
|       rc = dpl_append(txn, lp->mp_pgno, lp, 1); |       rc = dpl_append(txn, lp->mp_pgno, lp, 1); | ||||||
|       if (unlikely(rc != MDBX_SUCCESS)) |       if (unlikely(rc != MDBX_SUCCESS)) | ||||||
|         goto bailout; |         goto bailout; | ||||||
|  |       MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); | ||||||
|  |       VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); | ||||||
|     } |     } | ||||||
|  |     tASSERT(txn, dirtylist_check(txn)); | ||||||
|  | #else | ||||||
|  |     tASSERT(txn, txn->tw.dirtylist == nullptr); | ||||||
|  |     env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; | ||||||
|  |     txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages; | ||||||
|  |     txn->tw.writemap_dirty_npages = 0; | ||||||
|  | #endif /* MDBX_AVOID_MSYNC */ | ||||||
|     goto done; |     goto done; | ||||||
|   } |   } | ||||||
| #endif /* MDBX_AVOID_MSYNC */ |  | ||||||
|  |  | ||||||
|   NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", |   NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", | ||||||
|          need_spill_entries, need_spill_npages); |          need_spill_entries, need_spill_npages); | ||||||
|   tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); |   tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); | ||||||
|   tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= |   tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= | ||||||
|                    need_spill_npages); |                    need_spill_npages); | ||||||
|   if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { |  | ||||||
|   if (!txn->tw.spilled.list) { |   if (!txn->tw.spilled.list) { | ||||||
|     txn->tw.spilled.least_removed = INT_MAX; |     txn->tw.spilled.least_removed = INT_MAX; | ||||||
|     txn->tw.spilled.list = pnl_alloc(need_spill); |     txn->tw.spilled.list = pnl_alloc(need_spill); | ||||||
| @@ -4949,7 +4965,6 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, | |||||||
|      and pnl_append() will increase pnl on demand */ |      and pnl_append() will increase pnl on demand */ | ||||||
|         ; |         ; | ||||||
|   } |   } | ||||||
|   } |  | ||||||
|  |  | ||||||
|   /* Сортируем чтобы запись на диск была полее последовательна */ |   /* Сортируем чтобы запись на диск была полее последовательна */ | ||||||
|   MDBX_dpl *const dl = dpl_sort(txn); |   MDBX_dpl *const dl = dpl_sort(txn); | ||||||
| @@ -5063,67 +5078,46 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, | |||||||
|     if (unlikely(rc != MDBX_SUCCESS)) |     if (unlikely(rc != MDBX_SUCCESS)) | ||||||
|       goto bailout; |       goto bailout; | ||||||
|  |  | ||||||
|     unsigned prev_prio = 256, prio; |     size_t r = 0, w = 0; | ||||||
|     size_t r, w; |     pgno_t last = 0; | ||||||
|     for (w = 0, r = 1; |     while (r < dl->length && (spilled_entries < need_spill_entries || | ||||||
|          r <= dl->length && (spilled_entries < need_spill_entries || |                               spilled_npages < need_spill_npages)) { | ||||||
|                              spilled_npages < need_spill_npages); |       dl->items[++w] = dl->items[++r]; | ||||||
|          prev_prio = prio, ++r) { |       unsigned prio = spill_prio(txn, w, reciprocal); | ||||||
|       prio = spill_prio(txn, r, reciprocal); |       if (prio > prio2spill && | ||||||
|       MDBX_page *const dp = dl->items[r].ptr; |           (prio >= prio2adjacent || last != dl->items[w].pgno)) | ||||||
|       if (prio < prio2adjacent) { |         continue; | ||||||
|         const pgno_t pgno = dl->items[r].pgno; |  | ||||||
|         const unsigned npages = dpl_npages(dl, r); |  | ||||||
|         if (prio <= prio2spill) { |  | ||||||
|           if (prev_prio < prio2adjacent && prev_prio > prio2spill && |  | ||||||
|               dpl_endpgno(dl, r - 1) == pgno) { |  | ||||||
|             DEBUG("co-spill %u prev-adjacent page %" PRIaPGNO |  | ||||||
|                   " (age %d, prio %u)", |  | ||||||
|                   dpl_npages(dl, w), dl->items[r - 1].pgno, dpl_age(txn, r - 1), |  | ||||||
|                   prev_prio); |  | ||||||
|             --w; |  | ||||||
|             const unsigned co_npages = dpl_npages(dl, r - 1); |  | ||||||
|             rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, co_npages); |  | ||||||
|             if (unlikely(rc != MDBX_SUCCESS)) |  | ||||||
|               break; |  | ||||||
|             ++spilled_entries; |  | ||||||
|             spilled_npages += co_npages; |  | ||||||
|           } |  | ||||||
|  |  | ||||||
|           DEBUG("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, |       const size_t e = w; | ||||||
|                 dp->mp_pgno, dpl_age(txn, r), prio); |       last = dpl_endpgno(dl, w); | ||||||
|           rc = spill_page(txn, &ctx, dp, npages); |       while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno && | ||||||
|           if (unlikely(rc != MDBX_SUCCESS)) |              spill_prio(txn, w, reciprocal) < prio2adjacent) | ||||||
|             break; |         ; | ||||||
|  |  | ||||||
|  |       for (size_t i = w; ++i <= e;) { | ||||||
|  |         const unsigned npages = dpl_npages(dl, i); | ||||||
|  |         prio = spill_prio(txn, i, reciprocal); | ||||||
|  |         DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)", | ||||||
|  |               (prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno, | ||||||
|  |               dpl_age(txn, i), prio); | ||||||
|  |         tASSERT(txn, prio < 256); | ||||||
|         ++spilled_entries; |         ++spilled_entries; | ||||||
|         spilled_npages += npages; |         spilled_npages += npages; | ||||||
|           continue; |         rc = spill_page(txn, &ctx, dl->items[i].ptr, npages); | ||||||
|         } |  | ||||||
|  |  | ||||||
|         if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { |  | ||||||
|           DEBUG("co-spill %u next-adjacent page %" PRIaPGNO |  | ||||||
|                 " (age %d, prio %u)", |  | ||||||
|                 npages, dp->mp_pgno, dpl_age(txn, r), prio); |  | ||||||
|           rc = spill_page(txn, &ctx, dp, npages); |  | ||||||
|         if (unlikely(rc != MDBX_SUCCESS)) |         if (unlikely(rc != MDBX_SUCCESS)) | ||||||
|             break; |           goto failed; | ||||||
|           prio = prev_prio /* to continue co-spilling next adjacent pages */; |  | ||||||
|           ++spilled_entries; |  | ||||||
|           spilled_npages += npages; |  | ||||||
|           continue; |  | ||||||
|       } |       } | ||||||
|     } |     } | ||||||
|       dl->items[++w] = dl->items[r]; |  | ||||||
|     } |  | ||||||
|  |  | ||||||
|     VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, |     VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, | ||||||
|             spilled_npages); |             spilled_npages); | ||||||
|     tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); |     tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); | ||||||
|     tASSERT(txn, spilled_npages >= spilled_entries); |     tASSERT(txn, spilled_npages >= spilled_entries); | ||||||
|  |  | ||||||
|     while (r <= dl->length) |   failed: | ||||||
|       dl->items[++w] = dl->items[r++]; |     while (r < dl->length) | ||||||
|     tASSERT(txn, r - 1 - w == spilled_entries); |       dl->items[++w] = dl->items[++r]; | ||||||
|  |     tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS); | ||||||
|  |  | ||||||
|     dl->sorted = dpl_setlen(dl, w); |     dl->sorted = dpl_setlen(dl, w); | ||||||
|     txn->tw.dirtyroom += spilled_entries; |     txn->tw.dirtyroom += spilled_entries; | ||||||
| @@ -5138,10 +5132,8 @@ __cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, | |||||||
|       goto bailout; |       goto bailout; | ||||||
|  |  | ||||||
|     txn->mt_env->me_lck->mti_unsynced_pages.weak += spilled_npages; |     txn->mt_env->me_lck->mti_unsynced_pages.weak += spilled_npages; | ||||||
|     if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { |  | ||||||
|     pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); |     pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); | ||||||
|     txn->mt_flags |= MDBX_TXN_SPILLS; |     txn->mt_flags |= MDBX_TXN_SPILLS; | ||||||
|     } |  | ||||||
|     NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", |     NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", | ||||||
|            spilled_entries, spilled_npages, txn->tw.dirtyroom); |            spilled_entries, spilled_npages, txn->tw.dirtyroom); | ||||||
|   } else { |   } else { | ||||||
| @@ -5180,11 +5172,6 @@ static int cursor_spill(MDBX_cursor *mc, const MDBX_val *key, | |||||||
|                         const MDBX_val *data) { |                         const MDBX_val *data) { | ||||||
|   MDBX_txn *txn = mc->mc_txn; |   MDBX_txn *txn = mc->mc_txn; | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); |   tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); | ||||||
|   if (!txn->tw.dirtylist) { |  | ||||||
|     tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); |  | ||||||
|     return MDBX_SUCCESS; |  | ||||||
|   } |  | ||||||
|   tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); |  | ||||||
|  |  | ||||||
|   /* Estimate how much space this operation will take: */ |   /* Estimate how much space this operation will take: */ | ||||||
|   /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ |   /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ | ||||||
| @@ -5676,16 +5663,12 @@ __hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, | |||||||
|       txn->tw.loose_pages = mp_next(lp); |       txn->tw.loose_pages = mp_next(lp); | ||||||
|       txn->tw.loose_count--; |       txn->tw.loose_count--; | ||||||
|       txn->tw.dirtyroom++; |       txn->tw.dirtyroom++; | ||||||
|       if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { |       if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) | ||||||
|         tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); |  | ||||||
|         dpage_free(txn->mt_env, lp, 1); |         dpage_free(txn->mt_env, lp, 1); | ||||||
|       } |  | ||||||
|     } else { |     } else { | ||||||
|       ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); |       ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); | ||||||
|       if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { |       if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) | ||||||
|         tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); |  | ||||||
|         dpage_free(txn->mt_env, mp, npages); |         dpage_free(txn->mt_env, mp, npages); | ||||||
|       } |  | ||||||
|       return MDBX_TXN_FULL; |       return MDBX_TXN_FULL; | ||||||
|     } |     } | ||||||
|   } |   } | ||||||
| @@ -6059,7 +6042,7 @@ __cold static int map_resize(MDBX_env *env, const pgno_t used_pgno, | |||||||
|     env->me_lck->mti_pgop_stat.msync.weak += 1; |     env->me_lck->mti_pgop_stat.msync.weak += 1; | ||||||
| #endif /* MDBX_ENABLE_PGOP_STAT */ | #endif /* MDBX_ENABLE_PGOP_STAT */ | ||||||
|     rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), |     rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), | ||||||
|                     MDBX_SYNC_KICK); |                     MDBX_SYNC_NONE); | ||||||
|     if (unlikely(rc != MDBX_SUCCESS)) |     if (unlikely(rc != MDBX_SUCCESS)) | ||||||
|       goto bailout; |       goto bailout; | ||||||
|   } |   } | ||||||
| @@ -7834,6 +7817,20 @@ __hot static int page_touch(MDBX_cursor *mc) { | |||||||
|       return MDBX_SUCCESS; |       return MDBX_SUCCESS; | ||||||
|     tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); |     tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); | ||||||
|     const size_t n = dpl_search(txn, mp->mp_pgno); |     const size_t n = dpl_search(txn, mp->mp_pgno); | ||||||
|  |     if (MDBX_AVOID_MSYNC && | ||||||
|  |         unlikely(txn->tw.dirtylist->items[n].pgno != mp->mp_pgno)) { | ||||||
|  |       tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP)); | ||||||
|  |       tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1); | ||||||
|  |       VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); | ||||||
|  |       np = (MDBX_page *)mp; | ||||||
|  | #if MDBX_ENABLE_PGOP_STAT | ||||||
|  |       txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += 1; | ||||||
|  | #endif /* MDBX_ENABLE_PGOP_STAT */ | ||||||
|  |       return page_dirty(txn, np, 1); | ||||||
|  |     } | ||||||
|  |     tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length); | ||||||
|  |     tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && | ||||||
|  |                      txn->tw.dirtylist->items[n].ptr == mp); | ||||||
|     txn->tw.dirtylist->items[n].mlru = |     txn->tw.dirtylist->items[n].mlru = | ||||||
|         (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + |         (txn->tw.dirtylist->items[n].mlru & MDBX_dp_multi_mask) + | ||||||
|         txn_lru_turn(txn); |         txn_lru_turn(txn); | ||||||
| @@ -8883,6 +8880,8 @@ static int txn_renew(MDBX_txn *txn, const unsigned flags) { | |||||||
|       txn->tw.dirtyroom = MAX_PAGENO; |       txn->tw.dirtyroom = MAX_PAGENO; | ||||||
|       txn->tw.dirtylru = 0; |       txn->tw.dirtylru = 0; | ||||||
|     } |     } | ||||||
|  |     eASSERT(env, txn->tw.writemap_dirty_npages == 0); | ||||||
|  |     eASSERT(env, txn->tw.writemap_spilled_npages == 0); | ||||||
|   } |   } | ||||||
|  |  | ||||||
|   /* Setup db info */ |   /* Setup db info */ | ||||||
| @@ -9352,7 +9351,8 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { | |||||||
|     info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); |     info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); | ||||||
|     info->txn_space_dirty = pgno2bytes( |     info->txn_space_dirty = pgno2bytes( | ||||||
|         env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose |         env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose | ||||||
|                                : txn->tw.writemap_dirty_npages); |                                : (txn->tw.writemap_dirty_npages + | ||||||
|  |                                   txn->tw.writemap_spilled_npages)); | ||||||
|     info->txn_reader_lag = INT64_MAX; |     info->txn_reader_lag = INT64_MAX; | ||||||
|     MDBX_lockinfo *const lck = env->me_lck_mmap.lck; |     MDBX_lockinfo *const lck = env->me_lck_mmap.lck; | ||||||
|     if (scan_rlt && lck) { |     if (scan_rlt && lck) { | ||||||
| @@ -9566,10 +9566,8 @@ static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { | |||||||
|     remove_dl: |     remove_dl: | ||||||
|       npages = dpl_npages(dl, r); |       npages = dpl_npages(dl, r); | ||||||
|       dl->pages_including_loose -= npages; |       dl->pages_including_loose -= npages; | ||||||
|       if (!MDBX_AVOID_MSYNC || !(txn->mt_env->me_flags & MDBX_WRITEMAP)) { |       if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) | ||||||
|         tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); |  | ||||||
|         dpage_free(txn->mt_env, dl->items[r].ptr, npages); |         dpage_free(txn->mt_env, dl->items[r].ptr, npages); | ||||||
|       } |  | ||||||
|       ++r; |       ++r; | ||||||
|     next_i: |     next_i: | ||||||
|       i += step; |       i += step; | ||||||
| @@ -12410,7 +12408,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, | |||||||
|   rc = MDBX_RESULT_FALSE /* carry steady */; |   rc = MDBX_RESULT_FALSE /* carry steady */; | ||||||
|   if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { |   if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { | ||||||
|     eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); |     eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); | ||||||
|     enum osal_syncmode_bits mode_bits = MDBX_SYNC_KICK; |     enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; | ||||||
|     unsigned sync_op = 0; |     unsigned sync_op = 0; | ||||||
|     if ((flags & MDBX_SAFE_NOSYNC) == 0) { |     if ((flags & MDBX_SAFE_NOSYNC) == 0) { | ||||||
|       sync_op = 1; |       sync_op = 1; | ||||||
| @@ -12422,7 +12420,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, | |||||||
|         mode_bits |= MDBX_SYNC_IODQ; |         mode_bits |= MDBX_SYNC_IODQ; | ||||||
|     } else if (unlikely(env->me_incore)) |     } else if (unlikely(env->me_incore)) | ||||||
|       goto skip_incore_sync; |       goto skip_incore_sync; | ||||||
|     if (!MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { |     if (flags & MDBX_WRITEMAP) { | ||||||
| #if MDBX_ENABLE_PGOP_STAT | #if MDBX_ENABLE_PGOP_STAT | ||||||
|       env->me_lck->mti_pgop_stat.msync.weak += sync_op; |       env->me_lck->mti_pgop_stat.msync.weak += sync_op; | ||||||
| #else | #else | ||||||
| @@ -12567,7 +12565,7 @@ static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, | |||||||
| #endif /* MDBX_ENABLE_PGOP_STAT */ | #endif /* MDBX_ENABLE_PGOP_STAT */ | ||||||
|         rc = osal_msync( |         rc = osal_msync( | ||||||
|             &env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), |             &env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), | ||||||
|             (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_KICK |             (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE | ||||||
|                                       : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); |                                       : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); | ||||||
|       } else { |       } else { | ||||||
| #if MDBX_ENABLE_PGOP_STAT | #if MDBX_ENABLE_PGOP_STAT | ||||||
| @@ -13995,14 +13993,10 @@ __cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, | |||||||
| #if MDBX_ENABLE_PGOP_STAT | #if MDBX_ENABLE_PGOP_STAT | ||||||
|     lck->mti_pgop_stat.wops.weak = 1; |     lck->mti_pgop_stat.wops.weak = 1; | ||||||
| #endif /* MDBX_ENABLE_PGOP_STAT */ | #endif /* MDBX_ENABLE_PGOP_STAT */ | ||||||
|     err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_KICK); |     err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, | ||||||
|  |                      MDBX_SYNC_DATA | MDBX_SYNC_SIZE); | ||||||
|     if (unlikely(err != MDBX_SUCCESS)) { |     if (unlikely(err != MDBX_SUCCESS)) { | ||||||
|       ERROR("initial-%s for lck-file failed", "msync"); |       ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err); | ||||||
|       goto bailout; |  | ||||||
|     } |  | ||||||
|     err = osal_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); |  | ||||||
|     if (unlikely(err != MDBX_SUCCESS)) { |  | ||||||
|       ERROR("initial-%s for lck-file failed", "fsync"); |  | ||||||
|       goto bailout; |       goto bailout; | ||||||
|     } |     } | ||||||
|   } else { |   } else { | ||||||
|   | |||||||
| @@ -1110,6 +1110,7 @@ struct MDBX_txn { | |||||||
|           MDBX_PNL list; |           MDBX_PNL list; | ||||||
|         } spilled; |         } spilled; | ||||||
|         size_t writemap_dirty_npages; |         size_t writemap_dirty_npages; | ||||||
|  |         size_t writemap_spilled_npages; | ||||||
|       }; |       }; | ||||||
|     } tw; |     } tw; | ||||||
|   }; |   }; | ||||||
|   | |||||||
| @@ -1566,6 +1566,7 @@ MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, | |||||||
|    * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ |    * see http://www.spinics.net/lists/linux-ext4/msg33714.html */ | ||||||
|   while (1) { |   while (1) { | ||||||
|     switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { |     switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { | ||||||
|  |     case MDBX_SYNC_NONE: | ||||||
|     case MDBX_SYNC_KICK: |     case MDBX_SYNC_KICK: | ||||||
|       return MDBX_SUCCESS /* nothing to do */; |       return MDBX_SUCCESS /* nothing to do */; | ||||||
| #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 | #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 | ||||||
| @@ -1707,7 +1708,7 @@ MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { | |||||||
| MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, | MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, | ||||||
|                                   size_t length, |                                   size_t length, | ||||||
|                                   enum osal_syncmode_bits mode_bits) { |                                   enum osal_syncmode_bits mode_bits) { | ||||||
|   if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_KICK) |   if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_NONE) | ||||||
|     return MDBX_SUCCESS; |     return MDBX_SUCCESS; | ||||||
|  |  | ||||||
|   void *ptr = ptr_disp(map->base, offset); |   void *ptr = ptr_disp(map->base, offset); | ||||||
| @@ -1727,7 +1728,7 @@ MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, | |||||||
|   // NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases. |   // NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases. | ||||||
|   // |   // | ||||||
|   // assert(linux_kernel_version > 0x02061300); |   // assert(linux_kernel_version > 0x02061300); | ||||||
|   // if (mode_bits == MDBX_SYNC_KICK) |   // if (mode_bits <= MDBX_SYNC_KICK) | ||||||
|   //   return MDBX_SUCCESS; |   //   return MDBX_SUCCESS; | ||||||
| #endif /* Linux */ | #endif /* Linux */ | ||||||
|   if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) |   if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) | ||||||
|   | |||||||
| @@ -523,10 +523,11 @@ osal_thread_create(osal_thread_t *thread, | |||||||
| MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); | MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); | ||||||
|  |  | ||||||
| enum osal_syncmode_bits { | enum osal_syncmode_bits { | ||||||
|   MDBX_SYNC_KICK = 0, |   MDBX_SYNC_NONE = 0, | ||||||
|   MDBX_SYNC_DATA = 1, |   MDBX_SYNC_KICK = 1, | ||||||
|   MDBX_SYNC_SIZE = 2, |   MDBX_SYNC_DATA = 2, | ||||||
|   MDBX_SYNC_IODQ = 4 |   MDBX_SYNC_SIZE = 4, | ||||||
|  |   MDBX_SYNC_IODQ = 8 | ||||||
| }; | }; | ||||||
|  |  | ||||||
| MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, | MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user