diff --git a/CMakeLists.txt b/CMakeLists.txt index 162ce2be..777a3f30 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ ## -## Copyright 2020-2022 Leonid Yuriev +## Copyright 2020-2023 Leonid Yuriev ## and other libmdbx authors: please see AUTHORS file. ## All rights reserved. ## @@ -235,6 +235,7 @@ if(SUBPROJECT) if(NOT DEFINED CMAKE_POSITION_INDEPENDENT_CODE) option(CMAKE_POSITION_INDEPENDENT_CODE "Generate position independent (PIC)" ON) endif() + set(MDBX_MANAGE_BUILD_FLAGS_DEFAULT OFF) else() option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)" ON) option(CMAKE_POSITION_INDEPENDENT_CODE "Generate position independent (PIC)" ON) @@ -351,9 +352,14 @@ else() endif() endif(NOT MDBX_AMALGAMATED_SOURCE) - setup_compile_flags() + set(MDBX_MANAGE_BUILD_FLAGS_DEFAULT ON) endif(SUBPROJECT) +option(MDBX_MANAGE_BUILD_FLAGS "Allow libmdbx to configure/manage/override its own build flags" ${MDBX_MANAGE_BUILD_FLAGS_DEFAULT}) +if(MDBX_MANAGE_BUILD_FLAGS) + setup_compile_flags() +endif() + list(FIND CMAKE_C_COMPILE_FEATURES c_std_11 HAS_C11) list(FIND CMAKE_CXX_COMPILE_FEATURES cxx_std_11 HAS_CXX11) list(FIND CMAKE_CXX_COMPILE_FEATURES cxx_std_14 HAS_CXX14) @@ -500,16 +506,29 @@ if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows") if(MDBX_NTDLL_EXTRA_IMPLIB) add_mdbx_option(MDBX_WITHOUT_MSVC_CRT "Avoid dependence from MSVC CRT and use ntdll.dll instead" OFF) endif() + set(MDBX_AVOID_MSYNC_DEFAULT ON) else() add_mdbx_option(MDBX_USE_OFDLOCKS "Use Open file description locks (aka OFD locks, non-POSIX)" AUTO) mark_as_advanced(MDBX_USE_OFDLOCKS) + set(MDBX_AVOID_MSYNC_DEFAULT OFF) endif() -add_mdbx_option(MDBX_LOCKING "Locking method (Win32=-1, SysV=5, POSIX=1988, POSIX=2001, POSIX=2008, Futexes=1995)" AUTO) +option(MDBX_AVOID_MSYNC "Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP mode" ${MDBX_AVOID_MSYNC_DEFAULT}) +add_mdbx_option(MDBX_LOCKING "Locking method (Windows=-1, SysV=5, POSIX=1988, POSIX=2001, POSIX=2008, Futexes=1995)" AUTO) mark_as_advanced(MDBX_LOCKING) add_mdbx_option(MDBX_TRUST_RTC "Does a system have battery-backed Real-Time Clock or just a fake" AUTO) mark_as_advanced(MDBX_TRUST_RTC) option(MDBX_FORCE_ASSERTIONS "Force enable assertion checking" OFF) -option(MDBX_DISABLE_PAGECHECKS "Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB" OFF) +option(MDBX_DISABLE_VALIDATION "Disable some checks to reduce an overhead and detection probability of database corruption to a values closer to the LMDB" OFF) +option(MDBX_ENABLE_REFUND "Zerocost auto-compactification during write-transactions" ON) +option(MDBX_ENABLE_MADVISE "Using POSIX' madvise() and/or similar hints" ON) +if (CMAKE_TARGET_BITNESS GREATER 32) + set(MDBX_BIGFOOT_DEFAULT ON) +else() + set(MDBX_BIGFOOT_DEFAULT OFF) +endif() +option(MDBX_ENABLE_BIGFOOT "Chunking long list of retired pages during huge transactions commit to avoid use sequences of pages" ${MDBX_BIGFOOT_DEFAULT}) +option(MDBX_ENABLE_PGOP_STAT "Gathering statistics for page operations" ON) +option(MDBX_ENABLE_PROFGC "Profiling of GC search and updates" OFF) if(NOT MDBX_AMALGAMATED_SOURCE) if(CMAKE_CONFIGURATION_TYPES OR CMAKE_BUILD_TYPE_UPPERCASE STREQUAL "DEBUG") @@ -966,6 +985,7 @@ if (NOT SUBPROJECT) set(CPACK_PACKAGE_VERSION_COMMIT ${MDBX_VERSION_REVISION}) set(PACKAGE_VERSION "${CPACK_PACKAGE_VERSION_MAJOR}.${CPACK_PACKAGE_VERSION_MINOR}.${CPACK_PACKAGE_VERSION_PATCH}.${CPACK_PACKAGE_VERSION_COMMIT}") message(STATUS "libmdbx package version is ${PACKAGE_VERSION}") + file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/VERSION.txt" "${MDBX_VERSION_MAJOR}.${MDBX_VERSION_MINOR}.${MDBX_VERSION_RELEASE}.${MDBX_VERSION_REVISION}") endif() cmake_policy(POP) diff --git a/COPYRIGHT b/COPYRIGHT index bd3acace..352beaed 100644 --- a/COPYRIGHT +++ b/COPYRIGHT @@ -1,4 +1,4 @@ -Copyright 2015-2022 Leonid Yuriev . +Copyright 2015-2023 Leonid Yuriev . Copyright 2011-2015 Howard Chu, Symas Corp. Copyright 2015,2016 Peter-Service R&D LLC. All rights reserved. diff --git a/ChangeLog.md b/ChangeLog.md index c9f10752..a0296737 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -1,53 +1,447 @@ ChangeLog --------- -## v0.11.14 (Sergey Kapitsa) at 2023-02-14 +English version [by Google](https://gitflic-ru.translate.goog/project/erthink/libmdbx/blob?file=ChangeLog.md&_x_tr_sl=ru&_x_tr_tl=en) +and [by Yandex](https://translated.turbopages.org/proxy_u/ru-en.en/https/gitflic.ru/project/erthink/libmdbx/blob?file=ChangeLog.md). -The stable bugfix release in memory of [Sergey Kapitsa](https://en.wikipedia.org/wiki/Sergey_Kapitsa) on his 95th birthday. +## v0.12.4 (Арта-333) от 2023-03-03 + +Стабилизирующий выпуск с исправлением обнаруженных ошибок, устранением +недочетов и технических долгов. Ветка 0.12 считается готовой к +продуктовому использованию, получает статус стабильной и далее будет +получать только исправление ошибок. Разработка будет продолжена в ветке +0.13, а ветка 0.11 становится архивной. ``` -22 files changed, 250 insertions(+), 174 deletions(-) +63 files changed, 1161 insertions(+), 569 deletions(-) Signed-off-by: Леонид Юрьев (Leonid Yuriev) ``` +Благодарности: + + - Max за сообщение о проблеме ERROR_SHARING_VIOLATION + в режиме MDBX_EXCLUSIVE на Windows. + - Alisher Ashyrov за сообщение о проблеме + с assert-проверкой и содействие в отладке. + - Masatoshi Fukunaga за сообщение о проблеме + `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены всех значений в subDb. + +Исправления (без корректировок новых функций): + + - Устранен регресс после коммита 474391c83c5f81def6fdf3b0b6f5716a87b78fbf, + приводящий к возврату ERROR_SHARING_VIOLATION в Windows при открытии БД + в режиме MDBX_EXCLUSIVE для чтения-записи. + + - Добавлено ограничение размера отображения при коротком read-only файле, для + предотвращения ошибки ERROR_NOT_ENOUGH_MEMORY в Windows, которая возникает + в этом случае и совсем не информативна для пользователя. + + - Произведен рефакторинг `dxb_resize()`, в том числе, для устранения срабатывания + assert-проверки `size_bytes == env->me_dxb_mmap.current` в специфических + многопоточных сценариях использования. Проверка срабатывала только в + отладочных сборках, при специфическом наложении во времени читающей и + пишущей транзакции в разных потоках, одновременно с изменением размера БД. + Кроме срабатывание проверки, каких-либо других последствий не возникало. + + - Устранена проблема в `put(MDBX_UPSERT+MDBX_ALLDUPS)` для случая замены + всех значений единственного ключа в subDb. В ходе этой операции subDb + становится полностью пустой, без каких-либо страниц и именно эта + ситуация не была учтена в коде, что приводило к повреждению БД + при фиксации такой транзакции. + + - Устранена излишняя assert-проверка внутри `override_meta()`. + Что в отладочных сборках могло приводить к ложным срабатываниям + при восстановлении БД, в том числе при автоматическом откате слабых + мета-страниц. + + - Скорректированы макросы `__cold`/`__hot`, в том числе для устранения проблемы + `error: inlining failed in call to ‘always_inline FOO(...)’: target specific option mismatch` + при сборке посредством GCC >10.x для SH4. + +Ликвидация технических долгов и мелочи: + + - Исправлены многочисленные опечатки в документации. + - Доработан тест для полной стохастической проверки `MDBX_EKEYMISMATCH` в режиме `MDBX_APPEND`. + - Расширены сценарии запуска `mdbx_chk` из CMake-тестов для проверки как в обычном, + так и эксклюзивном режимах чтения-записи. + - Уточнены спецификаторы `const` и `noexcept` для нескольких методов в C++ API. + - Устранено использование стека под буферы для `wchar`-преобразования путей. + - Для Windows добавлена функция `mdbx_env_get_path()` для получения пути к БД + в формате многобайтных символов. + - Добавлены doxygen-описания для API с широкими символами. + - Устранены предупреждения статического анализатора MSVC, + все они были несущественные, либо ложные. + - Устранено ложное предупреждение GCC при сборке для SH4. + - Добавлена поддержка ASAN (Address Sanitizer) при сборке посредством MSVC. + - Расширен набор перебираемых режимов в скрипте `test/long_stochastic.sh`, + добавлена опция `--extra`. + - В C++ API добавлена поддержка расширенных опций времени выполнения `mdbx::extra_runtime_option`, + аналогично `enum MDBX_option_t` из C API. + - Вывод всех счетчиков page-operations в `mdbx_stat`. + + +------------------------------------------------------------------------------- + + +## v0.12.3 (Акула) от 2023-01-07 + +Выпуск с существенными доработками и новой функциональностью в память о закрытом open-source +[проекте "Акула"](https://erigon.substack.com/p/winding-down-support-for-akula-project). + +Добавлена prefault-запись, переделан контроль “некогерентности” unified page/buffer cache, изменена тактика слияния страниц и т.д. +Стало ещё быстрее, в некоторых сценариях вдвое. + +``` +20 files changed, 4508 insertions(+), 2928 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Благодарности: + + - [Alex Sharov](https://t.me/AskAlexSharov) и команде [Erigon](https://github.com/ledgerwatch/erigon) за тестирование. + - [Simon Leier](https://t.me/leisim) за сообщение о сбоях и тестирование. + +Новое: + + - Использование адреса [https://libmdbx.dqdkfa.ru/dead-github](https://libmdbx.dqdkfa.ru/dead-github) + для отсылки к сохранённым в web.archive.org копиям ресурсов, уничтоженных администрацией Github. + + - Реализована prefault-запись при выделении страниц для read-write отображений. + Это приводит к кратному снижению системных издержек и существенному увеличению + производительности в соответствующих сценариях использования, когда: + - размер БД и объём данных существенно больше ОЗУ; + - используется режим `MDBX_WRITEMAP`; + - не-мелкие транзакции (по ходу транзакции выделяется многие сотни или тысячи страниц). + + В режиме `MDBX_WRITEMAP` выделение/переиспользование страниц приводит + к page-fault и чтению страницы с диска, даже если содержимое страницы + не нужно (будет перезаписано). Это является следствием работы подсистемы + виртуальной памяти, а штатный способ лечения через `MADV_REMOVE` + работает не на всех ФС и обычно дороже получаемой экономии. + + Теперь в libmdbx используется "упреждающая запись" таких страниц, + которая на системах с [unified page cache](https://www.opennet.ru/base/dev/ubc.txt.html) + приводит к "вталкиванию" данных, устраняя необходимость чтения с диска при + обращении к такой странице памяти. + + Новый функционал работает в согласованности с автоматическим управлением read-ahead + и кэшем статуса присутствия страниц в ОЗУ, посредством [mincore()](https://man7.org/linux/man-pages/man2/mincore.2.html). + + - Добавлена опция `MDBX_opt_prefault_write_enable` для возможности принудительного + включения/выключения prefault-записи. + + - Реализован динамический выбор между сквозной записью на диск и обычной записью + с последующим [fdatasync()](https://man7.org/linux/man-pages/man3/fdatasync.3p.html) + управляемый опцией `MDBX_opt_writethrough_threshold`. + + В долговечных (durable) режимах данные на диск могут быть сброшены двумя способами: + - сквозной записью через файловый дескриптор открытый с `O_DSYNC`; + - обычной записью с последующим вызовом `fdatasync()`. + + Первый способ выгоднее при записи малого количества страниц и/или если + канал взаимодействия с диском/носителем имеет близкую к нулю задержку. + Второй способ выгоднее если требуется записать много страниц и/или канал + взаимодействия имеет весомую задержку (датацентры, облака). Добавленная + опция `MDBX_opt_writethrough_threshold` позволяет во время выполнения + задать порог для динамического выбора способа записи в зависимости от + объема и конкретных условия использования. + + - Автоматическая установка `MDBX_opt_rp_augment_limit` в зависимости от размера БД. + + - Запрещение разного режима `MDBX_WRITEMAP` между процессами в режимах + с отложенной/ленивой записью, так как в этом случае невозможно + обеспечить сброс данных на диск во всех случаях на всех поддерживаемых платформах. + + - Добавлена опция сборки `MDBX_MMAP_USE_MS_ASYNC` позволяющая отключить + использование системного вызова `msync(MS_ASYNC)`, в использовании + которого нет необходимости на подавляющем большинстве актуальных ОС. + По-умолчанию `MDBX_MMAP_USE_MS_ASYNC=0` (выключено) на Linux и других + системах с unified page cache. Такое поведение (без использования + `msync(MS_ASYNC)`) соответствует неизменяемой (hardcoded) логике LMDB. В + результате, в простых/наивных бенчмарках, libmdbx опережает LMDB + примерно также как при реальном применении. + + На всякий случай стоит еще раз отметить/напомнить, что на Windows + предположительно libmdbx будет отставать от LMDB в сценариях с + множеством мелких транзакций, так как libmdbx осознанно использует на + Windows файловые блокировки, которые медленные (плохо реализованы в ядре + ОС), но позволяют застраховать пользователей от массы неверных действий + приводящих к повреждению БД. + + - Поддержка не-печатных имен для subDb. + + - Добавлен явный выбор `tls_model("local-dynamic")` для обхода проблемы + `relocation R_X86_64_TPOFF32 against FOO cannot be used with -shared` + из-за ошибки в CLANG приводящей к использованию неверного режима `ls_model`. + + - Изменение тактики слияния страниц при удалении. + Теперь слияние выполняется преимущественно с уже измененной/грязной страницей. + Если же справа и слева обе страницы с одинаковым статусом, + то с наименее заполненной, как прежде. В сценариях с массивным удалением + это позволяет увеличить производительность до 50%. + + - Добавлен контроль отсутствия LCK-файлов с альтернативным именованием. + +Исправления (без корректировок новых функций): + + - Изменение размера отображения если это требуется для сброса данных на + диск при вызове `mdbx_env_sync()` из параллельного потока выполнения вне + работающей транзакции. + + - Исправление регресса после коммита db72763de049d6e4546f838277fe83b9081ad1de от 2022-10-08 + в логике возврата грязных страниц в режиме `MDBX_WRITEMAP`, из-за чего + освободившиеся страницы использовались не немедленно, а попадали в + retired-список совершаемой транзакции и происходил необоснованный рост + размера транзакции. + + - Устранение SIGSEGV или ошибочного вызова `free()` в ситуациях + повторного открытия среды посредством `mdbx_env_open()`. + + - Устранение ошибки совершенной в коммите fe20de136c22ed3bc4c6d3f673e79c106e824f60 от 2022-09-18, + в результате чего на Linux в режиме `MDBX_WRITEMAP` никогда не вызывался `msync()`. + Проблема существует только в релизе 0.12.2. + + - Добавление подсчета грязных страниц в `MDBX_WRITEMAP` для предоставления посредством `mdbx_txn_info()` + актуальной информации об объеме изменений в процессе транзакций чтения-записи. + + - Исправление несущественной опечатки в условиях `#if` определения порядка байт. + + - Исправление сборки для случая `MDBX_PNL_ASCENDING=1`. + +Ликвидация технических долгов и мелочи: + + - Доработка поддержки авто-слияния записей GC внутри `page_alloc_slowpath()`. + - Устранение несущественных предупреждений Coverity. + - Использование единого курсора для поиска в GC. + - Переработка внутренних флагов связанных с выделением страниц из GC. + - Доработка подготовки резерва перед обновлением GC при включенном BigFoot. + - Оптимизация `pnl_merge()` для случаев неперекрывающихся объединяемых списков. + - Оптимизация поддержки отсортированного списка страниц в `dpl_append()`. + - Ускорение работы `mdbx_chk` при обработке пользовательских записей в `@MAIN`. + - Переработка LRU-отметок для спиллинга. + - Переработка контроля "некогерентности" Unified page cache для уменьшения накладных расходов. + - Рефакторинг и микрооптимизация. + + +------------------------------------------------------------------------------- + + +## v0.12.2 (Иван Ярыгин) от 2022-11-11 + +Выпуск с существенными доработками и новой функциональностью +в память о российском борце [Иване Сергеевиче Ярыгине](https://ru.wikipedia.org/wiki/Ярыгин,_Иван_Сергеевич). + +На Олимпийских играх в Мюнхене в 1972 году Иван Ярыгин уложил всех соперников на лопатки, +суммарно затратив менее 9 минут. Этот рекорд никем не побит до сих пор. + +``` +64 files changed, 5573 insertions(+), 2510 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +Новое: + + - Поддержка всех основных опций при сборке посредством CMake. + + - Требования к CMake понижены до версии 3.0.2 для возможности сборки для устаревших платформ. + + - Добавлена возможность профилирования работы GC в сложных и/или нагруженных + сценариях (например Ethereum/Erigon). По-умолчанию соответствующий код отключен, + а для его активации необходимо указать опцию сборки `MDBX_ENABLE_PROFGC=1`. + + - Добавлена функция `mdbx_env_warmup()` для "прогрева" БД с возможностью + закрепления страниц в памяти. + В утилиты `mdbx_chk`, `mdbx_copy` и `mdbx_dump` добавлены опции `-u` и `-U` + для активации соответствующего функционала. + + - Отключение учета «грязных» страниц в не требующих этого режимах + (`MDBX_WRITEMAP` при `MDBX_AVOID_MSYNC=0`). Доработка позволяет снизить + накладные расходы и была запланирована давно, но откладывалась так как + требовала других изменений. + + - Вытеснение из памяти (спиллинг) «грязных» страниц с учетом размера + large/overflow-страниц. Доработка позволяет корректно соблюдать политику + задаваемую опциями `MDBX_opt_txn_dp_limit`, + `MDBX_opt_spill_max_denominator`, `MDBX_opt_spill_min_denominator` и + была запланирована давно, но откладывалась так как требовала других + изменений. + + - Для Windows в API добавлены UNICODE-зависимые определения макросов + `MDBX_DATANAME`, `MDBX_LOCKNAME` и `MDBX_LOCK_SUFFIX`. + + - Переход на преимущественное использование типа `size_t` для + уменьшения накладных расходов на платформе Эльбрус. + + - В API добавлены функции `mdbx_limits_valsize4page_max()` и + `mdbx_env_get_valsize4page_max()` возвращающие максимальный размер в + байтах значения, которое может быть размещена в одной + large/overflow-странице, а не последовательности из двух или более таких + страниц. Для таблиц с поддержкой дубликатов вынос значений на + large/overflow-страницы не поддерживается, поэтому результат совпадает с + `mdbx_limits_valsize_max()`. + + - В API добавлены функции `mdbx_limits_pairsize4page_max()`и + `mdbx_env_get_pairsize4page_max()` возвращающие в байтах максимальный + суммарный размер пары ключ-значение для их размещения на одной листовой + страницы, без выноса значения на отдельную large/overflow-страницу. Для + таблиц с поддержкой дубликатов вынос значений на large/overflow-страницы + не поддерживается, поэтому результат определяет максимальный/допустимый + суммарный размер пары ключ-значение. + + - Реализовано использование асинхронной (overlapped) записи в Windows, + включая использования небуфферизированного ввода-вывода и `WriteGather()`. + Это позволяет сократить накладные расходы и частично обойти проблемы + Windows с низкой производительностью ввода-вывода, включая большие + задержки `FlushFileBuffers()`. Новый код также обеспечивает консолидацию + записываемых регионов на всех платформах, а на Windows использование + событий (events) сведено к минимум, одновременно с автоматических + использованием `WriteGather()`. Поэтому ожидается существенное снижение + накладных расходов взаимодействия с ОС, а в Windows это ускорение, в + некоторых сценариях, может быть кратным в сравнении с LMDB. + + - Добавлена опция сборки `MDBX_AVOID_MSYNC`, которая определяет + поведение libmdbx в режиме `MDBX_WRITE_MAP` (когда данные изменяются + непосредственно в отображенных в ОЗУ страницах БД): + + * Если `MDBX_AVOID_MSYNC=0` (по умолчанию на всех системах кроме Windows), + то (как прежде) сохранение данных выполняется посредством `msync()`, + либо `FlushViewOfFile()` на Windows. На платформах с полноценной + подсистемой виртуальной памяти и адекватным файловым вводом-выводом + это обеспечивает минимум накладных расходов (один системный вызов) + и максимальную производительность. Однако, на Windows приводит + к значительной деградации, в том числе из-за того что после + `FlushViewOfFile()` требуется также вызов `FlushFileBuffers()` + с массой проблем и суеты внутри ядра ОС. + + * Если `MDBX_AVOID_MSYNC=1` (по умолчанию только на Windows), то + сохранение данных выполняется явной записью в файл каждой измененной + страницы БД. Это требует дополнительных накладных расходов, как + на отслеживание измененных страниц (ведение списков "грязных" + страниц), так и на системные вызовы для их записи. + Кроме этого, с точки зрения подсистемы виртуальной памяти ядра ОС, + страницы БД измененные в ОЗУ и явно записанные в файл, могут либо + оставаться "грязными" и быть повторно записаны ядром ОС позже, + либо требовать дополнительных накладных расходов для отслеживания + PTE (Page Table Entries), их модификации и дополнительного копирования + данных. Тем не менее, по имеющейся информации, на Windows такой путь + записи данных в целом обеспечивает более высокую производительность. + + - Улучшение эвристики включения авто-слияния записей GC. + + - Изменение формата LCK и семантики некоторых внутренних полей. Версии + libmdbx использующие разный формат не смогут работать с одной БД + одновременно, а только поочередно (LCK-файл переписывается при открытии + первым открывающим БД процессом). + + - В `C++` API добавлены методы фиксации транзакции с получением информации + о задержках. + + - Added `MDBX_HAVE_BUILT IN_CPU_SUPPORTS` build option to control use GCC's + `__builtin_cpu_supports()` function, which could be unavailable on a fake + OSes (macos, ios, android, etc). + +Исправления (без корректировок вышеперечисленных новых функций): + + - Устранения ряда предупреждений при сборке посредством MinGW. + - Устранение ложно-положительных сообщений от Valgrind об использовании + не инициализированных данных из-за выравнивающих зазоров в `struct troika`. + - Исправлен возврат неожиданной ошибки `MDBX_BUSY` из функций `mdbx_env_set_option()`, + `mdbx_env_set_syncbytes()` и `mdbx_env_set_syncperiod()`. + - Небольшие исправления для совместимости с CMake 3.8 + - Больше контроля и осторожности (паранойи) для страховки от дефектов `mremap()`. + - Костыль для починки сборки со старыми версиями `stdatomic.h` из GNU Lib C, + где макросы `ATOMIC_*_LOCK_FREE` ошибочно переопределяются через функции. + - Использование `fcntl64(F_GETLK64/F_SETLK64/F_SETLKW64)` при наличии. + Это решает проблему срабатывания проверочного утверждения при сборке для + платформ где тип `off_t` шире соответствующих полей `структуры flock`, + используемой для блокировки файлов. + - Доработан сбор информации о задержках при фиксации транзакций: + * Устранено искажение замеров длительности обновления GC + при включении отладочного внутреннего аудита; + * Защита от undeflow-нуля только общей задержки в метриках, + чтобы исключить ситуации, когда сумма отдельных стадий + больше общей длительности. + - Ряд исправлений для устранения срабатываний проверочных утверждения в + отладочных сборках. + - Более осторожное преобразование к типу `mdbx_tid_t` для устранения + предупреждений. + - Исправление лишнего сброса данных на диск в режиме `MDBX_SAFE_NOSYNC` + при обновлении GC. + - Fixed an extra check for `MDBX_APPENDDUP` inside `mdbx_cursor_put()` + which could result in returning `MDBX_EKEYMISMATCH` for valid cases. + - Fixed nasty `clz()` bug (by using `_BitScanReverse()`, only MSVC builds affected). + +Мелочи: + + - Исторические ссылки cвязанные с удалённым на ~~github~~ проектом перенаправлены на [web.archive.org](https://web.archive.org/web/https://github.com/erthink/libmdbx). + - Синхронизированны конструкции CMake между проектами. + - Добавлено предупреждение о небезопасности RISC-V. + - Добавлено описание параметров `MDBX_debug_func` и `MDBX_debug_func`. + - Добавлено обходное решение для минимизации ложно-положительных + конфликтов при использовании файловых блокировок в Windows. + - Проверка атомарности C11-операций c 32/64-битными данными. + - Уменьшение в 42 раза значения по-умолчанию для `me_options.dp_limit` + в отладочных сборках. + - Добавление платформы `gcc-riscv64-linux-gnu` в список для цели `cross-gcc`. + - Небольшие правки скрипта `long_stochastic.sh` для работы в Windows. + - Удаление ненужного вызова `LockFileEx()` внутри `mdbx_env_copy()`. + - Добавлено описание использования файловых дескрипторов в различных режимах. + - Добавлено использование `_CrtDbgReport()` в отладочных сборках. + - Fixed an extra ensure/assertion check of `oldest_reader` inside `txn_end()`. + - Removed description of deprecated usage of `MDBX_NODUPDATA`. + - Fixed regression ASAN/Valgring-enabled builds. + - Fixed minor MingGW warning. + + +------------------------------------------------------------------------------- + + +## v0.12.1 (Positive Proxima) at 2022-08-24 + +The planned frontward release with new superior features on the day of 20 anniversary of [Positive Technologies](https://ptsecurty.com). + +``` +37 files changed, 7604 insertions(+), 7417 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + +New: + + - The `Big Foot` feature which significantly reduces GC overhead for processing large lists of retired pages from huge transactions. + Now _libmdbx_ avoid creating large chunks of PNLs (page number lists) which required a long sequences of free pages, aka large/overflow pages. + Thus avoiding searching, allocating and storing such sequences inside GC. + - Improved hot/online validation and checking of database pages both for more robustness and performance. + - New solid and fast method to latch meta-pages called `Troika`. + The minimum of memory barriers, reads, comparisons and conditional transitions are used. + - New `MDBX_VALIDATION` environment options to extra validation of DB structure and pages content for carefully/safe handling damaged or untrusted DB. + - Accelerated ×16/×8/×4 by AVX512/AVX2/SSE2/Neon implementations of search page sequences. + - Added the `gcrtime_seconds16dot16` counter to the "Page Operation Statistics" that accumulates time spent for GC searching and reclaiming. + - Copy-with-compactification now clears/zeroes unused gaps inside database pages. + - The `C` and `C++` APIs has been extended and/or refined to simplify using `wchar_t` pathnames. + On Windows the `mdbx_env_openW()`, ``mdbx_env_get_pathW()`()`, `mdbx_env_copyW()`, `mdbx_env_open_for_recoveryW()` are available for now, + but the `mdbx_env_get_path()` has been replaced in favor of `mdbx_env_get_pathW()`. + - Added explicit error message for Buildroot's Microblaze toolchain maintainers. + - Added `MDBX_MANAGE_BUILD_FLAGS` build options for CMake. + - Speed-up internal `bsearch`/`lower_bound` implementation using branchless tactic, including workaround for CLANG x86 optimiser bug. + - A lot internal refinement and micro-optimisations. + - Internally counted volume of dirty pages (unused for now but for coming features). + Fixes: - - backport: Fixed insignificant typo of `||` inside `#if` byte-order condition. - - - backport: Fixed `SIGSEGV` or an erroneous call to `free()` in situations where - errors occur when reopening by `mdbx_env_open()` of a previously used - environment. - - - backport: Fixed `cursor_put_nochecklen()` internals for case when dupsort'ed named subDb - contains a single key with multiple values (aka duplicates), which are replaced - with a single value by put-operation with the `MDBX_UPSERT+MDBX_ALLDUPS` flags. - In this case, the database becomes completely empty, without any pages. - However exactly this condition was not considered and thus wasn't handled correctly. - See [issue#8](https://gitflic.ru/project/erthink/libmdbx/issue/8) for more information. - - - backport: Fixed extra assertion inside `override_meta()`, which could - lead to false-positive failing of the assertion in a debug builds during - DB recovery and auto-rollback. - - - backport: Refined the `__cold`/`__hot` macros to avoid the - `error: inlining failed in call to ‘always_inline FOO(...)’: target specific option mismatch` - issue during build using GCC >10.x for SH4 arch. - -Minors: - - - backport: Using the https://libmdbx.dqdkfa.ru/dead-github - for resources deleted by the Github' administration. - - backport: Fixed English typos. - - backport: Fixed proto of `__asan_default_options()`. - - backport: Fixed doxygen-description of C++ API, especially of C++20 concepts. - - backport: Refined `const` and `noexcept` for few C++ API methods. - - backport: Fixed copy&paste typo of "Getting started". - - backport: Update MithrilDB status. - - backport: Resolve false-posirive `used uninitialized` warning from GCC >10.x - while build for SH4 arch. + - Never use modern `__cxa_thread_atexit()` on Apple's OSes. + - Don't check owner for finished transactions. + - Fixed typo in `MDBX_EINVAL` which breaks MingGW builds with CLANG. -## v0.11.13 (Swashplate) at 2022-11-10 +## v0.12.0 at 2022-06-19 + +Not a release but preparation for changing feature set and API. + + +------------------------------------------------------------------------------- + + +## v0.11.13 at (Swashplate) 2022-11-10 The stable bugfix release in memory of [Boris Yuryev](https://ru.wikipedia.org/wiki/Юрьев,_Борис_Николаевич) on his 133rd birthday. @@ -113,13 +507,15 @@ Fixes: - Fixed derived C++ builds by removing `MDBX_INTERNAL_FUNC` for `mdbx_w2mb()` and `mdbx_mb2w()`. -------------------------------------------------------------------------------- - - ## v0.11.10 (the TriColor) at 2022-08-22 The stable bugfix release. +``` +14 files changed, 263 insertions(+), 252 deletions(-) +Signed-off-by: Леонид Юрьев (Leonid Yuriev) +``` + New: - The C++ API has been refined to simplify support for `wchar_t` in path names. @@ -141,8 +537,6 @@ Minors: - Minor clarified `iov_page()` failure case. -------------------------------------------------------------------------------- - ## v0.11.9 (Чирчик-1992) at 2022-08-02 @@ -259,7 +653,7 @@ New: - Support build by MinGW' make from command line without CMake. - Added `mdbx::filesystem` C++ API namespace that corresponds to `std::filesystem` or `std::experimental::filesystem`. - Created [website](https://libmdbx.dqdkfa.ru/) for online auto-generated documentation. - - Used `https://web.archive.org/web/20220414235959/https://github.com/erthink/` for dead (or temporarily lost) resources deleted by ~~Github~~. + - Used `https://web.archive.org/web/https://github.com/erthink/libmdbx` for dead (or temporarily lost) resources deleted by ~~Github~~. - Added `--loglevel=` command-line option to the `mdbx_test` tool. - Added few fast smoke-like tests into CMake builds. diff --git a/GNUmakefile b/GNUmakefile index 04ca355a..566feee1 100644 --- a/GNUmakefile +++ b/GNUmakefile @@ -53,8 +53,9 @@ CFLAGS_EXTRA ?= LD ?= ld # build options -MDBX_BUILD_OPTIONS ?=-DNDEBUG=1 +MDBX_BUILD_OPTIONS ?=-DNDEBUG=1 MDBX_BUILD_TIMESTAMP ?=$(shell date +%Y-%m-%dT%H:%M:%S%z) +MDBX_BUILD_CXX ?= YES # probe and compose common compiler flags with variable expansion trick (seems this work two times per session for GNU Make 3.81) CFLAGS ?= $(strip $(eval CFLAGS := -std=gnu11 -O2 -g -Wall -Werror -Wextra -Wpedantic -ffunction-sections -fPIC -fvisibility=hidden -pthread -Wno-error=attributes $$(shell for opt in -fno-semantic-interposition -Wno-unused-command-line-argument -Wno-tautological-compare; do [ -z "$$$$($(CC) '-DMDBX_BUILD_FLAGS="probe"' $$$${opt} -c $(SRC_PROBE_C) -o /dev/null >/dev/null 2>&1 || echo failed)" ] && echo "$$$${opt} "; done)$(CFLAGS_EXTRA))$(CFLAGS)) @@ -127,6 +128,9 @@ TIP := // TIP: .PHONY: all help options lib libs tools clean install uninstall check_buildflags_tag tools-static .PHONY: install-strip install-no-strip strip libmdbx mdbx show-options lib-static lib-shared +boolean = $(if $(findstring $(strip $($1)),YES Yes yes y ON On on 1 true True TRUE),1,$(if $(findstring $(strip $($1)),NO No no n OFF Off off 0 false False FALSE),,$(error Wrong value `$($1)` of $1 for YES/NO option))) +select_by = $(if $(call boolean,$(1)),$(2),$(3)) + ifeq ("$(origin V)", "command line") MDBX_BUILD_VERBOSE := $(V) endif @@ -134,7 +138,7 @@ ifndef MDBX_BUILD_VERBOSE MDBX_BUILD_VERBOSE := 0 endif -ifeq ($(MDBX_BUILD_VERBOSE),1) +ifeq ($(call boolean,MDBX_BUILD_VERBOSE),1) QUIET := HUSH := $(info $(TIP) Use `make V=0` for quiet.) @@ -193,12 +197,12 @@ help: show-options: @echo " MDBX_BUILD_OPTIONS = $(MDBX_BUILD_OPTIONS)" + @echo " MDBX_BUILD_CXX = $(MDBX_BUILD_CXX)" @echo " MDBX_BUILD_TIMESTAMP = $(MDBX_BUILD_TIMESTAMP)" @echo '$(TIP) Use `make options` to listing available build options.' - @echo " CC =`which $(CC)` | `$(CC) --version | head -1`" - @echo " CFLAGS =$(CFLAGS)" - @echo " CXXFLAGS =$(CXXFLAGS)" - @echo " LDFLAGS =$(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) $(EXE_LDFLAGS)" + @echo $(call select_by,MDBX_BUILD_CXX," CXX =`which $(CXX)` | `$(CXX) --version | head -1`"," CC =`which $(CC)` | `$(CC) --version | head -1`") + @echo $(call select_by,MDBX_BUILD_CXX," CXXFLAGS =$(CXXFLAGS)"," CFLAGS =$(CFLAGS)") + @echo $(call select_by,MDBX_BUILD_CXX," LDFLAGS =$(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) $(EXE_LDFLAGS)"," LDFLAGS =$(LDFLAGS) $(LIBS) $(EXE_LDFLAGS)") @echo '$(TIP) Use `make help` to listing available targets.' options: @@ -254,7 +258,7 @@ clean: config.h src/config.h src/version.c *.tar* buildflags.tag \ mdbx_*.static mdbx_*.static-lto -MDBX_BUILD_FLAGS =$(strip $(MDBX_BUILD_OPTIONS) $(CXXSTD) $(CFLAGS) $(LDFLAGS) $(LIBS)) +MDBX_BUILD_FLAGS =$(strip MDBX_BUILD_CXX=$(MDBX_BUILD_CXX) $(MDBX_BUILD_OPTIONS) $(call select_by,MDBX_BUILD_CXX,$(CXXFLAGS) $(LDFLAGS) $(LIB_STDCXXFS) $(LIBS),$(CFLAGS) $(LDFLAGS) $(LIBS))) check_buildflags_tag: $(QUIET)if [ "$(MDBX_BUILD_FLAGS)" != "$$(cat buildflags.tag 2>&1)" ]; then \ echo -n " CLEAN for build with specified flags..." && \ @@ -264,13 +268,13 @@ check_buildflags_tag: buildflags.tag: check_buildflags_tag -lib-static libmdbx.a: mdbx-static.o mdbx++-static.o +lib-static libmdbx.a: mdbx-static.o $(call select_by,MDBX_BUILD_CXX,mdbx++-static.o) @echo ' AR $@' $(QUIET)$(AR) rcs $@ $? $(HUSH) -lib-shared libmdbx.$(SO_SUFFIX): mdbx-dylib.o mdbx++-dylib.o +lib-shared libmdbx.$(SO_SUFFIX): mdbx-dylib.o $(call select_by,MDBX_BUILD_CXX,mdbx++-dylib.o) @echo ' LD $@' - $(QUIET)$(CXX) $(CXXFLAGS) $^ -pthread -shared $(LDFLAGS) $(LIB_STDCXXFS) $(LIBS) -o $@ + $(QUIET)$(call select_by,MDBX_BUILD_CXX,$(CXX) $(CXXFLAGS),$(CC) $(CFLAGS)) $^ -pthread -shared $(LDFLAGS) $(call select_by,MDBX_BUILD_CXX,$(LIB_STDCXXFS)) $(LIBS) -o $@ #> dist-cutoff-begin ifeq ($(wildcard mdbx.c),mdbx.c) @@ -349,9 +353,9 @@ TEST_DB ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.d TEST_LOG ?= $(shell [ -d /dev/shm ] && echo /dev/shm || echo /tmp)/mdbx-test.log TEST_OSAL := $(shell $(uname2osal)) TEST_ITER := $(shell $(uname2titer)) -TEST_SRC := test/osal-$(TEST_OSAL).cc $(filter-out $(wildcard test/osal-*.cc), $(wildcard test/*.cc)) -TEST_INC := $(wildcard test/*.h) -TEST_OBJ := $(patsubst %.cc,%.o,$(TEST_SRC)) +TEST_SRC := test/osal-$(TEST_OSAL).c++ $(filter-out $(wildcard test/osal-*.c++),$(wildcard test/*.c++)) $(call select_by,MDBX_BUILD_CXX,,src/mdbx.c++) +TEST_INC := $(wildcard test/*.h++) +TEST_OBJ := $(patsubst %.c++,%.o,$(TEST_SRC)) TAR ?= $(shell which gnu-tar || echo tar) ZIP ?= $(shell which zip || echo "echo 'Please install zip'") CLANG_FORMAT ?= $(shell (which clang-format-14 || which clang-format-13 || which clang-format) 2>/dev/null) @@ -359,7 +363,7 @@ CLANG_FORMAT ?= $(shell (which clang-format-14 || which clang-format-13 || which reformat: @echo ' RUNNING clang-format...' $(QUIET)if [ -n "$(CLANG_FORMAT)" ]; then \ - git ls-files | grep -E '\.(c|cxx|cc|cpp|h|hxx|hpp)(\.in)?$$' | xargs -r $(CLANG_FORMAT) -i --style=file; \ + git ls-files | grep -E '\.(c|c++|h|h++)(\.in)?$$' | xargs -r $(CLANG_FORMAT) -i --style=file; \ else \ echo "clang-format version 13..14 not found for 'reformat'"; \ fi @@ -382,11 +386,11 @@ MDBX_SMOKE_EXTRA ?= check: DESTDIR = $(shell pwd)/@check-install check: test dist install -smoke-assertion: MDBX_BUILD_OPTIONS=-DMDBX_FORCE_ASSERTIONS=1 +smoke-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1) smoke-assertion: smoke -test-assertion: MDBX_BUILD_OPTIONS=-DMDBX_FORCE_ASSERTIONS=1 +test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1) test-assertion: smoke -long-test-assertion: MDBX_BUILD_OPTIONS=-DMDBX_FORCE_ASSERTIONS=1 +long-test-assertion: MDBX_BUILD_OPTIONS:=$(strip $(MDBX_BUILD_OPTIONS) -DMDBX_FORCE_ASSERTIONS=1) long-test-assertion: smoke smoke: build-test @@ -414,7 +418,7 @@ smoke-fault: build-test test: build-test @echo ' RUNNING `test/long_stochastic.sh --loops 2`...' - $(QUIET)test/long_stochastic.sh --dont-check-ram-size --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) + $(QUIET)test/long_stochastic.sh --dont-check-ram-size --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) long-test: build-test @echo ' RUNNING `test/long_stochastic.sh --loops 42`...' @@ -422,12 +426,12 @@ long-test: build-test test-singleprocess: build-test @echo ' RUNNING `test/long_stochastic.sh --single --loops 2`...' - $(QUIET)test/long_stochastic.sh --dont-check-ram-size --single --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) + $(QUIET)test/long_stochastic.sh --dont-check-ram-size --single --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) test-valgrind: CFLAGS_EXTRA=-Ofast -DMDBX_USE_VALGRIND test-valgrind: build-test @echo ' RUNNING `test/long_stochastic.sh --with-valgrind --loops 2`...' - $(QUIET)test/long_stochastic.sh --with-valgrind --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) + $(QUIET)test/long_stochastic.sh --with-valgrind --loops 2 --db-upto-mb 256 --skip-make >$(TEST_LOG) || (cat $(TEST_LOG) && false) memcheck: VALGRIND=valgrind --trace-children=yes --log-file=valgrind-%p.log --leak-check=full --track-origins=yes --error-exitcode=42 --suppressions=test/valgrind_suppress.txt memcheck: CFLAGS_EXTRA=-Ofast -DMDBX_USE_VALGRIND @@ -448,7 +452,7 @@ gcc-analyzer: test-ubsan: @echo ' RE-TEST with `-fsanitize=undefined` option...' - $(QUIET)$(MAKE) IOARENA=false CXXSTD=$(CXXSTD) CFLAGS_EXTRA="-Ofast -fsanitize=undefined -fsanitize-undefined-trap-on-error" test + $(QUIET)$(MAKE) IOARENA=false CXXSTD=$(CXXSTD) CFLAGS_EXTRA="-DENABLE_UBSAN -Ofast -fsanitize=undefined -fsanitize-undefined-trap-on-error" test test-asan: @echo ' RE-TEST with `-fsanitize=address` option...' @@ -465,7 +469,7 @@ mdbx_example: mdbx.h example/example-mdbx.c libmdbx.$(SO_SUFFIX) build-test: all mdbx_example mdbx_test define test-rule -$(patsubst %.cc,%.o,$(1)): $(1) $(TEST_INC) $(HEADERS) $(lastword $(MAKEFILE_LIST)) +$(patsubst %.c++,%.o,$(1)): $(1) $(TEST_INC) $(HEADERS) $(lastword $(MAKEFILE_LIST)) @echo ' CC $$@' $(QUIET)$$(CXX) $$(CXXFLAGS) $$(MDBX_BUILD_OPTIONS) -c $(1) -o $$@ @@ -711,23 +715,23 @@ endif ################################################################################ # Cross-compilation simple test -CROSS_LIST = mips-linux-gnu-gcc \ +CROSS_LIST = \ + mips64-linux-gnuabi64-gcc mips-linux-gnu-gcc \ + hppa-linux-gnu-gcc s390x-linux-gnu-gcc \ powerpc64-linux-gnu-gcc powerpc-linux-gnu-gcc \ - arm-linux-gnueabihf-gcc aarch64-linux-gnu-gcc \ - sh4-linux-gnu-gcc mips64-linux-gnuabi64-gcc \ - hppa-linux-gnu-gcc s390x-linux-gnu-gcc + arm-linux-gnueabihf-gcc aarch64-linux-gnu-gcc -## On Ubuntu Focal (20.04) with QEMU 4.2 (1:4.2-3ubuntu6.6) & GCC 9.3 (9.3.0-17ubuntu1~20.04) -# hppa-linux-gnu-gcc - works (previously: don't supported by qemu) -# s390x-linux-gnu-gcc - works (previously: qemu hang/abort) +## On Ubuntu Focal (22.04) with QEMU 6.2 (1:6.2+dfsg-2ubuntu6.6) & GCC 11.3 (11.3.0-1ubuntu1~22.04) +# sh4-linux-gnu-gcc - coredump (qemu mmap-troubles) # sparc64-linux-gnu-gcc - coredump (qemu mmap-troubles, previously: qemu fails fcntl for F_SETLK/F_GETLK) # alpha-linux-gnu-gcc - coredump (qemu mmap-troubles) -CROSS_LIST_NOQEMU = sparc64-linux-gnu-gcc alpha-linux-gnu-gcc +# risc64-linux-gnu-gcc - coredump (qemu qemu fails fcntl for F_SETLK/F_GETLK) +CROSS_LIST_NOQEMU = sh4-linux-gnu-gcc sparc64-linux-gnu-gcc alpha-linux-gnu-gcc riscv64-linux-gnu-gcc cross-gcc: @echo ' Re-building by cross-compiler for: $(CROSS_LIST_NOQEMU) $(CROSS_LIST)' @echo "CORRESPONDING CROSS-COMPILERs ARE REQUIRED." - @echo "FOR INSTANCE: apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu" + @echo "FOR INSTANCE: sudo apt install \$$(apt list 'g++-*' | grep 'g++-[a-z0-9]\+-linux-gnu/' | cut -f 1 -d / | sort -u)" $(QUIET)for CC in $(CROSS_LIST_NOQEMU) $(CROSS_LIST); do \ echo "===================== $$CC"; \ $(MAKE) IOARENA=false CXXSTD= clean && CC=$$CC CXX=$$(echo $$CC | sed 's/-gcc/-g++/') EXE_LDFLAGS=-static $(MAKE) IOARENA=false all || exit $$?; \ @@ -739,8 +743,8 @@ cross-qemu: @echo ' Re-building by cross-compiler and re-check by QEMU for: $(CROSS_LIST)' @echo "CORRESPONDING CROSS-COMPILERs AND QEMUs ARE REQUIRED." @echo "FOR INSTANCE: " - @echo " 1) apt install g++-aarch64-linux-gnu g++-alpha-linux-gnu g++-arm-linux-gnueabihf g++-hppa-linux-gnu g++-mips-linux-gnu g++-mips64-linux-gnuabi64 g++-powerpc-linux-gnu g++-powerpc64-linux-gnu g++-s390x-linux-gnu g++-sh4-linux-gnu g++-sparc64-linux-gnu" - @echo " 2) apt install binfmt-support qemu-user-static qemu-user qemu-system-arm qemu-system-mips qemu-system-misc qemu-system-ppc qemu-system-sparc" + @echo " 1) sudo apt install \$$(apt list 'g++-*' | grep 'g++-[a-z0-9]\+-linux-gnu/' | cut -f 1 -d / | sort -u)" + @echo " 2) sudo apt install binfmt-support qemu-user-static qemu-user \$$(apt list 'qemu-system-*' | grep 'qemu-system-[a-z0-9]\+/' | cut -f 1 -d / | sort -u)" $(QUIET)for CC in $(CROSS_LIST); do \ echo "===================== $$CC + qemu"; \ $(MAKE) IOARENA=false CXXSTD= clean && \ @@ -784,7 +788,7 @@ IOARENA := $(shell \ (test -x ../ioarena/@BUILD/src/ioarena && echo ../ioarena/@BUILD/src/ioarena) || \ (test -x ../../@BUILD/src/ioarena && echo ../../@BUILD/src/ioarena) || \ (test -x ../../src/ioarena && echo ../../src/ioarena) || which ioarena 2>&- || \ - (echo false && echo '$(TIP) Clone and build the https://github.com/pmwkaa/ioarena.git within a neighbouring directory for availability of benchmarking.' >&2)) + (echo false && echo '$(TIP) Clone and build the https://abf.io/erthink/ioarena.git within a neighbouring directory for availability of benchmarking.' >&2)) endif NN ?= 25000000 BENCH_CRUD_MODE ?= nosync @@ -798,7 +802,7 @@ re-bench: bench-clean bench ifeq ($(or $(IOARENA),false),false) bench bench-quartet bench-triplet bench-couple: $(QUIET)echo 'The `ioarena` benchmark is required.' >&2 && \ - echo 'Please clone and build the https://github.com/pmwkaa/ioarena.git within a neighbouring `ioarena` directory.' >&2 && \ + echo 'Please clone and build the https://abf.io/erthink/ioarena.git within a neighbouring `ioarena` directory.' >&2 && \ false else @@ -809,15 +813,20 @@ define bench-rule bench-$(1)_$(2).txt: $(3) $(IOARENA) $(lastword $(MAKEFILE_LIST)) @echo ' RUNNING ioarena for $1/$2...' $(QUIET)(export LD_LIBRARY_PATH="./:$$$${LD_LIBRARY_PATH}"; \ - ldd $(IOARENA) && \ + ldd $(IOARENA) | grep -i $(1) && \ + $(IOARENA) -D $(1) -B batch -m $(BENCH_CRUD_MODE) -n $(2) \ + | tee $$@ | grep throughput | sed 's/throughput/batch×N/' && \ $(IOARENA) -D $(1) -B crud -m $(BENCH_CRUD_MODE) -n $(2) \ - | tee $$@ | grep throughput && \ + | tee -a $$@ | grep throughput | sed 's/throughput/ crud/' && \ $(IOARENA) -D $(1) -B iterate,get,iterate,get,iterate -m $(BENCH_CRUD_MODE) -r 4 -n $(2) \ - | tee -a $$@ | grep throughput \ - ) || mv -f $$@ $$@.error + | tee -a $$@ | grep throughput | sed '0,/throughput/{s/throughput/iterate/};s/throughput/ get/' && \ + $(IOARENA) -D $(1) -B delete -m $(BENCH_CRUD_MODE) -n $(2) \ + | tee -a $$@ | grep throughput | sed 's/throughput/ delete/' && \ + true) || mv -f $$@ $$@.error endef + $(eval $(call bench-rule,mdbx,$(NN),libmdbx.$(SO_SUFFIX))) $(eval $(call bench-rule,sophia,$(NN))) diff --git a/README.md b/README.md index 5841b041..46e1c549 100644 --- a/README.md +++ b/README.md @@ -277,7 +277,7 @@ the user's point of view. > and up to 30% faster when _libmdbx_ compiled with specific build options > which downgrades several runtime checks to be match with LMDB behaviour. > - > These and other results could be easily reproduced with [ioArena](https://github.com/pmwkaa/ioarena) just by `make bench-quartet` command, + > These and other results could be easily reproduced with [ioArena](https://abf.io/erthink/ioarena.git) just by `make bench-quartet` command, > including comparisons with [RockDB](https://en.wikipedia.org/wiki/RocksDB) > and [WiredTiger](https://en.wikipedia.org/wiki/WiredTiger). @@ -382,7 +382,7 @@ named mutexes are used. Historically, _libmdbx_ is a deeply revised and extended descendant of the [Lightning Memory-Mapped Database](https://en.wikipedia.org/wiki/Lightning_Memory-Mapped_Database). At first the development was carried out within the -[ReOpenLDAP](https://web.archive.org/web/20220414235959/https://github.com/erthink/ReOpenLDAP) project. About a +[ReOpenLDAP](https://web.archive.org/web/https://github.com/erthink/ReOpenLDAP) project. About a year later _libmdbx_ was separated into a standalone project, which was [presented at Highload++ 2015 conference](http://www.highload.ru/2015/abstracts/1831.html). @@ -659,7 +659,7 @@ Bindings Performance comparison ====================== -All benchmarks were done in 2015 by [IOArena](https://github.com/pmwkaa/ioarena) +All benchmarks were done in 2015 by [IOArena](https://abf.io/erthink/ioarena.git) and multiple [scripts](https://github.com/pmwkaa/ioarena/tree/HL%2B%2B2015) runs on Lenovo Carbon-2 laptop, i7-4600U 2.1 GHz (2 physical cores, 4 HyperThreading cores), 8 Gb RAM, SSD SAMSUNG MZNTD512HAGL-000L1 (DXT23L0Q) 512 Gb. diff --git a/TODO.md b/TODO.md index b37e20cb..0d9fd46d 100644 --- a/TODO.md +++ b/TODO.md @@ -11,15 +11,19 @@ For the same reason ~~Github~~ is blacklisted forever. So currently most of the links are broken due to noted malicious ~~Github~~ sabotage. - - [Engage an "overlapped I/O" on Windows](https://libmdbx.dqdkfa.ru/dead-github/issues/224). - - [Simple careful mode for working with corrupted DB](https://libmdbx.dqdkfa.ru/dead-github/issues/223). - [Move most of `mdbx_chk` functional to the library API](https://libmdbx.dqdkfa.ru/dead-github/issues/204). - [Replace SRW-lock on Windows to allow shrink DB with `MDBX_NOTLS` option](https://libmdbx.dqdkfa.ru/dead-github/issues/210). - [More flexible support of asynchronous runtime/framework(s)](https://libmdbx.dqdkfa.ru/dead-github/issues/200). - [Migration guide from LMDB to MDBX](https://libmdbx.dqdkfa.ru/dead-github/issues/199). - - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://libmdbx.dqdkfa.ru/dead-github/issues/193). - - [Large/Overflow pages accounting for dirty-room](https://libmdbx.dqdkfa.ru/dead-github/issues/192). - [Support for RAW devices](https://libmdbx.dqdkfa.ru/dead-github/issues/124). - [Support MessagePack for Keys & Values](https://libmdbx.dqdkfa.ru/dead-github/issues/115). - [Engage new terminology](https://libmdbx.dqdkfa.ru/dead-github/issues/137). - Packages for [Astra Linux](https://astralinux.ru/), [ALT Linux](https://www.altlinux.org/), [ROSA Linux](https://www.rosalinux.ru/), etc. + +Done +---- + + - [Simple careful mode for working with corrupted DB](https://libmdbx.dqdkfa.ru/dead-github/issues/223). + - [Engage an "overlapped I/O" on Windows](https://libmdbx.dqdkfa.ru/dead-github/issues/224). + - [Large/Overflow pages accounting for dirty-room](https://libmdbx.dqdkfa.ru/dead-github/issues/192). + - [Get rid of dirty-pages list in MDBX_WRITEMAP mode](https://libmdbx.dqdkfa.ru/dead-github/issues/193). diff --git a/cmake/compiler.cmake b/cmake/compiler.cmake index 78a31946..1d805ea0 100644 --- a/cmake/compiler.cmake +++ b/cmake/compiler.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. @@ -348,6 +348,8 @@ endif() if(MSVC) check_compiler_flag("/WX" CC_HAS_WERROR) + check_compiler_flag("/fsanitize=address" CC_HAS_ASAN) + check_compiler_flag("/fsanitize=undefined" CC_HAS_UBSAN) else() # # GCC started to warn for unused result starting from 4.2, and @@ -839,19 +841,26 @@ macro(setup_compile_flags) endif() if(ENABLE_ASAN) - add_compile_flags("C;CXX" "-fsanitize=address") + if(NOT MSVC) + add_compile_flags("C;CXX" "-fsanitize=address") + else() + add_compile_flags("C;CXX" "/fsanitize=address") + endif() add_definitions(-DASAN_ENABLED=1) endif() if(ENABLE_UBSAN) - add_compile_flags("C;CXX" "-fsanitize=undefined" "-fsanitize-undefined-trap-on-error") + if(NOT MSVC) + add_compile_flags("C;CXX" "-fsanitize=undefined" "-fsanitize-undefined-trap-on-error") + else() + add_compile_flags("C;CXX" "/fsanitize=undefined") + endif() add_definitions(-DUBSAN_ENABLED=1) endif() if(ENABLE_GCOV) if(NOT HAVE_GCOV) - message(FATAL_ERROR - "ENABLE_GCOV option requested but gcov library is not found") + message(FATAL_ERROR "ENABLE_GCOV option requested but gcov library is not found") endif() add_compile_flags("C;CXX" "-fprofile-arcs" "-ftest-coverage") diff --git a/cmake/profile.cmake b/cmake/profile.cmake index c9b8bed4..f13b6976 100644 --- a/cmake/profile.cmake +++ b/cmake/profile.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 6a3315e1..aa8aef01 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -1,4 +1,4 @@ -## Copyright (c) 2012-2022 Leonid Yuriev . +## Copyright (c) 2012-2023 Leonid Yuriev . ## ## Licensed under the Apache License, Version 2.0 (the "License"); ## you may not use this file except in compliance with the License. diff --git a/example/example-mdbx.c b/example/example-mdbx.c index a3735f9a..0e6148d9 100644 --- a/example/example-mdbx.c +++ b/example/example-mdbx.c @@ -4,7 +4,7 @@ */ /* - * Copyright 2015-2022 Leonid Yuriev . + * Copyright 2015-2023 Leonid Yuriev . * Copyright 2017 Ilya Shipitsin . * Copyright 2012-2015 Howard Chu, Symas Corp. * All rights reserved. diff --git a/example/sample-bdb.txt b/example/sample-bdb.txt index 503d97cb..d3478a16 100644 --- a/example/sample-bdb.txt +++ b/example/sample-bdb.txt @@ -4,7 +4,7 @@ */ /* - * Copyright 2015-2022 Leonid Yuriev . + * Copyright 2015-2023 Leonid Yuriev . * Copyright 2012-2015 Howard Chu, Symas Corp. * Copyright 2015,2016 Peter-Service R&D LLC. * All rights reserved. diff --git a/mdbx.h b/mdbx.h index 17b6e139..08542765 100644 --- a/mdbx.h +++ b/mdbx.h @@ -25,7 +25,7 @@ _The Future will (be) [Positive](https://www.ptsecurity.com). Всё будет \section copyright LICENSE & COPYRIGHT -\authors Copyright (c) 2015-2022, Leonid Yuriev +\authors Copyright (c) 2015-2023, Leonid Yuriev and other _libmdbx_ authors: please see [AUTHORS](./AUTHORS) file. \copyright Redistribution and use in source and binary forms, with or without @@ -77,10 +77,10 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. #if defined(__riscv) || defined(__riscv__) || defined(__RISCV) || \ defined(__RISCV__) -#warning The RISC-V architecture is intentionally insecure by design. \ +#warning "The RISC-V architecture is intentionally insecure by design. \ Please delete this admonition at your own risk, \ if you make such decision informed and consciously. \ - Refer to https://clck.ru/32d9xH for more information. + Refer to https://clck.ru/32d9xH for more information." #endif /* RISC-V */ #ifdef _MSC_VER @@ -634,9 +634,9 @@ typedef mode_t mdbx_mode_t; extern "C" { #endif -/* MDBX version 0.11.x */ +/* MDBX version 0.12.x */ #define MDBX_VERSION_MAJOR 0 -#define MDBX_VERSION_MINOR 11 +#define MDBX_VERSION_MINOR 12 #ifndef LIBMDBX_API #if defined(LIBMDBX_EXPORTS) @@ -835,18 +835,48 @@ enum MDBX_constants { #ifndef MDBX_LOCKNAME /** \brief The name of the lock file in the environment * without using \ref MDBX_NOSUBDIR */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_LOCKNAME "/mdbx.lck" -#endif +#else +#define MDBX_LOCKNAME_W L"\\mdbx.lck" +#define MDBX_LOCKNAME_A "\\mdbx.lck" +#ifdef UNICODE +#define MDBX_LOCKNAME MDBX_LOCKNAME_W +#else +#define MDBX_LOCKNAME MDBX_LOCKNAME_A +#endif /* UNICODE */ +#endif /* Windows */ +#endif /* MDBX_LOCKNAME */ #ifndef MDBX_DATANAME /** \brief The name of the data file in the environment * without using \ref MDBX_NOSUBDIR */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_DATANAME "/mdbx.dat" -#endif +#else +#define MDBX_DATANAME_W L"\\mdbx.dat" +#define MDBX_DATANAME_A "\\mdbx.dat" +#ifdef UNICODE +#define MDBX_DATANAME MDBX_DATANAME_W +#else +#define MDBX_DATANAME MDBX_DATANAME_A +#endif /* UNICODE */ +#endif /* Windows */ +#endif /* MDBX_DATANAME */ #ifndef MDBX_LOCK_SUFFIX /** \brief The suffix of the lock file when \ref MDBX_NOSUBDIR is used */ +#if !(defined(_WIN32) || defined(_WIN64)) #define MDBX_LOCK_SUFFIX "-lck" -#endif +#else +#define MDBX_LOCK_SUFFIX_W L"-lck" +#define MDBX_LOCK_SUFFIX_A "-lck" +#ifdef UNICODE +#define MDBX_LOCK_SUFFIX MDBX_LOCK_SUFFIX_W +#else +#define MDBX_LOCK_SUFFIX MDBX_LOCK_SUFFIX_A +#endif /* UNICODE */ +#endif /* Windows */ +#endif /* MDBX_LOCK_SUFFIX */ /* DEBUG & LOGGING ************************************************************/ @@ -1028,12 +1058,15 @@ LIBMDBX_API const char *mdbx_dump_val(const MDBX_val *key, char *const buf, const size_t bufsize); /** \brief Panics with message and causes abnormal process termination. */ -LIBMDBX_API void mdbx_panic(const char *fmt, ...) MDBX_PRINTF_ARGS(1, 2); +MDBX_NORETURN LIBMDBX_API void mdbx_panic(const char *fmt, ...) + MDBX_PRINTF_ARGS(1, 2); /** \brief Panics with asserton failed message and causes abnormal process * termination. */ -LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, - const char *func, unsigned line); +MDBX_NORETURN LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, + const char *msg, + const char *func, + unsigned line); /** end of c_debug @} */ /** \brief Environment flags @@ -1043,6 +1076,13 @@ LIBMDBX_API void mdbx_assert_fail(const MDBX_env *env, const char *msg, enum MDBX_env_flags_t { MDBX_ENV_DEFAULTS = 0, + /** Extra validation of DB structure and pages content. + * + * The `MDBX_VALIDATION` enabled the simple safe/careful mode for working + * with damaged or untrusted DB. However, a notable performance + * degradation should be expected. */ + MDBX_VALIDATION = UINT32_C(0x00002000), + /** No environment directory. * * By default, MDBX creates its environment in a directory whose pathname is @@ -1115,8 +1155,8 @@ enum MDBX_env_flags_t { * while opening the database/environment which is already used by another * process(es) with unknown mode/flags. In such cases, if there is a * difference in the specified flags (\ref MDBX_NOMETASYNC, - * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref MDBX_LIFORECLAIM, - * \ref MDBX_COALESCE and \ref MDBX_NORDAHEAD), instead of returning an error, + * \ref MDBX_SAFE_NOSYNC, \ref MDBX_UTTERLY_NOSYNC, \ref MDBX_LIFORECLAIM + * and \ref MDBX_NORDAHEAD), instead of returning an error, * the database will be opened in a compatibility with the already used mode. * * `MDBX_ACCEDE` has no effect if the current process is the only one either @@ -1223,6 +1263,7 @@ enum MDBX_env_flags_t { MDBX_NOMEMINIT = UINT32_C(0x1000000), /** Aims to coalesce a Garbage Collection items. + * \note Always enabled since v0.12 * * With `MDBX_COALESCE` flag MDBX will aims to coalesce items while recycling * a Garbage Collection. Technically, when possible short lists of pages @@ -1593,8 +1634,7 @@ enum MDBX_put_flags_t { MDBX_NOOVERWRITE = UINT32_C(0x10), /** Has effect only for \ref MDBX_DUPSORT databases. - * For upsertion: don't write if the key-value pair already exist. - * For deletion: remove all values for key. */ + * For upsertion: don't write if the key-value pair already exist. */ MDBX_NODUPDATA = UINT32_C(0x20), /** For upsertion: overwrite the current key/data pair. @@ -1886,6 +1926,15 @@ enum MDBX_error_t { /** Overlapping read and write transactions for the current thread */ MDBX_TXN_OVERLAPPING = -30415, + /** Внутренняя ошибка возвращаемая в случае нехватки запаса свободных страниц + * при обновлении GC. Используется как вспомогательное средство для отладки. + * \note С точки зрения пользователя семантически + * равнозначна \ref MDBX_PROBLEM. */ + MDBX_BACKLOG_DEPLETED = -30414, + + /** Alternative/Duplicate LCK-file is exists and should be removed manually */ + MDBX_DUPLICATED_CLK = -30413, + /* The last of MDBX-added error codes */ MDBX_LAST_ADDED_ERRCODE = MDBX_TXN_OVERLAPPING, @@ -2011,7 +2060,9 @@ LIBMDBX_API const char *mdbx_strerror_r_ANSI2OEM(int errnum, char *buf, * \returns a non-zero error value on failure and 0 on success. */ LIBMDBX_API int mdbx_env_create(MDBX_env **penv); -/** \brief MDBX environment options. */ +/** \brief MDBX environment extra runtime options. + * \ingroup c_settings + * \see mdbx_env_set_option() \see mdbx_env_get_option() */ enum MDBX_option_t { /** \brief Controls the maximum number of named databases for the environment. * @@ -2180,13 +2231,46 @@ enum MDBX_option_t { * to 50% (half empty) which corresponds to the range from 8192 and to 32768 * in units respectively. */ MDBX_opt_merge_threshold_16dot16_percent, + + /** \brief Controls the choosing between use write-through disk writes and + * usual ones with followed flush by the `fdatasync()` syscall. + * \details Depending on the operating system, storage subsystem + * characteristics and the use case, higher performance can be achieved by + * either using write-through or a serie of usual/lazy writes followed by + * the flush-to-disk. + * + * Basically for N chunks the latency/cost of write-through is: + * latency = N * (emit + round-trip-to-storage + storage-execution); + * And for serie of lazy writes with flush is: + * latency = N * (emit + storage-execution) + flush + round-trip-to-storage. + * + * So, for large N and/or noteable round-trip-to-storage the write+flush + * approach is win. But for small N and/or near-zero NVMe-like latency + * the write-through is better. + * + * To solve this issue libmdbx provide `MDBX_opt_writethrough_threshold`: + * - when N described above less or equal specified threshold, + * a write-through approach will be used; + * - otherwise, when N great than specified threshold, + * a write-and-flush approach will be used. + * + * \note MDBX_opt_writethrough_threshold affects only \ref MDBX_SYNC_DURABLE + * mode without \ref MDBX_WRITEMAP, and not supported on Windows. + * On Windows a write-through is used always but \ref MDBX_NOMETASYNC could + * be used for switching to write-and-flush. */ + MDBX_opt_writethrough_threshold, + + /** \brief Controls prevention of page-faults of reclaimed and allocated pages + * in the \ref MDBX_WRITEMAP mode by clearing ones through file handle before + * touching. */ + MDBX_opt_prefault_write_enable, }; #ifndef __cplusplus /** \ingroup c_settings */ typedef enum MDBX_option_t MDBX_option_t; #endif -/** \brief Sets the value of a runtime options for an environment. +/** \brief Sets the value of a extra runtime options for an environment. * \ingroup c_settings * * \param [in] env An environment handle returned by \ref mdbx_env_create(). @@ -2199,7 +2283,7 @@ typedef enum MDBX_option_t MDBX_option_t; LIBMDBX_API int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, uint64_t value); -/** \brief Gets the value of runtime options from an environment. +/** \brief Gets the value of extra runtime options from an environment. * \ingroup c_settings * * \param [in] env An environment handle returned by \ref mdbx_env_create(). @@ -2220,6 +2304,8 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, * be called later to discard the \ref MDBX_env handle and release associated * resources. * + * \note On Windows the \ref mdbx_env_openW() is recommended to use. + * * \param [in] env An environment handle returned * by \ref mdbx_env_create() * @@ -2287,6 +2373,14 @@ LIBMDBX_API int mdbx_env_get_option(const MDBX_env *env, LIBMDBX_API int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode); +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_open() + * \note Available only on Windows. + * \see mdbx_env_open() */ +LIBMDBX_API int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode); +#endif /* Windows */ + /** \brief Deletion modes for \ref mdbx_env_delete(). * \ingroup c_extra * \see mdbx_env_delete() */ @@ -2313,6 +2407,8 @@ typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; /** \brief Delete the environment's files in a proper and multiprocess-safe way. * \ingroup c_extra * + * \note On Windows the \ref mdbx_env_deleteW() is recommended to use. + * * \param [in] pathname The pathname for the database or the directory in which * the database files reside. * @@ -2330,6 +2426,14 @@ typedef enum MDBX_env_delete_mode_t MDBX_env_delete_mode_t; LIBMDBX_API int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode); +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_delete() + * \note Available only on Windows. + * \see mdbx_env_delete() */ +LIBMDBX_API int mdbx_env_deleteW(const wchar_t *pathname, + MDBX_env_delete_mode_t mode); +#endif /* Windows */ + /** \brief Copy an MDBX environment to the specified path, with options. * \ingroup c_extra * @@ -2339,6 +2443,8 @@ LIBMDBX_API int mdbx_env_delete(const char *pathname, * parallel with write transactions, because it employs a read-only * transaction. See long-lived transactions under \ref restrictions section. * + * \note On Windows the \ref mdbx_env_copyW() is recommended to use. + * * \param [in] env An environment handle returned by mdbx_env_create(). * It must have already been opened successfully. * \param [in] dest The pathname of a file in which the copy will reside. @@ -2364,6 +2470,14 @@ LIBMDBX_API int mdbx_env_delete(const char *pathname, LIBMDBX_API int mdbx_env_copy(MDBX_env *env, const char *dest, MDBX_copy_flags_t flags); +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_copy() + * \note Available only on Windows. + * \see mdbx_env_copy() */ +LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest, + MDBX_copy_flags_t flags); +#endif /* Windows */ + /** \brief Copy an environment to the specified file descriptor, with * options. * \ingroup c_extra @@ -2482,7 +2596,9 @@ struct MDBX_envinfo { uint64_t mi_unsync_volume; /** Current auto-sync threshold, see \ref mdbx_env_set_syncbytes(). */ uint64_t mi_autosync_threshold; - /** Time since the last steady sync in 1/65536 of second */ + /** Time since entering to a "dirty" out-of-sync state in units of 1/65536 of + * second. In other words, this is the time since the last non-steady commit + * or zero if it was steady. */ uint32_t mi_since_sync_seconds16dot16; /** Current auto-sync period in 1/65536 of second, * see \ref mdbx_env_set_syncperiod(). */ @@ -2500,16 +2616,22 @@ struct MDBX_envinfo { * first process opened the database after everyone had previously closed it). */ struct { - uint64_t newly; /**< Quantity of a new pages added */ - uint64_t cow; /**< Quantity of pages copied for update */ - uint64_t clone; /**< Quantity of parent's dirty pages clones - for nested transactions */ - uint64_t split; /**< Page splits */ - uint64_t merge; /**< Page merges */ - uint64_t spill; /**< Quantity of spilled dirty pages */ - uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ - uint64_t wops; /**< Number of explicit write operations (not a pages) - to a disk */ + uint64_t newly; /**< Quantity of a new pages added */ + uint64_t cow; /**< Quantity of pages copied for update */ + uint64_t clone; /**< Quantity of parent's dirty pages clones + for nested transactions */ + uint64_t split; /**< Page splits */ + uint64_t merge; /**< Page merges */ + uint64_t spill; /**< Quantity of spilled dirty pages */ + uint64_t unspill; /**< Quantity of unspilled/reloaded pages */ + uint64_t wops; /**< Number of explicit write operations (not a pages) + to a disk */ + uint64_t prefault; /**< Number of prefault write operations (not a pages) */ + uint64_t mincore; /**< Number of mincore() calls */ + uint64_t + msync; /**< Number of explicit msync-to-disk operations (not a pages) */ + uint64_t + fsync; /**< Number of explicit fsync-to-disk operations (not a pages) */ } mi_pgop_stat; }; #ifndef __cplusplus @@ -2766,6 +2888,94 @@ LIBMDBX_INLINE_API(int, mdbx_env_close, (MDBX_env * env)) { return mdbx_env_close_ex(env, false); } +/** \brief Warming up options + * \ingroup c_settings + * \anchor warmup_flags + * \see mdbx_env_warmup() */ +enum MDBX_warmup_flags_t { + /** By default \ref mdbx_env_warmup() just ask OS kernel to asynchronously + * prefetch database pages. */ + MDBX_warmup_default = 0, + + /** Peeking all pages of allocated portion of the database + * to force ones to be loaded into memory. However, the pages are just peeks + * sequentially, so unused pages that are in GC will be loaded in the same + * way as those that contain payload. */ + MDBX_warmup_force = 1, + + /** Using system calls to peeks pages instead of directly accessing ones, + * which at the cost of additional overhead avoids killing the current + * process by OOM-killer in a lack of memory condition. + * \note Has effect only on POSIX (non-Windows) systems with conjunction + * to \ref MDBX_warmup_force option. */ + MDBX_warmup_oomsafe = 2, + + /** Try to lock database pages in memory by `mlock()` on POSIX-systems + * or `VirtualLock()` on Windows. Please refer to description of these + * functions for reasonability of such locking and the information of + * effects, including the system as a whole. + * + * Such locking in memory requires that the corresponding resource limits + * (e.g. `RLIMIT_RSS`, `RLIMIT_MEMLOCK` or process working set size) + * and the availability of system RAM are sufficiently high. + * + * On successful, all currently allocated pages, both unused in GC and + * containing payload, will be locked in memory until the environment closes, + * or explicitly unblocked by using \ref MDBX_warmup_release, or the + * database geomenry will changed, including its auto-shrinking. */ + MDBX_warmup_lock = 4, + + /** Alters corresponding current resource limits to be enough for lock pages + * by \ref MDBX_warmup_lock. However, this option should be used in simpler + * applications since takes into account only current size of this environment + * disregarding all other factors. For real-world database application you + * will need full-fledged management of resources and their limits with + * respective engineering. */ + MDBX_warmup_touchlimit = 8, + + /** Release the lock that was performed before by \ref MDBX_warmup_lock. */ + MDBX_warmup_release = 16, +}; +#ifndef __cplusplus +typedef enum MDBX_warmup_flags_t MDBX_warmup_flags_t; +#else +DEFINE_ENUM_FLAG_OPERATORS(MDBX_warmup_flags_t) +#endif + +/** \brief Warms up the database by loading pages into memory, optionally lock + * ones. \ingroup c_settings + * + * Depending on the specified flags, notifies OS kernel about following access, + * force loads the database pages, including locks ones in memory or releases + * such a lock. However, the function does not analyze the b-tree nor the GC. + * Therefore an unused pages that are in GC handled (i.e. will be loaded) in + * the same way as those that contain payload. + * + * At least one of `env` or `txn` argument must be non-null. + * + * \param [in] env An environment handle returned + * by \ref mdbx_env_create(). + * \param [in] txn A transaction handle returned + * by \ref mdbx_txn_begin(). + * \param [in] flags The \ref warmup_flags, bitwise OR'ed together. + * + * \param [in] timeout_seconds_16dot16 Optional timeout which checking only + * during explicitly peeking database pages + * for loading ones if the \ref MDBX_warmup_force + * option was specified. + * + * \returns A non-zero error value on failure and 0 on success. + * Some possible errors are: + * + * \retval MDBX_ENOSYS The system does not support requested + * operation(s). + * + * \retval MDBX_RESULT_TRUE The specified timeout is reached during load + * data into memory. */ +LIBMDBX_API int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, + MDBX_warmup_flags_t flags, + unsigned timeout_seconds_16dot16); + /** \brief Set environment flags. * \ingroup c_settings * @@ -2804,6 +3014,8 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); /** \brief Return the path that was used in mdbx_env_open(). * \ingroup c_statinfo * + * \note On Windows the \ref mdbx_env_get_pathW() is recommended to use. + * * \param [in] env An environment handle returned by \ref mdbx_env_create() * \param [out] dest Address of a string pointer to contain the path. * This is the actual string in the environment, not a @@ -2814,6 +3026,13 @@ LIBMDBX_API int mdbx_env_get_flags(const MDBX_env *env, unsigned *flags); * \retval MDBX_EINVAL An invalid parameter was specified. */ LIBMDBX_API int mdbx_env_get_path(const MDBX_env *env, const char **dest); +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_get_path() + * \note Available only on Windows. + * \see mdbx_env_get_path() */ +LIBMDBX_API int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **dest); +#endif /* Windows */ + /** \brief Return the file descriptor for the given environment. * \ingroup c_statinfo * @@ -3093,6 +3312,21 @@ mdbx_limits_keysize_max(intptr_t pagesize, MDBX_db_flags_t flags); MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t mdbx_limits_valsize_max(intptr_t pagesize, MDBX_db_flags_t flags); +/** \brief Returns maximal size of key-value pair to fit in a single page with + * the given size and database flags, or -1 if pagesize is invalid. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_pairsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags); + +/** \brief Returns maximal data size in bytes to fit in a leaf-page or + * single overflow/large-page with the given page size and database flags, + * or -1 if pagesize is invalid. + * \ingroup c_statinfo + * \see db_flags */ +MDBX_NOTHROW_CONST_FUNCTION LIBMDBX_API intptr_t +mdbx_limits_valsize4page_max(intptr_t pagesize, MDBX_db_flags_t flags); + /** \brief Returns maximal write transaction size (i.e. limit for summary volume * of dirty pages) in bytes for given page size, or -1 if pagesize is invalid. * \ingroup c_statinfo */ @@ -3248,6 +3482,32 @@ mdbx_env_get_maxvalsize_ex(const MDBX_env *env, MDBX_db_flags_t flags); MDBX_DEPRECATED MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int mdbx_env_get_maxkeysize(const MDBX_env *env); +/** \brief Returns maximal size of key-value pair to fit in a single page + * for specified database flags. + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] flags Database options (\ref MDBX_DUPSORT, \ref MDBX_INTEGERKEY + * and so on). \see db_flags + * + * \returns The maximum size of a data can write, + * or -1 if something is wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_env_get_pairsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags); + +/** \brief Returns maximal data size in bytes to fit in a leaf-page or + * single overflow/large-page for specified database flags. + * \ingroup c_statinfo + * + * \param [in] env An environment handle returned by \ref mdbx_env_create(). + * \param [in] flags Database options (\ref MDBX_DUPSORT, \ref MDBX_INTEGERKEY + * and so on). \see db_flags + * + * \returns The maximum size of a data can write, + * or -1 if something is wrong. */ +MDBX_NOTHROW_PURE_FUNCTION LIBMDBX_API int +mdbx_env_get_valsize4page_max(const MDBX_env *env, MDBX_db_flags_t flags); + /** \brief Sets application information (a context pointer) associated with * the environment. * \see mdbx_env_get_userctx() @@ -3526,8 +3786,8 @@ struct MDBX_commit_latency { /** \brief Duration of preparation (commit child transactions, update * sub-databases records and cursors destroying). */ uint32_t preparation; - /** \brief Duration of GC/freeDB handling & updation. */ - uint32_t gc; + /** \brief Duration of GC update by wall clock. */ + uint32_t gc_wallclock; /** \brief Duration of internal audit if enabled. */ uint32_t audit; /** \brief Duration of writing dirty/modified data pages to a filesystem, @@ -3540,6 +3800,74 @@ struct MDBX_commit_latency { uint32_t ending; /** \brief The total duration of a commit. */ uint32_t whole; + /** \brief User-mode CPU time spent on GC update. */ + uint32_t gc_cputime; + + /** \brief Информация для профилирования работы GC. + * \note Статистика является общей для всех процессов работающих с одним + * файлом БД и хранится в LCK-файле. Данные аккумулируются при фиксации всех + * транзакций, но только в сборках libmdbx c установленной опцией + * \ref MDBX_ENABLE_PROFGC. Собранная статистика возвращаются любому процессу + * при использовании \ref mdbx_txn_commit_ex() и одновременно обнуляется + * при завершении транзакций верхнего уровня (не вложенных). */ + struct { + /** \brief Количество итераций обновления GC, + * больше 1 если были повторы/перезапуски. */ + uint32_t wloops; + /** \brief Количество итераций слияния записей GC. */ + uint32_t coalescences; + /** \brief Количество уничтожений предыдущих надежных/устойчивых + * точек фиксации при работе в режиме \ref MDBX_UTTERLY_NOSYNC. */ + uint32_t wipes; + /** \brief Количество принудительных фиксаций на диск + * во избежания приращения БД при работе вне режима + * \ref MDBX_UTTERLY_NOSYNC. */ + uint32_t flushes; + /** \brief Количество обращений к механизму Handle-Slow-Readers + * во избежания приращения БД. + * \see MDBX_hsr_func */ + uint32_t kicks; + + /** \brief Счетчик выполнения по медленному пути (slow path execution count) + * GC ради данных пользователя. */ + uint32_t work_counter; + /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри + * GC ради данных пользователя. */ + uint32_t work_rtime_monotonic; + /** \brief Время ЦПУ в режиме пользователе затраченное + * на подготовку страниц извлекаемых из GC для данных пользователя, + * включая подкачку с диска. */ + uint32_t work_xtime_cpu; + /** \brief Количество итераций поиска внутри GC при выделении страниц + * ради данных пользователя. */ + uint32_t work_rsteps; + /** \brief Количество запросов на выделение последовательностей страниц + * ради данных пользователя. */ + uint32_t work_xpages; + /** \brief Количество страничных промахов (page faults) внутри GC + * при выделении и подготовки страниц для данных пользователя. */ + uint32_t work_majflt; + + /** \brief Счетчик выполнения по медленному пути (slow path execution count) + * GC для целей поддержки и обновления самой GC. */ + uint32_t self_counter; + /** \brief Время "по настенным часам" затраченное на чтение и поиск внутри + * GC для целей поддержки и обновления самой GC. */ + uint32_t self_rtime_monotonic; + /** \brief Время ЦПУ в режиме пользователе затраченное на подготовку + * страниц извлекаемых из GC для целей поддержки и обновления самой GC, + * включая подкачку с диска. */ + uint32_t self_xtime_cpu; + /** \brief Количество итераций поиска внутри GC при выделении страниц + * для целей поддержки и обновления самой GC. */ + uint32_t self_rsteps; + /** \brief Количество запросов на выделение последовательностей страниц + * для самой GC. */ + uint32_t self_xpages; + /** \brief Количество страничных промахов (page faults) внутри GC + * при выделении и подготовки страниц для самой GC. */ + uint32_t self_majflt; + } gc_prof; }; #ifndef __cplusplus /** \ingroup c_statinfo */ @@ -3862,6 +4190,8 @@ typedef int(MDBX_cmp_func)(const MDBX_val *a, * by current thread. */ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi); +LIBMDBX_API int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, + MDBX_db_flags_t flags, MDBX_dbi *dbi); /** \deprecated Please * \ref avoid_custom_comparators "avoid using custom comparators" and use @@ -3881,6 +4211,9 @@ LIBMDBX_API int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_DEPRECATED LIBMDBX_API int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); +MDBX_DEPRECATED LIBMDBX_API int +mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp); /** \defgroup value2key Value-to-Key functions * \brief Value-to-Key functions to @@ -5083,11 +5416,12 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env); * this value into account to evaluate the impact that * a long-running transaction has. * \param [in] retry A retry number starting from 0. - * If callback has returned 0 at least once, then at end - * of current handling loop the callback function will be - * called additionally with negative value to notify about - * the end of loop. The callback function can use this value - * to implement timeout logic while waiting for readers. + * If callback has returned 0 at least once, then at end of + * current handling loop the callback function will be + * called additionally with negative `retry` value to notify + * about the end of loop. The callback function can use this + * fact to implement timeout reset logic while waiting for + * a readers. * * \returns The RETURN CODE determines the further actions libmdbx and must * match the action which was executed by the callback: @@ -5110,7 +5444,7 @@ LIBMDBX_API int mdbx_thread_unregister(const MDBX_env *env); * \retval 1 Transaction aborted asynchronous and reader slot * should be cleared immediately, i.e. read transaction * will not continue but \ref mdbx_txn_abort() - * or \ref mdbx_txn_reset() will be called later. + * nor \ref mdbx_txn_reset() will be called later. * * \retval 2 or great The reader process was terminated or killed, * and libmdbx should entirely reset reader registration. @@ -5177,18 +5511,20 @@ typedef enum MDBX_page_type_t MDBX_page_type_t; #endif /** \brief Pseudo-name for MainDB */ -#define MDBX_PGWALK_MAIN ((const char *)((ptrdiff_t)0)) +#define MDBX_PGWALK_MAIN ((void *)((ptrdiff_t)0)) /** \brief Pseudo-name for GarbageCollectorDB */ -#define MDBX_PGWALK_GC ((const char *)((ptrdiff_t)-1)) +#define MDBX_PGWALK_GC ((void *)((ptrdiff_t)-1)) /** \brief Pseudo-name for MetaPages */ -#define MDBX_PGWALK_META ((const char *)((ptrdiff_t)-2)) +#define MDBX_PGWALK_META ((void *)((ptrdiff_t)-2)) /** \brief Callback function for traverse the b-tree. \see mdbx_env_pgwalk() */ -typedef int MDBX_pgvisitor_func( - const uint64_t pgno, const unsigned number, void *const ctx, const int deep, - const char *const dbi, const size_t page_size, const MDBX_page_type_t type, - const MDBX_error_t err, const size_t nentries, const size_t payload_bytes, - const size_t header_bytes, const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; +typedef int +MDBX_pgvisitor_func(const uint64_t pgno, const unsigned number, void *const ctx, + const int deep, const MDBX_val *dbi_name, + const size_t page_size, const MDBX_page_type_t type, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) MDBX_CXX17_NOEXCEPT; /** \brief B-tree traversal function. */ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, @@ -5199,11 +5535,24 @@ LIBMDBX_API int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, * * This function mostly of internal API for `mdbx_chk` utility and subject to * change at any time. Do not use this function to avoid shooting your own - * leg(s). */ + * leg(s). + * + * \note On Windows the \ref mdbx_env_open_for_recoveryW() is recommended + * to use. */ LIBMDBX_API int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable); +#if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) +/** \copydoc mdbx_env_open_for_recovery() + * \note Available only on Windows. + * \see mdbx_env_open_for_recovery() */ +LIBMDBX_API int mdbx_env_open_for_recoveryW(MDBX_env *env, + const wchar_t *pathname, + unsigned target_meta, + bool writeable); +#endif /* Windows */ + /** \brief Turn database to the specified meta-page. * * This function mostly of internal API for `mdbx_chk` utility and subject to @@ -5213,230 +5562,8 @@ LIBMDBX_API int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target_meta); /** end of btree_traversal @} */ -/**** Attribute support functions for Nexenta (scheduled for removal) - * *****************************************************************/ -#if defined(MDBX_NEXENTA_ATTRS) || defined(DOXYGEN) -/** \defgroup nexenta Attribute support functions for Nexenta - * \ingroup c_crud - * @{ */ -typedef uint_fast64_t mdbx_attr_t; - -/** Store by cursor with attribute. - * - * This function stores key/data pairs into the database. The cursor is - * positioned at the new item, or on failure usually near it. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open() - * \param [in] key The key operated on. - * \param [in] data The data operated on. - * \param [in] attr The attribute. - * \param [in] flags Options for this operation. This parameter must be set - * to 0 or one of the values described here: - * - \ref MDBX_CURRENT - * Replace the item at the current cursor position. The key parameter - * must still be provided, and must match it, otherwise the function - * return \ref MDBX_EKEYMISMATCH. - * - * - \ref MDBX_APPEND - * Append the given key/data pair to the end of the database. No key - * comparisons are performed. This option allows fast bulk loading when - * keys are already known to be in the correct order. Loading unsorted - * keys with this flag will cause a \ref MDBX_KEYEXIST error. - * - * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_EKEYMISMATCH - * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). - * \retval MDBX_TXN_FULL The transaction has too many dirty pages. - * \retval MDBX_EACCES An attempt was made to write in a read-only - * transaction. - * \retval MDBX_EINVAL an invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr, - MDBX_put_flags_t flags); - -/** Store items and attributes into a database. - * - * This function stores key/data pairs in the database. The default behavior - * is to enter the new key/data pair, replacing any previously existing key - * if duplicates are disallowed. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to store in the database. - * \param [in] attr The attribute to store in the database. - * \param [in,out] data The data to store. - * \param [in] flags Special options for this operation. This parameter - * must be set to 0 or by bitwise OR'ing together one or - * more of the values described here: - * - \ref MDBX_NOOVERWRITE - * Enter the new key/data pair only if the key does not already appear - * in the database. The function will return \ref MDBX_KEYEXIST if the key - * already appears in the database. The data parameter will be set to - * point to the existing item. - * - * - \ref MDBX_CURRENT - * Update an single existing entry, but not add new ones. The function - * will return \ref MDBX_NOTFOUND if the given key not exist in the - * database. Or the \ref MDBX_EMULTIVAL in case duplicates for the given - * key. - * - * - \ref MDBX_APPEND - * Append the given key/data pair to the end of the database. This option - * allows fast bulk loading when keys are already known to be in the - * correct order. Loading unsorted keys with this flag will cause - * a \ref MDBX_EKEYMISMATCH error. - * - * \see \ref c_crud_hints "Quick reference for Insert/Update/Delete operations" - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_KEYEXIST - * \retval MDBX_MAP_FULL The database is full, see \ref mdbx_env_set_mapsize(). - * \retval MDBX_TXN_FULL The transaction has too many dirty pages. - * \retval MDBX_EACCES An attempt was made to write - * in a read-only transaction. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr, - MDBX_put_flags_t flags); - -/** Set items attribute from a database. - * - * This function stores key/data pairs attribute to the database. - * - * \note Internally based on \ref MDBX_RESERVE feature, - * therefore doesn't support \ref MDBX_DUPSORT. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to search for in the database. - * \param [in] data The data to be stored or NULL to save previous value. - * \param [in] attr The attribute to be stored. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND The key-value pair was not in the database. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t attr); - -/** Get items attribute from a database cursor. - * - * This function retrieves key/data pairs from the database. The address and - * length of the key are returned in the object to which key refers (except - * for the case of the \ref MDBX_SET option, in which the key object is - * unchanged), and the address and length of the data are returned in the object - * to which data refers. - * \see mdbx_get() - * - * \param [in] cursor A cursor handle returned by \ref mdbx_cursor_open(). - * \param [in,out] key The key for a retrieved item. - * \param [in,out] data The data of a retrieved item. - * \param [out] pattr The pointer to retrieve attribute. - * \param [in] op A cursor operation MDBX_cursor_op. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND No matching key found. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_cursor_get_attr(MDBX_cursor *cursor, MDBX_val *key, - MDBX_val *data, mdbx_attr_t *pattr, - MDBX_cursor_op op); - -/** Get items attribute from a database. - * - * This function retrieves key/data pairs from the database. The address - * and length of the data associated with the specified key are returned - * in the structure to which data refers. - * If the database supports duplicate keys (see \ref MDBX_DUPSORT) then the - * first data item for the key will be returned. Retrieval of other - * items requires the use of \ref mdbx_cursor_get(). - * - * \note The memory pointed to by the returned values is owned by the - * database. The caller need not dispose of the memory, and may not - * modify it in any way. For values returned in a read-only transaction - * any modification attempts will cause a `SIGSEGV`. - * - * \note Values returned from the database are valid only until a - * subsequent update operation, or the end of the transaction. - * - * \param [in] txn A transaction handle returned by \ref mdbx_txn_begin(). - * \param [in] dbi A database handle returned by \ref mdbx_dbi_open(). - * \param [in] key The key to search for in the database. - * \param [in,out] data The data corresponding to the key. - * \param [out] pattr The pointer to retrieve attribute. - * - * \returns A non-zero error value on failure and 0 on success, - * some possible errors are: - * \retval MDBX_NOTFOUND The key was not in the database. - * \retval MDBX_EINVAL An invalid parameter was specified. */ -LIBMDBX_API int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, - MDBX_val *data, mdbx_attr_t *pattr); -/** end of nexenta @} */ -#endif /* MDBX_NEXENTA_ATTRS */ - /** end of c_api @} */ -/******************************************************************************* - * Workaround for mmaped-lookahead-cross-page-boundary bug - * in an obsolete versions of Elbrus's libc and kernels. */ -#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ - MDBX_E2K_MLHCPB_WORKAROUND -LIBMDBX_API int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n); -LIBMDBX_API int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2); -LIBMDBX_API int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n); -LIBMDBX_API size_t mdbx_e2k_strlen_bug_workaround(const char *s); -LIBMDBX_API size_t mdbx_e2k_strnlen_bug_workaround(const char *s, - size_t maxlen); -#ifdef __cplusplus -namespace std { -inline int mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n) { - return ::mdbx_e2k_memcmp_bug_workaround(s1, s2, n); -} -inline int mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { - return ::mdbx_e2k_strcmp_bug_workaround(s1, s2); -} -inline int mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n) { - return ::mdbx_e2k_strncmp_bug_workaround(s1, s2, n); -} -inline size_t mdbx_e2k_strlen_bug_workaround(const char *s) { - return ::mdbx_e2k_strlen_bug_workaround(s); -} -inline size_t mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { - return ::mdbx_e2k_strnlen_bug_workaround(s, maxlen); -} -} // namespace std -#endif /* __cplusplus */ - -#include -#include -#undef memcmp -#define memcmp mdbx_e2k_memcmp_bug_workaround -#undef bcmp -#define bcmp mdbx_e2k_memcmp_bug_workaround -#undef strcmp -#define strcmp mdbx_e2k_strcmp_bug_workaround -#undef strncmp -#define strncmp mdbx_e2k_strncmp_bug_workaround -#undef strlen -#define strlen mdbx_e2k_strlen_bug_workaround -#undef strnlen -#define strnlen mdbx_e2k_strnlen_bug_workaround -#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ - #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/mdbx.h++ b/mdbx.h++ index f5c8d1ee..a05f1c63 100644 --- a/mdbx.h++ +++ b/mdbx.h++ @@ -1,7 +1,7 @@ /// \file mdbx.h++ /// \brief The libmdbx C++ API header file. /// -/// \author Copyright (c) 2020-2022, Leonid Yuriev . +/// \author Copyright (c) 2020-2023, Leonid Yuriev . /// \copyright SPDX-License-Identifier: Apache-2.0 /// /// Tested with: @@ -84,6 +84,11 @@ #include #endif +#if __cplusplus >= 201103L +#include +#include +#endif + #include "mdbx.h" #if (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L) || \ @@ -386,6 +391,11 @@ using path = ::std::wstring; using path = ::std::string; #endif /* mdbx::path */ +#if __cplusplus >= 201103L || defined(DOXYGEN) +/// \brief Duration in 1/65536 units of second. +using duration = ::std::chrono::duration>; +#endif /* Duration for C++11 */ + /// \defgroup cxx_exceptions exceptions and errors /// @{ @@ -3189,6 +3199,7 @@ public: /// \brief Returns the minimal values size in bytes for specified values /// mode. static inline size_t value_min(value_mode) noexcept; + /// \brief Returns the maximal value size in bytes for specified page size /// and database flags. static inline size_t value_max(intptr_t pagesize, MDBX_db_flags_t flags); @@ -3201,6 +3212,35 @@ public: /// \brief Returns the maximal value size in bytes for specified page size /// and values mode. static inline size_t value_max(const env &, value_mode); + + /// \brief Returns maximal size of key-value pair to fit in a single page + /// for specified size and database flags. + static inline size_t pairsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags); + /// \brief Returns maximal size of key-value pair to fit in a single page + /// for specified page size and values mode. + static inline size_t pairsize4page_max(intptr_t pagesize, value_mode); + /// \brief Returns maximal size of key-value pair to fit in a single page + /// for given environment and database flags. + static inline size_t pairsize4page_max(const env &, MDBX_db_flags_t flags); + /// \brief Returns maximal size of key-value pair to fit in a single page + /// for specified page size and values mode. + static inline size_t pairsize4page_max(const env &, value_mode); + + /// \brief Returns maximal data size in bytes to fit in a leaf-page or + /// single overflow/large-page for specified size and database flags. + static inline size_t valsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags); + /// \brief Returns maximal data size in bytes to fit in a leaf-page or + /// single overflow/large-page for specified page size and values mode. + static inline size_t valsize4page_max(intptr_t pagesize, value_mode); + /// \brief Returns maximal data size in bytes to fit in a leaf-page or + /// single overflow/large-page for given environment and database flags. + static inline size_t valsize4page_max(const env &, MDBX_db_flags_t flags); + /// \brief Returns maximal data size in bytes to fit in a leaf-page or + /// single overflow/large-page for specified page size and values mode. + static inline size_t valsize4page_max(const env &, value_mode); + /// \brief Returns the maximal write transaction size (i.e. limit for /// summary volume of dirty pages) in bytes for specified page size. static inline size_t transaction_size_max(intptr_t pagesize); @@ -3237,6 +3277,8 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) env ©(const ::std::wstring &destination, bool compactify, bool force_dynamic_size = false); + env ©(const wchar_t *destination, bool compactify, + bool force_dynamic_size = false); #endif /* Windows */ env ©(const ::std::string &destination, bool compactify, bool force_dynamic_size = false); @@ -3272,6 +3314,8 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) static bool remove(const ::std::wstring &pathname, const remove_mode mode = just_remove); + static bool remove(const wchar_t *pathname, + const remove_mode mode = just_remove); #endif /* Windows */ static bool remove(const ::std::string &pathname, const remove_mode mode = just_remove); @@ -3312,9 +3356,11 @@ public: /// \brief Returns the maximum number of threads/reader slots for the /// environment. + /// \see extra_runtime_option::max_readers inline unsigned max_readers() const; /// \brief Returns the maximum number of named databases for the environment. + /// \see extra_runtime_option::max_maps inline unsigned max_maps() const; /// \brief Returns the application context associated with the environment. @@ -3326,59 +3372,117 @@ public: /// \brief Sets threshold to force flush the data buffers to disk, for /// non-sync durability modes. /// - /// The threshold value affects all processes which operates with given - /// environment until the last process close environment or a new value will - /// be settled. - /// Data is always written to disk when \ref txn_managed::commit() is called, - /// but the operating system may keep it buffered. MDBX always flushes the OS - /// buffers upon commit as well, unless the environment was opened with \ref - /// whole_fragile, \ref lazy_weak_tail or in part \ref - /// half_synchronous_weak_last. The default is 0, than mean no any threshold - /// checked, and no additional flush will be made. + /// \details The threshold value affects all processes which operates with + /// given environment until the last process close environment or a new value + /// will be settled. Data is always written to disk when \ref + /// txn_managed::commit() is called, but the operating system may keep it + /// buffered. MDBX always flushes the OS buffers upon commit as well, unless + /// the environment was opened with \ref whole_fragile, \ref lazy_weak_tail or + /// in part \ref half_synchronous_weak_last. /// + /// The default is 0, than mean no any threshold checked, and no additional + /// flush will be made. + /// \see extra_runtime_option::sync_bytes inline env &set_sync_threshold(size_t bytes); + /// \brief Gets threshold used to force flush the data buffers to disk, for + /// non-sync durability modes. + /// + /// \copydetails set_sync_threshold() + /// \see extra_runtime_option::sync_bytes + inline size_t sync_threshold() const; + +#if __cplusplus >= 201103L || defined(DOXYGEN) /// \brief Sets relative period since the last unsteady commit to force flush /// the data buffers to disk, for non-sync durability modes. /// - /// The relative period value affects all processes which operates with given - /// environment until the last process close environment or a new value will - /// be settled. - /// Data is always written to disk when \ref txn_managed::commit() is called, - /// but the operating system may keep it buffered. MDBX always flushes the OS - /// buffers upon commit as well, unless the environment was opened with \ref - /// whole_fragile, \ref lazy_weak_tail or in part \ref - /// half_synchronous_weak_last. Settled period don't checked asynchronously, - /// but only by the \ref txn_managed::commit() and \ref env::sync_to_disk() - /// functions. Therefore, in cases where transactions are committed - /// infrequently and/or irregularly, polling by \ref env::poll_sync_to_disk() - /// may be a reasonable solution to timeout enforcement. The default is 0, - /// than mean no any timeout checked, and no additional flush will be made. + /// \details The relative period value affects all processes which operates + /// with given environment until the last process close environment or a new + /// value will be settled. Data is always written to disk when \ref + /// txn_managed::commit() is called, but the operating system may keep it + /// buffered. MDBX always flushes the OS buffers upon commit as well, unless + /// the environment was opened with \ref whole_fragile, \ref lazy_weak_tail or + /// in part \ref half_synchronous_weak_last. Settled period don't checked + /// asynchronously, but only by the \ref txn_managed::commit() and \ref + /// env::sync_to_disk() functions. Therefore, in cases where transactions are + /// committed infrequently and/or irregularly, polling by \ref + /// env::poll_sync_to_disk() may be a reasonable solution to timeout + /// enforcement. /// + /// The default is 0, than mean no any timeout checked, and no additional + /// flush will be made. + /// \see extra_runtime_option::sync_period + inline env &set_sync_period(const duration &period); + + /// \brief Gets relative period since the last unsteady commit that used to + /// force flush the data buffers to disk, for non-sync durability modes. + /// \copydetails set_sync_period(const duration&) + /// \see set_sync_period(const duration&) + /// \see extra_runtime_option::sync_period + inline duration sync_period() const; +#endif + + /// \copydoc set_sync_period(const duration&) /// \param [in] seconds_16dot16 The period in 1/65536 of second when a /// synchronous flush would be made since the last unsteady commit. - inline env &set_sync_period(unsigned seconds_16dot16); + inline env &set_sync_period__seconds_16dot16(unsigned seconds_16dot16); - /// \brief Sets relative period since the last unsteady commit to force flush - /// the data buffers to disk, for non-sync durability modes. - /// - /// The relative period value affects all processes which operates with given - /// environment until the last process close environment or a new value will - /// be settled. - /// Data is always written to disk when \ref txn_managed::commit() is called, - /// but the operating system may keep it buffered. MDBX always flushes the OS - /// buffers upon commit as well, unless the environment was opened with \ref - /// whole_fragile, \ref lazy_weak_tail or in part \ref - /// half_synchronous_weak_last. Settled period don't checked asynchronously, - /// but only by the \ref txn_managed::commit() and \ref env::sync_to_disk() - /// functions. Therefore, in cases where transactions are committed - /// infrequently and/or irregularly, polling by \ref env::poll_sync_to_disk() - /// may be a reasonable solution to timeout enforcement. The default is 0, - /// than mean no any timeout checked, and no additional flush will be made. - /// + /// \copydoc sync_period() + /// \see sync_period__seconds_16dot16(unsigned) + inline unsigned sync_period__seconds_16dot16() const; + + /// \copydoc set_sync_period(const duration&) /// \param [in] seconds The period in second when a synchronous flush would /// be made since the last unsteady commit. - inline env &set_sync_period(double seconds); + inline env &set_sync_period__seconds_double(double seconds); + + /// \copydoc sync_period() + /// \see set_sync_period__seconds_double(double) + inline double sync_period__seconds_double() const; + + /// \copydoc MDBX_option_t + enum class extra_runtime_option { + /// \copydoc MDBX_opt_max_db + /// \see max_maps() \see env::operate_parameters::max_maps + max_maps = MDBX_opt_max_db, + /// \copydoc MDBX_opt_max_readers + /// \see max_readers() \see env::operate_parameters::max_readers + max_readers = MDBX_opt_max_readers, + /// \copydoc MDBX_opt_sync_bytes + /// \see sync_threshold() \see set_sync_threshold() + sync_bytes = MDBX_opt_sync_bytes, + /// \copydoc MDBX_opt_sync_period + /// \see sync_period() \see set_sync_period() + sync_period = MDBX_opt_sync_period, + /// \copydoc MDBX_opt_rp_augment_limit + rp_augment_limit = MDBX_opt_rp_augment_limit, + /// \copydoc MDBX_opt_loose_limit + loose_limit = MDBX_opt_loose_limit, + /// \copydoc MDBX_opt_dp_reserve_limit + dp_reserve_limit = MDBX_opt_dp_reserve_limit, + /// \copydoc MDBX_opt_txn_dp_limit + dp_limit = MDBX_opt_txn_dp_limit, + /// \copydoc MDBX_opt_txn_dp_initial + dp_initial = MDBX_opt_txn_dp_initial, + /// \copydoc MDBX_opt_spill_max_denominator + spill_max_denominator = MDBX_opt_spill_max_denominator, + /// \copydoc MDBX_opt_spill_min_denominator + spill_min_denominator = MDBX_opt_spill_min_denominator, + /// \copydoc MDBX_opt_spill_parent4child_denominator + spill_parent4child_denominator = MDBX_opt_spill_parent4child_denominator, + /// \copydoc MDBX_opt_merge_threshold_16dot16_percent + merge_threshold_16dot16_percent = MDBX_opt_merge_threshold_16dot16_percent, + /// \copydoc MDBX_opt_writethrough_threshold + writethrough_threshold = MDBX_opt_writethrough_threshold, + /// \copydoc MDBX_opt_prefault_write_enable + prefault_write_enable = MDBX_opt_prefault_write_enable, + }; + + /// \copybrief mdbx_env_set_option() + inline env &set_extra_option(extra_runtime_option option, uint64_t value); + + /// \copybrief mdbx_env_get_option() + inline uint64_t extra_option(extra_runtime_option option) const; /// \brief Alter environment flags. inline env &alter_flags(MDBX_env_flags_t flags, bool on_off); @@ -3519,6 +3623,8 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) env_managed(const ::std::wstring &pathname, const operate_parameters &, bool accede = true); + explicit env_managed(const wchar_t *pathname, const operate_parameters &, + bool accede = true); #endif /* Windows */ env_managed(const ::std::string &pathname, const operate_parameters &, bool accede = true); @@ -3543,6 +3649,8 @@ public: #if defined(_WIN32) || defined(_WIN64) || defined(DOXYGEN) env_managed(const ::std::wstring &pathname, const create_parameters &, const operate_parameters &, bool accede = true); + explicit env_managed(const wchar_t *pathname, const create_parameters &, + const operate_parameters &, bool accede = true); #endif /* Windows */ env_managed(const ::std::string &pathname, const create_parameters &, const operate_parameters &, bool accede = true); @@ -3879,12 +3987,31 @@ public: //---------------------------------------------------------------------------- - /// \brief Abandon all the operations of the transaction instead of saving - /// them. + /// \brief Abandon all the operations of the transaction + /// instead of saving ones. void abort(); /// \brief Commit all the operations of a transaction into the database. void commit(); + + using commit_latency = MDBX_commit_latency; + + /// \brief Commit all the operations of a transaction into the database + /// and collect latency information. + void commit(commit_latency *); + + /// \brief Commit all the operations of a transaction into the database + /// and collect latency information. + void commit(commit_latency &latency) { return commit(&latency); } + + /// \brief Commit all the operations of a transaction into the database + /// and return latency information. + /// \returns latency information of commit stages. + commit_latency commit_get_latency() { + commit_latency result; + commit(&result); + return result; + } }; /// \brief Unmanaged cursor. @@ -4867,6 +4994,56 @@ inline size_t env::limits::value_max(const env &env, value_mode mode) { return value_max(env, MDBX_db_flags_t(mode)); } +inline size_t env::limits::pairsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + const intptr_t result = mdbx_limits_pairsize4page_max(pagesize, flags); + if (result < 0) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL); + return static_cast(result); +} + +inline size_t env::limits::pairsize4page_max(intptr_t pagesize, + value_mode mode) { + return pairsize4page_max(pagesize, MDBX_db_flags_t(mode)); +} + +inline size_t env::limits::pairsize4page_max(const env &env, + MDBX_db_flags_t flags) { + const intptr_t result = mdbx_env_get_pairsize4page_max(env, flags); + if (result < 0) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL); + return static_cast(result); +} + +inline size_t env::limits::pairsize4page_max(const env &env, value_mode mode) { + return pairsize4page_max(env, MDBX_db_flags_t(mode)); +} + +inline size_t env::limits::valsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + const intptr_t result = mdbx_limits_valsize4page_max(pagesize, flags); + if (result < 0) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL); + return static_cast(result); +} + +inline size_t env::limits::valsize4page_max(intptr_t pagesize, + value_mode mode) { + return valsize4page_max(pagesize, MDBX_db_flags_t(mode)); +} + +inline size_t env::limits::valsize4page_max(const env &env, + MDBX_db_flags_t flags) { + const intptr_t result = mdbx_env_get_valsize4page_max(env, flags); + if (result < 0) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_EINVAL); + return static_cast(result); +} + +inline size_t env::limits::valsize4page_max(const env &env, value_mode mode) { + return valsize4page_max(env, MDBX_db_flags_t(mode)); +} + inline size_t env::limits::transaction_size_max(intptr_t pagesize) { const intptr_t result = mdbx_limits_txnsize_max(pagesize); if (result < 0) @@ -4961,13 +5138,53 @@ inline env &env::set_sync_threshold(size_t bytes) { return *this; } -inline env &env::set_sync_period(unsigned seconds_16dot16) { +inline size_t env::sync_threshold() const { + size_t bytes; + error::success_or_throw(::mdbx_env_get_syncbytes(handle_, &bytes)); + return bytes; +} + +inline env &env::set_sync_period__seconds_16dot16(unsigned seconds_16dot16) { error::success_or_throw(::mdbx_env_set_syncperiod(handle_, seconds_16dot16)); return *this; } -inline env &env::set_sync_period(double seconds) { - return set_sync_period(unsigned(seconds * 65536)); +inline unsigned env::sync_period__seconds_16dot16() const { + unsigned seconds_16dot16; + error::success_or_throw(::mdbx_env_get_syncperiod(handle_, &seconds_16dot16)); + return seconds_16dot16; +} + +inline env &env::set_sync_period__seconds_double(double seconds) { + return set_sync_period__seconds_16dot16(unsigned(seconds * 65536)); +} + +inline double env::sync_period__seconds_double() const { + return sync_period__seconds_16dot16() / 65536.0; +} + +#if __cplusplus >= 201103L +inline env &env::set_sync_period(const duration &period) { + return set_sync_period__seconds_16dot16(period.count()); +} + +inline duration env::sync_period() const { + return duration(sync_period__seconds_16dot16()); +} +#endif + +inline env &env::set_extra_option(enum env::extra_runtime_option option, + uint64_t value) { + error::success_or_throw( + ::mdbx_env_set_option(handle_, ::MDBX_option_t(option), value)); + return *this; +} + +inline uint64_t env::extra_option(enum env::extra_runtime_option option) const { + uint64_t value; + error::success_or_throw( + ::mdbx_env_get_option(handle_, ::MDBX_option_t(option), &value)); + return value; } inline env &env::alter_flags(MDBX_env_flags_t flags, bool on_off) { diff --git a/src/alloy.c b/src/alloy.c index 1e770f23..7d0cf636 100644 --- a/src/alloy.c +++ b/src/alloy.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * diff --git a/src/base.h b/src/base.h index ccd1d7dd..b8a243e8 100644 --- a/src/base.h +++ b/src/base.h @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -63,7 +63,7 @@ #define SSIZE_MAX INTPTR_MAX #endif -#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul +#if UINTPTR_MAX > 0xffffFFFFul || ULONG_MAX > 0xffffFFFFul || defined(_WIN64) #define MDBX_WORDBITS 64 #else #define MDBX_WORDBITS 32 @@ -259,8 +259,10 @@ __extern_C key_t ftok(const char *, int); #include #include #include +#include #include #include +#include #include #endif /*---------------------------------------------------------------------*/ @@ -302,8 +304,9 @@ __extern_C key_t ftok(const char *, int); /* LY: define neutral __ia32__ for x86 and x86-64 */ #define __ia32__ 1 #endif /* __ia32__ */ -#if !defined(__amd64__) && (defined(__x86_64) || defined(__x86_64__) || \ - defined(__amd64) || defined(_M_X64)) +#if !defined(__amd64__) && \ + (defined(__x86_64) || defined(__x86_64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64)) /* LY: define trusty __amd64__ for all AMD64/x86-64 arch */ #define __amd64__ 1 #endif /* __amd64__ */ @@ -371,18 +374,50 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __BYTE_ORDER__ || __ORDER_LITTLE_ENDIAN__ || __ORDER_BIG_ENDIAN__ */ +/*----------------------------------------------------------------------------*/ +/* Availability of CMOV or equivalent */ + +#ifndef MDBX_HAVE_CMOV +#if defined(__e2k__) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb2__) || defined(__thumb2) +#define MDBX_HAVE_CMOV 1 +#elif defined(__thumb__) || defined(__thumb) || defined(__TARGET_ARCH_THUMB) +#define MDBX_HAVE_CMOV 0 +#elif defined(_M_ARM) || defined(_M_ARM64) || defined(__aarch64__) || \ + defined(__aarch64) || defined(__arm__) || defined(__arm) || \ + defined(__CC_ARM) +#define MDBX_HAVE_CMOV 1 +#elif (defined(__riscv__) || defined(__riscv64)) && \ + (defined(__riscv_b) || defined(__riscv_bitmanip)) +#define MDBX_HAVE_CMOV 1 +#elif defined(i686) || defined(__i686) || defined(__i686__) || \ + (defined(_M_IX86) && _M_IX86 > 600) || defined(__x86_64) || \ + defined(__x86_64__) || defined(__amd64__) || defined(__amd64) || \ + defined(_M_X64) || defined(_M_AMD64) +#define MDBX_HAVE_CMOV 1 +#else +#define MDBX_HAVE_CMOV 0 +#endif +#endif /* MDBX_HAVE_CMOV */ + /*----------------------------------------------------------------------------*/ /* Compiler's includes for builtins/intrinsics */ #if defined(_MSC_VER) || defined(__INTEL_COMPILER) #include #elif __GNUC_PREREQ(4, 4) || defined(__clang__) -#if defined(__ia32__) || defined(__e2k__) +#if defined(__e2k__) +#include #include -#endif /* __ia32__ */ +#endif /* __e2k__ */ #if defined(__ia32__) #include +#include #endif /* __ia32__ */ +#ifdef __ARM_NEON +#include +#endif #elif defined(__SUNPRO_C) || defined(__sun) || defined(sun) #include #elif (defined(_HPUX_SOURCE) || defined(__hpux) || defined(__HP_aCC)) && \ @@ -603,6 +638,16 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* __anonymous_struct_extension__ */ +#ifndef expect_with_probability +#if defined(__builtin_expect_with_probability) || \ + __has_builtin(__builtin_expect_with_probability) || __GNUC_PREREQ(9, 0) +#define expect_with_probability(expr, value, prob) \ + __builtin_expect_with_probability(expr, value, prob) +#else +#define expect_with_probability(expr, value, prob) (expr) +#endif +#endif /* expect_with_probability */ + #ifndef MDBX_WEAK_IMPORT_ATTRIBUTE #ifdef WEAK_IMPORT_ATTRIBUTE #define MDBX_WEAK_IMPORT_ATTRIBUTE WEAK_IMPORT_ATTRIBUTE @@ -616,6 +661,28 @@ __extern_C key_t ftok(const char *, int); #endif #endif /* MDBX_WEAK_IMPORT_ATTRIBUTE */ +#ifndef MDBX_GOOFY_MSVC_STATIC_ANALYZER +#ifdef _PREFAST_ +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 1 +#else +#define MDBX_GOOFY_MSVC_STATIC_ANALYZER 0 +#endif +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + +#if MDBX_GOOFY_MSVC_STATIC_ANALYZER || (defined(_MSC_VER) && _MSC_VER > 1919) +#define MDBX_ANALYSIS_ASSUME(expr) __analysis_assume(expr) +#ifdef _PREFAST_ +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(prefast(suppress : warn_id)) +#else +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) \ + __pragma(warning(suppress : warn_id)) +#endif +#else +#define MDBX_ANALYSIS_ASSUME(expr) assert(expr) +#define MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(warn_id) +#endif /* MDBX_GOOFY_MSVC_STATIC_ANALYZER */ + /*----------------------------------------------------------------------------*/ #if defined(MDBX_USE_VALGRIND) diff --git a/src/bits.md b/src/bits.md index 99cef8e8..e8708f02 100644 --- a/src/bits.md +++ b/src/bits.md @@ -1,11 +1,11 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NODE | PAGE | MRESIZE | --|---------|-----------|--------------|----------|-----------|------------|---------|----------|---------| -0 |0000 0001|ALLOC_CACHE|TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH | | -1 |0000 0002|ALLOC_GC |TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |F_SUBDATA|P_LEAF | | -2 |0000 0004|ALLOC_NEW |TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | -3 |0000 0008|ALLOC_SLOT |TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | -4 |0000 0010|ALLOC_FAKE |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | -5 |0000 0020| | |INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | +0 |0000 0001|ALLOC_RSRV |TXN_FINISHED | | |DBI_DIRTY |F_BIGDATA|P_BRANCH | | +1 |0000 0002|ALLOC_UNIMP|TXN_ERROR |REVERSEKEY|F_SUBDATA |DBI_STALE |F_SUBDATA|P_LEAF | | +2 |0000 0004|ALLOC_COLSC|TXN_DIRTY |DUPSORT | |DBI_FRESH |F_DUPDATA|P_OVERFLOW| | +3 |0000 0008|ALLOC_SSCAN|TXN_SPILLS |INTEGERKEY| |DBI_CREAT | |P_META | | +4 |0000 0010|ALLOC_FIFO |TXN_HAS_CHILD |DUPFIXED |NOOVERWRITE|DBI_VALID | |P_BAD | | +5 |0000 0020| |TXN_DRAINED_GC|INTEGERDUP|NODUPDATA |DBI_USRVALID| |P_LEAF2 | | 6 |0000 0040| | |REVERSEDUP|CURRENT |DBI_DUPDATA | |P_SUBP | | 7 |0000 0080| | | |ALLDUPS |DBI_AUDITED | | | | 8 |0000 0100| _MAY_MOVE | | | | | | | <= | @@ -13,9 +13,9 @@ N | MASK | ENV | TXN | DB | PUT | DBI | NOD 10|0000 0400| | | | | | | | | 11|0000 0800| | | | | | | | | 12|0000 1000| | | | | | | | | -13|0000 2000| | | | | | |P_SPILLED | | +13|0000 2000|VALIDATION | | | | | |P_SPILLED | | 14|0000 4000|NOSUBDIR | | | | | |P_LOOSE | | -15|0000 8000| | |DB_VALID |NOSPILL | | |P_FROZEN | | +15|0000 8000| | |DB_VALID | | | |P_FROZEN | | 16|0001 0000|SAFE_NOSYNC|TXN_NOSYNC | |RESERVE | |RESERVE | | | 17|0002 0000|RDONLY |TXN_RDONLY | |APPEND | |APPEND | | <= | 18|0004 0000|NOMETASYNC |TXN_NOMETASYNC|CREATE |APPENDDUP | | | | | diff --git a/src/config.h.in b/src/config.h.in index 7959699a..05c561b1 100644 --- a/src/config.h.in +++ b/src/config.h.in @@ -26,7 +26,13 @@ #ifndef MDBX_TRUST_RTC_AUTO #cmakedefine01 MDBX_TRUST_RTC #endif -#cmakedefine01 MDBX_DISABLE_PAGECHECKS +#cmakedefine01 MDBX_DISABLE_VALIDATION +#cmakedefine01 MDBX_AVOID_MSYNC +#cmakedefine01 MDBX_ENABLE_REFUND +#cmakedefine01 MDBX_ENABLE_MADVISE +#cmakedefine01 MDBX_ENABLE_BIGFOOT +#cmakedefine01 MDBX_ENABLE_PGOP_STAT +#cmakedefine01 MDBX_ENABLE_PROFGC /* Windows */ #cmakedefine01 MDBX_WITHOUT_MSVC_CRT diff --git a/src/core.c b/src/core.c index c149edd3..0a2d1da3 100644 --- a/src/core.c +++ b/src/core.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev . + * Copyright 2015-2023 Leonid Yuriev . * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -40,22 +40,20 @@ /*------------------------------------------------------------------------------ * Internal inline functions */ -MDBX_NOTHROW_CONST_FUNCTION static unsigned branchless_abs(int value) { +MDBX_NOTHROW_CONST_FUNCTION static size_t branchless_abs(intptr_t value) { assert(value > INT_MIN); - const unsigned expanded_sign = - (unsigned)(value >> (sizeof(value) * CHAR_BIT - 1)); - return ((unsigned)value + expanded_sign) ^ expanded_sign; + const size_t expanded_sign = + (size_t)(value >> (sizeof(value) * CHAR_BIT - 1)); + return ((size_t)value + expanded_sign) ^ expanded_sign; } /* Pack/Unpack 16-bit values for Grow step & Shrink threshold */ -MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(unsigned m, - unsigned e) { +MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t me2v(size_t m, size_t e) { assert(m < 2048 && e < 8); return (pgno_t)(32768 + ((m + 1) << (e + 8))); } -MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v, - unsigned e) { +MDBX_NOTHROW_CONST_FUNCTION static __inline uint16_t v2me(size_t v, size_t e) { assert(v > (e ? me2v(2047, e - 1) : 32768)); assert(v <= me2v(2047, e)); size_t m = (v - 32768 + ((size_t)1 << (e + 8)) - 1) >> (e + 8); @@ -103,9 +101,9 @@ MDBX_NOTHROW_CONST_FUNCTION static uint16_t pages2pv(size_t pages) { /*------------------------------------------------------------------------------ * Unaligned access */ -MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline unsigned -field_alignment(unsigned alignment_baseline, size_t field_offset) { - unsigned merge = alignment_baseline | (unsigned)field_offset; +MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline size_t +field_alignment(size_t alignment_baseline, size_t field_offset) { + size_t merge = alignment_baseline | (size_t)field_offset; return merge & -(int)merge; } @@ -122,7 +120,7 @@ static __always_inline void poke_u8(uint8_t *const __restrict ptr, } MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint16_t -unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) { +unaligned_peek_u16(const size_t expected_alignment, const void *const ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(uint16_t)) == 0) return *(const uint16_t *)ptr; @@ -138,9 +136,9 @@ unaligned_peek_u16(const unsigned expected_alignment, const void *const ptr) { } } -static __always_inline void -unaligned_poke_u16(const unsigned expected_alignment, - void *const __restrict ptr, const uint16_t v) { +static __always_inline void unaligned_poke_u16(const size_t expected_alignment, + void *const __restrict ptr, + const uint16_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 2 || (expected_alignment % sizeof(v)) == 0) *(uint16_t *)ptr = v; @@ -155,7 +153,7 @@ unaligned_poke_u16(const unsigned expected_alignment, } MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32( - const unsigned expected_alignment, const void *const __restrict ptr) { + const size_t expected_alignment, const void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(uint32_t)) == 0) return *(const uint32_t *)ptr; @@ -177,9 +175,9 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t unaligned_peek_u32( } } -static __always_inline void -unaligned_poke_u32(const unsigned expected_alignment, - void *const __restrict ptr, const uint32_t v) { +static __always_inline void unaligned_poke_u32(const size_t expected_alignment, + void *const __restrict ptr, + const uint32_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 4 || (expected_alignment % sizeof(v)) == 0) *(uint32_t *)ptr = v; @@ -198,7 +196,7 @@ unaligned_poke_u32(const unsigned expected_alignment, } MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( - const unsigned expected_alignment, const void *const __restrict ptr) { + const size_t expected_alignment, const void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) return *(const uint64_t *)ptr; @@ -221,29 +219,29 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t unaligned_peek_u64( } static __always_inline uint64_t -unaligned_peek_u64_volatile(const unsigned expected_alignment, - volatile const void *const __restrict ptr) { +unaligned_peek_u64_volatile(const size_t expected_alignment, + const volatile void *const __restrict ptr) { assert((uintptr_t)ptr % expected_alignment == 0); assert(expected_alignment % sizeof(uint32_t) == 0); if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(uint64_t)) == 0) - return *(volatile const uint64_t *)ptr; + return *(const volatile uint64_t *)ptr; else { #if defined(__unaligned) || defined(_M_ARM) || defined(_M_ARM64) || \ defined(_M_X64) || defined(_M_IA64) - return *(volatile const __unaligned uint64_t *)ptr; + return *(const volatile __unaligned uint64_t *)ptr; #else - const uint32_t lo = ((volatile const uint32_t *) + const uint32_t lo = ((const volatile uint32_t *) ptr)[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__]; - const uint32_t hi = ((volatile const uint32_t *) + const uint32_t hi = ((const volatile uint32_t *) ptr)[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__]; return lo | (uint64_t)hi << 32; #endif /* _MSC_VER || __unaligned */ } } -static __always_inline void -unaligned_poke_u64(const unsigned expected_alignment, - void *const __restrict ptr, const uint64_t v) { +static __always_inline void unaligned_poke_u64(const size_t expected_alignment, + void *const __restrict ptr, + const uint64_t v) { assert((uintptr_t)ptr % expected_alignment == 0); if (MDBX_UNALIGNED_OK >= 8 || (expected_alignment % sizeof(v)) == 0) *(uint64_t *)ptr = v; @@ -262,24 +260,24 @@ unaligned_poke_u64(const unsigned expected_alignment, } #define UNALIGNED_PEEK_8(ptr, struct, field) \ - peek_u8((const uint8_t *)(ptr) + offsetof(struct, field)) + peek_u8(ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_8(ptr, struct, field, value) \ - poke_u8((uint8_t *)(ptr) + offsetof(struct, field), value) + poke_u8(ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_16(ptr, struct, field) \ - unaligned_peek_u16(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u16(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_16(ptr, struct, field, value) \ - unaligned_poke_u16(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u16(1, ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_32(ptr, struct, field) \ - unaligned_peek_u32(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u32(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_32(ptr, struct, field, value) \ - unaligned_poke_u32(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u32(1, ptr_disp(ptr, offsetof(struct, field)), value) #define UNALIGNED_PEEK_64(ptr, struct, field) \ - unaligned_peek_u64(1, (const char *)(ptr) + offsetof(struct, field)) + unaligned_peek_u64(1, ptr_disp(ptr, offsetof(struct, field))) #define UNALIGNED_POKE_64(ptr, struct, field, value) \ - unaligned_poke_u64(1, (char *)(ptr) + offsetof(struct, field), value) + unaligned_poke_u64(1, ptr_disp(ptr, offsetof(struct, field)), value) /* Get the page number pointed to by a branch node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t @@ -343,13 +341,13 @@ static __always_inline void node_set_flags(MDBX_node *const __restrict node, /* Address of the key for the node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * node_key(const MDBX_node *const __restrict node) { - return (char *)node + NODESIZE; + return ptr_disp(node, NODESIZE); } /* Address of the data for a node */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * node_data(const MDBX_node *const __restrict node) { - return (char *)node_key(node) + node_ks(node); + return ptr_disp(node_key(node), node_ks(node)); } /* Size of a node in a leaf page with a given key and data. @@ -418,7 +416,7 @@ node_largedata_pgno(const MDBX_node *const __restrict node) { * and so on up to the root. Therefore double-splitting is avoided here and * the maximum node size is half of a leaf page space: * LEAF_NODE_MAX = even_floor(PAGEROOM / 2 - sizeof(indx_t)); - * DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - KEYLEN_MAX; + * DATALEN_NO_OVERFLOW = LEAF_NODE_MAX - NODESIZE - KEYLEN_MAX; * * - SubDatabase-node must fit into one leaf-page: * SUBDB_NAME_MAX = LEAF_NODE_MAX - node_hdr_len - sizeof(MDBX_db); @@ -439,7 +437,7 @@ node_largedata_pgno(const MDBX_node *const __restrict node) { (EVEN_FLOOR(PAGEROOM(pagesize) / 2) - sizeof(indx_t)) #define MAX_GC1OVPAGE(pagesize) (PAGEROOM(pagesize) / sizeof(pgno_t) - 1) -static __inline unsigned keysize_max(size_t pagesize, MDBX_db_flags_t flags) { +static __inline size_t keysize_max(size_t pagesize, MDBX_db_flags_t flags) { assert(pagesize >= MIN_PAGESIZE && pagesize <= MAX_PAGESIZE && is_powerof2(pagesize)); STATIC_ASSERT(BRANCH_NODE_MAX(MIN_PAGESIZE) - NODESIZE >= 8); @@ -454,11 +452,10 @@ static __inline unsigned keysize_max(size_t pagesize, MDBX_db_flags_t flags) { (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_REVERSEDUP | MDBX_INTEGERDUP)) { const intptr_t max_dupsort_leaf_key = LEAF_NODE_MAX(pagesize) - NODESIZE - sizeof(MDBX_db); - return (max_branch_key < max_dupsort_leaf_key) - ? (unsigned)max_branch_key - : (unsigned)max_dupsort_leaf_key; + return (max_branch_key < max_dupsort_leaf_key) ? max_branch_key + : max_dupsort_leaf_key; } - return (unsigned)max_branch_key; + return max_branch_key; } static __inline size_t valsize_max(size_t pagesize, MDBX_db_flags_t flags) { @@ -494,8 +491,8 @@ __cold int mdbx_env_get_maxkeysize_ex(const MDBX_env *env, } size_t mdbx_default_pagesize(void) { - size_t pagesize = mdbx_syspagesize(); - mdbx_ensure(nullptr, is_powerof2(pagesize)); + size_t pagesize = osal_syspagesize(); + ENSURE(nullptr, is_powerof2(pagesize)); pagesize = (pagesize >= MIN_PAGESIZE) ? pagesize : MIN_PAGESIZE; pagesize = (pagesize <= MAX_PAGESIZE) ? pagesize : MAX_PAGESIZE; return pagesize; @@ -533,10 +530,58 @@ __cold intptr_t mdbx_limits_valsize_max(intptr_t pagesize, return valsize_max(pagesize, flags); } +__cold intptr_t mdbx_limits_pairsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) + return BRANCH_NODE_MAX(pagesize) - NODESIZE; + + return LEAF_NODE_MAX(pagesize) - NODESIZE; +} + +__cold int mdbx_env_get_pairsize4page_max(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) + return -1; + + return (int)mdbx_limits_pairsize4page_max((intptr_t)env->me_psize, flags); +} + +__cold intptr_t mdbx_limits_valsize4page_max(intptr_t pagesize, + MDBX_db_flags_t flags) { + if (pagesize < 1) + pagesize = (intptr_t)mdbx_default_pagesize(); + if (unlikely(pagesize < (intptr_t)MIN_PAGESIZE || + pagesize > (intptr_t)MAX_PAGESIZE || + !is_powerof2((size_t)pagesize))) + return -1; + + if (flags & + (MDBX_DUPSORT | MDBX_DUPFIXED | MDBX_INTEGERDUP | MDBX_REVERSEDUP)) + return valsize_max(pagesize, flags); + + return PAGEROOM(pagesize); +} + +__cold int mdbx_env_get_valsize4page_max(const MDBX_env *env, + MDBX_db_flags_t flags) { + if (unlikely(!env || env->me_signature.weak != MDBX_ME_SIGNATURE)) + return -1; + + return (int)mdbx_limits_valsize4page_max((intptr_t)env->me_psize, flags); +} + /* Calculate the size of a leaf node. * * The size depends on the environment's page size; if a data item - * is too large it will be put onto an overflow page and the node + * is too large it will be put onto an large/overflow page and the node * size will only include the key and not the data. Sizes are always * rounded up to an even number of bytes, to guarantee 2-byte alignment * of the MDBX_node headers. */ @@ -544,7 +589,7 @@ MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { size_t node_bytes = node_size(key, data); if (node_bytes > env->me_leaf_nodemax) { - /* put on overflow page */ + /* put on large/overflow page */ node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t); } @@ -554,7 +599,7 @@ leaf_size(const MDBX_env *env, const MDBX_val *key, const MDBX_val *data) { /* Calculate the size of a branch node. * * The size should depend on the environment's page size but since - * we currently don't support spilling large keys onto overflow + * we currently don't support spilling large keys onto large/overflow * pages, it's simply the size of the MDBX_node header plus the * size of the key. Sizes are always rounded up to an even number * of bytes, to guarantee 2-byte alignment of the MDBX_node headers. @@ -568,11 +613,11 @@ branch_size(const MDBX_env *env, const MDBX_val *key) { /* Size of a node in a branch page with a given key. * This is just the node header plus the key, there is no data. */ size_t node_bytes = node_size(key, nullptr); - if (unlikely(node_bytes > env->me_leaf_nodemax)) { - /* put on overflow page */ + if (unlikely(node_bytes > env->me_branch_nodemax)) { + /* put on large/overflow page */ /* not implemented */ - mdbx_assert_fail(env, "INDXSIZE(key) <= env->me_nodemax", __func__, - __LINE__); + mdbx_panic("node_size(key) %zu > %u branch_nodemax", node_bytes, + env->me_branch_nodemax); node_bytes = node_size(key, nullptr) + sizeof(pgno_t); } @@ -601,29 +646,29 @@ flags_db2sub(uint16_t db_flags) { /*----------------------------------------------------------------------------*/ MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t -pgno2bytes(const MDBX_env *env, pgno_t pgno) { - mdbx_assert(env, (1u << env->me_psize2log) == env->me_psize); +pgno2bytes(const MDBX_env *env, size_t pgno) { + eASSERT(env, (1u << env->me_psize2log) == env->me_psize); return ((size_t)pgno) << env->me_psize2log; } MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_page * -pgno2page(const MDBX_env *env, pgno_t pgno) { - return (MDBX_page *)(env->me_map + pgno2bytes(env, pgno)); +pgno2page(const MDBX_env *env, size_t pgno) { + return ptr_disp(env->me_map, pgno2bytes(env, pgno)); } MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t bytes2pgno(const MDBX_env *env, size_t bytes) { - mdbx_assert(env, (env->me_psize >> env->me_psize2log) == 1); + eASSERT(env, (env->me_psize >> env->me_psize2log) == 1); return (pgno_t)(bytes >> env->me_psize2log); } MDBX_NOTHROW_PURE_FUNCTION static size_t -pgno_align2os_bytes(const MDBX_env *env, pgno_t pgno) { +pgno_align2os_bytes(const MDBX_env *env, size_t pgno) { return ceil_powerof2(pgno2bytes(env, pgno), env->me_os_psize); } MDBX_NOTHROW_PURE_FUNCTION static pgno_t pgno_align2os_pgno(const MDBX_env *env, - pgno_t pgno) { + size_t pgno) { return bytes2pgno(env, pgno_align2os_bytes(env, pgno)); } @@ -635,7 +680,7 @@ bytes_align2os_bytes(const MDBX_env *env, size_t bytes) { /* Address of first usable data byte in a page, after the header */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * page_data(const MDBX_page *mp) { - return (char *)mp + PAGEHDRSZ; + return ptr_disp(mp, PAGEHDRSZ); } MDBX_NOTHROW_PURE_FUNCTION static __always_inline const MDBX_page * @@ -649,25 +694,25 @@ page_meta(MDBX_page *mp) { } /* Number of nodes on a page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t page_numkeys(const MDBX_page *mp) { return mp->mp_lower >> 1; } /* The amount of space remaining in the page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t page_room(const MDBX_page *mp) { return mp->mp_upper - mp->mp_lower; } /* Maximum free space in an empty page */ -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t page_space(const MDBX_env *env) { STATIC_ASSERT(PAGEHDRSZ % 2 == 0); return env->me_psize - PAGEHDRSZ; } -MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t page_used(const MDBX_env *env, const MDBX_page *mp) { return page_space(env) - page_room(mp); } @@ -678,74 +723,94 @@ page_fill(const MDBX_env *env, const MDBX_page *mp) { return page_used(env, mp) * 100.0 / page_space(env); } -/* The number of overflow pages needed to store the given size. */ +/* The number of large/overflow pages needed to store the given size. */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline pgno_t number_of_ovpages(const MDBX_env *env, size_t bytes) { return bytes2pgno(env, PAGEHDRSZ - 1 + bytes) + 1; } +__cold static const char *pagetype_caption(const uint8_t type, + char buf4unknown[16]) { + switch (type) { + case P_BRANCH: + return "branch"; + case P_LEAF: + return "leaf"; + case P_LEAF | P_SUBP: + return "subleaf"; + case P_LEAF | P_LEAF2: + return "dupfixed-leaf"; + case P_LEAF | P_LEAF2 | P_SUBP: + return "dupfixed-subleaf"; + case P_LEAF | P_LEAF2 | P_SUBP | P_LEGACY_DIRTY: + return "dupfixed-subleaf.legacy-dirty"; + case P_OVERFLOW: + return "large"; + default: + snprintf(buf4unknown, 16, "unknown_0x%x", type); + return buf4unknown; + } +} + __cold static int MDBX_PRINTF_ARGS(2, 3) bad_page(const MDBX_page *mp, const char *fmt, ...) { - if (mdbx_log_enabled(MDBX_LOG_ERROR)) { + if (LOG_ENABLED(MDBX_LOG_ERROR)) { static const MDBX_page *prev; if (prev != mp) { + char buf4unknown[16]; prev = mp; - const char *type; - switch (mp->mp_flags & (P_BRANCH | P_LEAF | P_OVERFLOW | P_META | - P_LEAF2 | P_BAD | P_SUBP)) { - case P_BRANCH: - type = "branch"; - break; - case P_LEAF: - type = "leaf"; - break; - case P_LEAF | P_SUBP: - type = "subleaf"; - break; - case P_LEAF | P_LEAF2: - type = "dupfixed-leaf"; - break; - case P_LEAF | P_LEAF2 | P_SUBP: - type = "dupfixed-subleaf"; - break; - case P_OVERFLOW: - type = "large"; - break; - default: - type = "broken"; - } - mdbx_debug_log(MDBX_LOG_ERROR, "badpage", 0, - "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", type, - mp->mp_pgno, mp->mp_txnid); + debug_log(MDBX_LOG_ERROR, "badpage", 0, + "corrupted %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, + mp->mp_txnid); } va_list args; va_start(args, fmt); - mdbx_debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); + debug_log_va(MDBX_LOG_ERROR, "badpage", 0, fmt, args); va_end(args); } return MDBX_CORRUPTED; } +__cold static void MDBX_PRINTF_ARGS(2, 3) + poor_page(const MDBX_page *mp, const char *fmt, ...) { + if (LOG_ENABLED(MDBX_LOG_NOTICE)) { + static const MDBX_page *prev; + if (prev != mp) { + char buf4unknown[16]; + prev = mp; + debug_log(MDBX_LOG_NOTICE, "poorpage", 0, + "suboptimal %s-page #%u, mod-txnid %" PRIaTXN "\n", + pagetype_caption(PAGETYPE_WHOLE(mp), buf4unknown), mp->mp_pgno, + mp->mp_txnid); + } + + va_list args; + va_start(args, fmt); + debug_log_va(MDBX_LOG_NOTICE, "poorpage", 0, fmt, args); + va_end(args); + } +} + /* Address of node i in page p */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline MDBX_node * -page_node(const MDBX_page *mp, unsigned i) { - assert((mp->mp_flags & (P_LEAF2 | P_OVERFLOW | P_META)) == 0); - assert(page_numkeys(mp) > (unsigned)(i)); +page_node(const MDBX_page *mp, size_t i) { + assert(PAGETYPE_COMPAT(mp) == P_LEAF || PAGETYPE_WHOLE(mp) == P_BRANCH); + assert(page_numkeys(mp) > i); assert(mp->mp_ptrs[i] % 2 == 0); - return (MDBX_node *)((char *)mp + mp->mp_ptrs[i] + PAGEHDRSZ); + return ptr_disp(mp, mp->mp_ptrs[i] + PAGEHDRSZ); } /* The address of a key in a LEAF2 page. * LEAF2 pages are used for MDBX_DUPFIXED sorted-duplicate sub-DBs. * There are no node headers, keys are stored contiguously. */ MDBX_NOTHROW_PURE_FUNCTION static __always_inline void * -page_leaf2key(const MDBX_page *mp, unsigned i, size_t keysize) { - assert((mp->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_META)) == - (P_LEAF | P_LEAF2)); +page_leaf2key(const MDBX_page *mp, size_t i, size_t keysize) { + assert(PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); assert(mp->mp_leaf2_ksize == keysize); (void)keysize; - return (char *)mp + PAGEHDRSZ + (i * mp->mp_leaf2_ksize); + return ptr_disp(mp, PAGEHDRSZ + i * mp->mp_leaf2_ksize); } /* Set the node's key into keyptr. */ @@ -761,120 +826,6 @@ get_key_optional(const MDBX_node *node, MDBX_val *keyptr /* __may_null */) { get_key(node, keyptr); } -/*------------------------------------------------------------------------------ - * Workaround for mmaped-lookahead-cross-page-boundary bug - * in an obsolete versions of Elbrus's libc and kernels. */ -#if defined(__e2k__) && defined(MDBX_E2K_MLHCPB_WORKAROUND) && \ - MDBX_E2K_MLHCPB_WORKAROUND -int __hot mdbx_e2k_memcmp_bug_workaround(const void *s1, const void *s2, - size_t n) { - if (unlikely(n > 42 - /* LY: align followed access if reasonable possible */ - && (((uintptr_t)s1) & 7) != 0 && - (((uintptr_t)s1) & 7) == (((uintptr_t)s2) & 7))) { - if (((uintptr_t)s1) & 1) { - const int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (diff) - return diff; - s1 = (char *)s1 + 1; - s2 = (char *)s2 + 1; - n -= 1; - } - - if (((uintptr_t)s1) & 2) { - const uint16_t a = *(uint16_t *)s1; - const uint16_t b = *(uint16_t *)s2; - if (likely(a != b)) - return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1; - s1 = (char *)s1 + 2; - s2 = (char *)s2 + 2; - n -= 2; - } - - if (((uintptr_t)s1) & 4) { - const uint32_t a = *(uint32_t *)s1; - const uint32_t b = *(uint32_t *)s2; - if (likely(a != b)) - return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1; - s1 = (char *)s1 + 4; - s2 = (char *)s2 + 4; - n -= 4; - } - } - - while (n >= 8) { - const uint64_t a = *(uint64_t *)s1; - const uint64_t b = *(uint64_t *)s2; - if (likely(a != b)) - return (__builtin_bswap64(a) > __builtin_bswap64(b)) ? 1 : -1; - s1 = (char *)s1 + 8; - s2 = (char *)s2 + 8; - n -= 8; - } - - if (n & 4) { - const uint32_t a = *(uint32_t *)s1; - const uint32_t b = *(uint32_t *)s2; - if (likely(a != b)) - return (__builtin_bswap32(a) > __builtin_bswap32(b)) ? 1 : -1; - s1 = (char *)s1 + 4; - s2 = (char *)s2 + 4; - } - - if (n & 2) { - const uint16_t a = *(uint16_t *)s1; - const uint16_t b = *(uint16_t *)s2; - if (likely(a != b)) - return (__builtin_bswap16(a) > __builtin_bswap16(b)) ? 1 : -1; - s1 = (char *)s1 + 2; - s2 = (char *)s2 + 2; - } - - return (n & 1) ? *(uint8_t *)s1 - *(uint8_t *)s2 : 0; -} - -int __hot mdbx_e2k_strcmp_bug_workaround(const char *s1, const char *s2) { - while (true) { - int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (likely(diff != 0) || *s1 == '\0') - return diff; - s1 += 1; - s2 += 1; - } -} - -int __hot mdbx_e2k_strncmp_bug_workaround(const char *s1, const char *s2, - size_t n) { - while (n > 0) { - int diff = *(uint8_t *)s1 - *(uint8_t *)s2; - if (likely(diff != 0) || *s1 == '\0') - return diff; - s1 += 1; - s2 += 1; - n -= 1; - } - return 0; -} - -size_t __hot mdbx_e2k_strlen_bug_workaround(const char *s) { - size_t n = 0; - while (*s) { - s += 1; - n += 1; - } - return n; -} - -size_t __hot mdbx_e2k_strnlen_bug_workaround(const char *s, size_t maxlen) { - size_t n = 0; - while (maxlen > n && *s) { - s += 1; - n += 1; - } - return n; -} -#endif /* MDBX_E2K_MLHCPB_WORKAROUND */ - /*------------------------------------------------------------------------------ * safe read/write volatile 64-bit fields on 32-bit architectures. */ @@ -889,16 +840,16 @@ atomic_store64(MDBX_atomic_uint64_t *p, const uint64_t value, atomic_store_explicit(MDBX_c11a_rw(uint64_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ #else /* !MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); atomic_store32(&p->low, (uint32_t)value, mo_Relaxed); - mdbx_jitter4testing(true); + jitter4testing(true); atomic_store32(&p->high, (uint32_t)(value >> 32), order); - mdbx_jitter4testing(true); + jitter4testing(true); #endif /* !MDBX_64BIT_ATOMIC */ return value; } @@ -910,7 +861,7 @@ MDBX_MAYBE_UNUSED static __always_inline #endif /* MDBX_64BIT_ATOMIC */ uint64_t - atomic_load64(const MDBX_atomic_uint64_t *p, + atomic_load64(const volatile MDBX_atomic_uint64_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint64_t) == 8); #if MDBX_64BIT_ATOMIC @@ -918,26 +869,26 @@ MDBX_MAYBE_UNUSED static assert(atomic_is_lock_free(MDBX_c11a_ro(uint64_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint64_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint64_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ #else /* !MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); uint64_t value = (uint64_t)atomic_load32(&p->high, order) << 32; - mdbx_jitter4testing(true); + jitter4testing(true); value |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease); - mdbx_jitter4testing(true); + jitter4testing(true); for (;;) { - mdbx_compiler_barrier(); + osal_compiler_barrier(); uint64_t again = (uint64_t)atomic_load32(&p->high, order) << 32; - mdbx_jitter4testing(true); + jitter4testing(true); again |= atomic_load32(&p->low, (order == mo_Relaxed) ? mo_Relaxed : mo_AcquireRelease); - mdbx_jitter4testing(true); + jitter4testing(true); if (likely(value == again)) return value; value = again; @@ -1046,40 +997,34 @@ static __always_inline uint64_t safe64_txnid_next(uint64_t txnid) { return txnid; } -#if defined(MDBX_HAVE_C11ATOMICS) && defined(__LCC__) -#define safe64_reset(p, single_writer) \ - atomic_store64(p, UINT64_MAX, \ - (single_writer) ? mo_AcquireRelease \ - : mo_SequentialConsistency) -#else +/* Atomically make target value >= SAFE64_INVALID_THRESHOLD */ static __always_inline void safe64_reset(MDBX_atomic_uint64_t *p, bool single_writer) { -#if !MDBX_64BIT_CAS - if (!single_writer) { - STATIC_ASSERT(xMDBX_TXNID_STEP > 1); + if (single_writer) { +#if MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 + atomic_store64(p, UINT64_MAX, mo_AcquireRelease); +#else + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); +#endif /* MDBX_64BIT_ATOMIC && MDBX_WORDBITS >= 64 */ + } else { +#if MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ + atomic_store64(p, UINT64_MAX, mo_AcquireRelease); +#elif MDBX_64BIT_CAS + /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); +#else /* it is safe to increment low-part to avoid ABA, since xMDBX_TXNID_STEP > 1 * and overflow was preserved in safe64_txnid_next() */ + STATIC_ASSERT(xMDBX_TXNID_STEP > 1); atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; - atomic_store32( - &p->high, UINT32_MAX, - mo_Relaxed) /* atomically make >= SAFE64_INVALID_THRESHOLD */; + atomic_store32(&p->high, UINT32_MAX, mo_AcquireRelease); atomic_add32(&p->low, 1) /* avoid ABA in safe64_reset_compare() */; - } else -#endif /* !MDBX_64BIT_CAS */ -#if MDBX_64BIT_ATOMIC - /* atomically make value >= SAFE64_INVALID_THRESHOLD by 64-bit operation */ - atomic_store64(p, UINT64_MAX, - single_writer ? mo_AcquireRelease - : mo_SequentialConsistency); -#else - /* atomically make value >= SAFE64_INVALID_THRESHOLD by 32-bit operation */ - atomic_store32(&p->high, UINT32_MAX, - single_writer ? mo_AcquireRelease : mo_SequentialConsistency); -#endif /* MDBX_64BIT_ATOMIC */ +#endif /* MDBX_64BIT_CAS && MDBX_64BIT_ATOMIC */ + } assert(p->weak >= SAFE64_INVALID_THRESHOLD); - mdbx_jitter4testing(true); + jitter4testing(true); } -#endif /* LCC && MDBX_HAVE_C11ATOMICS */ static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, txnid_t compare) { @@ -1105,32 +1050,34 @@ static __always_inline bool safe64_reset_compare(MDBX_atomic_uint64_t *p, rc = true; } #endif /* MDBX_64BIT_CAS */ - mdbx_jitter4testing(true); + jitter4testing(true); return rc; } static __always_inline void safe64_write(MDBX_atomic_uint64_t *p, const uint64_t v) { assert(p->weak >= SAFE64_INVALID_THRESHOLD); -#if MDBX_64BIT_ATOMIC +#if MDBX_64BIT_ATOMIC && MDBX_64BIT_CAS atomic_store64(p, v, mo_AcquireRelease); #else /* MDBX_64BIT_ATOMIC */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); /* update low-part but still value >= SAFE64_INVALID_THRESHOLD */ atomic_store32(&p->low, (uint32_t)v, mo_Relaxed); assert(p->weak >= SAFE64_INVALID_THRESHOLD); - mdbx_jitter4testing(true); + jitter4testing(true); /* update high-part from SAFE64_INVALID_THRESHOLD to actual value */ atomic_store32(&p->high, (uint32_t)(v >> 32), mo_AcquireRelease); #endif /* MDBX_64BIT_ATOMIC */ assert(p->weak == v); - mdbx_jitter4testing(true); + jitter4testing(true); } static __always_inline uint64_t safe64_read(const MDBX_atomic_uint64_t *p) { - mdbx_jitter4testing(true); - uint64_t v = atomic_load64(p, mo_AcquireRelease); - mdbx_jitter4testing(true); + jitter4testing(true); + uint64_t v; + do + v = atomic_load64(p, mo_AcquireRelease); + while (!MDBX_64BIT_ATOMIC && unlikely(v != p->weak)); return v; } @@ -1172,7 +1119,7 @@ MDBX_MAYBE_UNUSED static void safe64_inc(MDBX_atomic_uint64_t *p, const uint64_t v) { assert(v > 0); - safe64_update(p, atomic_load64(p, mo_Relaxed) + v); + safe64_update(p, safe64_read(p) + v); } /*----------------------------------------------------------------------------*/ @@ -1181,7 +1128,7 @@ MDBX_MAYBE_UNUSED static typedef struct rthc_entry_t { MDBX_reader *begin; MDBX_reader *end; - mdbx_thread_key_t thr_tls_key; + osal_thread_key_t thr_tls_key; } rthc_entry_t; #if MDBX_DEBUG @@ -1200,11 +1147,11 @@ static CRITICAL_SECTION lcklist_critical_section; static pthread_mutex_t lcklist_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t rthc_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_cond_t rthc_cond = PTHREAD_COND_INITIALIZER; -static mdbx_thread_key_t rthc_key; +static osal_thread_key_t rthc_key; static MDBX_atomic_uint32_t rthc_pending; static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) { - uint64_t salt = mdbx_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ + uint64_t salt = osal_thread_self() * UINT64_C(0xA2F0EEC059629A17) ^ UINT64_C(0x01E07C6FDB596497) * (uintptr_t)(addr); #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return salt << 8 | kind; @@ -1217,7 +1164,12 @@ static __inline uint64_t rthc_signature(const void *addr, uint8_t kind) { #define MDBX_THREAD_RTHC_REGISTERED(addr) rthc_signature(addr, 0x0D) #define MDBX_THREAD_RTHC_COUNTED(addr) rthc_signature(addr, 0xC0) -static __thread uint64_t rthc_thread_state; +static __thread uint64_t rthc_thread_state +#if __has_attribute(tls_model) && \ + (defined(__PIC__) || defined(__pic__) || MDBX_BUILD_SHARED_LIBRARY) + __attribute__((tls_model("local-dynamic"))) +#endif + ; #if defined(__APPLE__) && defined(__SANITIZE_ADDRESS__) && \ !defined(MDBX_ATTRIBUTE_NO_SANITIZE_ADDRESS) @@ -1251,7 +1203,7 @@ static __inline int rthc_atexit(void (*dtor)(void *), void *obj, #ifndef MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL #if defined(LIBCXXABI_HAS_CXA_THREAD_ATEXIT_IMPL) || \ defined(HAVE___CXA_THREAD_ATEXIT_IMPL) || __GLIBC_PREREQ(2, 18) || \ - defined(ANDROID) + defined(BIONIC) #define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 1 #else #define MDBX_HAVE_CXA_THREAD_ATEXIT_IMPL 0 @@ -1319,7 +1271,7 @@ static __inline void rthc_lock(void) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(&rthc_critical_section); #else - mdbx_ensure(nullptr, mdbx_pthread_mutex_lock(&rthc_mutex) == 0); + ENSURE(nullptr, osal_pthread_mutex_lock(&rthc_mutex) == 0); #endif } @@ -1327,11 +1279,11 @@ static __inline void rthc_unlock(void) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(&rthc_critical_section); #else - mdbx_ensure(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); + ENSURE(nullptr, pthread_mutex_unlock(&rthc_mutex) == 0); #endif } -static __inline int thread_key_create(mdbx_thread_key_t *key) { +static __inline int thread_key_create(osal_thread_key_t *key) { int rc; #if defined(_WIN32) || defined(_WIN64) *key = TlsAlloc(); @@ -1339,22 +1291,22 @@ static __inline int thread_key_create(mdbx_thread_key_t *key) { #else rc = pthread_key_create(key, nullptr); #endif - mdbx_trace("&key = %p, value %" PRIuPTR ", rc %d", - __Wpedantic_format_voidptr(key), (uintptr_t)*key, rc); + TRACE("&key = %p, value %" PRIuPTR ", rc %d", __Wpedantic_format_voidptr(key), + (uintptr_t)*key, rc); return rc; } -static __inline void thread_key_delete(mdbx_thread_key_t key) { - mdbx_trace("key = %" PRIuPTR, (uintptr_t)key); +static __inline void thread_key_delete(osal_thread_key_t key) { + TRACE("key = %" PRIuPTR, (uintptr_t)key); #if defined(_WIN32) || defined(_WIN64) - mdbx_ensure(nullptr, TlsFree(key)); + ENSURE(nullptr, TlsFree(key)); #else - mdbx_ensure(nullptr, pthread_key_delete(key) == 0); + ENSURE(nullptr, pthread_key_delete(key) == 0); workaround_glibc_bug21031(); #endif } -static __inline void *thread_rthc_get(mdbx_thread_key_t key) { +static __inline void *thread_rthc_get(osal_thread_key_t key) { #if defined(_WIN32) || defined(_WIN64) return TlsGetValue(key); #else @@ -1362,9 +1314,9 @@ static __inline void *thread_rthc_get(mdbx_thread_key_t key) { #endif } -static void thread_rthc_set(mdbx_thread_key_t key, const void *value) { +static void thread_rthc_set(osal_thread_key_t key, const void *value) { #if defined(_WIN32) || defined(_WIN64) - mdbx_ensure(nullptr, TlsSetValue(key, (void *)value)); + ENSURE(nullptr, TlsSetValue(key, (void *)value)); #else const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(&rthc_thread_state); @@ -1372,102 +1324,59 @@ static void thread_rthc_set(mdbx_thread_key_t key, const void *value) { if (value && unlikely(rthc_thread_state != sign_registered && rthc_thread_state != sign_counted)) { rthc_thread_state = sign_registered; - mdbx_trace("thread registered 0x%" PRIxPTR, mdbx_thread_self()); - if (rthc_atexit(mdbx_rthc_thread_dtor, &rthc_thread_state, + TRACE("thread registered 0x%" PRIxPTR, osal_thread_self()); + if (rthc_atexit(thread_dtor, &rthc_thread_state, (void *)&mdbx_version /* dso_anchor */)) { - mdbx_ensure(nullptr, - pthread_setspecific(rthc_key, &rthc_thread_state) == 0); + ENSURE(nullptr, pthread_setspecific(rthc_key, &rthc_thread_state) == 0); rthc_thread_state = sign_counted; const unsigned count_before = atomic_add32(&rthc_pending, 1); - mdbx_ensure(nullptr, count_before < INT_MAX); - mdbx_notice("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", - (uintptr_t)rthc_key, count_before); + ENSURE(nullptr, count_before < INT_MAX); + NOTICE("fallback to pthreads' tsd, key %" PRIuPTR ", count %u", + (uintptr_t)rthc_key, count_before); (void)count_before; } } - mdbx_ensure(nullptr, pthread_setspecific(key, value) == 0); -#endif -} - -__cold void mdbx_rthc_global_init(void) { - rthc_limit = RTHC_INITIAL_LIMIT; - rthc_table = rthc_table_static; -#if defined(_WIN32) || defined(_WIN64) - InitializeCriticalSection(&rthc_critical_section); - InitializeCriticalSection(&lcklist_critical_section); -#else - mdbx_ensure(nullptr, - pthread_key_create(&rthc_key, mdbx_rthc_thread_dtor) == 0); - mdbx_trace("pid %d, &mdbx_rthc_key = %p, value 0x%x", mdbx_getpid(), - __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); -#endif - /* checking time conversion, this also avoids racing on 32-bit architectures - * during writing calculated 64-bit ratio(s) into memory. */ - uint32_t proba = UINT32_MAX; - while (true) { - unsigned time_conversion_checkup = - mdbx_osal_monotime_to_16dot16(mdbx_osal_16dot16_to_monotime(proba)); - unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; - unsigned one_less = (proba > 0) ? proba - 1 : proba; - mdbx_ensure(nullptr, time_conversion_checkup >= one_less && - time_conversion_checkup <= one_more); - if (proba == 0) - break; - proba >>= 1; - } - - bootid = mdbx_osal_bootid(); -#if 0 /* debug */ - for (unsigned i = 0; i < 65536; ++i) { - size_t pages = pv2pages(i); - unsigned x = pages2pv(pages); - size_t xp = pv2pages(x); - if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) - printf("%u => %zu => %u => %zu\n", i, pages, x, xp); - assert(pages == xp); - } - fflush(stdout); + ENSURE(nullptr, pthread_setspecific(key, value) == 0); #endif } /* dtor called for thread, i.e. for all mdbx's environment objects */ -__cold void mdbx_rthc_thread_dtor(void *rthc) { +__cold void thread_dtor(void *rthc) { rthc_lock(); - mdbx_trace(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", mdbx_getpid(), - mdbx_thread_self(), rthc); + TRACE(">> pid %d, thread 0x%" PRIxPTR ", rthc %p", osal_getpid(), + osal_thread_self(), rthc); - const uint32_t self_pid = mdbx_getpid(); - for (unsigned i = 0; i < rthc_count; ++i) { - const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + const uint32_t self_pid = osal_getpid(); + for (size_t i = 0; i < rthc_count; ++i) { + const osal_thread_key_t key = rthc_table[i].thr_tls_key; MDBX_reader *const reader = thread_rthc_get(key); if (reader < rthc_table[i].begin || reader >= rthc_table[i].end) continue; #if !defined(_WIN32) && !defined(_WIN64) if (pthread_setspecific(key, nullptr) != 0) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p: ignore race with tsd-key deletion", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader)); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p: ignore race with tsd-key deletion", + osal_thread_self(), __Wpedantic_format_voidptr(reader)); continue /* ignore race with tsd-key deletion by mdbx_env_close() */; } #endif - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, [%i], %p ... %p (%+i), rtch-pid %i, " - "current-pid %i", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader), i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, - self_pid); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, [%zi], %p ... %p (%+i), rtch-pid %i, " + "current-pid %i", + osal_thread_self(), __Wpedantic_format_voidptr(reader), i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + (int)(reader - rthc_table[i].begin), reader->mr_pid.weak, self_pid); if (atomic_load32(&reader->mr_pid, mo_Relaxed) == self_pid) { - mdbx_trace("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", - mdbx_thread_self(), __Wpedantic_format_voidptr(reader)); - atomic_cas32(&reader->mr_pid, self_pid, 0); + TRACE("==== thread 0x%" PRIxPTR ", rthc %p, cleanup", osal_thread_self(), + __Wpedantic_format_voidptr(reader)); + (void)atomic_cas32(&reader->mr_pid, self_pid, 0); } } #if defined(_WIN32) || defined(_WIN64) - mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), rthc); + TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); rthc_unlock(); #else const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); @@ -1475,28 +1384,28 @@ __cold void mdbx_rthc_thread_dtor(void *rthc) { const uint64_t state = rthc_read(rthc); if (state == sign_registered && rthc_compare_and_clean(rthc, sign_registered)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "registered", state); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "registered", state); } else if (state == sign_counted && rthc_compare_and_clean(rthc, sign_counted)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "counted", state); - mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "counted", state); + ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } else { - mdbx_warning("thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), rthc, mdbx_getpid(), "wrong", state); + WARNING("thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), rthc, osal_getpid(), "wrong", state); } if (atomic_load32(&rthc_pending, mo_AcquireRelease) == 0) { - mdbx_trace("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", - mdbx_thread_self(), rthc, mdbx_getpid()); - mdbx_ensure(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); + TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, wake", osal_thread_self(), + rthc, osal_getpid()); + ENSURE(nullptr, pthread_cond_broadcast(&rthc_cond) == 0); } - mdbx_trace("<< thread 0x%" PRIxPTR ", rthc %p", mdbx_thread_self(), rthc); + TRACE("<< thread 0x%" PRIxPTR ", rthc %p", osal_thread_self(), rthc); /* Allow tail call optimization, i.e. gcc should generate the jmp instruction * instead of a call for pthread_mutex_unlock() and therefore CPU could not * return to current DSO's code section, which may be unloaded immediately @@ -1505,44 +1414,45 @@ __cold void mdbx_rthc_thread_dtor(void *rthc) { #endif } -__cold void mdbx_rthc_global_dtor(void) { - mdbx_trace(">> pid %d", mdbx_getpid()); +MDBX_EXCLUDE_FOR_GPROF +__cold void global_dtor(void) { + TRACE(">> pid %d", osal_getpid()); rthc_lock(); #if !defined(_WIN32) && !defined(_WIN64) uint64_t *rthc = pthread_getspecific(rthc_key); - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status 0x%08" PRIx64 ", left %d", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), rthc ? rthc_read(rthc) : ~UINT64_C(0), - atomic_load32(&rthc_pending, mo_Relaxed)); + TRACE("== thread 0x%" PRIxPTR ", rthc %p, pid %d, self-status 0x%08" PRIx64 + ", left %d", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + rthc ? rthc_read(rthc) : ~UINT64_C(0), + atomic_load32(&rthc_pending, mo_Relaxed)); if (rthc) { const uint64_t sign_registered = MDBX_THREAD_RTHC_REGISTERED(rthc); const uint64_t sign_counted = MDBX_THREAD_RTHC_COUNTED(rthc); const uint64_t state = rthc_read(rthc); if (state == sign_registered && rthc_compare_and_clean(rthc, sign_registered)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "registered", state); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + "registered", state); } else if (state == sign_counted && rthc_compare_and_clean(rthc, sign_counted)) { - mdbx_trace("== thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "counted", state); - mdbx_ensure(nullptr, atomic_sub32(&rthc_pending, 1) > 0); + TRACE("== thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), osal_getpid(), + "counted", state); + ENSURE(nullptr, atomic_sub32(&rthc_pending, 1) > 0); } else { - mdbx_warning("thread 0x%" PRIxPTR - ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", - mdbx_thread_self(), __Wpedantic_format_voidptr(rthc), - mdbx_getpid(), "wrong", state); + WARNING("thread 0x%" PRIxPTR + ", rthc %p, pid %d, self-status %s (0x%08" PRIx64 ")", + osal_thread_self(), __Wpedantic_format_voidptr(rthc), + osal_getpid(), "wrong", state); } } struct timespec abstime; - mdbx_ensure(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); + ENSURE(nullptr, clock_gettime(CLOCK_REALTIME, &abstime) == 0); abstime.tv_nsec += 1000000000l / 10; if (abstime.tv_nsec >= 1000000000l) { abstime.tv_nsec -= 1000000000l; @@ -1554,8 +1464,7 @@ __cold void mdbx_rthc_global_dtor(void) { for (unsigned left; (left = atomic_load32(&rthc_pending, mo_AcquireRelease)) > 0;) { - mdbx_notice("tls-cleanup: pid %d, pending %u, wait for...", mdbx_getpid(), - left); + NOTICE("tls-cleanup: pid %d, pending %u, wait for...", osal_getpid(), left); const int rc = pthread_cond_timedwait(&rthc_cond, &rthc_mutex, &abstime); if (rc && rc != EINTR) break; @@ -1563,29 +1472,28 @@ __cold void mdbx_rthc_global_dtor(void) { thread_key_delete(rthc_key); #endif - const uint32_t self_pid = mdbx_getpid(); - for (unsigned i = 0; i < rthc_count; ++i) { - const mdbx_thread_key_t key = rthc_table[i].thr_tls_key; + const uint32_t self_pid = osal_getpid(); + for (size_t i = 0; i < rthc_count; ++i) { + const osal_thread_key_t key = rthc_table[i].thr_tls_key; thread_key_delete(key); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { - mdbx_trace( - "== [%i] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " - "rthc-pid %i, current-pid %i", - i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), - __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), - rthc->mr_pid.weak, self_pid); + TRACE("== [%zi] = key %" PRIuPTR ", %p ... %p, rthc %p (%+i), " + "rthc-pid %i, current-pid %i", + i, (uintptr_t)key, __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), + __Wpedantic_format_voidptr(rthc), (int)(rthc - rthc_table[i].begin), + rthc->mr_pid.weak, self_pid); if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } } rthc_limit = rthc_count = 0; if (rthc_table != rthc_table_static) - mdbx_free(rthc_table); + osal_free(rthc_table); rthc_table = nullptr; rthc_unlock(); @@ -1598,29 +1506,30 @@ __cold void mdbx_rthc_global_dtor(void) { workaround_glibc_bug21031(); #endif - mdbx_trace("<< pid %d\n", mdbx_getpid()); + osal_dtor(); + TRACE("<< pid %d\n", osal_getpid()); } -__cold int mdbx_rthc_alloc(mdbx_thread_key_t *pkey, MDBX_reader *begin, - MDBX_reader *end) { +__cold int rthc_alloc(osal_thread_key_t *pkey, MDBX_reader *begin, + MDBX_reader *end) { assert(pkey != NULL); #ifndef NDEBUG - *pkey = (mdbx_thread_key_t)0xBADBADBAD; + *pkey = (osal_thread_key_t)0xBADBADBAD; #endif /* NDEBUG */ rthc_lock(); - mdbx_trace(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); + TRACE(">> rthc_count %u, rthc_limit %u", rthc_count, rthc_limit); int rc; if (rthc_count == rthc_limit) { rthc_entry_t *new_table = - mdbx_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, + osal_realloc((rthc_table == rthc_table_static) ? nullptr : rthc_table, sizeof(rthc_entry_t) * rthc_limit * 2); if (new_table == nullptr) { rc = MDBX_ENOMEM; goto bailout; } if (rthc_table == rthc_table_static) - memcpy(new_table, rthc_table_static, sizeof(rthc_table_static)); + memcpy(new_table, rthc_table, sizeof(rthc_entry_t) * rthc_limit); rthc_table = new_table; rthc_limit *= 2; } @@ -1630,15 +1539,14 @@ __cold int mdbx_rthc_alloc(mdbx_thread_key_t *pkey, MDBX_reader *begin, goto bailout; *pkey = rthc_table[rthc_count].thr_tls_key; - mdbx_trace("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, - (uintptr_t)*pkey, __Wpedantic_format_voidptr(begin), - __Wpedantic_format_voidptr(end)); + TRACE("== [%i] = key %" PRIuPTR ", %p ... %p", rthc_count, (uintptr_t)*pkey, + __Wpedantic_format_voidptr(begin), __Wpedantic_format_voidptr(end)); rthc_table[rthc_count].begin = begin; rthc_table[rthc_count].end = end; ++rthc_count; - mdbx_trace("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", - (uintptr_t)*pkey, rthc_count, rthc_limit); + TRACE("<< key %" PRIuPTR ", rthc_count %u, rthc_limit %u", (uintptr_t)*pkey, + rthc_count, rthc_limit); rthc_unlock(); return MDBX_SUCCESS; @@ -1647,30 +1555,30 @@ bailout: return rc; } -__cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { +__cold void rthc_remove(const osal_thread_key_t key) { thread_key_delete(key); rthc_lock(); - mdbx_trace(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, - rthc_count, rthc_limit); + TRACE(">> key %zu, rthc_count %u, rthc_limit %u", (uintptr_t)key, rthc_count, + rthc_limit); - for (unsigned i = 0; i < rthc_count; ++i) { + for (size_t i = 0; i < rthc_count; ++i) { if (key == rthc_table[i].thr_tls_key) { - const uint32_t self_pid = mdbx_getpid(); - mdbx_trace("== [%i], %p ...%p, current-pid %d", i, - __Wpedantic_format_voidptr(rthc_table[i].begin), - __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); + const uint32_t self_pid = osal_getpid(); + TRACE("== [%zi], %p ...%p, current-pid %d", i, + __Wpedantic_format_voidptr(rthc_table[i].begin), + __Wpedantic_format_voidptr(rthc_table[i].end), self_pid); for (MDBX_reader *rthc = rthc_table[i].begin; rthc < rthc_table[i].end; ++rthc) { if (atomic_load32(&rthc->mr_pid, mo_Relaxed) == self_pid) { atomic_store32(&rthc->mr_pid, 0, mo_AcquireRelease); - mdbx_trace("== cleanup %p", __Wpedantic_format_voidptr(rthc)); + TRACE("== cleanup %p", __Wpedantic_format_voidptr(rthc)); } } if (--rthc_count > 0) rthc_table[i] = rthc_table[rthc_count]; else if (rthc_table != rthc_table_static) { - mdbx_free(rthc_table); + osal_free(rthc_table); rthc_table = rthc_table_static; rthc_limit = RTHC_INITIAL_LIMIT; } @@ -1678,8 +1586,8 @@ __cold void mdbx_rthc_remove(const mdbx_thread_key_t key) { } } - mdbx_trace("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, - rthc_count, rthc_limit); + TRACE("<< key %zu, rthc_count %u, rthc_limit %u", (size_t)key, rthc_count, + rthc_limit); rthc_unlock(); } @@ -1692,7 +1600,7 @@ static __inline void lcklist_lock(void) { #if defined(_WIN32) || defined(_WIN64) EnterCriticalSection(&lcklist_critical_section); #else - mdbx_ensure(nullptr, mdbx_pthread_mutex_lock(&lcklist_mutex) == 0); + ENSURE(nullptr, osal_pthread_mutex_lock(&lcklist_mutex) == 0); #endif } @@ -1700,7 +1608,7 @@ static __inline void lcklist_unlock(void) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(&lcklist_critical_section); #else - mdbx_ensure(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); + ENSURE(nullptr, pthread_mutex_unlock(&lcklist_mutex) == 0); #endif } @@ -1713,7 +1621,7 @@ MDBX_NOTHROW_CONST_FUNCTION static uint64_t rrxmrrxmsx_0(uint64_t v) { return v ^ v >> 28; } -static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { +static int uniq_peek(const osal_mmap_t *pending, osal_mmap_t *scan) { int rc; uint64_t bait; MDBX_lockinfo *const pending_lck = pending->lck; @@ -1723,42 +1631,40 @@ static int uniq_peek(const mdbx_mmap_t *pending, mdbx_mmap_t *scan) { rc = MDBX_SUCCESS; } else { bait = 0 /* hush MSVC warning */; - rc = mdbx_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); + rc = osal_msync(scan, 0, sizeof(MDBX_lockinfo), MDBX_SYNC_DATA); if (rc == MDBX_SUCCESS) - rc = mdbx_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), + rc = osal_pread(pending->fd, &bait, sizeof(scan_lck->mti_bait_uniqueness), offsetof(MDBX_lockinfo, mti_bait_uniqueness)); } if (likely(rc == MDBX_SUCCESS) && bait == atomic_load64(&scan_lck->mti_bait_uniqueness, mo_AcquireRelease)) rc = MDBX_RESULT_TRUE; - mdbx_trace("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", - pending_lck ? "mem" : "file", bait, - (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); + TRACE("uniq-peek: %s, bait 0x%016" PRIx64 ",%s rc %d", + pending_lck ? "mem" : "file", bait, + (rc == MDBX_RESULT_TRUE) ? " found," : (rc ? " FAILED," : ""), rc); return rc; } -static int uniq_poke(const mdbx_mmap_t *pending, mdbx_mmap_t *scan, +static int uniq_poke(const osal_mmap_t *pending, osal_mmap_t *scan, uint64_t *abra) { if (*abra == 0) { - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); uintptr_t uit = 0; memcpy(&uit, &tid, (sizeof(tid) < sizeof(uit)) ? sizeof(tid) : sizeof(uit)); - *abra = - rrxmrrxmsx_0(mdbx_osal_monotime() + UINT64_C(5873865991930747) * uit); + *abra = rrxmrrxmsx_0(osal_monotime() + UINT64_C(5873865991930747) * uit); } const uint64_t cadabra = - rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)mdbx_getpid()) + rrxmrrxmsx_0(*abra + UINT64_C(7680760450171793) * (unsigned)osal_getpid()) << 24 | *abra >> 40; MDBX_lockinfo *const scan_lck = scan->lck; - atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, - mo_SequentialConsistency); + atomic_store64(&scan_lck->mti_bait_uniqueness, cadabra, mo_AcquireRelease); *abra = *abra * UINT64_C(6364136223846793005) + 1; return uniq_peek(pending, scan); } -__cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { +__cold static int uniq_check(const osal_mmap_t *pending, MDBX_env **found) { *found = nullptr; uint64_t salt = 0; for (MDBX_env *scan = inprocess_lcklist_head; scan != RTHC_ENVLIST_END; @@ -1769,33 +1675,33 @@ __cold static int uniq_check(const mdbx_mmap_t *pending, MDBX_env **found) { : uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_ENODATA) { uint64_t length = 0; - if (likely(mdbx_filesize(pending->fd, &length) == MDBX_SUCCESS && + if (likely(osal_filesize(pending->fd, &length) == MDBX_SUCCESS && length == 0)) { /* LY: skip checking since LCK-file is empty, i.e. just created. */ - mdbx_debug("uniq-probe: %s", "unique (new/empty lck)"); + DEBUG("uniq-probe: %s", "unique (new/empty lck)"); return MDBX_RESULT_TRUE; } } if (err == MDBX_RESULT_TRUE) err = uniq_poke(pending, &scan->me_lck_mmap, &salt); if (err == MDBX_RESULT_TRUE) { - (void)mdbx_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), - MDBX_SYNC_NONE); + (void)osal_msync(&scan->me_lck_mmap, 0, sizeof(MDBX_lockinfo), + MDBX_SYNC_KICK); err = uniq_poke(pending, &scan->me_lck_mmap, &salt); } if (err == MDBX_RESULT_TRUE) { err = uniq_poke(pending, &scan->me_lck_mmap, &salt); *found = scan; - mdbx_debug("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); + DEBUG("uniq-probe: found %p", __Wpedantic_format_voidptr(*found)); return MDBX_RESULT_FALSE; } if (unlikely(err != MDBX_SUCCESS)) { - mdbx_debug("uniq-probe: failed rc %d", err); + DEBUG("uniq-probe: failed rc %d", err); return err; } } - mdbx_debug("uniq-probe: %s", "unique"); + DEBUG("uniq-probe: %s", "unique"); return MDBX_RESULT_TRUE; } @@ -1803,8 +1709,8 @@ static int lcklist_detach_locked(MDBX_env *env) { MDBX_env *inprocess_neighbor = nullptr; int rc = MDBX_SUCCESS; if (env->me_lcklist_next != nullptr) { - mdbx_ensure(env, env->me_lcklist_next != nullptr); - mdbx_ensure(env, inprocess_lcklist_head != RTHC_ENVLIST_END); + ENSURE(env, env->me_lcklist_next != nullptr); + ENSURE(env, inprocess_lcklist_head != RTHC_ENVLIST_END); for (MDBX_env **ptr = &inprocess_lcklist_head; *ptr != RTHC_ENVLIST_END; ptr = &(*ptr)->me_lcklist_next) { if (*ptr == env) { @@ -1813,16 +1719,16 @@ static int lcklist_detach_locked(MDBX_env *env) { break; } } - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); } - rc = likely(mdbx_getpid() == env->me_pid) + rc = likely(osal_getpid() == env->me_pid) ? uniq_check(&env->me_lck_mmap, &inprocess_neighbor) : MDBX_PANIC; if (!inprocess_neighbor && env->me_live_reader) - (void)mdbx_rpid_clear(env); + (void)osal_rpid_clear(env); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_lck_destroy(env, inprocess_neighbor); + rc = osal_lck_destroy(env, inprocess_neighbor); return rc; } @@ -1831,13 +1737,24 @@ static int lcklist_detach_locked(MDBX_env *env) { * and network-sort for small chunks. * Thanks to John M. Gamble for the http://pages.ripco.net/~jgamble/nw.html */ +#if MDBX_HAVE_CMOV #define SORT_CMP_SWAP(TYPE, CMP, a, b) \ do { \ const TYPE swap_tmp = (a); \ - const bool swap_cmp = CMP(swap_tmp, b); \ + const bool swap_cmp = expect_with_probability(CMP(swap_tmp, b), 0, .5); \ (a) = swap_cmp ? swap_tmp : b; \ (b) = swap_cmp ? b : swap_tmp; \ } while (0) +#else +#define SORT_CMP_SWAP(TYPE, CMP, a, b) \ + do \ + if (expect_with_probability(!CMP(a, b), 0, .5)) { \ + const TYPE swap_tmp = (a); \ + (a) = (b); \ + (b) = swap_tmp; \ + } \ + while (0) +#endif // 3 comparators, 3 parallel operations // o-----^--^--o @@ -2028,686 +1945,10 @@ static int lcklist_detach_locked(MDBX_env *env) { SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ } while (0) -// 25 comparators, 9 parallel operations -// o--^-----^--^-----^-----------------------------------o -// | | | | -// o--v--^--v--|-----|--^-----^-----------^--------------o -// | | | | | | -// o-----v-----|-----|--|-----|--^-----^--|--^-----^--^--o -// | | | | | | | | | | -// o--^-----^--v--^--v--|-----|--|-----|--v--|-----|--v--o -// | | | | | | | | | -// o--v--^--v-----|-----v--^--v--|-----|-----|--^--v-----o -// | | | | | | | -// o-----v--------|--------|-----v--^--v--^--|--|--^-----o -// | | | | | | | -// o--^-----^-----v--------|--------|-----|--v--v--v-----o -// | | | | | -// o--v--^--v--------------v--------|-----v--------------o -// | | -// o-----v--------------------------v--------------------o -// -// [[0,1],[3,4],[6,7]] -// [[1,2],[4,5],[7,8]] -// [[0,1],[3,4],[6,7],[2,5]] -// [[0,3],[1,4],[5,8]] -// [[3,6],[4,7],[2,5]] -// [[0,3],[1,4],[5,7],[2,6]] -// [[1,3],[4,6]] -// [[2,4],[5,6]] -// [[2,3]] -#define SORT_NETWORK_9(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - } while (0) - -// 29 comparators, 9 parallel operations -// o--------------^-----^--^--^-----------------------o -// | | | | -// o-----------^--|--^--|--|--v--^--------^-----------o -// | | | | | | | -// o--------^--|--|--|--|--v--^--v-----^--|--^--------o -// | | | | | | | | | -// o-----^--|--|--|--|--v--^--|-----^--|--v--v--^-----o -// | | | | | | | | | | -// o--^--|--|--|--|--v-----|--v--^--|--|--^-----v--^--o -// | | | | | | | | | | | -// o--|--|--|--|--v--^-----|--^--|--v--v--|-----^--v--o -// | | | | | | | | | | -// o--|--|--|--v--^--|-----v--|--v--^-----|--^--v-----o -// | | | | | | | | | -// o--|--|--v-----|--|--^-----v--^--|-----v--v--------o -// | | | | | | | -// o--|--v--------|--v--|--^-----v--v-----------------o -// | | | | -// o--v-----------v-----v--v--------------------------o -// -// [[4,9],[3,8],[2,7],[1,6],[0,5]] -// [[1,4],[6,9],[0,3],[5,8]] -// [[0,2],[3,6],[7,9]] -// [[0,1],[2,4],[5,7],[8,9]] -// [[1,2],[4,6],[7,8],[3,5]] -// [[2,5],[6,8],[1,3],[4,7]] -// [[2,3],[6,7]] -// [[3,4],[5,6]] -// [[4,5]] -#define SORT_NETWORK_10(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - } while (0) - -// 35 comparators, 9 parallel operations -// o--^-----^-----------------^--------^--------------------o -// | | | | -// o--v--^--|--^--^--------^--|--------|--^-----------------o -// | | | | | | | | -// o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o -// | | | | | | | | | | -// o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o -// | | | | | | | | | | | -// o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o -// | | | | | | | | | -// o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o -// | | | | | | | | | -// o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o -// | | | | | | | | | -// o--v--v--------|--|-----|-----v--|--^-----|-----^--|--^--o -// | | | | | | | | | -// o--^--^--------|--|-----|--------v--|-----v--^--|--v--v--o -// | | | | | | | | -// o--v--|--^-----|--v-----|-----------|--------v--v--------o -// | | | | | -// o-----v--v-----v--------v-----------v--------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9]] -// [[1,3],[5,7],[0,2],[4,6],[8,10]] -// [[1,2],[5,6],[9,10],[0,4],[3,7]] -// [[1,5],[6,10],[4,8]] -// [[5,9],[2,6],[0,4],[3,8]] -// [[1,5],[6,10],[2,3],[8,9]] -// [[1,4],[7,10],[3,5],[6,8]] -// [[2,4],[7,9],[5,6]] -// [[3,4],[7,8]] -#define SORT_NETWORK_11(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - } while (0) - -// 39 comparators, parallel operations -// o--^-----^-----------------^--------^--------------------o -// | | | | -// o--v--^--|--^--^--------^--|--------|--^-----------------o -// | | | | | | | | -// o--^--|--v--v--|-----^--|--|--------|--|-----^--^--------o -// | | | | | | | | | | -// o--v--v--------|-----|--|--|--^-----|--|--^--v--|--^--^--o -// | | | | | | | | | | | -// o--^-----^-----|-----|--|--v--|--^--v--v--|-----v--|--v--o -// | | | | | | | | | -// o--v--^--|--^--v--^--|--v-----|--|--------|--------v--^--o -// | | | | | | | | | -// o--^--|--v--v--^--|--v--^-----|--|--------|--------^--v--o -// | | | | | | | | | -// o--v--v--------|--|-----|--^--v--|--^--^--|-----^--|--^--o -// | | | | | | | | | | | -// o--^-----^-----|--|-----|--|-----v--|--|--v--^--|--v--v--o -// | | | | | | | | | | -// o--v--^--|--^--|--v-----|--|--------|--|-----v--v--------o -// | | | | | | | | -// o--^--|--v--v--v--------v--|--------|--v-----------------o -// | | | | -// o--v--v--------------------v--------v--------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11]] -// [[1,3],[5,7],[9,11],[0,2],[4,6],[8,10]] -// [[1,2],[5,6],[9,10],[0,4],[7,11]] -// [[1,5],[6,10],[3,7],[4,8]] -// [[5,9],[2,6],[0,4],[7,11],[3,8]] -// [[1,5],[6,10],[2,3],[8,9]] -// [[1,4],[7,10],[3,5],[6,8]] -// [[2,4],[7,9],[5,6]] -// [[3,4],[7,8]] -#define SORT_NETWORK_12(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - } while (0) - -// 45 comparators, 10 parallel operations -// o--------^--^-----^-----------------------------^-----------------o -// | | | | -// o--^-----|--v-----|-----^--------------^-----^--|-----^-----------o -// | | | | | | | | -// o--|-----|--^--^--v-----|--------------|--^--|--|--^--v--^--------o -// | | | | | | | | | | | -// o--|--^--|--|--v-----^--|--------^-----|--|--v--|--|--^--v-----^--o -// | | | | | | | | | | | | | -// o--|--v--|--|--^-----|--v-----^--v-----|--|--^--|--|--|--^--^--v--o -// | | | | | | | | | | | | | | -// o--|--^--|--|--|--^--|--------|-----^--|--|--|--v--v--v--|--v--^--o -// | | | | | | | | | | | | | | -// o--|--|--|--v--v--|--|--^-----|--^--v--|--v--|--^--------v--^--v--o -// | | | | | | | | | | | | -// o--v--|--|-----^--|--v--|--^--|--|-----v-----v--|--^--------v-----o -// | | | | | | | | | | -// o-----v--|--^--|--|-----|--v--|--|--^-----^-----v--v--^-----------o -// | | | | | | | | | | -// o--^-----|--|--|--v-----|-----v--|--v--^--|--^--------v-----------o -// | | | | | | | | | -// o--|-----|--|--|--^-----|--------v--^--|--v--v--------------------o -// | | | | | | | | -// o--v-----|--v--|--v-----|--^--------v--v--------------------------o -// | | | | -// o--------v-----v--------v--v--------------------------------------o -// -// [[1,7],[9,11],[3,4],[5,8],[0,12],[2,6]] -// [[0,1],[2,3],[4,6],[8,11],[7,12],[5,9]] -// [[0,2],[3,7],[10,11],[1,4],[6,12]] -// [[7,8],[11,12],[4,9],[6,10]] -// [[3,4],[5,6],[8,9],[10,11],[1,7]] -// [[2,6],[9,11],[1,3],[4,7],[8,10],[0,5]] -// [[2,5],[6,8],[9,10]] -// [[1,2],[3,5],[7,8],[4,6]] -// [[2,3],[4,5],[6,7],[8,9]] -// [[3,4],[5,6]] -#define SORT_NETWORK_13(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - } while (0) - -/* *INDENT-OFF* */ -/* clang-format off */ - -// 51 comparators, 10 parallel operations -// o--^--^-----^-----------^-----------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^-----------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v-----------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^--------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^-----------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^--^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | -// o--^--v--|--|--|--------------v--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | -// o--v-----v--|--|-----------------v--|--|--------|--v-----|--^--------|--|--^--------o -// | | | | | | | | | | -// o--^--------v--|--------------------v--|--------v--------|--|--------v--v--v--------o -// | | | | | -// o--v-----------v-----------------------v-----------------v--v-----------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] -// [[0,2],[4,6],[8,10],[1,3],[5,7],[9,11]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[3,7]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13]] -// [[5,10],[6,9],[3,12],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - -/* *INDENT-ON* */ -/* clang-format on */ - -#define SORT_NETWORK_14(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - -/* *INDENT-OFF* */ -/* clang-format off */ - -// 56 comparators, 10 parallel operations -// o--^--^-----^-----------^--------------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^--------------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v--------------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^-----------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^--------------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^-----^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--^--|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | | -// o--^--v--|--|--|--^-----------v--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | | | -// o--v-----v--|--|--|--------------v--|--|--|--------|--v-----|--^--^-----|--|--^--------o -// | | | | | | | | | | | | | -// o--^--^-----v--|--|-----------------v--|--|--------v--------|--|--|-----v--v--v--------o -// | | | | | | | | | -// o--v--|--------v--|--------------------v--|--^--------------v--|--v--------------------o -// | | | | | -// o-----v-----------v-----------------------v--v-----------------v-----------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] -// [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14]] -// [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - -/* *INDENT-ON* */ -/* clang-format on */ - -#define SORT_NETWORK_15(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - -/* *INDENT-OFF* */ -/* clang-format off */ - -// 60 comparators, 10 parallel operations -// o--^--^-----^-----------^-----------------------------------------------------------------o -// | | | | -// o--v--|--^--|--^--------|--^-----^-----------------------------^--------------------------o -// | | | | | | | | -// o--^--v--|--|--|--^-----|--|--^--v-----------------------------|--^--^--------------------o -// | | | | | | | | | | | -// o--v-----v--|--|--|--^--|--|--|--^--------------------^--------|--|--|--^--^--^-----------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--|--|--|--|--^-----------------|-----^--v--|--v--|--|--v-----------o -// | | | | | | | | | | | | | | | -// o--v--|--^-----v--|--|--|--|--|--|--|--^--------^-----|-----|-----|--^--|--v-----^--------o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--------v--|--|--|--|--|--|--|--^-----|--^--|-----|-----|--v--|-----^--v-----^--o -// | | | | | | | | | | | | | | | | | | -// o--v-----v-----------v--|--|--|--|--|--|--|--^--|--|--|--^--|--^--|-----|--^--|--^--^--v--o -// | | | | | | | | | | | | | | | | | | | | -// o--^--^-----^-----------v--|--|--|--|--|--|--|--|--|--|--|--v--|--v-----v--|--v--|--v--^--o -// | | | | | | | | | | | | | | | | | | -// o--v--|--^--|--^-----------v--|--|--|--|--|--|--|--v--|--|-----|--^--------|-----v--^--v--o -// | | | | | | | | | | | | | | | | | -// o--^--v--|--|--|--^-----------v--|--|--|--|--|--v-----|--|-----|--v--------|--^-----v-----o -// | | | | | | | | | | | | | | | -// o--v-----v--|--|--|--^-----------v--|--|--|--|--------|--v-----|--^--^-----|--|--^--------o -// | | | | | | | | | | | | | | | -// o--^--^-----v--|--|--|--------------v--|--|--|--------v--------|--|--|-----v--v--v--------o -// | | | | | | | | | | | -// o--v--|--^-----v--|--|-----------------v--|--|--^--------------v--|--v--------------------o -// | | | | | | | | -// o--^--v--|--------v--|--------------------v--|--v-----------------v-----------------------o -// | | | | -// o--v-----v-----------v-----------------------v--------------------------------------------o -// -// [[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13],[14,15]] -// [[0,2],[4,6],[8,10],[12,14],[1,3],[5,7],[9,11],[13,15]] -// [[0,4],[8,12],[1,5],[9,13],[2,6],[10,14],[3,7],[11,15]] -// [[0,8],[1,9],[2,10],[3,11],[4,12],[5,13],[6,14],[7,15]] -// [[5,10],[6,9],[3,12],[13,14],[7,11],[1,2],[4,8]] -// [[1,4],[7,13],[2,8],[11,14],[5,6],[9,10]] -// [[2,4],[11,13],[3,8],[7,12]] -// [[6,8],[10,12],[3,5],[7,9]] -// [[3,4],[5,6],[7,8],[9,10],[11,12]] -// [[6,7],[8,9]] - -/* *INDENT-ON* */ -/* clang-format on */ - -#define SORT_NETWORK_16(TYPE, CMP, begin) \ - do { \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[1]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[14], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[12], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[3]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[0], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[15]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[13], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[11]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[2]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[4], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[1], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[14]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[2], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[13]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[10], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[5]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[9]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[3], begin[4]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[5], begin[6]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[7], begin[8]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[9], begin[10]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[11], begin[12]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[6], begin[7]); \ - SORT_CMP_SWAP(TYPE, CMP, begin[8], begin[9]); \ - } while (0) - #define SORT_INNER(TYPE, CMP, begin, end, len) \ switch (len) { \ default: \ + assert(false); \ __unreachable(); \ case 0: \ case 1: \ @@ -2733,30 +1974,6 @@ static int lcklist_detach_locked(MDBX_env *env) { case 8: \ SORT_NETWORK_8(TYPE, CMP, begin); \ break; \ - case 9: \ - SORT_NETWORK_9(TYPE, CMP, begin); \ - break; \ - case 10: \ - SORT_NETWORK_10(TYPE, CMP, begin); \ - break; \ - case 11: \ - SORT_NETWORK_11(TYPE, CMP, begin); \ - break; \ - case 12: \ - SORT_NETWORK_12(TYPE, CMP, begin); \ - break; \ - case 13: \ - SORT_NETWORK_13(TYPE, CMP, begin); \ - break; \ - case 14: \ - SORT_NETWORK_14(TYPE, CMP, begin); \ - break; \ - case 15: \ - SORT_NETWORK_15(TYPE, CMP, begin); \ - break; \ - case 16: \ - SORT_NETWORK_16(TYPE, CMP, begin); \ - break; \ } #define SORT_SWAP(TYPE, a, b) \ @@ -2784,7 +2001,7 @@ static int lcklist_detach_locked(MDBX_env *env) { \ static __inline bool NAME##_is_sorted(const TYPE *first, const TYPE *last) { \ while (++first <= last) \ - if (CMP(first[0], first[-1])) \ + if (expect_with_probability(CMP(first[0], first[-1]), 1, .1)) \ return false; \ return true; \ } \ @@ -2793,14 +2010,15 @@ static int lcklist_detach_locked(MDBX_env *env) { TYPE *lo, *hi; \ } NAME##_stack; \ \ - static __hot void NAME(TYPE *const begin, TYPE *const end) { \ - NAME##_stack stack[sizeof(unsigned) * CHAR_BIT], *top = stack; \ + __hot static void NAME(TYPE *const __restrict begin, \ + TYPE *const __restrict end) { \ + NAME##_stack stack[sizeof(size_t) * CHAR_BIT], *__restrict top = stack; \ \ - TYPE *hi = end - 1; \ - TYPE *lo = begin; \ + TYPE *__restrict hi = end - 1; \ + TYPE *__restrict lo = begin; \ while (true) { \ const ptrdiff_t len = hi - lo; \ - if (len < 16) { \ + if (len < 8) { \ SORT_INNER(TYPE, CMP, lo, hi + 1, len + 1); \ if (unlikely(top == stack)) \ break; \ @@ -2808,7 +2026,7 @@ static int lcklist_detach_locked(MDBX_env *env) { continue; \ } \ \ - TYPE *mid = lo + (len >> 1); \ + TYPE *__restrict mid = lo + (len >> 1); \ SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ SORT_CMP_SWAP(TYPE, CMP, *mid, *hi); \ SORT_CMP_SWAP(TYPE, CMP, *lo, *mid); \ @@ -2816,9 +2034,9 @@ static int lcklist_detach_locked(MDBX_env *env) { TYPE *right = hi - 1; \ TYPE *left = lo + 1; \ while (1) { \ - while (CMP(*left, *mid)) \ + while (expect_with_probability(CMP(*left, *mid), 0, .5)) \ ++left; \ - while (CMP(*mid, *right)) \ + while (expect_with_probability(CMP(*mid, *right), 0, .5)) \ --right; \ if (unlikely(left > right)) { \ if (EXPECT_LOW_CARDINALITY_OR_PRESORTED) { \ @@ -2844,7 +2062,7 @@ static int lcklist_detach_locked(MDBX_env *env) { } \ } \ \ - if (mdbx_audit_enabled()) { \ + if (AUDIT_ENABLED()) { \ for (TYPE *scan = begin + 1; scan < end; ++scan) \ assert(CMP(scan[-1], scan[0])); \ } \ @@ -2855,59 +2073,58 @@ static int lcklist_detach_locked(MDBX_env *env) { #define RADIXSORT_IMPL(NAME, TYPE, EXTRACT_KEY, BUFFER_PREALLOCATED, END_GAP) \ \ - __hot static bool NAME##_radixsort(TYPE *const begin, \ - const unsigned length) { \ + __hot static bool NAME##_radixsort(TYPE *const begin, const size_t length) { \ TYPE *tmp; \ if (BUFFER_PREALLOCATED) { \ tmp = begin + length + END_GAP; \ /* memset(tmp, 0xDeadBeef, sizeof(TYPE) * length); */ \ } else { \ - tmp = mdbx_malloc(sizeof(TYPE) * length); \ + tmp = osal_malloc(sizeof(TYPE) * length); \ if (unlikely(!tmp)) \ return false; \ } \ \ - unsigned key_shift = 0, key_diff_mask; \ + size_t key_shift = 0, key_diff_mask; \ do { \ struct { \ - unsigned a[256], b[256]; \ + pgno_t a[256], b[256]; \ } counters; \ memset(&counters, 0, sizeof(counters)); \ \ key_diff_mask = 0; \ - unsigned prev_key = EXTRACT_KEY(begin) >> key_shift; \ + size_t prev_key = EXTRACT_KEY(begin) >> key_shift; \ TYPE *r = begin, *end = begin + length; \ do { \ - const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ counters.a[key & 255]++; \ counters.b[(key >> 8) & 255]++; \ key_diff_mask |= prev_key ^ key; \ prev_key = key; \ } while (++r != end); \ \ - unsigned ta = 0, tb = 0; \ - for (unsigned i = 0; i < 256; ++i) { \ - const unsigned ia = counters.a[i]; \ + pgno_t ta = 0, tb = 0; \ + for (size_t i = 0; i < 256; ++i) { \ + const pgno_t ia = counters.a[i]; \ counters.a[i] = ta; \ ta += ia; \ - const unsigned ib = counters.b[i]; \ + const pgno_t ib = counters.b[i]; \ counters.b[i] = tb; \ tb += ib; \ } \ \ r = begin; \ do { \ - const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ tmp[counters.a[key & 255]++] = *r; \ } while (++r != end); \ \ if (unlikely(key_diff_mask < 256)) { \ - memcpy(begin, tmp, (char *)end - (char *)begin); \ + memcpy(begin, tmp, ptr_dist(end, begin)); \ break; \ } \ end = (r = tmp) + length; \ do { \ - const unsigned key = EXTRACT_KEY(r) >> key_shift; \ + const size_t key = EXTRACT_KEY(r) >> key_shift; \ begin[counters.b[(key >> 8) & 255]++] = *r; \ } while (++r != end); \ \ @@ -2915,154 +2132,181 @@ static int lcklist_detach_locked(MDBX_env *env) { } while (key_diff_mask >> 16); \ \ if (!(BUFFER_PREALLOCATED)) \ - mdbx_free(tmp); \ + osal_free(tmp); \ return true; \ } /*------------------------------------------------------------------------------ * LY: Binary search */ +#if defined(__clang__) && __clang_major__ > 4 && defined(__ia32__) +#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ + do \ + __asm __volatile("" \ + : "+r"(size) \ + : "r" /* the `b` constraint is more suitable here, but \ + cause CLANG to allocate and push/pop an one more \ + register, so using the `r` which avoids this. */ \ + (flag)); \ + while (0) +#else +#define WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(size, flag) \ + do { \ + /* nope for non-clang or non-x86 */; \ + } while (0) +#endif /* Workaround for CLANG */ + +#define BINARY_SEARCH_STEP(TYPE_LIST, CMP, it, size, key) \ + do { \ + } while (0) + +/* *INDENT-OFF* */ +/* clang-format off */ #define SEARCH_IMPL(NAME, TYPE_LIST, TYPE_ARG, CMP) \ static __always_inline const TYPE_LIST *NAME( \ - const TYPE_LIST *first, unsigned length, const TYPE_ARG item) { \ - const TYPE_LIST *const begin = first, *const end = begin + length; \ + const TYPE_LIST *it, size_t length, const TYPE_ARG item) { \ + const TYPE_LIST *const begin = it, *const end = begin + length; \ \ - while (length > 3) { \ - const unsigned whole = length; \ - length >>= 1; \ - const TYPE_LIST *const middle = first + length; \ - const unsigned left = whole - length - 1; \ - const bool cmp = CMP(*middle, item); \ - length = cmp ? left : length; \ - first = cmp ? middle + 1 : first; \ - } \ + if (MDBX_HAVE_CMOV) \ + do { \ + /* Адаптивно-упрощенный шаг двоичного поиска: \ + * - без переходов при наличии cmov или аналога; \ + * - допускает лишние итерации; \ + * - но ищет пока size > 2, что требует дозавершения поиска \ + * среди остающихся 0-1-2 элементов. */ \ + const TYPE_LIST *const middle = it + (length >> 1); \ + length = (length + 1) >> 1; \ + const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ + WORKAROUND_FOR_CLANG_OPTIMIZER_BUG(length, flag); \ + it = flag ? middle : it; \ + } while (length > 2); \ + else \ + while (length > 2) { \ + /* Вариант с использованием условного перехода. Основное отличие в \ + * том, что при "не равно" (true от компаратора) переход делается на 1 \ + * ближе к концу массива. Алгоритмически это верно и обеспечивает \ + * чуть-чуть более быструю сходимость, но зато требует больше \ + * вычислений при true от компаратора. Также ВАЖНО(!) не допускается \ + * спекулятивное выполнение при size == 0. */ \ + const TYPE_LIST *const middle = it + (length >> 1); \ + length = (length + 1) >> 1; \ + const bool flag = expect_with_probability(CMP(*middle, item), 0, .5); \ + if (flag) { \ + it = middle + 1; \ + length -= 1; \ + } \ + } \ + it += length > 1 && expect_with_probability(CMP(*it, item), 0, .5); \ + it += length > 0 && expect_with_probability(CMP(*it, item), 0, .5); \ \ - switch (length) { \ - case 3: \ - if (!CMP(*first, item)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 2: \ - if (!CMP(*first, item)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 1: \ - if (!CMP(*first, item)) \ - break; \ - ++first; \ - __fallthrough /* fall through */; \ - case 0: \ - break; \ - default: \ - __unreachable(); \ - } \ - \ - if (mdbx_audit_enabled()) { \ - for (const TYPE_LIST *scan = begin; scan < first; ++scan) \ + if (AUDIT_ENABLED()) { \ + for (const TYPE_LIST *scan = begin; scan < it; ++scan) \ assert(CMP(*scan, item)); \ - for (const TYPE_LIST *scan = first; scan < end; ++scan) \ + for (const TYPE_LIST *scan = it; scan < end; ++scan) \ assert(!CMP(*scan, item)); \ (void)begin, (void)end; \ } \ \ - return first; \ + return it; \ } +/* *INDENT-ON* */ +/* clang-format on */ /*----------------------------------------------------------------------------*/ -static __always_inline size_t pnl2bytes(size_t size) { +static __always_inline size_t pnl_size2bytes(size_t size) { assert(size > 0 && size <= MDBX_PGL_LIMIT); #if MDBX_PNL_PREALLOC_FOR_RADIXSORT size += size; #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ STATIC_ASSERT(MDBX_ASSUME_MALLOC_OVERHEAD + (MDBX_PGL_LIMIT * (MDBX_PNL_PREALLOC_FOR_RADIXSORT + 1) + - MDBX_PNL_GRANULATE + 2) * + MDBX_PNL_GRANULATE + 3) * sizeof(pgno_t) < SIZE_MAX / 4 * 3); size_t bytes = - ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 2), + ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(pgno_t) * (size + 3), MDBX_PNL_GRANULATE * sizeof(pgno_t)) - MDBX_ASSUME_MALLOC_OVERHEAD; return bytes; } -static __always_inline pgno_t bytes2pnl(const size_t bytes) { +static __always_inline pgno_t pnl_bytes2size(const size_t bytes) { size_t size = bytes / sizeof(pgno_t); - assert(size > 2 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); - size -= 2; + assert(size > 3 && size <= MDBX_PGL_LIMIT + /* alignment gap */ 65536); + size -= 3; #if MDBX_PNL_PREALLOC_FOR_RADIXSORT size >>= 1; #endif /* MDBX_PNL_PREALLOC_FOR_RADIXSORT */ return (pgno_t)size; } -static MDBX_PNL mdbx_pnl_alloc(size_t size) { - size_t bytes = pnl2bytes(size); - MDBX_PNL pl = mdbx_malloc(bytes); +static MDBX_PNL pnl_alloc(size_t size) { + size_t bytes = pnl_size2bytes(size); + MDBX_PNL pl = osal_malloc(bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - pl[0] = bytes2pnl(bytes); + pl[0] = pnl_bytes2size(bytes); assert(pl[0] >= size); - pl[1] = 0; pl += 1; + *pl = 0; } return pl; } -static void mdbx_pnl_free(MDBX_PNL pl) { +static void pnl_free(MDBX_PNL pl) { if (likely(pl)) - mdbx_free(pl - 1); + osal_free(pl - 1); } /* Shrink the PNL to the default size if it has grown larger */ -static void mdbx_pnl_shrink(MDBX_PNL *ppl) { - assert(bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && - bytes2pnl(pnl2bytes(MDBX_PNL_INITIAL)) < MDBX_PNL_INITIAL * 3 / 2); - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); - MDBX_PNL_SIZE(*ppl) = 0; +static void pnl_shrink(MDBX_PNL *ppl) { + assert(pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) >= MDBX_PNL_INITIAL && + pnl_bytes2size(pnl_size2bytes(MDBX_PNL_INITIAL)) < + MDBX_PNL_INITIAL * 3 / 2); + assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); + MDBX_PNL_SETSIZE(*ppl, 0); if (unlikely(MDBX_PNL_ALLOCLEN(*ppl) > - MDBX_PNL_INITIAL * 2 - MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { - size_t bytes = pnl2bytes(MDBX_PNL_INITIAL); - MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); + MDBX_PNL_INITIAL * (MDBX_PNL_PREALLOC_FOR_RADIXSORT ? 8 : 4) - + MDBX_CACHELINE_SIZE / sizeof(pgno_t))) { + size_t bytes = pnl_size2bytes(MDBX_PNL_INITIAL * 2); + MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - *pl = bytes2pnl(bytes); + *pl = pnl_bytes2size(bytes); *ppl = pl + 1; } } } /* Grow the PNL to the size growed to at least given size */ -static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { +static int pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { const size_t allocated = MDBX_PNL_ALLOCLEN(*ppl); - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); + assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); if (likely(allocated >= wanna)) return MDBX_SUCCESS; if (unlikely(wanna > /* paranoia */ MDBX_PGL_LIMIT)) { - mdbx_error("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); + ERROR("PNL too long (%zu > %zu)", wanna, (size_t)MDBX_PGL_LIMIT); return MDBX_TXN_FULL; } const size_t size = (wanna + wanna - allocated < MDBX_PGL_LIMIT) ? wanna + wanna - allocated : MDBX_PGL_LIMIT; - size_t bytes = pnl2bytes(size); - MDBX_PNL pl = mdbx_realloc(*ppl - 1, bytes); + size_t bytes = pnl_size2bytes(size); + MDBX_PNL pl = osal_realloc(*ppl - 1, bytes); if (likely(pl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(pl); #endif /* malloc_usable_size */ - *pl = bytes2pnl(bytes); + *pl = pnl_bytes2size(bytes); assert(*pl >= wanna); *ppl = pl + 1; return MDBX_SUCCESS; @@ -3071,46 +2315,47 @@ static int mdbx_pnl_reserve(MDBX_PNL *ppl, const size_t wanna) { } /* Make room for num additional elements in an PNL */ -static __always_inline int __must_check_result mdbx_pnl_need(MDBX_PNL *ppl, - size_t num) { - assert(MDBX_PNL_SIZE(*ppl) <= MDBX_PGL_LIMIT && - MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_SIZE(*ppl)); +static __always_inline int __must_check_result pnl_need(MDBX_PNL *ppl, + size_t num) { + assert(MDBX_PNL_GETSIZE(*ppl) <= MDBX_PGL_LIMIT && + MDBX_PNL_ALLOCLEN(*ppl) >= MDBX_PNL_GETSIZE(*ppl)); assert(num <= MDBX_PGL_LIMIT); - const size_t wanna = MDBX_PNL_SIZE(*ppl) + num; - return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) - ? MDBX_SUCCESS - : mdbx_pnl_reserve(ppl, wanna); + const size_t wanna = MDBX_PNL_GETSIZE(*ppl) + num; + return likely(MDBX_PNL_ALLOCLEN(*ppl) >= wanna) ? MDBX_SUCCESS + : pnl_reserve(ppl, wanna); } -static __always_inline void mdbx_pnl_xappend(MDBX_PNL pl, pgno_t pgno) { - assert(MDBX_PNL_SIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); - if (mdbx_audit_enabled()) { - for (unsigned i = MDBX_PNL_SIZE(pl); i > 0; --i) +static __always_inline void pnl_xappend(MDBX_PNL pl, pgno_t pgno) { + assert(MDBX_PNL_GETSIZE(pl) < MDBX_PNL_ALLOCLEN(pl)); + if (AUDIT_ENABLED()) { + for (size_t i = MDBX_PNL_GETSIZE(pl); i > 0; --i) assert(pgno != pl[i]); } - MDBX_PNL_SIZE(pl) += 1; + *pl += 1; MDBX_PNL_LAST(pl) = pgno; } /* Append an pgno range onto an unsorted PNL */ -__always_inline static int __must_check_result -mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { +__always_inline static int __must_check_result pnl_append_range(bool spilled, + MDBX_PNL *ppl, + pgno_t pgno, + size_t n) { assert(n > 0); - int rc = mdbx_pnl_need(ppl, n); + int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) return rc; const MDBX_PNL pnl = *ppl; #if MDBX_PNL_ASCENDING - unsigned w = MDBX_PNL_SIZE(pnl); + size_t w = MDBX_PNL_GETSIZE(pnl); do { pnl[++w] = pgno; pgno += spilled ? 2 : 1; } while (--n); - MDBX_PNL_SIZE(pnl) = w; + MDBX_PNL_SETSIZE(pnl, w); #else - unsigned w = MDBX_PNL_SIZE(pnl) + n; - MDBX_PNL_SIZE(pnl) = w; + size_t w = MDBX_PNL_GETSIZE(pnl) + n; + MDBX_PNL_SETSIZE(pnl, w); do { pnl[w--] = pgno; pgno += spilled ? 2 : 1; @@ -3121,17 +2366,16 @@ mdbx_pnl_append_range(bool spilled, MDBX_PNL *ppl, pgno_t pgno, unsigned n) { } /* Append an pgno range into the sorted PNL */ -static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, - pgno_t pgno, - unsigned n) { +__hot static int __must_check_result pnl_insert_range(MDBX_PNL *ppl, + pgno_t pgno, size_t n) { assert(n > 0); - int rc = mdbx_pnl_need(ppl, n); + int rc = pnl_need(ppl, n); if (unlikely(rc != MDBX_SUCCESS)) return rc; const MDBX_PNL pnl = *ppl; - unsigned r = MDBX_PNL_SIZE(pnl), w = r + n; - MDBX_PNL_SIZE(pnl) = w; + size_t r = MDBX_PNL_GETSIZE(pnl), w = r + n; + MDBX_PNL_SETSIZE(pnl, w); while (r && MDBX_PNL_DISORDERED(pnl[r], pgno)) pnl[w--] = pnl[r--]; @@ -3141,103 +2385,147 @@ static __hot int __must_check_result mdbx_pnl_insert_range(MDBX_PNL *ppl, return MDBX_SUCCESS; } -static bool mdbx_pnl_check(const MDBX_PNL pl, const size_t limit) { +__hot static bool pnl_check(const pgno_t *pl, const size_t limit) { assert(limit >= MIN_PAGENO - MDBX_ENABLE_REFUND); - if (likely(MDBX_PNL_SIZE(pl))) { - assert(MDBX_PNL_LEAST(pl) >= MIN_PAGENO); - assert(MDBX_PNL_MOST(pl) < limit); - assert(MDBX_PNL_SIZE(pl) <= MDBX_PGL_LIMIT); - if (unlikely(MDBX_PNL_SIZE(pl) > MDBX_PGL_LIMIT)) + if (likely(MDBX_PNL_GETSIZE(pl))) { + if (unlikely(MDBX_PNL_GETSIZE(pl) > MDBX_PGL_LIMIT)) return false; if (unlikely(MDBX_PNL_LEAST(pl) < MIN_PAGENO)) return false; if (unlikely(MDBX_PNL_MOST(pl) >= limit)) return false; - if (mdbx_audit_enabled()) { - for (const pgno_t *scan = &MDBX_PNL_LAST(pl); --scan > pl;) { - assert(MDBX_PNL_ORDERED(scan[0], scan[1])); - if (unlikely(!MDBX_PNL_ORDERED(scan[0], scan[1]))) + + if ((!MDBX_DISABLE_VALIDATION || AUDIT_ENABLED()) && + likely(MDBX_PNL_GETSIZE(pl) > 1)) { + const pgno_t *scan = MDBX_PNL_BEGIN(pl); + const pgno_t *const end = MDBX_PNL_END(pl); + pgno_t prev = *scan++; + do { + if (unlikely(!MDBX_PNL_ORDERED(prev, *scan))) return false; - } + prev = *scan; + } while (likely(++scan != end)); } } return true; } -static __always_inline bool mdbx_pnl_check4assert(const MDBX_PNL pl, - const size_t limit) { - if (unlikely(pl == nullptr)) - return true; - assert(MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_SIZE(pl)); - if (unlikely(MDBX_PNL_ALLOCLEN(pl) < MDBX_PNL_SIZE(pl))) - return false; - return mdbx_pnl_check(pl, limit); +static __always_inline bool pnl_check_allocated(const pgno_t *pl, + const size_t limit) { + return pl == nullptr || (MDBX_PNL_ALLOCLEN(pl) >= MDBX_PNL_GETSIZE(pl) && + pnl_check(pl, limit)); } -/* Merge an PNL onto an PNL. The destination PNL must be big enough */ -static void __hot mdbx_pnl_xmerge(MDBX_PNL dst, const MDBX_PNL src) { - assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); - assert(mdbx_pnl_check(src, MAX_PAGENO + 1)); - const size_t total = MDBX_PNL_SIZE(dst) + MDBX_PNL_SIZE(src); +static __always_inline void +pnl_merge_inner(pgno_t *__restrict dst, const pgno_t *__restrict src_a, + const pgno_t *__restrict src_b, + const pgno_t *__restrict const src_b_detent) { + do { +#if MDBX_HAVE_CMOV + const bool flag = MDBX_PNL_ORDERED(*src_b, *src_a); +#if defined(__LCC__) || __CLANG_PREREQ(13, 0) + // lcc 1.26: 13ШК (подготовка и первая итерация) + 7ШК (цикл), БЕЗ loop-mode + // gcc>=7: cmp+jmp с возвратом в тело цикла (WTF?) + // gcc<=6: cmov×3 + // clang<=12: cmov×3 + // clang>=13: cmov, set+add/sub + *dst = flag ? *src_a-- : *src_b--; +#else + // gcc: cmov, cmp+set+add/sub + // clang<=5: cmov×2, set+add/sub + // clang>=6: cmov, set+add/sub + *dst = flag ? *src_a : *src_b; + src_b += (ptrdiff_t)flag - 1; + src_a -= flag; +#endif + --dst; +#else /* MDBX_HAVE_CMOV */ + while (MDBX_PNL_ORDERED(*src_b, *src_a)) + *dst-- = *src_a--; + *dst-- = *src_b--; +#endif /* !MDBX_HAVE_CMOV */ + } while (likely(src_b > src_b_detent)); +} + +/* Merge a PNL onto a PNL. The destination PNL must be big enough */ +__hot static size_t pnl_merge(MDBX_PNL dst, const MDBX_PNL src) { + assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); + assert(pnl_check(src, MAX_PAGENO + 1)); + const size_t src_len = MDBX_PNL_GETSIZE(src); + const size_t dst_len = MDBX_PNL_GETSIZE(dst); + size_t total = dst_len; assert(MDBX_PNL_ALLOCLEN(dst) >= total); - pgno_t *w = dst + total; - pgno_t *d = dst + MDBX_PNL_SIZE(dst); - const pgno_t *s = src + MDBX_PNL_SIZE(src); - dst[0] = /* detent for scan below */ (MDBX_PNL_ASCENDING ? 0 : ~(pgno_t)0); - while (s > src) { - while (MDBX_PNL_ORDERED(*s, *d)) - *w-- = *d--; - *w-- = *s--; + if (likely(src_len > 0)) { + total += src_len; + if (!MDBX_DEBUG && total < (MDBX_HAVE_CMOV ? 21 : 12)) + goto avoid_call_libc_for_short_cases; + if (dst_len == 0 || + MDBX_PNL_ORDERED(MDBX_PNL_LAST(dst), MDBX_PNL_FIRST(src))) + memcpy(MDBX_PNL_END(dst), MDBX_PNL_BEGIN(src), src_len * sizeof(pgno_t)); + else if (MDBX_PNL_ORDERED(MDBX_PNL_LAST(src), MDBX_PNL_FIRST(dst))) { + memmove(MDBX_PNL_BEGIN(dst) + src_len, MDBX_PNL_BEGIN(dst), + dst_len * sizeof(pgno_t)); + memcpy(MDBX_PNL_BEGIN(dst), MDBX_PNL_BEGIN(src), + src_len * sizeof(pgno_t)); + } else { + avoid_call_libc_for_short_cases: + dst[0] = /* the detent */ (MDBX_PNL_ASCENDING ? 0 : P_INVALID); + pnl_merge_inner(dst + total, dst + dst_len, src + src_len, src); + } + MDBX_PNL_SETSIZE(dst, total); } - MDBX_PNL_SIZE(dst) = (pgno_t)total; - assert(mdbx_pnl_check4assert(dst, MAX_PAGENO + 1)); + assert(pnl_check_allocated(dst, MAX_PAGENO + 1)); + return total; } -static void mdbx_spill_remove(MDBX_txn *txn, unsigned idx, unsigned npages) { - mdbx_tassert(txn, idx > 0 && idx <= MDBX_PNL_SIZE(txn->tw.spill_pages) && - txn->tw.spill_least_removed > 0); - txn->tw.spill_least_removed = - (idx < txn->tw.spill_least_removed) ? idx : txn->tw.spill_least_removed; - txn->tw.spill_pages[idx] |= 1; - MDBX_PNL_SIZE(txn->tw.spill_pages) -= - (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); +static void spill_remove(MDBX_txn *txn, size_t idx, size_t npages) { + tASSERT(txn, idx > 0 && idx <= MDBX_PNL_GETSIZE(txn->tw.spilled.list) && + txn->tw.spilled.least_removed > 0); + txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) + ? idx + : txn->tw.spilled.least_removed; + txn->tw.spilled.list[idx] |= 1; + MDBX_PNL_SETSIZE(txn->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); while (unlikely(npages > 1)) { - const pgno_t pgno = (txn->tw.spill_pages[idx] >> 1) + 1; + const pgno_t pgno = (txn->tw.spilled.list[idx] >> 1) + 1; if (MDBX_PNL_ASCENDING) { - if (++idx > MDBX_PNL_SIZE(txn->tw.spill_pages) || - (txn->tw.spill_pages[idx] >> 1) != pgno) + if (++idx > MDBX_PNL_GETSIZE(txn->tw.spilled.list) || + (txn->tw.spilled.list[idx] >> 1) != pgno) return; } else { - if (--idx < 1 || (txn->tw.spill_pages[idx] >> 1) != pgno) + if (--idx < 1 || (txn->tw.spilled.list[idx] >> 1) != pgno) return; - txn->tw.spill_least_removed = (idx < txn->tw.spill_least_removed) - ? idx - : txn->tw.spill_least_removed; + txn->tw.spilled.least_removed = (idx < txn->tw.spilled.least_removed) + ? idx + : txn->tw.spilled.least_removed; } - txn->tw.spill_pages[idx] |= 1; - MDBX_PNL_SIZE(txn->tw.spill_pages) -= - (idx == MDBX_PNL_SIZE(txn->tw.spill_pages)); + txn->tw.spilled.list[idx] |= 1; + MDBX_PNL_SETSIZE(txn->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list) - + (idx == MDBX_PNL_GETSIZE(txn->tw.spilled.list))); --npages; } } -static MDBX_PNL mdbx_spill_purge(MDBX_txn *txn) { - mdbx_tassert(txn, txn->tw.spill_least_removed > 0); - const MDBX_PNL sl = txn->tw.spill_pages; - if (txn->tw.spill_least_removed != INT_MAX) { - unsigned len = MDBX_PNL_SIZE(sl), r, w; - for (w = r = txn->tw.spill_least_removed; r <= len; ++r) { +static MDBX_PNL spill_purge(MDBX_txn *txn) { + tASSERT(txn, txn->tw.spilled.least_removed > 0); + const MDBX_PNL sl = txn->tw.spilled.list; + if (txn->tw.spilled.least_removed != INT_MAX) { + size_t len = MDBX_PNL_GETSIZE(sl), r, w; + for (w = r = txn->tw.spilled.least_removed; r <= len; ++r) { sl[w] = sl[r]; w += 1 - (sl[r] & 1); } for (size_t i = 1; i < w; ++i) - mdbx_tassert(txn, (sl[i] & 1) == 0); - MDBX_PNL_SIZE(sl) = w - 1; - txn->tw.spill_least_removed = INT_MAX; + tASSERT(txn, (sl[i] & 1) == 0); + MDBX_PNL_SETSIZE(sl, w - 1); + txn->tw.spilled.least_removed = INT_MAX; } else { - for (size_t i = 1; i <= MDBX_PNL_SIZE(sl); ++i) - mdbx_tassert(txn, (sl[i] & 1) == 0); + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(sl); ++i) + tASSERT(txn, (sl[i] & 1) == 0); } return sl; } @@ -3252,15 +2540,15 @@ RADIXSORT_IMPL(pgno, pgno_t, MDBX_PNL_EXTRACT_KEY, SORT_IMPL(pgno_sort, false, pgno_t, MDBX_PNL_ORDERED) -static __hot void mdbx_pnl_sort_nochk(MDBX_PNL pnl) { - if (likely(MDBX_PNL_SIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || - unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_SIZE(pnl)))) +__hot __noinline static void pnl_sort_nochk(MDBX_PNL pnl) { + if (likely(MDBX_PNL_GETSIZE(pnl) < MDBX_RADIXSORT_THRESHOLD) || + unlikely(!pgno_radixsort(&MDBX_PNL_FIRST(pnl), MDBX_PNL_GETSIZE(pnl)))) pgno_sort(MDBX_PNL_BEGIN(pnl), MDBX_PNL_END(pnl)); } -static __inline void mdbx_pnl_sort(MDBX_PNL pnl, size_t limit4check) { - mdbx_pnl_sort_nochk(pnl); - assert(mdbx_pnl_check(pnl, limit4check)); +static __inline void pnl_sort(MDBX_PNL pnl, size_t limit4check) { + pnl_sort_nochk(pnl); + assert(pnl_check(pnl, limit4check)); (void)limit4check; } @@ -3268,65 +2556,78 @@ static __inline void mdbx_pnl_sort(MDBX_PNL pnl, size_t limit4check) { * Returns The index of the first item greater than or equal to pgno. */ SEARCH_IMPL(pgno_bsearch, pgno_t, pgno_t, MDBX_PNL_ORDERED) -static __hot unsigned mdbx_pnl_search_nochk(const MDBX_PNL pnl, pgno_t pgno) { +__hot __noinline static size_t pnl_search_nochk(const MDBX_PNL pnl, + pgno_t pgno) { const pgno_t *begin = MDBX_PNL_BEGIN(pnl); - const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_SIZE(pnl), pgno); - const pgno_t *end = begin + MDBX_PNL_SIZE(pnl); + const pgno_t *it = pgno_bsearch(begin, MDBX_PNL_GETSIZE(pnl), pgno); + const pgno_t *end = begin + MDBX_PNL_GETSIZE(pnl); assert(it >= begin && it <= end); if (it != begin) assert(MDBX_PNL_ORDERED(it[-1], pgno)); if (it != end) assert(!MDBX_PNL_ORDERED(it[0], pgno)); - return (unsigned)(it - begin + 1); + return it - begin + 1; } -static __inline unsigned mdbx_pnl_search(const MDBX_PNL pnl, pgno_t pgno, - size_t limit) { - assert(mdbx_pnl_check4assert(pnl, limit)); +static __inline size_t pnl_search(const MDBX_PNL pnl, pgno_t pgno, + size_t limit) { + assert(pnl_check_allocated(pnl, limit)); + if (MDBX_HAVE_CMOV) { + /* cmov-ускоренный бинарный поиск может читать (но не использовать) один + * элемент за концом данных, этот элемент в пределах выделенного участка + * памяти, но не инициализирован. */ + VALGRIND_MAKE_MEM_DEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); + } assert(pgno < limit); (void)limit; - return mdbx_pnl_search_nochk(pnl, pgno); + size_t n = pnl_search_nochk(pnl, pgno); + if (MDBX_HAVE_CMOV) { + VALGRIND_MAKE_MEM_UNDEFINED(MDBX_PNL_END(pnl), sizeof(pgno_t)); + } + return n; } -static __inline unsigned mdbx_search_spilled(const MDBX_txn *txn, pgno_t pgno) { - const MDBX_PNL pnl = txn->tw.spill_pages; +static __inline size_t search_spilled(const MDBX_txn *txn, pgno_t pgno) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const MDBX_PNL pnl = txn->tw.spilled.list; if (likely(!pnl)) return 0; pgno <<= 1; - unsigned n = mdbx_pnl_search(pnl, pgno, (size_t)(MAX_PAGENO + 1) << 1); - return (n <= MDBX_PNL_SIZE(pnl) && pnl[n] == pgno) ? n : 0; + size_t n = pnl_search(pnl, pgno, (size_t)MAX_PAGENO + MAX_PAGENO + 1); + return (n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] == pgno) ? n : 0; } -static __inline bool mdbx_intersect_spilled(const MDBX_txn *txn, pgno_t pgno, - unsigned npages) { - const MDBX_PNL pnl = txn->tw.spill_pages; +static __inline bool intersect_spilled(const MDBX_txn *txn, pgno_t pgno, + size_t npages) { + const MDBX_PNL pnl = txn->tw.spilled.list; if (likely(!pnl)) return false; - const unsigned len = MDBX_PNL_SIZE(pnl); - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - mdbx_debug_extra("PNL len %u [", len); - for (unsigned i = 1; i <= len; ++i) - mdbx_debug_extra_print(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) - : (long)(pnl[i] >> 1)); - mdbx_debug_extra_print("%s\n", "]"); + const size_t len = MDBX_PNL_GETSIZE(pnl); + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("PNL len %zu [", len); + for (size_t i = 1; i <= len; ++i) + DEBUG_EXTRA_PRINT(" %li", (pnl[i] & 1) ? -(long)(pnl[i] >> 1) + : (long)(pnl[i] >> 1)); + DEBUG_EXTRA_PRINT("%s\n", "]"); } const pgno_t spilled_range_begin = pgno << 1; - const pgno_t spilled_range_last = ((pgno + npages) << 1) - 1; + const pgno_t spilled_range_last = ((pgno + (pgno_t)npages) << 1) - 1; #if MDBX_PNL_ASCENDING - const unsigned n = - mdbx_pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); - assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_begin <= pnl[n])); - const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] <= spilled_range_last; + const size_t n = + pnl_search(pnl, spilled_range_begin, (size_t)(MAX_PAGENO + 1) << 1); + assert(n && + (n == MDBX_PNL_GETSIZE(pnl) + 1 || spilled_range_begin <= pnl[n])); + const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] <= spilled_range_last; #else - const unsigned n = - mdbx_pnl_search(pnl, spilled_range_last, (size_t)(MAX_PAGENO + 1) << 1); - assert(n && (n == MDBX_PNL_SIZE(pnl) + 1 || spilled_range_last >= pnl[n])); - const bool rc = n <= MDBX_PNL_SIZE(pnl) && pnl[n] >= spilled_range_begin; + const size_t n = + pnl_search(pnl, spilled_range_last, (size_t)MAX_PAGENO + MAX_PAGENO + 1); + assert(n && (n == MDBX_PNL_GETSIZE(pnl) + 1 || spilled_range_last >= pnl[n])); + const bool rc = n <= MDBX_PNL_GETSIZE(pnl) && pnl[n] >= spilled_range_begin; #endif - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { bool check = false; - for (unsigned i = 0; i < npages; ++i) - check |= mdbx_search_spilled(txn, pgno + i) != 0; + for (size_t i = 0; i < npages; ++i) + check |= search_spilled(txn, (pgno_t)(pgno + i)) != 0; assert(check == rc); } return rc; @@ -3334,7 +2635,7 @@ static __inline bool mdbx_intersect_spilled(const MDBX_txn *txn, pgno_t pgno, /*----------------------------------------------------------------------------*/ -static __always_inline size_t txl2bytes(const size_t size) { +static __always_inline size_t txl_size2bytes(const size_t size) { assert(size > 0 && size <= MDBX_TXL_MAX * 2); size_t bytes = ceil_powerof2(MDBX_ASSUME_MALLOC_OVERHEAD + sizeof(txnid_t) * (size + 2), @@ -3343,54 +2644,54 @@ static __always_inline size_t txl2bytes(const size_t size) { return bytes; } -static __always_inline size_t bytes2txl(const size_t bytes) { +static __always_inline size_t txl_bytes2size(const size_t bytes) { size_t size = bytes / sizeof(txnid_t); assert(size > 2 && size <= MDBX_TXL_MAX * 2); return size - 2; } -static MDBX_TXL mdbx_txl_alloc(void) { - size_t bytes = txl2bytes(MDBX_TXL_INITIAL); - MDBX_TXL tl = mdbx_malloc(bytes); +static MDBX_TXL txl_alloc(void) { + size_t bytes = txl_size2bytes(MDBX_TXL_INITIAL); + MDBX_TXL tl = osal_malloc(bytes); if (likely(tl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(tl); #endif /* malloc_usable_size */ - tl[0] = bytes2txl(bytes); + tl[0] = txl_bytes2size(bytes); assert(tl[0] >= MDBX_TXL_INITIAL); - tl[1] = 0; tl += 1; + *tl = 0; } return tl; } -static void mdbx_txl_free(MDBX_TXL tl) { +static void txl_free(MDBX_TXL tl) { if (likely(tl)) - mdbx_free(tl - 1); + osal_free(tl - 1); } -static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { +static int txl_reserve(MDBX_TXL *ptl, const size_t wanna) { const size_t allocated = (size_t)MDBX_PNL_ALLOCLEN(*ptl); - assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && - MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); + assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && + MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); if (likely(allocated >= wanna)) return MDBX_SUCCESS; if (unlikely(wanna > /* paranoia */ MDBX_TXL_MAX)) { - mdbx_error("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); + ERROR("TXL too long (%zu > %zu)", wanna, (size_t)MDBX_TXL_MAX); return MDBX_TXN_FULL; } const size_t size = (wanna + wanna - allocated < MDBX_TXL_MAX) ? wanna + wanna - allocated : MDBX_TXL_MAX; - size_t bytes = txl2bytes(size); - MDBX_TXL tl = mdbx_realloc(*ptl - 1, bytes); + size_t bytes = txl_size2bytes(size); + MDBX_TXL tl = osal_realloc(*ptl - 1, bytes); if (likely(tl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(tl); #endif /* malloc_usable_size */ - *tl = bytes2txl(bytes); + *tl = txl_bytes2size(bytes); assert(*tl >= wanna); *ptl = tl + 1; return MDBX_SUCCESS; @@ -3398,48 +2699,45 @@ static int mdbx_txl_reserve(MDBX_TXL *ptl, const size_t wanna) { return MDBX_ENOMEM; } -static __always_inline int __must_check_result mdbx_txl_need(MDBX_TXL *ptl, - size_t num) { - assert(MDBX_PNL_SIZE(*ptl) <= MDBX_TXL_MAX && - MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_SIZE(*ptl)); +static __always_inline int __must_check_result txl_need(MDBX_TXL *ptl, + size_t num) { + assert(MDBX_PNL_GETSIZE(*ptl) <= MDBX_TXL_MAX && + MDBX_PNL_ALLOCLEN(*ptl) >= MDBX_PNL_GETSIZE(*ptl)); assert(num <= MDBX_PGL_LIMIT); - const size_t wanna = (size_t)MDBX_PNL_SIZE(*ptl) + num; - return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) - ? MDBX_SUCCESS - : mdbx_txl_reserve(ptl, wanna); + const size_t wanna = (size_t)MDBX_PNL_GETSIZE(*ptl) + num; + return likely(MDBX_PNL_ALLOCLEN(*ptl) >= wanna) ? MDBX_SUCCESS + : txl_reserve(ptl, wanna); } -static __always_inline void mdbx_txl_xappend(MDBX_TXL tl, txnid_t id) { - assert(MDBX_PNL_SIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); - MDBX_PNL_SIZE(tl) += 1; +static __always_inline void txl_xappend(MDBX_TXL tl, txnid_t id) { + assert(MDBX_PNL_GETSIZE(tl) < MDBX_PNL_ALLOCLEN(tl)); + tl[0] += 1; MDBX_PNL_LAST(tl) = id; } #define TXNID_SORT_CMP(first, last) ((first) > (last)) SORT_IMPL(txnid_sort, false, txnid_t, TXNID_SORT_CMP) -static void mdbx_txl_sort(MDBX_TXL tl) { +static void txl_sort(MDBX_TXL tl) { txnid_sort(MDBX_PNL_BEGIN(tl), MDBX_PNL_END(tl)); } -static int __must_check_result mdbx_txl_append(MDBX_TXL *ptl, txnid_t id) { - if (unlikely(MDBX_PNL_SIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { - int rc = mdbx_txl_need(ptl, MDBX_TXL_GRANULATE); +static int __must_check_result txl_append(MDBX_TXL *ptl, txnid_t id) { + if (unlikely(MDBX_PNL_GETSIZE(*ptl) == MDBX_PNL_ALLOCLEN(*ptl))) { + int rc = txl_need(ptl, MDBX_TXL_GRANULATE); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - mdbx_txl_xappend(*ptl, id); + txl_xappend(*ptl, id); return MDBX_SUCCESS; } /*----------------------------------------------------------------------------*/ -#define MDBX_DPL_UNSORTED_BACKLOG 16 -#define MDBX_DPL_GAP_FOR_MERGESORT MDBX_DPL_UNSORTED_BACKLOG -#define MDBX_DPL_GAP_FOR_EDGING 2 -#define MDBX_DPL_RESERVE_GAP \ - (MDBX_DPL_GAP_FOR_MERGESORT + MDBX_DPL_GAP_FOR_EDGING) +#define MDBX_DPL_GAP_MERGESORT 16 +#define MDBX_DPL_GAP_EDGING 2 +#define MDBX_DPL_RESERVE_GAP (MDBX_DPL_GAP_MERGESORT + MDBX_DPL_GAP_EDGING) -static __always_inline size_t dpl2bytes(ptrdiff_t size) { +static __always_inline size_t dpl_size2bytes(ptrdiff_t size) { assert(size > CURSOR_STACK && (size_t)size <= MDBX_PGL_LIMIT); #if MDBX_DPL_PREALLOC_FOR_RADIXSORT size += size; @@ -3458,7 +2756,7 @@ static __always_inline size_t dpl2bytes(ptrdiff_t size) { return bytes; } -static __always_inline unsigned bytes2dpl(const ptrdiff_t bytes) { +static __always_inline size_t dpl_bytes2size(const ptrdiff_t bytes) { size_t size = (bytes - sizeof(MDBX_dpl)) / sizeof(MDBX_dp); assert(size > CURSOR_STACK + MDBX_DPL_RESERVE_GAP && size <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); @@ -3466,67 +2764,81 @@ static __always_inline unsigned bytes2dpl(const ptrdiff_t bytes) { #if MDBX_DPL_PREALLOC_FOR_RADIXSORT size >>= 1; #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ - return (unsigned)size; + return size; } -static __always_inline unsigned dpl_setlen(MDBX_dpl *dl, unsigned len) { - static const MDBX_page dpl_stub_pageE = { - {0}, 0, P_BAD, {0}, /* pgno */ ~(pgno_t)0}; +static __always_inline size_t dpl_setlen(MDBX_dpl *dl, size_t len) { + static const MDBX_page dpl_stub_pageE = {INVALID_TXNID, + 0, + P_BAD, + {0}, + /* pgno */ ~(pgno_t)0}; assert(dpl_stub_pageE.mp_flags == P_BAD && dpl_stub_pageE.mp_pgno == P_INVALID); dl->length = len; dl->items[len + 1].ptr = (MDBX_page *)&dpl_stub_pageE; dl->items[len + 1].pgno = P_INVALID; - dl->items[len + 1].extra = 0; + dl->items[len + 1].npages = 1; return len; } static __always_inline void dpl_clear(MDBX_dpl *dl) { - static const MDBX_page dpl_stub_pageB = {{0}, 0, P_BAD, {0}, /* pgno */ 0}; + static const MDBX_page dpl_stub_pageB = {INVALID_TXNID, + 0, + P_BAD, + {0}, + /* pgno */ 0}; assert(dpl_stub_pageB.mp_flags == P_BAD && dpl_stub_pageB.mp_pgno == 0); dl->sorted = dpl_setlen(dl, 0); + dl->pages_including_loose = 0; dl->items[0].ptr = (MDBX_page *)&dpl_stub_pageB; dl->items[0].pgno = 0; - dl->items[0].extra = 0; + dl->items[0].npages = 1; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } -static void mdbx_dpl_free(MDBX_txn *txn) { +static void dpl_free(MDBX_txn *txn) { if (likely(txn->tw.dirtylist)) { - mdbx_free(txn->tw.dirtylist); + osal_free(txn->tw.dirtylist); txn->tw.dirtylist = NULL; } } -static MDBX_dpl *mdbx_dpl_reserve(MDBX_txn *txn, size_t size) { - size_t bytes = dpl2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); - MDBX_dpl *const dl = mdbx_realloc(txn->tw.dirtylist, bytes); +static MDBX_dpl *dpl_reserve(MDBX_txn *txn, size_t size) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + + size_t bytes = + dpl_size2bytes((size < MDBX_PGL_LIMIT) ? size : MDBX_PGL_LIMIT); + MDBX_dpl *const dl = osal_realloc(txn->tw.dirtylist, bytes); if (likely(dl)) { #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) bytes = malloc_usable_size(dl); #endif /* malloc_usable_size */ - dl->detent = bytes2dpl(bytes); - mdbx_tassert(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); + dl->detent = dpl_bytes2size(bytes); + tASSERT(txn, txn->tw.dirtylist == NULL || dl->length <= dl->detent); txn->tw.dirtylist = dl; } return dl; } -static int mdbx_dpl_alloc(MDBX_txn *txn) { - mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - const int wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) - ? txn->mt_env->me_options.dp_initial - : txn->mt_geo.upper; - if (txn->tw.dirtylist) { - dpl_clear(txn->tw.dirtylist); - const int realloc_threshold = 64; - if (likely( - !((int)(txn->tw.dirtylist->detent - wanna) > realloc_threshold || - (int)(txn->tw.dirtylist->detent - wanna) < -realloc_threshold))) - return MDBX_SUCCESS; - } - if (unlikely(!mdbx_dpl_reserve(txn, wanna))) +static int dpl_alloc(MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + + const size_t wanna = (txn->mt_env->me_options.dp_initial < txn->mt_geo.upper) + ? txn->mt_env->me_options.dp_initial + : txn->mt_geo.upper; +#if MDBX_FORCE_ASSERTIONS || MDBX_DEBUG + if (txn->tw.dirtylist) + /* обнуляем чтобы не сработал ассерт внутри dpl_reserve() */ + txn->tw.dirtylist->sorted = txn->tw.dirtylist->length = 0; +#endif /* asertions enabled */ + if (unlikely(!txn->tw.dirtylist || txn->tw.dirtylist->detent < wanna || + txn->tw.dirtylist->detent > wanna + wanna) && + unlikely(!dpl_reserve(txn, wanna))) return MDBX_ENOMEM; + dpl_clear(txn->tw.dirtylist); return MDBX_SUCCESS; } @@ -3538,15 +2850,18 @@ RADIXSORT_IMPL(dpl, MDBX_dp, MDBX_DPL_EXTRACT_KEY, #define DP_SORT_CMP(first, last) ((first).pgno < (last).pgno) SORT_IMPL(dp_sort, false, MDBX_dp, DP_SORT_CMP) -__hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { +__hot __noinline static MDBX_dpl *dpl_sort_slowpath(const MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - const unsigned unsorted = dl->length - dl->sorted; + const size_t unsorted = dl->length - dl->sorted; if (likely(unsorted < MDBX_RADIXSORT_THRESHOLD) || unlikely(!dpl_radixsort(dl->items + 1, dl->length))) { if (dl->sorted > unsorted / 4 + 4 && (MDBX_DPL_PREALLOC_FOR_RADIXSORT || - dl->length + unsorted < dl->detent + MDBX_DPL_GAP_FOR_MERGESORT)) { + dl->length + unsorted < dl->detent + MDBX_DPL_GAP_MERGESORT)) { MDBX_dp *const sorted_begin = dl->items + 1; MDBX_dp *const sorted_end = sorted_begin + dl->sorted; MDBX_dp *const end = @@ -3559,20 +2874,24 @@ __hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { memcpy(tmp, sorted_end, unsorted * sizeof(MDBX_dp)); dp_sort(tmp, tmp + unsorted); /* merge two parts from end to begin */ - MDBX_dp *w = dl->items + dl->length; - MDBX_dp *l = dl->items + dl->sorted; - MDBX_dp *r = end - 1; + MDBX_dp *__restrict w = dl->items + dl->length; + MDBX_dp *__restrict l = dl->items + dl->sorted; + MDBX_dp *__restrict r = end - 1; do { - const bool cmp = l->pgno > r->pgno; + const bool cmp = expect_with_probability(l->pgno > r->pgno, 0, .5); +#if defined(__LCC__) || __CLANG_PREREQ(13, 0) || !MDBX_HAVE_CMOV + *w = cmp ? *l-- : *r--; +#else *w = cmp ? *l : *r; l -= cmp; - r += cmp - 1; + r += (ptrdiff_t)cmp - 1; +#endif } while (likely(--w > l)); assert(r == tmp - 1); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_assert_enabled()) - for (unsigned i = 0; i <= dl->length; ++i) + if (ASSERT_ENABLED()) + for (size_t i = 0; i <= dl->length; ++i) assert(dl->items[i].pgno < dl->items[i + 1].pgno); } else { dp_sort(dl->items + 1, dl->items + dl->length + 1); @@ -3587,12 +2906,15 @@ __hot __noinline static MDBX_dpl *mdbx_dpl_sort_slowpath(const MDBX_txn *txn) { return dl; } -static __always_inline MDBX_dpl *mdbx_dpl_sort(const MDBX_txn *txn) { +static __always_inline MDBX_dpl *dpl_sort(const MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->length <= MDBX_PGL_LIMIT); assert(dl->sorted <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - return likely(dl->sorted == dl->length) ? dl : mdbx_dpl_sort_slowpath(txn); + return likely(dl->sorted == dl->length) ? dl : dpl_sort_slowpath(txn); } /* Returns the index of the first dirty-page whose pgno @@ -3600,10 +2922,13 @@ static __always_inline MDBX_dpl *mdbx_dpl_sort(const MDBX_txn *txn) { #define DP_SEARCH_CMP(dp, id) ((dp).pgno < (id)) SEARCH_IMPL(dp_bsearch, MDBX_dp, pgno_t, DP_SEARCH_CMP) -static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { +__hot __noinline static size_t dpl_search(const MDBX_txn *txn, pgno_t pgno) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_audit_enabled()) { + if (AUDIT_ENABLED()) { for (const MDBX_dp *ptr = dl->items + dl->sorted; --ptr > dl->items;) { assert(ptr[0].pgno < ptr[1].pgno); assert(ptr[0].pgno >= NUM_METAS); @@ -3613,7 +2938,7 @@ static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { switch (dl->length - dl->sorted) { default: /* sort a whole */ - mdbx_dpl_sort_slowpath(txn); + dpl_sort_slowpath(txn); break; case 0: /* whole sorted cases */ @@ -3625,59 +2950,53 @@ static unsigned __hot mdbx_dpl_search(const MDBX_txn *txn, pgno_t pgno) { return dl->length - N + 1; \ __fallthrough - /* try linear search until the threshold */ - LINEAR_SEARCH_CASE(16); /* fall through */ - LINEAR_SEARCH_CASE(15); /* fall through */ - LINEAR_SEARCH_CASE(14); /* fall through */ - LINEAR_SEARCH_CASE(13); /* fall through */ - LINEAR_SEARCH_CASE(12); /* fall through */ - LINEAR_SEARCH_CASE(11); /* fall through */ - LINEAR_SEARCH_CASE(10); /* fall through */ - LINEAR_SEARCH_CASE(9); /* fall through */ - LINEAR_SEARCH_CASE(8); /* fall through */ - LINEAR_SEARCH_CASE(7); /* fall through */ - LINEAR_SEARCH_CASE(6); /* fall through */ - LINEAR_SEARCH_CASE(5); /* fall through */ - LINEAR_SEARCH_CASE(4); /* fall through */ - LINEAR_SEARCH_CASE(3); /* fall through */ - LINEAR_SEARCH_CASE(2); /* fall through */ + /* use linear scan until the threshold */ + LINEAR_SEARCH_CASE(7); /* fall through */ + LINEAR_SEARCH_CASE(6); /* fall through */ + LINEAR_SEARCH_CASE(5); /* fall through */ + LINEAR_SEARCH_CASE(4); /* fall through */ + LINEAR_SEARCH_CASE(3); /* fall through */ + LINEAR_SEARCH_CASE(2); /* fall through */ case 1: if (dl->items[dl->length].pgno == pgno) return dl->length; /* continue bsearch on the sorted part */ break; } - return (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); + return dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; } MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned -dpl_npages(const MDBX_dpl *dl, unsigned i) { - assert(0 <= (int)i && i <= dl->length); - unsigned n = likely(!dl->items[i].multi) ? 1 : dl->items[i].ptr->mp_pages; +dpl_npages(const MDBX_dpl *dl, size_t i) { + assert(0 <= (intptr_t)i && i <= dl->length); + unsigned n = dl->items[i].npages; assert(n == (IS_OVERFLOW(dl->items[i].ptr) ? dl->items[i].ptr->mp_pages : 1)); return n; } -MDBX_NOTHROW_PURE_FUNCTION static __inline unsigned -dpl_endpgno(const MDBX_dpl *dl, unsigned i) { +MDBX_NOTHROW_PURE_FUNCTION static __inline pgno_t +dpl_endpgno(const MDBX_dpl *dl, size_t i) { return dpl_npages(dl, i) + dl->items[i].pgno; } -static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, - unsigned npages) { +static __inline bool dpl_intersect(const MDBX_txn *txn, pgno_t pgno, + size_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; assert(dl->sorted == dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - unsigned const n = mdbx_dpl_search(txn, pgno); + size_t const n = dpl_search(txn, pgno); assert(n >= 1 && n <= dl->length + 1); assert(pgno <= dl->items[n].pgno); assert(pgno > dl->items[n - 1].pgno); const bool rc = /* intersection with founded */ pgno + npages > dl->items[n].pgno || /* intersection with prev */ dpl_endpgno(dl, n - 1) > pgno; - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { bool check = false; - for (unsigned i = 1; i <= dl->length; ++i) { + for (size_t i = 1; i <= dl->length; ++i) { const MDBX_page *const dp = dl->items[i].ptr; if (!(dp->mp_pgno /* begin */ >= /* end */ pgno + npages || dpl_endpgno(dl, i) /* end */ <= /* begin */ pgno)) @@ -3688,34 +3007,46 @@ static __inline bool mdbx_dpl_intersect(const MDBX_txn *txn, pgno_t pgno, return rc; } -static __always_inline unsigned mdbx_dpl_exist(MDBX_txn *txn, pgno_t pgno) { +MDBX_NOTHROW_PURE_FUNCTION static __always_inline size_t +dpl_exist(const MDBX_txn *txn, pgno_t pgno) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); MDBX_dpl *dl = txn->tw.dirtylist; - unsigned i = mdbx_dpl_search(txn, pgno); + size_t i = dpl_search(txn, pgno); assert((int)i > 0); return (dl->items[i].pgno == pgno) ? i : 0; } MDBX_MAYBE_UNUSED static const MDBX_page *debug_dpl_find(const MDBX_txn *txn, const pgno_t pgno) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); const MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - for (unsigned i = dl->length; i > dl->sorted; --i) - if (dl->items[i].pgno == pgno) - return dl->items[i].ptr; + if (dl) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + assert(dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + for (size_t i = dl->length; i > dl->sorted; --i) + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; - if (dl->sorted) { - const unsigned i = - (unsigned)(dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items); - if (dl->items[i].pgno == pgno) - return dl->items[i].ptr; + if (dl->sorted) { + const size_t i = dp_bsearch(dl->items + 1, dl->sorted, pgno) - dl->items; + if (dl->items[i].pgno == pgno) + return dl->items[i].ptr; + } + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); } return nullptr; } -static void mdbx_dpl_remove(const MDBX_txn *txn, unsigned i) { +static void dpl_remove_ex(const MDBX_txn *txn, size_t i, size_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *dl = txn->tw.dirtylist; - assert((int)i > 0 && i <= dl->length); + assert((intptr_t)i > 0 && i <= dl->length); assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + dl->pages_including_loose -= npages; dl->sorted -= dl->sorted >= i; dl->length -= 1; memmove(dl->items + i, dl->items + i + 1, @@ -3723,87 +3054,180 @@ static void mdbx_dpl_remove(const MDBX_txn *txn, unsigned i) { assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); } -static __always_inline int __must_check_result -mdbx_dpl_append(MDBX_txn *txn, pgno_t pgno, MDBX_page *page, unsigned npages) { +static void dpl_remove(const MDBX_txn *txn, size_t i) { + dpl_remove_ex(txn, i, dpl_npages(txn->tw.dirtylist, i)); +} + +static __noinline void txn_lru_reduce(MDBX_txn *txn) { + NOTICE("lru-reduce %u -> %u", txn->tw.dirtylru, txn->tw.dirtylru >> 1); + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + do { + txn->tw.dirtylru >>= 1; + MDBX_dpl *dl = txn->tw.dirtylist; + for (size_t i = 1; i <= dl->length; ++i) { + size_t *const ptr = + ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + *ptr >>= 1; + } + txn = txn->mt_parent; + } while (txn); +} + +MDBX_NOTHROW_PURE_FUNCTION static __inline uint32_t dpl_age(const MDBX_txn *txn, + size_t i) { + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + const MDBX_dpl *dl = txn->tw.dirtylist; + assert((intptr_t)i > 0 && i <= dl->length); + size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + return txn->tw.dirtylru - (uint32_t)*ptr; +} + +static __inline uint32_t txn_lru_turn(MDBX_txn *txn) { + txn->tw.dirtylru += 1; + if (unlikely(txn->tw.dirtylru > UINT32_MAX / 3) && + (txn->mt_flags & MDBX_WRITEMAP) == 0) + txn_lru_reduce(txn); + return txn->tw.dirtylru; +} + +static __always_inline int __must_check_result dpl_append(MDBX_txn *txn, + pgno_t pgno, + MDBX_page *page, + size_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const MDBX_dp dp = {page, pgno, (pgno_t)npages}; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + size_t *const ptr = ptr_disp(page, -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + } + MDBX_dpl *dl = txn->tw.dirtylist; - assert(dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - if (mdbx_audit_enabled()) { - for (unsigned i = dl->length; i > 0; --i) { - assert(dl->items[i].pgno != pgno); - if (unlikely(dl->items[i].pgno == pgno)) { - mdbx_error("Page %u already exist in the DPL at %u", pgno, i); + tASSERT(txn, dl->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + tASSERT(txn, dl->items[0].pgno == 0 && + dl->items[dl->length + 1].pgno == P_INVALID); + if (AUDIT_ENABLED()) { + for (size_t i = dl->length; i > 0; --i) { + assert(dl->items[i].pgno != dp.pgno); + if (unlikely(dl->items[i].pgno == dp.pgno)) { + ERROR("Page %u already exist in the DPL at %zu", dp.pgno, i); return MDBX_PROBLEM; } } } - const unsigned length = dl->length + 1; - const unsigned sorted = - (dl->sorted == dl->length && dl->items[dl->length].pgno < pgno) - ? length - : dl->sorted; - if (unlikely(dl->length == dl->detent)) { if (unlikely(dl->detent >= MDBX_PGL_LIMIT)) { - mdbx_error("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); + ERROR("DPL is full (MDBX_PGL_LIMIT %zu)", MDBX_PGL_LIMIT); return MDBX_TXN_FULL; } const size_t size = (dl->detent < MDBX_PNL_INITIAL * 42) ? dl->detent + dl->detent : dl->detent + dl->detent / 2; - dl = mdbx_dpl_reserve(txn, size); + dl = dpl_reserve(txn, size); if (unlikely(!dl)) return MDBX_ENOMEM; - mdbx_tassert(txn, dl->length < dl->detent); + tASSERT(txn, dl->length < dl->detent); } - /* copy the stub beyond the end */ - dl->items[length + 1] = dl->items[length]; - /* append page */ - dl->items[length].ptr = page; - dl->items[length].pgno = pgno; - dl->items[length].multi = npages > 1; - dl->items[length].lru = txn->tw.dirtylru++; - dl->length = length; - dl->sorted = sorted; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - return MDBX_SUCCESS; -} + /* Сортировка нужна для быстрого поиска, используем несколько тактик: + * 1) Сохраняем упорядоченность при естественной вставке в нужном порядке. + * 2) Добавляем в не-сортированный хвост, который сортируем и сливаем + * с отсортированной головой по необходимости, а пока хвост короткий + * ищем в нём сканированием, избегая большой пересортировки. + * 3) Если не-сортированный хвост короткий, а добавляемый элемент близок + * к концу отсортированной головы, то выгоднее сразу вставить элемент + * в нужное место. + * + * Алгоритмически: + * - добавлять в не-сортированный хвост следует только если вставка сильно + * дорогая, т.е. если целевая позиция элемента сильно далека от конца; + * - для быстрой проверки достаточно сравнить добавляемый элемент с отстоящим + * от конца на максимально-приемлемое расстояние; + * - если список короче, либо элемент в этой позиции меньше вставляемого, + * то следует перемещать элементы и вставлять в отсортированную голову; + * - если не-сортированный хвост длиннее, либо элемент в этой позиции больше, + * то следует добавлять в не-сортированный хвост. */ -static __inline uint32_t mdbx_dpl_age(const MDBX_txn *txn, unsigned i) { - const MDBX_dpl *dl = txn->tw.dirtylist; - assert((int)i > 0 && i <= dl->length); - /* overflow could be here */ - return (txn->tw.dirtylru - dl->items[i].lru) & UINT32_C(0x7fffFFFF); + dl->pages_including_loose += npages; + MDBX_dp *i = dl->items + dl->length; + +#define MDBX_DPL_INSERTION_THRESHOLD 42 + const ptrdiff_t pivot = (ptrdiff_t)dl->length - MDBX_DPL_INSERTION_THRESHOLD; +#if MDBX_HAVE_CMOV + const pgno_t pivot_pgno = + dl->items[(dl->length < MDBX_DPL_INSERTION_THRESHOLD) + ? 0 + : dl->length - MDBX_DPL_INSERTION_THRESHOLD] + .pgno; +#endif /* MDBX_HAVE_CMOV */ + + /* copy the stub beyond the end */ + i[2] = i[1]; + dl->length += 1; + + if (likely(pivot <= (ptrdiff_t)dl->sorted) && +#if MDBX_HAVE_CMOV + pivot_pgno < dp.pgno) { +#else + (pivot <= 0 || dl->items[pivot].pgno < dp.pgno)) { +#endif /* MDBX_HAVE_CMOV */ + dl->sorted += 1; + + /* сдвигаем несортированный хвост */ + while (i >= dl->items + dl->sorted) { +#if !defined(__GNUC__) /* пытаемся избежать вызова memmove() */ + i[1] = *i; +#elif MDBX_WORDBITS == 64 && \ + (defined(__SIZEOF_INT128__) || \ + (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128)) + STATIC_ASSERT(sizeof(MDBX_dp) == sizeof(__uint128_t)); + ((__uint128_t *)i)[1] = *(volatile __uint128_t *)i; +#else + i[1].ptr = i->ptr; + i[1].pgno = i->pgno; + i[1].npages = i->npages; +#endif + --i; + } + /* ищем нужную позицию сдвигая отсортированные элементы */ + while (i->pgno > pgno) { + tASSERT(txn, i > dl->items); + i[1] = *i; + --i; + } + tASSERT(txn, i->pgno < dp.pgno); + } + + i[1] = dp; + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + assert(dl->sorted <= dl->length); + return MDBX_SUCCESS; } /*----------------------------------------------------------------------------*/ -uint8_t mdbx_runtime_flags = MDBX_RUNTIME_FLAGS_INIT; -uint8_t mdbx_loglevel = MDBX_LOG_FATAL; -MDBX_debug_func *mdbx_debug_logger; +uint8_t runtime_flags = MDBX_RUNTIME_FLAGS_INIT; +uint8_t loglevel = MDBX_LOG_FATAL; +MDBX_debug_func *debug_logger; -static __must_check_result __inline int mdbx_page_retire(MDBX_cursor *mc, - MDBX_page *mp); +static __must_check_result __inline int page_retire(MDBX_cursor *mc, + MDBX_page *mp); -static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, - unsigned npages); -struct page_result { +static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, + size_t npages); +typedef struct page_result { MDBX_page *page; int err; -}; +} pgr_t; -static struct page_result mdbx_page_alloc(MDBX_cursor *mc, const pgno_t num, - int flags); -static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, - const txnid_t laggard); +static txnid_t kick_longlived_readers(MDBX_env *env, const txnid_t laggard); -static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags, - const unsigned npages); -static int mdbx_page_touch(MDBX_cursor *mc); -static int mdbx_cursor_touch(MDBX_cursor *mc); -static int mdbx_touch_dbi(MDBX_cursor *mc); +static pgr_t page_new(MDBX_cursor *mc, const unsigned flags); +static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages); +static int page_touch(MDBX_cursor *mc); +static int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, + const MDBX_val *data); #define MDBX_END_NAMES \ { \ @@ -3811,7 +3235,7 @@ static int mdbx_touch_dbi(MDBX_cursor *mc); "fail-beginchild" \ } enum { - /* mdbx_txn_end operation number, for logging */ + /* txn_end operation number, for logging */ MDBX_END_COMMITTED, MDBX_END_PURE_COMMIT, MDBX_END_ABORT, @@ -3820,151 +3244,162 @@ enum { MDBX_END_FAIL_BEGIN, MDBX_END_FAIL_BEGINCHILD }; -#define MDBX_END_OPMASK 0x0F /* mask for mdbx_txn_end() operation number */ +#define MDBX_END_OPMASK 0x0F /* mask for txn_end() operation number */ #define MDBX_END_UPDATE 0x10 /* update env state (DBIs) */ #define MDBX_END_FREE 0x20 /* free txn unless it is MDBX_env.me_txn0 */ #define MDBX_END_EOTDONE 0x40 /* txn's cursors already closed */ #define MDBX_END_SLOT 0x80 /* release any reader slot if MDBX_NOTLS */ -static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode); +static int txn_end(MDBX_txn *txn, const unsigned mode); -__hot static struct page_result __must_check_result -mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, txnid_t front); -static __always_inline int __must_check_result mdbx_page_get(MDBX_cursor *mc, - pgno_t pgno, - MDBX_page **mp, - txnid_t front) { +static __always_inline pgr_t page_get_inline(const uint16_t ILL, + const MDBX_cursor *const mc, + const pgno_t pgno, + const txnid_t front); - struct page_result ret = mdbx_page_get_ex(mc, pgno, front); +static pgr_t page_get_any(const MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS, mc, pgno, front); +} + +__hot static pgr_t page_get_three(const MDBX_cursor *const mc, + const pgno_t pgno, const txnid_t front) { + return page_get_inline(P_ILL_BITS | P_OVERFLOW, mc, pgno, front); +} + +static pgr_t page_get_large(const MDBX_cursor *const mc, const pgno_t pgno, + const txnid_t front) { + return page_get_inline(P_ILL_BITS | P_BRANCH | P_LEAF | P_LEAF2, mc, pgno, + front); +} + +static __always_inline int __must_check_result page_get(const MDBX_cursor *mc, + const pgno_t pgno, + MDBX_page **mp, + const txnid_t front) { + pgr_t ret = page_get_three(mc, pgno, front); *mp = ret.page; return ret.err; } -static int __must_check_result mdbx_page_search_root(MDBX_cursor *mc, - const MDBX_val *key, - int flags); +static int __must_check_result page_search_root(MDBX_cursor *mc, + const MDBX_val *key, int flags); #define MDBX_PS_MODIFY 1 #define MDBX_PS_ROOTONLY 2 #define MDBX_PS_FIRST 4 #define MDBX_PS_LAST 8 -static int __must_check_result mdbx_page_search(MDBX_cursor *mc, - const MDBX_val *key, int flags); -static int __must_check_result mdbx_page_merge(MDBX_cursor *csrc, - MDBX_cursor *cdst); +static int __must_check_result page_search(MDBX_cursor *mc, const MDBX_val *key, + int flags); +static int __must_check_result page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst); #define MDBX_SPLIT_REPLACE MDBX_APPENDDUP /* newkey is not new */ -static int __must_check_result mdbx_page_split(MDBX_cursor *mc, - const MDBX_val *const newkey, - MDBX_val *const newdata, - pgno_t newpgno, unsigned nflags); +static int __must_check_result page_split(MDBX_cursor *mc, + const MDBX_val *const newkey, + MDBX_val *const newdata, + pgno_t newpgno, const unsigned naf); -static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, - bool report); -static int __must_check_result mdbx_validate_meta_copy(MDBX_env *env, - const MDBX_meta *meta, - MDBX_meta *dest); -static int __must_check_result mdbx_override_meta(MDBX_env *env, - unsigned target, - txnid_t txnid, - const MDBX_meta *shape); -static int __must_check_result mdbx_read_header(MDBX_env *env, MDBX_meta *meta, - const int lck_exclusive, - const mdbx_mode_t mode_bits); -static int __must_check_result mdbx_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending); -static int mdbx_env_close0(MDBX_env *env); +static int coherency_timeout(uint64_t *timestamp, pgno_t pgno); +static bool coherency_check_meta(const MDBX_env *env, + const volatile MDBX_meta *meta, bool report); +static int __must_check_result validate_meta_copy(MDBX_env *env, + const MDBX_meta *meta, + MDBX_meta *dest); +static int __must_check_result override_meta(MDBX_env *env, size_t target, + txnid_t txnid, + const MDBX_meta *shape); +static int __must_check_result read_header(MDBX_env *env, MDBX_meta *meta, + const int lck_exclusive, + const mdbx_mode_t mode_bits); +static int __must_check_result sync_locked(MDBX_env *env, unsigned flags, + MDBX_meta *const pending, + meta_troika_t *const troika); +static int env_close(MDBX_env *env); struct node_result { MDBX_node *node; bool exact; }; -static struct node_result mdbx_node_search(MDBX_cursor *mc, - const MDBX_val *key); +static struct node_result node_search(MDBX_cursor *mc, const MDBX_val *key); -static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - pgno_t pgno); -static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - MDBX_val *data, - unsigned flags); -static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key); +static int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, + const MDBX_val *key, + pgno_t pgno); +static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, + const MDBX_val *key, + MDBX_val *data, unsigned flags); +static int __must_check_result node_add_leaf2(MDBX_cursor *mc, size_t indx, + const MDBX_val *key); -static void mdbx_node_del(MDBX_cursor *mc, size_t ksize); -static void mdbx_node_shrink(MDBX_page *mp, unsigned indx); -static int __must_check_result mdbx_node_move(MDBX_cursor *csrc, - MDBX_cursor *cdst, bool fromleft); -static int __must_check_result mdbx_node_read(MDBX_cursor *mc, - const MDBX_node *leaf, - MDBX_val *data, - const txnid_t front); -static int __must_check_result mdbx_rebalance(MDBX_cursor *mc); -static int __must_check_result mdbx_update_key(MDBX_cursor *mc, - const MDBX_val *key); +static void node_del(MDBX_cursor *mc, size_t ksize); +static void node_shrink(MDBX_page *mp, size_t indx); +static int __must_check_result node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, + bool fromleft); +static int __must_check_result node_read(MDBX_cursor *mc, const MDBX_node *leaf, + MDBX_val *data, const MDBX_page *mp); +static int __must_check_result rebalance(MDBX_cursor *mc); +static int __must_check_result update_key(MDBX_cursor *mc, const MDBX_val *key); -static void mdbx_cursor_pop(MDBX_cursor *mc); -static int __must_check_result mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp); +static void cursor_pop(MDBX_cursor *mc); +static int __must_check_result cursor_push(MDBX_cursor *mc, MDBX_page *mp); -static int __must_check_result mdbx_audit_ex(MDBX_txn *txn, - unsigned retired_stored, - bool dont_filter_gc); +static int __must_check_result audit_ex(MDBX_txn *txn, size_t retired_stored, + bool dont_filter_gc); -static int __must_check_result mdbx_page_check(MDBX_cursor *const mc, - const MDBX_page *const mp, - unsigned options); -static int __must_check_result mdbx_cursor_check(MDBX_cursor *mc, - unsigned options); -static int __must_check_result mdbx_cursor_del0(MDBX_cursor *mc); -static int __must_check_result mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, - const MDBX_val *key, - const MDBX_val *data, unsigned flags); +static int __must_check_result page_check(const MDBX_cursor *const mc, + const MDBX_page *const mp); +static int __must_check_result cursor_check(const MDBX_cursor *mc); +static int __must_check_result cursor_get(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); +static int __must_check_result cursor_put_checklen(MDBX_cursor *mc, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); +static int __must_check_result cursor_put_nochecklen(MDBX_cursor *mc, + const MDBX_val *key, + MDBX_val *data, + unsigned flags); +static int __must_check_result cursor_check_updating(MDBX_cursor *mc); +static int __must_check_result cursor_del(MDBX_cursor *mc, + MDBX_put_flags_t flags); +static int __must_check_result delete(MDBX_txn *txn, MDBX_dbi dbi, + const MDBX_val *key, const MDBX_val *data, + unsigned flags); #define SIBLING_LEFT 0 #define SIBLING_RIGHT 2 -static int __must_check_result mdbx_cursor_sibling(MDBX_cursor *mc, int dir); -static int __must_check_result mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); -static int __must_check_result mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); +static int __must_check_result cursor_sibling(MDBX_cursor *mc, int dir); +static int __must_check_result cursor_next(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); +static int __must_check_result cursor_prev(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); struct cursor_set_result { int err; bool exact; }; -static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op); -static int __must_check_result mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data); -static int __must_check_result mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data); +static struct cursor_set_result cursor_set(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data, MDBX_cursor_op op); +static int __must_check_result cursor_first(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data); +static int __must_check_result cursor_last(MDBX_cursor *mc, MDBX_val *key, + MDBX_val *data); -static int __must_check_result mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, - MDBX_dbi dbi); -static int __must_check_result mdbx_xcursor_init0(MDBX_cursor *mc); -static int __must_check_result mdbx_xcursor_init1(MDBX_cursor *mc, - MDBX_node *node, - const MDBX_page *mp); -static int __must_check_result mdbx_xcursor_init2(MDBX_cursor *mc, - MDBX_xcursor *src_mx, - bool new_dupdata); +static int __must_check_result cursor_init(MDBX_cursor *mc, MDBX_txn *txn, + size_t dbi); +static int __must_check_result cursor_xinit0(MDBX_cursor *mc); +static int __must_check_result cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, + const MDBX_page *mp); +static int __must_check_result cursor_xinit2(MDBX_cursor *mc, + MDBX_xcursor *src_mx, + bool new_dupdata); static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst); -static int __must_check_result mdbx_drop_tree(MDBX_cursor *mc, - const bool may_have_subDBs); -static int __must_check_result mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi); -static int __must_check_result mdbx_setup_dbx(MDBX_dbx *const dbx, - const MDBX_db *const db, - const unsigned pagesize); - -static MDBX_cmp_func cmp_lexical, cmp_reverse, cmp_int_align4, cmp_int_align2, - cmp_int_unaligned, cmp_lenfast; +static int __must_check_result drop_tree(MDBX_cursor *mc, + const bool may_have_subDBs); +static int __must_check_result fetch_sdb(MDBX_txn *txn, size_t dbi); +static int __must_check_result setup_dbx(MDBX_dbx *const dbx, + const MDBX_db *const db, + const unsigned pagesize); static __inline MDBX_cmp_func *get_default_keycmp(unsigned flags); static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags); @@ -4036,6 +3471,9 @@ __cold const char *mdbx_liberr2str(int errnum) { case MDBX_TXN_OVERLAPPING: return "MDBX_TXN_OVERLAPPING: Overlapping read and write transactions for" " the current thread"; + case MDBX_DUPLICATED_CLK: + return "MDBX_DUPLICATED_CLK: Alternative/Duplicate LCK-file is exists, " + "please keep one and remove unused other"; default: return NULL; } @@ -4119,30 +3557,30 @@ const char *mdbx_strerror_ANSI2OEM(int errnum) { } #endif /* Bit of madness for Windows */ -__cold void mdbx_debug_log_va(int level, const char *function, int line, - const char *fmt, va_list args) { - if (mdbx_debug_logger) - mdbx_debug_logger(level, function, line, fmt, args); +__cold void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args) { + if (debug_logger) + debug_logger(level, function, line, fmt, args); else { #if defined(_WIN32) || defined(_WIN64) if (IsDebuggerPresent()) { int prefix_len = 0; char *prefix = nullptr; if (function && line > 0) - prefix_len = mdbx_asprintf(&prefix, "%s:%d ", function, line); + prefix_len = osal_asprintf(&prefix, "%s:%d ", function, line); else if (function) - prefix_len = mdbx_asprintf(&prefix, "%s: ", function); + prefix_len = osal_asprintf(&prefix, "%s: ", function); else if (line > 0) - prefix_len = mdbx_asprintf(&prefix, "%d: ", line); + prefix_len = osal_asprintf(&prefix, "%d: ", line); if (prefix_len > 0 && prefix) { OutputDebugStringA(prefix); - mdbx_free(prefix); + osal_free(prefix); } char *msg = nullptr; - int msg_len = mdbx_vasprintf(&msg, fmt, args); + int msg_len = osal_vasprintf(&msg, fmt, args); if (msg_len > 0 && msg) { OutputDebugStringA(msg); - mdbx_free(msg); + osal_free(msg); } } #else @@ -4158,11 +3596,11 @@ __cold void mdbx_debug_log_va(int level, const char *function, int line, } } -__cold void mdbx_debug_log(int level, const char *function, int line, - const char *fmt, ...) { +__cold void debug_log(int level, const char *function, int line, + const char *fmt, ...) { va_list args; va_start(args, fmt); - mdbx_debug_log_va(level, function, line, fmt, args); + debug_log_va(level, function, line, fmt, args); va_end(args); } @@ -4178,7 +3616,7 @@ const char *mdbx_dump_val(const MDBX_val *key, char *const buf, bool is_ascii = true; const uint8_t *const data = key->iov_base; - for (unsigned i = 0; i < key->iov_len; i++) + for (size_t i = 0; i < key->iov_len; i++) if (data[i] < ' ' || data[i] > '~') { is_ascii = false; break; @@ -4188,13 +3626,13 @@ const char *mdbx_dump_val(const MDBX_val *key, char *const buf, int len = snprintf(buf, bufsize, "%.*s", (key->iov_len > INT_MAX) ? INT_MAX : (int)key->iov_len, data); - assert(len > 0 && (unsigned)len < bufsize); + assert(len > 0 && (size_t)len < bufsize); (void)len; } else { char *const detent = buf + bufsize - 2; char *ptr = buf; *ptr++ = '<'; - for (unsigned i = 0; i < key->iov_len; i++) { + for (size_t i = 0; i < key->iov_len; i++) { const ptrdiff_t left = detent - ptr; assert(left > 0); int len = snprintf(ptr, left, "%02x", data[i]); @@ -4213,26 +3651,24 @@ const char *mdbx_dump_val(const MDBX_val *key, char *const buf, /*------------------------------------------------------------------------------ LY: debug stuff */ -static const char *mdbx_leafnode_type(MDBX_node *n) { +static const char *leafnode_type(MDBX_node *n) { static const char *const tp[2][2] = {{"", ": DB"}, {": sub-page", ": sub-DB"}}; - return F_ISSET(node_flags(n), F_BIGDATA) - ? ": overflow page" - : tp[F_ISSET(node_flags(n), F_DUPDATA)] - [F_ISSET(node_flags(n), F_SUBDATA)]; + return (node_flags(n) & F_BIGDATA) + ? ": large page" + : tp[!!(node_flags(n) & F_DUPDATA)][!!(node_flags(n) & F_SUBDATA)]; } /* Display all the keys in the page. */ -MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { +MDBX_MAYBE_UNUSED static void page_list(MDBX_page *mp) { pgno_t pgno = mp->mp_pgno; const char *type; MDBX_node *node; - unsigned i, nkeys, nsize, total = 0; + size_t i, nkeys, nsize, total = 0; MDBX_val key; DKBUF; - switch (mp->mp_flags & - (P_BRANCH | P_LEAF | P_LEAF2 | P_META | P_OVERFLOW | P_SUBP)) { + switch (PAGETYPE_WHOLE(mp)) { case P_BRANCH: type = "Branch page"; break; @@ -4249,51 +3685,51 @@ MDBX_MAYBE_UNUSED static void mdbx_page_list(MDBX_page *mp) { type = "Leaf2 sub-page"; break; case P_OVERFLOW: - mdbx_verbose("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); + VERBOSE("Overflow page %" PRIaPGNO " pages %u\n", pgno, mp->mp_pages); return; case P_META: - mdbx_verbose("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, - unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); + VERBOSE("Meta-page %" PRIaPGNO " txnid %" PRIu64 "\n", pgno, + unaligned_peek_u64(4, page_meta(mp)->mm_txnid_a)); return; default: - mdbx_verbose("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); + VERBOSE("Bad page %" PRIaPGNO " flags 0x%X\n", pgno, mp->mp_flags); return; } nkeys = page_numkeys(mp); - mdbx_verbose("%s %" PRIaPGNO " numkeys %u\n", type, pgno, nkeys); + VERBOSE("%s %" PRIaPGNO " numkeys %zu\n", type, pgno, nkeys); for (i = 0; i < nkeys; i++) { if (IS_LEAF2(mp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ key.iov_len = nsize = mp->mp_leaf2_ksize; key.iov_base = page_leaf2key(mp, i, nsize); total += nsize; - mdbx_verbose("key %u: nsize %u, %s\n", i, nsize, DKEY(&key)); + VERBOSE("key %zu: nsize %zu, %s\n", i, nsize, DKEY(&key)); continue; } node = page_node(mp, i); key.iov_len = node_ks(node); key.iov_base = node->mn_data; - nsize = (unsigned)(NODESIZE + key.iov_len); + nsize = NODESIZE + key.iov_len; if (IS_BRANCH(mp)) { - mdbx_verbose("key %u: page %" PRIaPGNO ", %s\n", i, node_pgno(node), - DKEY(&key)); + VERBOSE("key %zu: page %" PRIaPGNO ", %s\n", i, node_pgno(node), + DKEY(&key)); total += nsize; } else { - if (F_ISSET(node_flags(node), F_BIGDATA)) + if (node_flags(node) & F_BIGDATA) nsize += sizeof(pgno_t); else - nsize += (unsigned)node_ds(node); + nsize += node_ds(node); total += nsize; nsize += sizeof(indx_t); - mdbx_verbose("key %u: nsize %u, %s%s\n", i, nsize, DKEY(&key), - mdbx_leafnode_type(node)); + VERBOSE("key %zu: nsize %zu, %s%s\n", i, nsize, DKEY(&key), + leafnode_type(node)); } total = EVEN(total); } - mdbx_verbose("Total: header %u + contents %u + unused %u\n", - IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, - page_room(mp)); + VERBOSE("Total: header %zu + contents %zu + unused %zu\n", + IS_LEAF2(mp) ? PAGEHDRSZ : PAGEHDRSZ + mp->mp_lower, total, + page_room(mp)); } /*----------------------------------------------------------------------------*/ @@ -4324,9 +3760,9 @@ MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { /* Perform act while tracking temporary cursor mn */ #define WITH_CURSOR_TRACKING(mn, act) \ do { \ - mdbx_cassert(&(mn), \ - mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ - mdbx_cassert(&(mn), !cursor_is_tracked(&(mn))); \ + cASSERT(&(mn), \ + mn.mc_txn->mt_cursors != NULL /* must be not rdonly txt */); \ + cASSERT(&(mn), !cursor_is_tracked(&(mn))); \ MDBX_cursor mc_dummy; \ MDBX_cursor **tracking_head = &(mn).mc_txn->mt_cursors[mn.mc_dbi]; \ MDBX_cursor *tracked = &(mn); \ @@ -4345,38 +3781,40 @@ MDBX_MAYBE_UNUSED static bool cursor_is_tracked(const MDBX_cursor *mc) { int mdbx_cmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_cmp(a, b); } int mdbx_dcmp(const MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(NULL, txn->mt_signature == MDBX_MT_SIGNATURE); return txn->mt_dbxs[dbi].md_dcmp(a, b); } /* Allocate memory for a page. * Re-use old malloc'ed pages first for singletons, otherwise just malloc. * Set MDBX_TXN_ERROR on failure. */ -static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { +static MDBX_page *page_malloc(MDBX_txn *txn, size_t num) { MDBX_env *env = txn->mt_env; MDBX_page *np = env->me_dp_reserve; size_t size = env->me_psize; if (likely(num == 1 && np)) { - mdbx_assert(env, env->me_dp_reserve_len > 0); + eASSERT(env, env->me_dp_reserve_len > 0); MDBX_ASAN_UNPOISON_MEMORY_REGION(np, size); - VALGRIND_MEMPOOL_ALLOC(env, np, size); - VALGRIND_MAKE_MEM_DEFINED(&np->mp_next, sizeof(np->mp_next)); - env->me_dp_reserve = np->mp_next; + VALGRIND_MEMPOOL_ALLOC(env, ptr_disp(np, -(ptrdiff_t)sizeof(size_t)), + size + sizeof(size_t)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(np), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(np); env->me_dp_reserve_len -= 1; } else { size = pgno2bytes(env, num); - np = mdbx_malloc(size); - if (unlikely(!np)) { + void *const ptr = osal_malloc(size + sizeof(size_t)); + if (unlikely(!ptr)) { txn->mt_flags |= MDBX_TXN_ERROR; - return np; + return nullptr; } - VALGRIND_MEMPOOL_ALLOC(env, np, size); + VALGRIND_MEMPOOL_ALLOC(env, ptr, size + sizeof(size_t)); + np = ptr_disp(ptr, sizeof(size_t)); } if ((env->me_flags & MDBX_NOMEMINIT) == 0) { @@ -4386,126 +3824,139 @@ static MDBX_page *mdbx_page_malloc(MDBX_txn *txn, unsigned num) { size_t skip = PAGEHDRSZ; if (num > 1) skip += pgno2bytes(env, num - 1); - memset((char *)np + skip, 0, size - skip); + memset(ptr_disp(np, skip), 0, size - skip); } #if MDBX_DEBUG np->mp_pgno = 0; #endif VALGRIND_MAKE_MEM_UNDEFINED(np, size); np->mp_flags = 0; - np->mp_pages = num; + np->mp_pages = (pgno_t)num; return np; } /* Free a shadow dirty page */ -static void mdbx_dpage_free(MDBX_env *env, MDBX_page *dp, unsigned npages) { +static void dpage_free(MDBX_env *env, MDBX_page *dp, size_t npages) { VALGRIND_MAKE_MEM_UNDEFINED(dp, pgno2bytes(env, npages)); MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, pgno2bytes(env, npages)); - if (MDBX_DEBUG != 0 || unlikely(env->me_flags & MDBX_PAGEPERTURB)) + if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) memset(dp, -1, pgno2bytes(env, npages)); if (npages == 1 && env->me_dp_reserve_len < env->me_options.dp_reserve_limit) { - MDBX_ASAN_POISON_MEMORY_REGION((char *)dp + sizeof(dp->mp_next), - pgno2bytes(env, npages) - - sizeof(dp->mp_next)); - dp->mp_next = env->me_dp_reserve; - VALGRIND_MEMPOOL_FREE(env, dp); + MDBX_ASAN_POISON_MEMORY_REGION(dp, env->me_psize); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); + mp_next(dp) = env->me_dp_reserve; + VALGRIND_MEMPOOL_FREE(env, ptr_disp(dp, -(ptrdiff_t)sizeof(size_t))); env->me_dp_reserve = dp; env->me_dp_reserve_len += 1; } else { /* large pages just get freed directly */ - VALGRIND_MEMPOOL_FREE(env, dp); - mdbx_free(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + VALGRIND_MEMPOOL_FREE(env, ptr); + osal_free(ptr); } } /* Return all dirty pages to dpage list */ -static void mdbx_dlist_free(MDBX_txn *txn) { +static void dlist_free(MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); MDBX_env *env = txn->mt_env; MDBX_dpl *const dl = txn->tw.dirtylist; - for (unsigned i = 1; i <= dl->length; i++) { - MDBX_page *dp = dl->items[i].ptr; - mdbx_dpage_free(env, dp, dpl_npages(dl, i)); - } + for (size_t i = 1; i <= dl->length; i++) + dpage_free(env, dl->items[i].ptr, dpl_npages(dl, i)); dpl_clear(dl); } -static __always_inline MDBX_db *mdbx_outer_db(MDBX_cursor *mc) { - mdbx_cassert(mc, (mc->mc_flags & C_SUB) != 0); +static __always_inline MDBX_db *outer_db(MDBX_cursor *mc) { + cASSERT(mc, (mc->mc_flags & C_SUB) != 0); MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); - mdbx_cassert(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); - mdbx_cassert(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + cASSERT(mc, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + cASSERT(mc, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); return couple->outer.mc_db; } -MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { +MDBX_MAYBE_UNUSED __cold static bool dirtylist_check(MDBX_txn *txn) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); const MDBX_dpl *const dl = txn->tw.dirtylist; - assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); - mdbx_tassert(txn, txn->tw.dirtyroom + dl->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + if (!dl) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + return true; + } + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); - if (!mdbx_audit_enabled()) + assert(dl->items[0].pgno == 0 && dl->items[dl->length + 1].pgno == P_INVALID); + tASSERT(txn, txn->tw.dirtyroom + dl->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + + if (!AUDIT_ENABLED()) return true; - unsigned loose = 0; - for (unsigned i = dl->length; i > 0; --i) { + size_t loose = 0, pages = 0; + for (size_t i = dl->length; i > 0; --i) { const MDBX_page *const dp = dl->items[i].ptr; if (!dp) continue; - mdbx_tassert(txn, dp->mp_pgno == dl->items[i].pgno); + tASSERT(txn, dp->mp_pgno == dl->items[i].pgno); if (unlikely(dp->mp_pgno != dl->items[i].pgno)) return false; - const uint32_t age = mdbx_dpl_age(txn, i); - mdbx_tassert(txn, age < UINT32_MAX / 3); - if (unlikely(age > UINT32_MAX / 3)) - return false; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { + const uint32_t age = dpl_age(txn, i); + tASSERT(txn, age < UINT32_MAX / 3); + if (unlikely(age > UINT32_MAX / 3)) + return false; + } - mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); if (dp->mp_flags == P_LOOSE) { loose += 1; } else if (unlikely(!IS_MODIFIABLE(txn, dp))) return false; const unsigned num = dpl_npages(dl, i); - mdbx_tassert(txn, txn->mt_next_pgno >= dp->mp_pgno + num); + pages += num; + tASSERT(txn, txn->mt_next_pgno >= dp->mp_pgno + num); if (unlikely(txn->mt_next_pgno < dp->mp_pgno + num)) return false; if (i < dl->sorted) { - mdbx_tassert(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); + tASSERT(txn, dl->items[i + 1].pgno >= dp->mp_pgno + num); if (unlikely(dl->items[i + 1].pgno < dp->mp_pgno + num)) return false; } - const unsigned rpa = mdbx_pnl_search(txn->tw.reclaimed_pglist, dp->mp_pgno, - txn->mt_next_pgno); - mdbx_tassert(txn, rpa > MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) || - txn->tw.reclaimed_pglist[rpa] != dp->mp_pgno); - if (rpa <= MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && - unlikely(txn->tw.reclaimed_pglist[rpa] == dp->mp_pgno)) + const size_t rpa = + pnl_search(txn->tw.relist, dp->mp_pgno, txn->mt_next_pgno); + tASSERT(txn, rpa > MDBX_PNL_GETSIZE(txn->tw.relist) || + txn->tw.relist[rpa] != dp->mp_pgno); + if (rpa <= MDBX_PNL_GETSIZE(txn->tw.relist) && + unlikely(txn->tw.relist[rpa] == dp->mp_pgno)) return false; if (num > 1) { - const unsigned rpb = mdbx_pnl_search( - txn->tw.reclaimed_pglist, dp->mp_pgno + num - 1, txn->mt_next_pgno); - mdbx_tassert(txn, rpa == rpb); + const size_t rpb = + pnl_search(txn->tw.relist, dp->mp_pgno + num - 1, txn->mt_next_pgno); + tASSERT(txn, rpa == rpb); if (unlikely(rpa != rpb)) return false; } } - mdbx_tassert(txn, loose == txn->tw.loose_count); + tASSERT(txn, loose == txn->tw.loose_count); if (unlikely(loose != txn->tw.loose_count)) return false; - for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.retired_pages); ++i) { + tASSERT(txn, pages == dl->pages_including_loose); + if (unlikely(pages != dl->pages_including_loose)) + return false; + + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.retired_pages); ++i) { const MDBX_page *const dp = debug_dpl_find(txn, txn->tw.retired_pages[i]); - mdbx_tassert(txn, !dp); + tASSERT(txn, !dp); if (unlikely(dp)) return false; } @@ -4514,146 +3965,155 @@ MDBX_MAYBE_UNUSED __cold static bool mdbx_dirtylist_check(MDBX_txn *txn) { } #if MDBX_ENABLE_REFUND -static void mdbx_refund_reclaimed(MDBX_txn *txn) { +static void refund_reclaimed(MDBX_txn *txn) { /* Scanning in descend order */ pgno_t next_pgno = txn->mt_next_pgno; - const MDBX_PNL pnl = txn->tw.reclaimed_pglist; - mdbx_tassert(txn, MDBX_PNL_SIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); + const MDBX_PNL pnl = txn->tw.relist; + tASSERT(txn, MDBX_PNL_GETSIZE(pnl) && MDBX_PNL_MOST(pnl) == next_pgno - 1); #if MDBX_PNL_ASCENDING - unsigned i = MDBX_PNL_SIZE(pnl); - mdbx_tassert(txn, pnl[i] == next_pgno - 1); + size_t i = MDBX_PNL_GETSIZE(pnl); + tASSERT(txn, pnl[i] == next_pgno - 1); while (--next_pgno, --i > 0 && pnl[i] == next_pgno - 1) ; - MDBX_PNL_SIZE(pnl) = i; + MDBX_PNL_SETSIZE(pnl, i); #else - unsigned i = 1; - mdbx_tassert(txn, pnl[i] == next_pgno - 1); - unsigned len = MDBX_PNL_SIZE(pnl); + size_t i = 1; + tASSERT(txn, pnl[i] == next_pgno - 1); + size_t len = MDBX_PNL_GETSIZE(pnl); while (--next_pgno, ++i <= len && pnl[i] == next_pgno - 1) ; - MDBX_PNL_SIZE(pnl) = len -= i - 1; - for (unsigned move = 0; move < len; ++move) + MDBX_PNL_SETSIZE(pnl, len -= i - 1); + for (size_t move = 0; move < len; ++move) pnl[1 + move] = pnl[i + move]; #endif - mdbx_verbose("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, - txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); + VERBOSE("refunded %" PRIaPGNO " pages: %" PRIaPGNO " -> %" PRIaPGNO, + txn->mt_next_pgno - next_pgno, txn->mt_next_pgno, next_pgno); txn->mt_next_pgno = next_pgno; - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - 1)); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno - 1)); } -static void mdbx_refund_loose(MDBX_txn *txn) { - mdbx_tassert(txn, txn->tw.loose_pages != nullptr); - mdbx_tassert(txn, txn->tw.loose_count > 0); +static void refund_loose(MDBX_txn *txn) { + tASSERT(txn, txn->tw.loose_pages != nullptr); + tASSERT(txn, txn->tw.loose_count > 0); MDBX_dpl *const dl = txn->tw.dirtylist; - mdbx_tassert(txn, dl->length >= txn->tw.loose_count); + if (dl) { + tASSERT(txn, dl->length >= txn->tw.loose_count); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } pgno_t onstack[MDBX_CACHELINE_SIZE * 8 / sizeof(pgno_t)]; MDBX_PNL suitable = onstack; - if (dl->length - dl->sorted > txn->tw.loose_count) { + if (!dl || dl->length - dl->sorted > txn->tw.loose_count) { /* Dirty list is useless since unsorted. */ - if (bytes2pnl(sizeof(onstack)) < txn->tw.loose_count) { - suitable = mdbx_pnl_alloc(txn->tw.loose_count); + if (pnl_bytes2size(sizeof(onstack)) < txn->tw.loose_count) { + suitable = pnl_alloc(txn->tw.loose_count); if (unlikely(!suitable)) return /* this is not a reason for transaction fail */; } /* Collect loose-pages which may be refunded. */ - mdbx_tassert(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); + tASSERT(txn, txn->mt_next_pgno >= MIN_PAGENO + txn->tw.loose_count); pgno_t most = MIN_PAGENO; - unsigned w = 0; - for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = lp->mp_next) { - mdbx_tassert(txn, lp->mp_flags == P_LOOSE); - mdbx_tassert(txn, txn->mt_next_pgno > lp->mp_pgno); + size_t w = 0; + for (const MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { + tASSERT(txn, lp->mp_flags == P_LOOSE); + tASSERT(txn, txn->mt_next_pgno > lp->mp_pgno); if (likely(txn->mt_next_pgno - txn->tw.loose_count <= lp->mp_pgno)) { - mdbx_tassert(txn, - w < ((suitable == onstack) ? bytes2pnl(sizeof(onstack)) - : MDBX_PNL_ALLOCLEN(suitable))); + tASSERT(txn, + w < ((suitable == onstack) ? pnl_bytes2size(sizeof(onstack)) + : MDBX_PNL_ALLOCLEN(suitable))); suitable[++w] = lp->mp_pgno; most = (lp->mp_pgno > most) ? lp->mp_pgno : most; } + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } if (most + 1 == txn->mt_next_pgno) { /* Sort suitable list and refund pages at the tail. */ - MDBX_PNL_SIZE(suitable) = w; - mdbx_pnl_sort(suitable, MAX_PAGENO + 1); + MDBX_PNL_SETSIZE(suitable, w); + pnl_sort(suitable, MAX_PAGENO + 1); /* Scanning in descend order */ - const int step = MDBX_PNL_ASCENDING ? -1 : 1; - const int begin = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(suitable) : 1; - const int end = MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_SIZE(suitable) + 1; - mdbx_tassert(txn, suitable[begin] >= suitable[end - step]); - mdbx_tassert(txn, most == suitable[begin]); + const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1; + const intptr_t begin = + MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(suitable) : 1; + const intptr_t end = + MDBX_PNL_ASCENDING ? 0 : MDBX_PNL_GETSIZE(suitable) + 1; + tASSERT(txn, suitable[begin] >= suitable[end - step]); + tASSERT(txn, most == suitable[begin]); - for (int i = begin + step; i != end; i += step) { + for (intptr_t i = begin + step; i != end; i += step) { if (suitable[i] != most - 1) break; most -= 1; } - const unsigned refunded = txn->mt_next_pgno - most; - mdbx_debug("refund-suitable %u pages %" PRIaPGNO " -> %" PRIaPGNO, - refunded, most, txn->mt_next_pgno); - txn->tw.loose_count -= refunded; - txn->tw.dirtyroom += refunded; - assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); + const size_t refunded = txn->mt_next_pgno - most; + DEBUG("refund-suitable %zu pages %" PRIaPGNO " -> %" PRIaPGNO, refunded, + most, txn->mt_next_pgno); txn->mt_next_pgno = most; + txn->tw.loose_count -= refunded; + if (dl) { + txn->tw.dirtyroom += refunded; + dl->pages_including_loose -= refunded; + assert(txn->tw.dirtyroom <= txn->mt_env->me_options.dp_limit); - /* Filter-out dirty list */ - unsigned r = 0; - w = 0; - if (dl->sorted) { - do { + /* Filter-out dirty list */ + size_t r = 0; + w = 0; + if (dl->sorted) { + do { + if (dl->items[++r].pgno < most) { + if (++w != r) + dl->items[w] = dl->items[r]; + } + } while (r < dl->sorted); + dl->sorted = w; + } + while (r < dl->length) { if (dl->items[++r].pgno < most) { if (++w != r) dl->items[w] = dl->items[r]; } - } while (r < dl->sorted); - dl->sorted = w; - } - while (r < dl->length) { - if (dl->items[++r].pgno < most) { - if (++w != r) - dl->items[w] = dl->items[r]; } + dpl_setlen(dl, w); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); } - dpl_setlen(dl, w); - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - goto unlink_loose; } } else { /* Dirtylist is mostly sorted, just refund loose pages at the end. */ - mdbx_dpl_sort(txn); - mdbx_tassert(txn, dl->length < 2 || - dl->items[1].pgno < dl->items[dl->length].pgno); - mdbx_tassert(txn, dl->sorted == dl->length); + dpl_sort(txn); + tASSERT(txn, + dl->length < 2 || dl->items[1].pgno < dl->items[dl->length].pgno); + tASSERT(txn, dl->sorted == dl->length); /* Scan dirtylist tail-forward and cutoff suitable pages. */ - unsigned n; + size_t n; for (n = dl->length; dl->items[n].pgno == txn->mt_next_pgno - 1 && dl->items[n].ptr->mp_flags == P_LOOSE; --n) { - mdbx_tassert(txn, n > 0); + tASSERT(txn, n > 0); MDBX_page *dp = dl->items[n].ptr; - mdbx_debug("refund-sorted page %" PRIaPGNO, dp->mp_pgno); - mdbx_tassert(txn, dp->mp_pgno == dl->items[n].pgno); + DEBUG("refund-sorted page %" PRIaPGNO, dp->mp_pgno); + tASSERT(txn, dp->mp_pgno == dl->items[n].pgno); txn->mt_next_pgno -= 1; } dpl_setlen(dl, n); if (dl->sorted != dl->length) { - const unsigned refunded = dl->sorted - dl->length; + const size_t refunded = dl->sorted - dl->length; dl->sorted = dl->length; txn->tw.loose_count -= refunded; txn->tw.dirtyroom += refunded; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + dl->pages_including_loose -= refunded; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); @@ -4661,41 +4121,43 @@ static void mdbx_refund_loose(MDBX_txn *txn) { unlink_loose: for (MDBX_page **link = &txn->tw.loose_pages; *link;) { MDBX_page *dp = *link; - mdbx_tassert(txn, dp->mp_flags == P_LOOSE); + tASSERT(txn, dp->mp_flags == P_LOOSE); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(dp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); if (txn->mt_next_pgno > dp->mp_pgno) { - link = &dp->mp_next; + link = &mp_next(dp); } else { - *link = dp->mp_next; + *link = mp_next(dp); if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(txn->mt_env, dp, 1); + dpage_free(txn->mt_env, dp, 1); } } } } - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); if (suitable != onstack) - mdbx_pnl_free(suitable); + pnl_free(suitable); txn->tw.loose_refund_wl = txn->mt_next_pgno; } -static bool mdbx_refund(MDBX_txn *txn) { +static bool txn_refund(MDBX_txn *txn) { const pgno_t before = txn->mt_next_pgno; if (txn->tw.loose_pages && txn->tw.loose_refund_wl > txn->mt_next_pgno) - mdbx_refund_loose(txn); + refund_loose(txn); while (true) { - if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) == 0 || - MDBX_PNL_MOST(txn->tw.reclaimed_pglist) != txn->mt_next_pgno - 1) + if (MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) != txn->mt_next_pgno - 1) break; - mdbx_refund_reclaimed(txn); + refund_reclaimed(txn); if (!txn->tw.loose_pages || txn->tw.loose_refund_wl <= txn->mt_next_pgno) break; const pgno_t memo = txn->mt_next_pgno; - mdbx_refund_loose(txn); + refund_loose(txn); if (memo == txn->mt_next_pgno) break; } @@ -4703,81 +4165,94 @@ static bool mdbx_refund(MDBX_txn *txn) { if (before == txn->mt_next_pgno) return false; - if (txn->tw.spill_pages) + if (txn->tw.spilled.list) /* Squash deleted pagenums if we refunded any */ - mdbx_spill_purge(txn); + spill_purge(txn); return true; } #else /* MDBX_ENABLE_REFUND */ -static __inline bool mdbx_refund(MDBX_txn *txn) { +static __inline bool txn_refund(MDBX_txn *txn) { (void)txn; /* No online auto-compactification. */ return false; } #endif /* MDBX_ENABLE_REFUND */ -__cold static void mdbx_kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, - unsigned npages) { +__cold static void kill_page(MDBX_txn *txn, MDBX_page *mp, pgno_t pgno, + size_t npages) { MDBX_env *const env = txn->mt_env; - mdbx_debug("kill %u page(s) %" PRIaPGNO, npages, pgno); - mdbx_assert(env, pgno >= NUM_METAS && npages); + DEBUG("kill %zu page(s) %" PRIaPGNO, npages, pgno); + eASSERT(env, pgno >= NUM_METAS && npages); if (!IS_FROZEN(txn, mp)) { const size_t bytes = pgno2bytes(env, npages); memset(mp, -1, bytes); mp->mp_pgno = pgno; - if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) + osal_pwrite(env->me_lazy_fd, mp, bytes, pgno2bytes(env, pgno)); } else { - struct iovec iov[MDBX_COMMIT_PAGES]; + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; iov[0].iov_len = env->me_psize; - iov[0].iov_base = (char *)env->me_pbuf + env->me_psize; - size_t iov_off = pgno2bytes(env, pgno); - unsigned n = 1; + iov[0].iov_base = ptr_disp(env->me_pbuf, env->me_psize); + size_t iov_off = pgno2bytes(env, pgno), n = 1; while (--npages) { iov[n] = iov[0]; - if (++n == MDBX_COMMIT_PAGES) { - mdbx_pwritev(env->me_lazy_fd, iov, MDBX_COMMIT_PAGES, iov_off, - pgno2bytes(env, MDBX_COMMIT_PAGES)); - iov_off += pgno2bytes(env, MDBX_COMMIT_PAGES); + if (++n == MDBX_AUXILARY_IOV_MAX) { + osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, iov_off); + iov_off += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); n = 0; } } - mdbx_pwritev(env->me_lazy_fd, iov, n, iov_off, pgno2bytes(env, n)); + osal_pwritev(env->me_lazy_fd, iov, n, iov_off); } } -/* Remove page from dirty list */ -static __inline void mdbx_page_wash(MDBX_txn *txn, const unsigned di, - MDBX_page *const mp, - const unsigned npages) { - mdbx_tassert(txn, di && di <= txn->tw.dirtylist->length && - txn->tw.dirtylist->items[di].ptr == mp); - mdbx_dpl_remove(txn, di); - txn->tw.dirtyroom++; - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); +/* Remove page from dirty list, etc */ +static __inline void page_wash(MDBX_txn *txn, size_t di, MDBX_page *const mp, + const size_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); mp->mp_txnid = INVALID_TXNID; - mp->mp_flags = 0xFFFF; + mp->mp_flags = P_BAD; + + if (txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, + MDBX_AVOID_MSYNC || (di && txn->tw.dirtylist->items[di].ptr == mp)); + if (!MDBX_AVOID_MSYNC || di) { + dpl_remove_ex(txn, di, npages); + txn->tw.dirtyroom++; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) { + dpage_free(txn->mt_env, mp, npages); + return; + } + } + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC && !di); + txn->tw.writemap_dirty_npages -= (txn->tw.writemap_dirty_npages > npages) + ? npages + : txn->tw.writemap_dirty_npages; + } VALGRIND_MAKE_MEM_UNDEFINED(mp, PAGEHDRSZ); - if (txn->mt_flags & MDBX_WRITEMAP) { - VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), - pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); - } else - mdbx_dpage_free(txn->mt_env, mp, npages); + VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); + MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), + pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); } -static __inline txnid_t pp_txnid4chk(const MDBX_page *mp, const MDBX_txn *txn) { - (void)txn; -#if MDBX_DISABLE_PAGECHECKS - (void)mp; - return 0; -#else - return /* maybe zero in legacy DB */ mp->mp_txnid; -#endif /* !MDBX_DISABLE_PAGECHECKS */ +static __inline bool suitable4loose(const MDBX_txn *txn, pgno_t pgno) { + /* TODO: + * 1) при включенной "экономии последовательностей" проверить, что + * страница не примыкает к какой-либо из уже находящийся в reclaimed. + * 2) стоит подумать над тем, чтобы при большом loose-списке отбрасывать + половину в reclaimed. */ + return txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && + (!MDBX_ENABLE_REFUND || + /* skip pages near to the end in favor of compactification */ + txn->mt_next_pgno > pgno + txn->mt_env->me_options.dp_loose_limit || + txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit); } /* Retire, loosen or free a single page. @@ -4788,12 +4263,12 @@ static __inline txnid_t pp_txnid4chk(const MDBX_page *mp, const MDBX_txn *txn) { * * If the page wasn't dirtied in this txn, just add it * to this txn's free list. */ -static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, - MDBX_page *mp /* maybe null */, - int pagetype /* maybe unknown/zero */) { +static int page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, + MDBX_page *mp /* maybe null */, + unsigned pageflags /* maybe unknown/zero */) { int rc; MDBX_txn *const txn = mc->mc_txn; - mdbx_tassert(txn, !mp || (mp->mp_pgno == pgno && PAGETYPE(mp) == pagetype)); + tASSERT(txn, !mp || (mp->mp_pgno == pgno && mp->mp_flags == pageflags)); /* During deleting entire subtrees, it is reasonable and possible to avoid * reading leaf pages, i.e. significantly reduce hard page-faults & IOPs: @@ -4806,108 +4281,118 @@ static int mdbx_page_retire_ex(MDBX_cursor *mc, const pgno_t pgno, * requires support the list of dirty pages and avoid explicit spilling. * So for flexibility and avoid extra internal dependencies we just * fallback to reading if dirty list was not allocated yet. */ - unsigned di = 0, si = 0, npages = 1; - bool is_frozen = false, is_spilled = false, is_shadowed = false; + size_t di = 0, si = 0, npages = 1; + enum page_status { + unknown, + frozen, + spilled, + shadowed, + modifable + } status = unknown; + if (unlikely(!mp)) { - if (mdbx_assert_enabled() && pagetype) { - MDBX_page *check; - rc = mdbx_page_get(mc, pgno, &check, txn->mt_front); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mdbx_tassert(txn, (PAGETYPE(check) & ~(P_LEAF2 | P_SPILLED)) == - (pagetype & ~(P_LEAF2 | P_FROZEN))); - mdbx_tassert(txn, !(pagetype & P_FROZEN) || IS_FROZEN(txn, check)); + if (ASSERT_ENABLED() && pageflags) { + pgr_t check; + check = page_get_any(mc, pgno, txn->mt_front); + if (unlikely(check.err != MDBX_SUCCESS)) + return check.err; + tASSERT(txn, + (check.page->mp_flags & ~P_SPILLED) == (pageflags & ~P_FROZEN)); + tASSERT(txn, !(pageflags & P_FROZEN) || IS_FROZEN(txn, check.page)); } - if (pagetype & P_FROZEN) { - is_frozen = true; - if (mdbx_assert_enabled()) { + if (pageflags & P_FROZEN) { + status = frozen; + if (ASSERT_ENABLED()) { for (MDBX_txn *scan = txn; scan; scan = scan->mt_parent) { - mdbx_tassert(txn, !mdbx_search_spilled(scan, pgno)); - mdbx_tassert(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(scan, pgno)); + tASSERT(txn, !scan->tw.dirtylist || !debug_dpl_find(scan, pgno)); } } goto status_done; - } else if (pagetype && txn->tw.dirtylist) { - if ((di = mdbx_dpl_exist(txn, pgno)) != 0) { + } else if (pageflags && txn->tw.dirtylist) { + if ((di = dpl_exist(txn, pgno)) != 0) { mp = txn->tw.dirtylist->items[di].ptr; - mdbx_tassert(txn, IS_MODIFIABLE(txn, mp)); + tASSERT(txn, IS_MODIFIABLE(txn, mp)); + status = modifable; goto status_done; } - if ((si = mdbx_search_spilled(txn, pgno)) != 0) { - is_spilled = true; + if ((si = search_spilled(txn, pgno)) != 0) { + status = spilled; goto status_done; } for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_dpl_exist(parent, pgno)) { - is_shadowed = true; + if (dpl_exist(parent, pgno)) { + status = shadowed; goto status_done; } - if (mdbx_search_spilled(parent, pgno)) { - is_spilled = true; + if (search_spilled(parent, pgno)) { + status = spilled; goto status_done; } } - is_frozen = true; + status = frozen; goto status_done; } - rc = mdbx_page_get(mc, pgno, &mp, txn->mt_front); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - mdbx_tassert(txn, !pagetype || PAGETYPE(mp) == pagetype); - pagetype = PAGETYPE(mp); + pgr_t pg = page_get_any(mc, pgno, txn->mt_front); + if (unlikely(pg.err != MDBX_SUCCESS)) + return pg.err; + mp = pg.page; + tASSERT(txn, !pageflags || mp->mp_flags == pageflags); + pageflags = mp->mp_flags; } - is_frozen = IS_FROZEN(txn, mp); - if (!is_frozen) { - const bool is_dirty = IS_MODIFIABLE(txn, mp); - is_spilled = IS_SPILLED(txn, mp) && !(txn->mt_flags & MDBX_WRITEMAP); - is_shadowed = IS_SHADOWED(txn, mp); - if (is_dirty) { - mdbx_tassert(txn, !is_spilled); - mdbx_tassert(txn, !mdbx_search_spilled(txn, pgno)); - mdbx_tassert(txn, debug_dpl_find(txn, pgno) == mp || txn->mt_parent || - (txn->mt_flags & MDBX_WRITEMAP)); - } else { - mdbx_tassert(txn, !debug_dpl_find(txn, pgno)); - } - - di = is_dirty ? mdbx_dpl_exist(txn, pgno) : 0; - si = is_spilled ? mdbx_search_spilled(txn, pgno) : 0; - mdbx_tassert(txn, !is_dirty || di || (txn->mt_flags & MDBX_WRITEMAP)); + if (IS_FROZEN(txn, mp)) { + status = frozen; + tASSERT(txn, !IS_MODIFIABLE(txn, mp)); + tASSERT(txn, !IS_SPILLED(txn, mp)); + tASSERT(txn, !IS_SHADOWED(txn, mp)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + } else if (IS_MODIFIABLE(txn, mp)) { + status = modifable; + if (txn->tw.dirtylist) + di = dpl_exist(txn, pgno); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) || !IS_SPILLED(txn, mp)); + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + } else if (IS_SHADOWED(txn, mp)) { + status = shadowed; + tASSERT(txn, !txn->tw.spilled.list || !search_spilled(txn, pgno)); + tASSERT(txn, !debug_dpl_find(txn, pgno)); } else { - mdbx_tassert(txn, !IS_MODIFIABLE(txn, mp)); - mdbx_tassert(txn, !IS_SPILLED(txn, mp)); - mdbx_tassert(txn, !IS_SHADOWED(txn, mp)); + tASSERT(txn, IS_SPILLED(txn, mp)); + status = spilled; + si = search_spilled(txn, pgno); + tASSERT(txn, !debug_dpl_find(txn, pgno)); } status_done: - if (likely((pagetype & P_OVERFLOW) == 0)) { + if (likely((pageflags & P_OVERFLOW) == 0)) { STATIC_ASSERT(P_BRANCH == 1); - const bool is_branch = pagetype & P_BRANCH; + const bool is_branch = pageflags & P_BRANCH; if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); - mdbx_cassert(mc, !is_branch || outer->md_branch_pages > 0); + MDBX_db *outer = outer_db(mc); + cASSERT(mc, !is_branch || outer->md_branch_pages > 0); outer->md_branch_pages -= is_branch; - mdbx_cassert(mc, is_branch || outer->md_leaf_pages > 0); + cASSERT(mc, is_branch || outer->md_leaf_pages > 0); outer->md_leaf_pages -= 1 - is_branch; } - mdbx_cassert(mc, !is_branch || mc->mc_db->md_branch_pages > 0); + cASSERT(mc, !is_branch || mc->mc_db->md_branch_pages > 0); mc->mc_db->md_branch_pages -= is_branch; - mdbx_cassert(mc, (pagetype & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); - mc->mc_db->md_leaf_pages -= (pagetype & P_LEAF) != 0; + cASSERT(mc, (pageflags & P_LEAF) == 0 || mc->mc_db->md_leaf_pages > 0); + mc->mc_db->md_leaf_pages -= (pageflags & P_LEAF) != 0; } else { npages = mp->mp_pages; - mdbx_cassert(mc, mc->mc_db->md_overflow_pages >= npages); - mc->mc_db->md_overflow_pages -= npages; + cASSERT(mc, mc->mc_db->md_overflow_pages >= npages); + mc->mc_db->md_overflow_pages -= (pgno_t)npages; } - if (is_frozen) { + if (status == frozen) { retire: - mdbx_debug("retire %u page %" PRIaPGNO, npages, pgno); - rc = mdbx_pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + DEBUG("retire %zu page %" PRIaPGNO, npages, pgno); + rc = pnl_append_range(false, &txn->tw.retired_pages, pgno, npages); + tASSERT(txn, dirtylist_check(txn)); return rc; } @@ -4916,66 +4401,61 @@ status_done: * нераспределенного "хвоста" БД сдвигается только при их коммите. */ if (MDBX_ENABLE_REFUND && unlikely(pgno + npages == txn->mt_next_pgno)) { const char *kind = nullptr; - if (di) { + if (status == modifable) { /* Страница испачкана в этой транзакции, но до этого могла быть * аллоцирована, испачкана и пролита в одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "dirty"; /* Remove from dirty list */ - mdbx_page_wash(txn, di, mp, npages); + page_wash(txn, di, mp, npages); } else if (si) { /* Страница пролита в этой транзакции, т.е. она аллоцирована * и запачкана в этой или одной из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "spilled"; - mdbx_spill_remove(txn, si, npages); - } else if ((txn->mt_flags & MDBX_WRITEMAP)) { - kind = "writemap"; - mdbx_tassert(txn, mp && IS_MODIFIABLE(txn, mp)); + tASSERT(txn, status == spilled); + spill_remove(txn, si, npages); } else { /* Страница аллоцирована, запачкана и возможно пролита в одной * из родительских транзакций. * Её МОЖНО вытолкнуть в нераспределенный хвост. */ kind = "parent's"; - if (mdbx_assert_enabled() && mp) { + if (ASSERT_ENABLED() && mp) { kind = nullptr; for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_search_spilled(parent, pgno)) { + if (search_spilled(parent, pgno)) { kind = "parent-spilled"; - mdbx_tassert(txn, is_spilled); + tASSERT(txn, status == spilled); break; } if (mp == debug_dpl_find(parent, pgno)) { kind = "parent-dirty"; - mdbx_tassert(txn, !is_spilled); + tASSERT(txn, status == shadowed); break; } } - mdbx_tassert(txn, kind != nullptr); + tASSERT(txn, kind != nullptr); } - mdbx_tassert(txn, - is_spilled || is_shadowed || (mp && IS_SHADOWED(txn, mp))); + tASSERT(txn, status == spilled || status == shadowed); } - mdbx_debug("refunded %u %s page %" PRIaPGNO, npages, kind, pgno); + DEBUG("refunded %zu %s page %" PRIaPGNO, npages, kind, pgno); txn->mt_next_pgno = pgno; - mdbx_refund(txn); + txn_refund(txn); return MDBX_SUCCESS; } - if (di) { + if (status == modifable) { /* Dirty page from this transaction */ /* If suitable we can reuse it through loose list */ - if (likely(npages == 1 && - txn->tw.loose_count < txn->mt_env->me_options.dp_loose_limit && - (!MDBX_ENABLE_REFUND || - /* skip pages near to the end in favor of compactification */ - txn->mt_next_pgno > - pgno + txn->mt_env->me_options.dp_loose_limit || - txn->mt_next_pgno <= txn->mt_env->me_options.dp_loose_limit))) { - mdbx_debug("loosen dirty page %" PRIaPGNO, pgno); + if (likely(npages == 1 && suitable4loose(txn, pgno)) && + (di || !txn->tw.dirtylist)) { + DEBUG("loosen dirty page %" PRIaPGNO, pgno); + if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) + memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); + mp->mp_txnid = INVALID_TXNID; mp->mp_flags = P_LOOSE; - mp->mp_next = txn->tw.loose_pages; + mp_next(mp) = txn->tw.loose_pages; txn->tw.loose_pages = mp; txn->tw.loose_count++; #if MDBX_ENABLE_REFUND @@ -4983,8 +4463,6 @@ status_done: ? pgno + 2 : txn->tw.loose_refund_wl; #endif /* MDBX_ENABLE_REFUND */ - if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) - memset(page_data(mp), -1, txn->mt_env->me_psize - PAGEHDRSZ); VALGRIND_MAKE_MEM_NOACCESS(page_data(mp), txn->mt_env->me_psize - PAGEHDRSZ); MDBX_ASAN_POISON_MEMORY_REGION(page_data(mp), @@ -5003,17 +4481,17 @@ status_done: for (MDBX_txn *parent = txn->mt_parent; parent && (parent->mt_flags & MDBX_TXN_SPILLS); parent = parent->mt_parent) { - if (mdbx_intersect_spilled(parent, pgno, npages)) + if (intersect_spilled(parent, pgno, npages)) goto skip_invalidate; - if (mdbx_dpl_intersect(parent, pgno, npages)) + if (dpl_intersect(parent, pgno, npages)) goto skip_invalidate; } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (MDBX_DEBUG != 0 || unlikely(txn->mt_env->me_flags & MDBX_PAGEPERTURB)) #endif - mdbx_kill_page(txn, mp, pgno, npages); - if (!(txn->mt_flags & MDBX_WRITEMAP)) { + kill_page(txn, mp, pgno, npages); + if ((txn->mt_flags & MDBX_WRITEMAP) == 0) { VALGRIND_MAKE_MEM_NOACCESS(page_data(pgno2page(txn->mt_env, pgno)), pgno2bytes(txn->mt_env, npages) - PAGEHDRSZ); MDBX_ASAN_POISON_MEMORY_REGION(page_data(pgno2page(txn->mt_env, pgno)), @@ -5022,22 +4500,22 @@ status_done: } } skip_invalidate: - /* Remove from dirty list */ - mdbx_page_wash(txn, di, mp, npages); + + /* wash dirty page */ + page_wash(txn, di, mp, npages); reclaim: - mdbx_debug("reclaim %u %s page %" PRIaPGNO, npages, "dirty", pgno); - rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, pgno, npages); - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + DEBUG("reclaim %zu %s page %" PRIaPGNO, npages, "dirty", pgno); + rc = pnl_insert_range(&txn->tw.relist, pgno, npages); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, dirtylist_check(txn)); return rc; } if (si) { /* Page ws spilled in this txn */ - mdbx_spill_remove(txn, si, npages); + spill_remove(txn, si, npages); /* Страница могла быть выделена и затем пролита в этой транзакции, * тогда её необходимо поместить в reclaimed-список. * Либо она могла быть выделена в одной из родительских транзакций и затем @@ -5045,7 +4523,7 @@ status_done: * retired-список для последующей фильтрации при коммите. */ for (MDBX_txn *parent = txn->mt_parent; parent; parent = parent->mt_parent) { - if (mdbx_dpl_exist(parent, pgno)) + if (dpl_exist(parent, pgno)) goto retire; } /* Страница точно была выделена в этой транзакции @@ -5053,17 +4531,17 @@ status_done: goto reclaim; } - if (is_shadowed) { + if (status == shadowed) { /* Dirty page MUST BE a clone from (one of) parent transaction(s). */ - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const MDBX_page *parent_dp = nullptr; /* Check parent(s)'s dirty lists. */ for (MDBX_txn *parent = txn->mt_parent; parent && !parent_dp; parent = parent->mt_parent) { - mdbx_tassert(txn, !mdbx_search_spilled(parent, pgno)); + tASSERT(txn, !search_spilled(parent, pgno)); parent_dp = debug_dpl_find(parent, pgno); } - mdbx_tassert(txn, parent_dp && (!mp || parent_dp == mp)); + tASSERT(txn, parent_dp && (!mp || parent_dp == mp)); } /* Страница была выделена в родительской транзакции и теперь может быть * использована повторно, но только внутри этой транзакции, либо дочерних. @@ -5080,206 +4558,284 @@ status_done: goto retire; } -static __inline int mdbx_page_retire(MDBX_cursor *mc, MDBX_page *mp) { - return mdbx_page_retire_ex(mc, mp->mp_pgno, mp, PAGETYPE(mp)); +static __inline int page_retire(MDBX_cursor *mc, MDBX_page *mp) { + return page_retire_ex(mc, mp->mp_pgno, mp, mp->mp_flags); } -struct mdbx_iov_ctx { - unsigned iov_items; - size_t iov_bytes; - size_t iov_off; +typedef struct iov_ctx { + MDBX_env *env; + osal_ioring_t *ior; + mdbx_filehandle_t fd; + int err; +#ifndef MDBX_NEED_WRITTEN_RANGE +#define MDBX_NEED_WRITTEN_RANGE 1 +#endif /* MDBX_NEED_WRITTEN_RANGE */ +#if MDBX_NEED_WRITTEN_RANGE pgno_t flush_begin; pgno_t flush_end; - struct iovec iov[MDBX_COMMIT_PAGES]; -}; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + uint64_t coherency_timestamp; +} iov_ctx_t; -static __inline void mdbx_iov_init(MDBX_txn *const txn, - struct mdbx_iov_ctx *ctx) { - ctx->flush_begin = MAX_PAGENO; - ctx->flush_end = MIN_PAGENO; - ctx->iov_items = 0; - ctx->iov_bytes = 0; - ctx->iov_off = 0; - (void)txn; +__must_check_result static int iov_init(MDBX_txn *const txn, iov_ctx_t *ctx, + size_t items, size_t npages, + mdbx_filehandle_t fd, + bool check_coherence) { + ctx->env = txn->mt_env; + ctx->ior = &txn->mt_env->me_ioring; + ctx->fd = fd; + ctx->coherency_timestamp = + (check_coherence || txn->mt_env->me_lck->mti_pgop_stat.incoherence.weak) + ? 0 + : UINT64_MAX /* не выполнять сверку */; + ctx->err = osal_ioring_prepare(ctx->ior, items, + pgno_align2os_bytes(txn->mt_env, npages)); + if (likely(ctx->err == MDBX_SUCCESS)) { +#if MDBX_NEED_WRITTEN_RANGE + ctx->flush_begin = MAX_PAGENO; + ctx->flush_end = MIN_PAGENO; +#endif /* MDBX_NEED_WRITTEN_RANGE */ + osal_ioring_reset(ctx->ior); + } + return ctx->err; } -static __inline void mdbx_iov_done(MDBX_txn *const txn, - struct mdbx_iov_ctx *ctx) { - mdbx_tassert(txn, ctx->iov_items == 0); -#if defined(__linux__) || defined(__gnu_linux__) - MDBX_env *const env = txn->mt_env; - if (!(txn->mt_flags & MDBX_WRITEMAP) && - mdbx_linux_kernel_version < 0x02060b00) - /* Linux kernels older than version 2.6.11 ignore the addr and nbytes - * arguments, making this function fairly expensive. Therefore, the - * whole cache is always flushed. */ - mdbx_flush_incoherent_mmap( - env->me_map + pgno2bytes(env, ctx->flush_begin), - pgno2bytes(env, ctx->flush_end - ctx->flush_begin), env->me_os_psize); -#endif /* Linux */ +static inline bool iov_empty(const iov_ctx_t *ctx) { + return osal_ioring_used(ctx->ior) == 0; } -static int mdbx_iov_write(MDBX_txn *const txn, struct mdbx_iov_ctx *ctx) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - mdbx_tassert(txn, ctx->iov_items > 0); +static void iov_callback4dirtypages(iov_ctx_t *ctx, size_t offset, void *data, + size_t bytes) { + MDBX_env *const env = ctx->env; + eASSERT(env, (env->me_flags & MDBX_WRITEMAP) == 0); - MDBX_env *const env = txn->mt_env; - int rc; - if (likely(ctx->iov_items == 1)) { - mdbx_assert(env, ctx->iov_bytes == (size_t)ctx->iov[0].iov_len); - rc = mdbx_pwrite(env->me_lazy_fd, ctx->iov[0].iov_base, ctx->iov[0].iov_len, - ctx->iov_off); - } else { - rc = mdbx_pwritev(env->me_lazy_fd, ctx->iov, ctx->iov_items, ctx->iov_off, - ctx->iov_bytes); - } + MDBX_page *wp = (MDBX_page *)data; + eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); + eASSERT(env, bytes2pgno(env, bytes) >= (IS_OVERFLOW(wp) ? wp->mp_pages : 1u)); + eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); - if (unlikely(rc != MDBX_SUCCESS)) - mdbx_error("Write error: %s", mdbx_strerror(rc)); - else { - VALGRIND_MAKE_MEM_DEFINED(txn->mt_env->me_map + ctx->iov_off, - ctx->iov_bytes); - MDBX_ASAN_UNPOISON_MEMORY_REGION(txn->mt_env->me_map + ctx->iov_off, - ctx->iov_bytes); - } - - unsigned iov_items = ctx->iov_items; -#if MDBX_ENABLE_PGOP_STAT - txn->mt_env->me_lck->mti_pgop_stat.wops.weak += iov_items; -#endif /* MDBX_ENABLE_PGOP_STAT */ - ctx->iov_items = 0; - ctx->iov_bytes = 0; - - uint64_t timestamp = 0; - for (unsigned i = 0; i < iov_items; i++) { - MDBX_page *wp = (MDBX_page *)ctx->iov[i].iov_base; - const MDBX_page *rp = pgno2page(txn->mt_env, wp->mp_pgno); + if (likely(ctx->err == MDBX_SUCCESS)) { + const MDBX_page *const rp = ptr_disp(env->me_map, offset); + VALGRIND_MAKE_MEM_DEFINED(rp, bytes); + MDBX_ASAN_UNPOISON_MEMORY_REGION(rp, bytes); + osal_flush_incoherent_mmap(rp, bytes, env->me_os_psize); /* check with timeout as the workaround - * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ - while (likely(rc == MDBX_SUCCESS) && - unlikely(memcmp(wp, rp, ctx->iov[i].iov_len) != 0)) { - if (!timestamp) { - timestamp = mdbx_osal_monotime(); - mdbx_iov_done(txn, ctx); - mdbx_warning( - "catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); - } else if (unlikely(mdbx_osal_monotime() - timestamp > 65536 / 10)) { - mdbx_error( - "bailout waiting for %" PRIaPGNO " page arrival %s", wp->mp_pgno, - "(workaround for incoherent flaw of unified page/buffer cache)"); - rc = MDBX_CORRUPTED; - } -#if defined(_WIN32) || defined(_WIN64) - SwitchToThread(); -#elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) - sched_yield(); -#elif (defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 1)) || defined(_OPEN_THREADS) - pthread_yield(); -#else - usleep(42); -#endif + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 + * + * Проблема проявляется только при неупорядоченности: если записанная + * последней мета-страница "обгоняет" ранее записанные, т.е. когда + * записанное в файл позже становится видимым в отображении раньше, + * чем записанное ранее. + * + * Исходно здесь всегда выполнялась полная сверка. Это давало полную + * гарантию защиты от проявления проблемы, но порождало накладные расходы. + * В некоторых сценариях наблюдалось снижение производительности до 10-15%, + * а в синтетических тестах до 30%. Конечно никто не вникал в причины, + * а просто останавливался на мнении "libmdbx не быстрее LMDB", + * например: https://clck.ru/3386er + * + * Поэтому после серии экспериментов и тестов реализовано следующее: + * 0. Посредством опции сборки MDBX_FORCE_CHECK_MMAP_COHERENCY=1 + * можно включить полную сверку после записи. + * Остальные пункты являются взвешенным компромиссом между полной + * гарантией обнаружения проблемы и бесполезными затратами на системах + * без этого недостатка. + * 1. При старте транзакций проверяется соответствие выбранной мета-страницы + * корневым страницам b-tree проверяется. Эта проверка показала себя + * достаточной без сверки после записи. При обнаружении "некогерентности" + * эти случаи подсчитываются, а при их ненулевом счетчике выполняется + * полная сверка. Таким образом, произойдет переключение в режим полной + * сверки, если показавшая себя достаточной проверка заметит проявление + * проблемы хоты-бы раз. + * 2. Сверка не выполняется при фиксации транзакции, так как: + * - при наличии проблемы "не-когерентности" (при отложенном копировании + * или обновлении PTE, после возврата из write-syscall), проверка + * в этом процессе не гарантирует актуальность данных в другом + * процессе, который может запустить транзакцию сразу после коммита; + * - сверка только последнего блока позволяет почти восстановить + * производительность в больших транзакциях, но одновременно размывает + * уверенность в отсутствии сбоев, чем обесценивает всю затею; + * - после записи данных будет записана мета-страница, соответствие + * которой корневым страницам b-tree проверяется при старте + * транзакций, и только эта проверка показала себя достаточной; + * 3. При спиллинге производится полная сверка записанных страниц. Тут был + * соблазн сверять не полностью, а например начало и конец каждого блока. + * Но при спиллинге возможна ситуация повторного вытеснения страниц, в + * том числе large/overflow. При этом возникает риск прочитать в текущей + * транзакции старую версию страницы, до повторной записи. В этом случае + * могут возникать крайне редкие невоспроизводимые ошибки. С учетом того + * что спиллинг выполняет крайне редко, решено отказаться от экономии + * в пользу надежности. */ +#ifndef MDBX_FORCE_CHECK_MMAP_COHERENCY +#define MDBX_FORCE_CHECK_MMAP_COHERENCY 0 +#endif /* MDBX_FORCE_CHECK_MMAP_COHERENCY */ + if ((MDBX_FORCE_CHECK_MMAP_COHERENCY || + ctx->coherency_timestamp != UINT64_MAX) && + unlikely(memcmp(wp, rp, bytes))) { + ctx->coherency_timestamp = 0; + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; + WARNING("catch delayed/non-arrived page %" PRIaPGNO " %s", wp->mp_pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); + do + if (coherency_timeout(&ctx->coherency_timestamp, wp->mp_pgno) != + MDBX_RESULT_TRUE) { + ctx->err = MDBX_PROBLEM; + break; + } + while (unlikely(memcmp(wp, rp, bytes))); } - mdbx_dpage_free(env, wp, bytes2pgno(env, ctx->iov[i].iov_len)); } - return rc; + + if (likely(bytes == env->me_psize)) + dpage_free(env, wp, 1); + else { + do { + eASSERT(env, wp->mp_pgno == bytes2pgno(env, offset)); + eASSERT(env, (wp->mp_flags & P_ILL_BITS) == 0); + size_t npages = IS_OVERFLOW(wp) ? wp->mp_pages : 1u; + size_t chunk = pgno2bytes(env, npages); + eASSERT(env, bytes >= chunk); + MDBX_page *next = ptr_disp(wp, chunk); + dpage_free(env, wp, npages); + wp = next; + offset += chunk; + bytes -= chunk; + } while (bytes); + } } -static int iov_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, - unsigned npages) { - MDBX_env *const env = txn->mt_env; - mdbx_tassert(txn, - dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); - mdbx_tassert(txn, IS_MODIFIABLE(txn, dp)); - mdbx_tassert(txn, - !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); +static void iov_complete(iov_ctx_t *ctx) { + if ((ctx->env->me_flags & MDBX_WRITEMAP) == 0) + osal_ioring_walk(ctx->ior, ctx, iov_callback4dirtypages); + osal_ioring_reset(ctx->ior); +} - ctx->flush_begin = - (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; - ctx->flush_end = (ctx->flush_end > dp->mp_pgno + npages) - ? ctx->flush_end - : dp->mp_pgno + npages; - env->me_lck->mti_unsynced_pages.weak += npages; +__must_check_result static int iov_write(iov_ctx_t *ctx) { + eASSERT(ctx->env, !iov_empty(ctx)); + osal_ioring_write_result_t r = osal_ioring_write(ctx->ior, ctx->fd); +#if MDBX_ENABLE_PGOP_STAT + ctx->env->me_lck->mti_pgop_stat.wops.weak += r.wops; +#endif /* MDBX_ENABLE_PGOP_STAT */ + ctx->err = r.err; + if (unlikely(ctx->err != MDBX_SUCCESS)) + ERROR("Write error: %s", mdbx_strerror(ctx->err)); + iov_complete(ctx); + return ctx->err; +} + +__must_check_result static int iov_page(MDBX_txn *txn, iov_ctx_t *ctx, + MDBX_page *dp, size_t npages) { + MDBX_env *const env = txn->mt_env; + tASSERT(txn, ctx->err == MDBX_SUCCESS); + tASSERT(txn, dp->mp_pgno >= MIN_PAGENO && dp->mp_pgno < txn->mt_next_pgno); + tASSERT(txn, IS_MODIFIABLE(txn, dp)); + tASSERT(txn, !(dp->mp_flags & ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW))); if (IS_SHADOWED(txn, dp)) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); dp->mp_txnid = txn->mt_txnid; - mdbx_tassert(txn, IS_SPILLED(txn, dp)); - const size_t size = pgno2bytes(env, npages); - if (ctx->iov_off + ctx->iov_bytes != pgno2bytes(env, dp->mp_pgno) || - ctx->iov_items == ARRAY_LENGTH(ctx->iov) || - ctx->iov_bytes + size > MAX_WRITE) { - if (ctx->iov_items) { - int err = mdbx_iov_write(txn, ctx); - if (unlikely(err != MDBX_SUCCESS)) - return err; -#if defined(__linux__) || defined(__gnu_linux__) - if (mdbx_linux_kernel_version >= 0x02060b00) - /* Linux kernels older than version 2.6.11 ignore the addr and nbytes - * arguments, making this function fairly expensive. Therefore, the - * whole cache is always flushed. */ -#endif /* Linux */ - mdbx_flush_incoherent_mmap(env->me_map + ctx->iov_off, ctx->iov_bytes, - env->me_os_psize); + tASSERT(txn, IS_SPILLED(txn, dp)); +#if MDBX_AVOID_MSYNC + doit:; +#endif /* MDBX_AVOID_MSYNC */ + int err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, + pgno2bytes(env, npages)); + if (unlikely(err != MDBX_SUCCESS)) { + ctx->err = err; + if (unlikely(err != MDBX_RESULT_TRUE)) { + iov_complete(ctx); + return err; } - ctx->iov_off = pgno2bytes(env, dp->mp_pgno); + err = iov_write(ctx); + tASSERT(txn, iov_empty(ctx)); + if (likely(err == MDBX_SUCCESS)) { + err = osal_ioring_add(ctx->ior, pgno2bytes(env, dp->mp_pgno), dp, + pgno2bytes(env, npages)); + if (unlikely(err != MDBX_SUCCESS)) { + iov_complete(ctx); + return ctx->err = err; + } + } + tASSERT(txn, ctx->err == MDBX_SUCCESS); } - ctx->iov[ctx->iov_items].iov_base = (void *)dp; - ctx->iov[ctx->iov_items].iov_len = size; - ctx->iov_items += 1; - ctx->iov_bytes += size; } else { - mdbx_tassert(txn, txn->mt_flags & MDBX_WRITEMAP); + tASSERT(txn, txn->mt_flags & MDBX_WRITEMAP); +#if MDBX_AVOID_MSYNC + goto doit; +#endif /* MDBX_AVOID_MSYNC */ } + +#if MDBX_NEED_WRITTEN_RANGE + ctx->flush_begin = + (ctx->flush_begin < dp->mp_pgno) ? ctx->flush_begin : dp->mp_pgno; + ctx->flush_end = (ctx->flush_end > dp->mp_pgno + (pgno_t)npages) + ? ctx->flush_end + : dp->mp_pgno + (pgno_t)npages; +#endif /* MDBX_NEED_WRITTEN_RANGE */ return MDBX_SUCCESS; } -static int spill_page(MDBX_txn *txn, struct mdbx_iov_ctx *ctx, MDBX_page *dp, - unsigned npages) { - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - pgno_t pgno = dp->mp_pgno; - int err = iov_page(txn, ctx, dp, npages); - if (likely(err == MDBX_SUCCESS)) { - err = mdbx_pnl_append_range(true, &txn->tw.spill_pages, pgno << 1, npages); +static int spill_page(MDBX_txn *txn, iov_ctx_t *ctx, MDBX_page *dp, + const size_t npages) { + tASSERT(txn, !(txn->mt_flags & MDBX_WRITEMAP)); #if MDBX_ENABLE_PGOP_STAT - if (likely(err == MDBX_SUCCESS)) - txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; + txn->mt_env->me_lck->mti_pgop_stat.spill.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ - } + const pgno_t pgno = dp->mp_pgno; + int err = iov_page(txn, ctx, dp, npages); + if (likely(err == MDBX_SUCCESS)) + err = pnl_append_range(true, &txn->tw.spilled.list, pgno << 1, npages); return err; } /* Set unspillable LRU-label for dirty pages watched by txn. * Returns the number of pages marked as unspillable. */ -static unsigned mdbx_cursor_keep(MDBX_txn *txn, MDBX_cursor *mc) { - unsigned keep = 0; - while (mc->mc_flags & C_INITIALIZED) { - for (unsigned i = 0; i < mc->mc_snum; ++i) { - const MDBX_page *mp = mc->mc_pg[i]; - if (IS_MODIFIABLE(txn, mp) && !IS_SUBP(mp)) { - unsigned const n = mdbx_dpl_search(txn, mp->mp_pgno); +static size_t cursor_keep(const MDBX_txn *const txn, const MDBX_cursor *mc) { + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + size_t keep = 0; + while ((mc->mc_flags & C_INITIALIZED) && mc->mc_snum) { + tASSERT(txn, mc->mc_top == mc->mc_snum - 1); + const MDBX_page *mp; + size_t i = 0; + do { + mp = mc->mc_pg[i]; + tASSERT(txn, !IS_SUBP(mp)); + if (IS_MODIFIABLE(txn, mp)) { + size_t const n = dpl_search(txn, mp->mp_pgno); if (txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && - mdbx_dpl_age(txn, n)) { - txn->tw.dirtylist->items[n].lru = txn->tw.dirtylru; + /* не считаем дважды */ dpl_age(txn, n)) { + size_t *const ptr = ptr_disp(txn->tw.dirtylist->items[n].ptr, + -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + tASSERT(txn, dpl_age(txn, n) == 0); ++keep; } } - } - if (!mc->mc_xcursor) + } while (++i < mc->mc_snum); + + tASSERT(txn, IS_LEAF(mp)); + if (!mc->mc_xcursor || mc->mc_ki[mc->mc_top] >= page_numkeys(mp)) + break; + if (!(node_flags(page_node(mp, mc->mc_ki[mc->mc_top])) & F_SUBDATA)) break; mc = &mc->mc_xcursor->mx_cursor; } return keep; } -static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { - unsigned keep = m0 ? mdbx_cursor_keep(txn, m0) : 0; - for (unsigned i = FREE_DBI; i < txn->mt_numdbs; ++i) +static size_t txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { + tASSERT(txn, (txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0); + txn_lru_turn(txn); + size_t keep = m0 ? cursor_keep(txn, m0) : 0; + for (size_t i = FREE_DBI; i < txn->mt_numdbs; ++i) if (F_ISSET(txn->mt_dbistate[i], DBI_DIRTY | DBI_VALID) && txn->mt_dbs[i].md_root != P_INVALID) for (MDBX_cursor *mc = txn->mt_cursors[i]; mc; mc = mc->mc_next) if (mc != m0) - keep += mdbx_cursor_keep(txn, mc); + keep += cursor_keep(txn, mc); return keep; } @@ -5287,24 +4843,21 @@ static unsigned mdbx_txn_keep(MDBX_txn *txn, MDBX_cursor *m0) { * 0 = should be spilled; * ... * > 255 = must not be spilled. */ -static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, - const uint32_t reciprocal) { +MDBX_NOTHROW_PURE_FUNCTION static unsigned +spill_prio(const MDBX_txn *txn, const size_t i, const uint32_t reciprocal) { MDBX_dpl *const dl = txn->tw.dirtylist; - const uint32_t age = mdbx_dpl_age(txn, i); - const unsigned npages = dpl_npages(dl, i); + const uint32_t age = dpl_age(txn, i); + const size_t npages = dpl_npages(dl, i); const pgno_t pgno = dl->items[i].pgno; if (age == 0) { - mdbx_debug("skip %s %u page %" PRIaPGNO, "keep", npages, pgno); + DEBUG("skip %s %zu page %" PRIaPGNO, "keep", npages, pgno); return 256; } MDBX_page *const dp = dl->items[i].ptr; if (dp->mp_flags & (P_LOOSE | P_SPILLED)) { - mdbx_debug("skip %s %u page %" PRIaPGNO, - (dp->mp_flags & P_LOOSE) ? "loose" - : (dp->mp_flags & P_LOOSE) ? "loose" - : "parent-spilled", - npages, pgno); + DEBUG("skip %s %zu page %" PRIaPGNO, + (dp->mp_flags & P_LOOSE) ? "loose" : "parent-spilled", npages, pgno); return 256; } @@ -5313,30 +4866,30 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, MDBX_txn *parent = txn->mt_parent; if (parent && (parent->mt_flags & MDBX_TXN_SPILLS)) { do - if (mdbx_intersect_spilled(parent, pgno, npages)) { - mdbx_debug("skip-2 parent-spilled %u page %" PRIaPGNO, npages, pgno); + if (intersect_spilled(parent, pgno, npages)) { + DEBUG("skip-2 parent-spilled %zu page %" PRIaPGNO, npages, pgno); dp->mp_flags |= P_SPILLED; return 256; } while ((parent = parent->mt_parent) != nullptr); } - mdbx_tassert(txn, age * (uint64_t)reciprocal < UINT32_MAX); + tASSERT(txn, age * (uint64_t)reciprocal < UINT32_MAX); unsigned prio = age * reciprocal >> 24; - mdbx_tassert(txn, prio < 256); + tASSERT(txn, prio < 256); if (likely(npages == 1)) return prio = 256 - prio; /* make a large/overflow pages be likely to spill */ - uint32_t factor = npages | npages >> 1; + size_t factor = npages | npages >> 1; factor |= factor >> 2; factor |= factor >> 4; factor |= factor >> 8; factor |= factor >> 16; - factor = prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; + factor = (size_t)prio * log2n_powerof2(factor + 1) + /* golden ratio */ 157; factor = (factor < 256) ? 255 - factor : 0; - mdbx_tassert(txn, factor < 256 && factor < (256 - prio)); - return prio = factor; + tASSERT(txn, factor < 256 && factor < (256 - prio)); + return prio = (unsigned)factor; } /* Spill pages from the dirty list back to disk. @@ -5357,8 +4910,8 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, * If the txn never references them again, they can be left alone. * If the txn only reads them, they can be used without any fuss. * If the txn writes them again, they can be dirtied immediately without - * going thru all of the work of mdbx_page_touch(). Such references are - * handled by mdbx_page_unspill(). + * going thru all of the work of page_touch(). Such references are + * handled by page_unspill(). * * Also note, we never spill DB root pages, nor pages of active cursors, * because we'll need these back again soon anyway. And in nested txns, @@ -5366,78 +4919,128 @@ static unsigned spill_prio(const MDBX_txn *txn, const unsigned i, * parent txn. That would alter the parent txns' data even though * the child hasn't committed yet, and we'd have no way to undo it if * the child aborted. */ -static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, - const unsigned need) { -#if xMDBX_DEBUG_SPILLING != 1 +__cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, + const intptr_t wanna_spill_entries, + const intptr_t wanna_spill_npages, + const size_t need); + +static __inline int txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, + const size_t need) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, !m0 || cursor_is_tracked(m0)); + + const intptr_t wanna_spill_entries = + txn->tw.dirtylist ? (need - txn->tw.dirtyroom - txn->tw.loose_count) : 0; + const intptr_t wanna_spill_npages = + need + + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages) - + txn->tw.loose_count - txn->mt_env->me_options.dp_limit; + /* production mode */ - if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) + if (likely(wanna_spill_npages < 1 && wanna_spill_entries < 1) +#if xMDBX_DEBUG_SPILLING == 1 + /* debug mode: always try to spill if xMDBX_DEBUG_SPILLING == 1 */ + && txn->mt_txnid % 23 > 11 +#endif + ) return MDBX_SUCCESS; - unsigned wanna_spill = need - txn->tw.dirtyroom; -#else - /* debug mode: spill at least one page if xMDBX_DEBUG_SPILLING == 1 */ - unsigned wanna_spill = - (need > txn->tw.dirtyroom) ? need - txn->tw.dirtyroom : 1; -#endif /* xMDBX_DEBUG_SPILLING */ - const unsigned dirty = txn->tw.dirtylist->length; - const unsigned spill_min = - txn->mt_env->me_options.spill_min_denominator - ? dirty / txn->mt_env->me_options.spill_min_denominator - : 0; - const unsigned spill_max = - dirty - (txn->mt_env->me_options.spill_max_denominator - ? dirty / txn->mt_env->me_options.spill_max_denominator + return txn_spill_slowpath(txn, m0, wanna_spill_entries, wanna_spill_npages, + need); +} + +static size_t spill_gate(const MDBX_env *env, intptr_t part, + const size_t total) { + const intptr_t spill_min = + env->me_options.spill_min_denominator + ? (total + env->me_options.spill_min_denominator - 1) / + env->me_options.spill_min_denominator + : 1; + const intptr_t spill_max = + total - (env->me_options.spill_max_denominator + ? total / env->me_options.spill_max_denominator : 0); - wanna_spill = (wanna_spill > spill_min) ? wanna_spill : spill_min; - wanna_spill = (wanna_spill < spill_max) ? wanna_spill : spill_max; - if (!wanna_spill) - return MDBX_SUCCESS; + part = (part < spill_max) ? part : spill_max; + part = (part > spill_min) ? part : spill_min; + eASSERT(env, part >= 0 && (size_t)part <= total); + return (size_t)part; +} - mdbx_notice("spilling %u dirty-entries (have %u dirty-room, need %u)", - wanna_spill, txn->tw.dirtyroom, need); - mdbx_tassert(txn, txn->tw.dirtylist->length >= wanna_spill); +__cold static int txn_spill_slowpath(MDBX_txn *const txn, MDBX_cursor *const m0, + const intptr_t wanna_spill_entries, + const intptr_t wanna_spill_npages, + const size_t need) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - struct mdbx_iov_ctx ctx; - mdbx_iov_init(txn, &ctx); int rc = MDBX_SUCCESS; + if (unlikely(txn->tw.loose_count >= + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages))) + goto done; + + const size_t dirty_entries = + txn->tw.dirtylist ? (txn->tw.dirtylist->length - txn->tw.loose_count) : 1; + const size_t dirty_npages = + (txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : txn->tw.writemap_dirty_npages) - + txn->tw.loose_count; + const size_t need_spill_entries = + spill_gate(txn->mt_env, wanna_spill_entries, dirty_entries); + const size_t need_spill_npages = + spill_gate(txn->mt_env, wanna_spill_npages, dirty_npages); + + const size_t need_spill = (need_spill_entries > need_spill_npages) + ? need_spill_entries + : need_spill_npages; + if (!need_spill) + goto done; + if (txn->mt_flags & MDBX_WRITEMAP) { - MDBX_dpl *const dl = txn->tw.dirtylist; - const unsigned span = dl->length - txn->tw.loose_count; - txn->tw.dirtyroom += span; - unsigned r, w; - for (w = 0, r = 1; r <= dl->length; ++r) { - MDBX_page *dp = dl->items[r].ptr; - if (dp->mp_flags & P_LOOSE) - dl->items[++w] = dl->items[r]; - else if (!MDBX_FAKE_SPILL_WRITEMAP) { - rc = iov_page(txn, &ctx, dp, dpl_npages(dl, r)); - mdbx_tassert(txn, rc == MDBX_SUCCESS); - } + NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "msync", + dirty_entries, dirty_npages); + const MDBX_env *env = txn->mt_env; + tASSERT(txn, txn->tw.spilled.list == nullptr); + rc = + osal_msync(&txn->mt_env->me_dxb_mmap, 0, + pgno_align2os_bytes(env, txn->mt_next_pgno), MDBX_SYNC_KICK); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; +#if MDBX_AVOID_MSYNC + MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); + tASSERT(txn, dirtylist_check(txn)); + env->me_lck->mti_unsynced_pages.weak += + txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count; + dpl_clear(txn->tw.dirtylist); + txn->tw.dirtyroom = env->me_options.dp_limit - txn->tw.loose_count; + for (MDBX_page *lp = txn->tw.loose_pages; lp != nullptr; lp = mp_next(lp)) { + tASSERT(txn, lp->mp_flags == P_LOOSE); + rc = dpl_append(txn, lp->mp_pgno, lp, 1); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } - - mdbx_tassert(txn, span == r - 1 - w && w == txn->tw.loose_count); - dl->sorted = (dl->sorted == dl->length) ? w : 0; - dpl_setlen(dl, w); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - - if (!MDBX_FAKE_SPILL_WRITEMAP && ctx.flush_end > ctx.flush_begin) { - MDBX_env *const env = txn->mt_env; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_msync(&env->me_dxb_mmap, - pgno_align2os_bytes(env, ctx.flush_begin), - pgno_align2os_bytes(env, ctx.flush_end - ctx.flush_begin), - MDBX_SYNC_NONE); - } - return rc; + tASSERT(txn, dirtylist_check(txn)); +#else + tASSERT(txn, txn->tw.dirtylist == nullptr); + env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; + txn->tw.writemap_spilled_npages += txn->tw.writemap_dirty_npages; + txn->tw.writemap_dirty_npages = 0; +#endif /* MDBX_AVOID_MSYNC */ + goto done; } - mdbx_tassert(txn, !(txn->mt_flags & MDBX_WRITEMAP)); - if (!txn->tw.spill_pages) { - txn->tw.spill_least_removed = INT_MAX; - txn->tw.spill_pages = mdbx_pnl_alloc(wanna_spill); - if (unlikely(!txn->tw.spill_pages)) { + NOTICE("%s-spilling %zu dirty-entries, %zu dirty-npages", "write", + need_spill_entries, need_spill_npages); + MDBX_ANALYSIS_ASSUME(txn->tw.dirtylist != nullptr); + tASSERT(txn, txn->tw.dirtylist->length - txn->tw.loose_count >= 1); + tASSERT(txn, txn->tw.dirtylist->pages_including_loose - txn->tw.loose_count >= + need_spill_npages); + if (!txn->tw.spilled.list) { + txn->tw.spilled.least_removed = INT_MAX; + txn->tw.spilled.list = pnl_alloc(need_spill); + if (unlikely(!txn->tw.spilled.list)) { rc = MDBX_ENOMEM; bailout: txn->mt_flags |= MDBX_TXN_ERROR; @@ -5445,27 +5048,27 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, } } else { /* purge deleted slots */ - mdbx_spill_purge(txn); - rc = mdbx_pnl_reserve(&txn->tw.spill_pages, wanna_spill); + spill_purge(txn); + rc = pnl_reserve(&txn->tw.spilled.list, need_spill); (void)rc /* ignore since the resulting list may be shorter - and mdbx_pnl_append() will increase pnl on demand */ + and pnl_append() will increase pnl on demand */ ; } /* Сортируем чтобы запись на диск была полее последовательна */ - MDBX_dpl *const dl = mdbx_dpl_sort(txn); + MDBX_dpl *const dl = dpl_sort(txn); /* Preserve pages which may soon be dirtied again */ - const unsigned unspillable = mdbx_txn_keep(txn, m0); + const size_t unspillable = txn_keep(txn, m0); if (unspillable + txn->tw.loose_count >= dl->length) { #if xMDBX_DEBUG_SPILLING == 1 /* avoid false failure in debug mode */ if (likely(txn->tw.dirtyroom + txn->tw.loose_count >= need)) return MDBX_SUCCESS; #endif /* xMDBX_DEBUG_SPILLING */ - mdbx_error("all %u dirty pages are unspillable since referenced " - "by a cursor(s), use fewer cursors or increase " - "MDBX_opt_txn_dp_limit", - unspillable); + ERROR("all %zu dirty pages are unspillable since referenced " + "by a cursor(s), use fewer cursors or increase " + "MDBX_opt_txn_dp_limit", + unspillable); goto done; } @@ -5494,140 +5097,158 @@ static int mdbx_txn_spill(MDBX_txn *const txn, MDBX_cursor *const m0, /* get min/max of LRU-labels */ uint32_t age_max = 0; - for (unsigned i = 1; i <= dl->length; ++i) { - const uint32_t age = mdbx_dpl_age(txn, i); + for (size_t i = 1; i <= dl->length; ++i) { + const uint32_t age = dpl_age(txn, i); age_max = (age_max >= age) ? age_max : age; } - mdbx_verbose("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); + VERBOSE("lru-head %u, age-max %u", txn->tw.dirtylru, age_max); /* half of 8-bit radix-sort */ - unsigned radix_counters[256], spillable = 0, spilled = 0; - memset(&radix_counters, 0, sizeof(radix_counters)); + pgno_t radix_entries[256], radix_npages[256]; + memset(&radix_entries, 0, sizeof(radix_entries)); + memset(&radix_npages, 0, sizeof(radix_npages)); + size_t spillable_entries = 0, spillable_npages = 0; const uint32_t reciprocal = (UINT32_C(255) << 24) / (age_max + 1); - for (unsigned i = 1; i <= dl->length; ++i) { - unsigned prio = spill_prio(txn, i, reciprocal); + for (size_t i = 1; i <= dl->length; ++i) { + const unsigned prio = spill_prio(txn, i, reciprocal); + size_t *const ptr = ptr_disp(dl->items[i].ptr, -(ptrdiff_t)sizeof(size_t)); + TRACE("page %" PRIaPGNO + ", lru %zu, is_multi %c, npages %u, age %u of %u, prio %u", + dl->items[i].pgno, *ptr, (dl->items[i].npages > 1) ? 'Y' : 'N', + dpl_npages(dl, i), dpl_age(txn, i), age_max, prio); if (prio < 256) { - radix_counters[prio] += 1; - spillable += 1; + radix_entries[prio] += 1; + spillable_entries += 1; + const pgno_t npages = dpl_npages(dl, i); + radix_npages[prio] += npages; + spillable_npages += npages; } } - if (likely(spillable > 0)) { - unsigned prio2spill = 0, prio2adjacent = 128, amount = radix_counters[0]; - for (unsigned i = 1; i < 256; i++) { - if (amount < wanna_spill) { + tASSERT(txn, spillable_npages >= spillable_entries); + pgno_t spilled_entries = 0, spilled_npages = 0; + if (likely(spillable_entries > 0)) { + size_t prio2spill = 0, prio2adjacent = 128, + amount_entries = radix_entries[0], amount_npages = radix_npages[0]; + for (size_t i = 1; i < 256; i++) { + if (amount_entries < need_spill_entries || + amount_npages < need_spill_npages) { prio2spill = i; prio2adjacent = i + (257 - i) / 2; - amount += radix_counters[i]; - } else if (amount + amount < spillable + wanna_spill - /* РАВНОЗНАЧНО: amount - wanna_spill < spillable - amount */) { + amount_entries += radix_entries[i]; + amount_npages += radix_npages[i]; + } else if (amount_entries + amount_entries < + spillable_entries + need_spill_entries + /* РАВНОЗНАЧНО: amount - need_spill < spillable - amount */ + || amount_npages + amount_npages < + spillable_npages + need_spill_npages) { prio2adjacent = i; - amount += radix_counters[i]; + amount_entries += radix_entries[i]; + amount_npages += radix_npages[i]; } else break; } - mdbx_verbose("prio2spill %u, prio2adjacent %u, amount %u, spillable %u, " - "wanna_spill %u", - prio2spill, prio2adjacent, amount, spillable, wanna_spill); - mdbx_tassert(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); - - unsigned prev_prio = 256; - unsigned r, w, prio; - for (w = 0, r = 1; r <= dl->length && spilled < wanna_spill; - prev_prio = prio, ++r) { - prio = spill_prio(txn, r, reciprocal); - MDBX_page *const dp = dl->items[r].ptr; - if (prio < prio2adjacent) { - const pgno_t pgno = dl->items[r].pgno; - const unsigned npages = dpl_npages(dl, r); - if (prio <= prio2spill) { - if (prev_prio < prio2adjacent && prev_prio > prio2spill && - dpl_endpgno(dl, r - 1) == pgno) { - mdbx_debug("co-spill %u prev-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - dpl_npages(dl, w), dl->items[r - 1].pgno, - mdbx_dpl_age(txn, r - 1), prev_prio); - --w; - rc = spill_page(txn, &ctx, dl->items[r - 1].ptr, - dpl_npages(dl, r - 1)); - if (unlikely(rc != MDBX_SUCCESS)) - break; - ++spilled; - } - - mdbx_debug("spill %u page %" PRIaPGNO " (age %d, prio %u)", npages, - dp->mp_pgno, mdbx_dpl_age(txn, r), prio); - rc = spill_page(txn, &ctx, dp, npages); - if (unlikely(rc != MDBX_SUCCESS)) - break; - ++spilled; - continue; - } - - if (prev_prio <= prio2spill && dpl_endpgno(dl, r - 1) == pgno) { - mdbx_debug("co-spill %u next-adjacent page %" PRIaPGNO - " (age %d, prio %u)", - npages, dp->mp_pgno, mdbx_dpl_age(txn, r), prio); - rc = spill_page(txn, &ctx, dp, npages); - if (unlikely(rc != MDBX_SUCCESS)) - break; - prio = prev_prio /* to continue co-spilling next adjacent pages */; - ++spilled; - continue; - } - } - dl->items[++w] = dl->items[r]; - } - - mdbx_tassert(txn, spillable == 0 || spilled > 0); - - while (r <= dl->length) - dl->items[++w] = dl->items[r++]; - mdbx_tassert(txn, r - 1 - w == spilled); - - dl->sorted = dpl_setlen(dl, w); - txn->tw.dirtyroom += spilled; - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - - if (ctx.iov_items) { - /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ - mdbx_tassert(txn, rc == MDBX_SUCCESS); - rc = mdbx_iov_write(txn, &ctx); - } + VERBOSE("prio2spill %zu, prio2adjacent %zu, spillable %zu/%zu," + " wanna-spill %zu/%zu, amount %zu/%zu", + prio2spill, prio2adjacent, spillable_entries, spillable_npages, + need_spill_entries, need_spill_npages, amount_entries, + amount_npages); + tASSERT(txn, prio2spill < prio2adjacent && prio2adjacent <= 256); + iov_ctx_t ctx; + rc = + iov_init(txn, &ctx, amount_entries, amount_npages, +#if defined(_WIN32) || defined(_WIN64) + txn->mt_env->me_overlapped_fd ? txn->mt_env->me_overlapped_fd : +#endif + txn->mt_env->me_lazy_fd, + true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - mdbx_pnl_sort(txn->tw.spill_pages, (size_t)txn->mt_next_pgno << 1); + size_t r = 0, w = 0; + pgno_t last = 0; + while (r < dl->length && (spilled_entries < need_spill_entries || + spilled_npages < need_spill_npages)) { + dl->items[++w] = dl->items[++r]; + unsigned prio = spill_prio(txn, w, reciprocal); + if (prio > prio2spill && + (prio >= prio2adjacent || last != dl->items[w].pgno)) + continue; + + const size_t e = w; + last = dpl_endpgno(dl, w); + while (--w && dpl_endpgno(dl, w) == dl->items[w + 1].pgno && + spill_prio(txn, w, reciprocal) < prio2adjacent) + ; + + for (size_t i = w; ++i <= e;) { + const unsigned npages = dpl_npages(dl, i); + prio = spill_prio(txn, i, reciprocal); + DEBUG("%sspill[%zu] %u page %" PRIaPGNO " (age %d, prio %u)", + (prio > prio2spill) ? "co-" : "", i, npages, dl->items[i].pgno, + dpl_age(txn, i), prio); + tASSERT(txn, prio < 256); + ++spilled_entries; + spilled_npages += npages; + rc = spill_page(txn, &ctx, dl->items[i].ptr, npages); + if (unlikely(rc != MDBX_SUCCESS)) + goto failed; + } + } + + VERBOSE("spilled entries %u, spilled npages %u", spilled_entries, + spilled_npages); + tASSERT(txn, spillable_entries == 0 || spilled_entries > 0); + tASSERT(txn, spilled_npages >= spilled_entries); + + failed: + while (r < dl->length) + dl->items[++w] = dl->items[++r]; + tASSERT(txn, r - w == spilled_entries || rc != MDBX_SUCCESS); + + dl->sorted = dpl_setlen(dl, w); + txn->tw.dirtyroom += spilled_entries; + txn->tw.dirtylist->pages_including_loose -= spilled_npages; + tASSERT(txn, dirtylist_check(txn)); + + if (!iov_empty(&ctx)) { + tASSERT(txn, rc == MDBX_SUCCESS); + rc = iov_write(&ctx); + } + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + txn->mt_env->me_lck->mti_unsynced_pages.weak += spilled_npages; + pnl_sort(txn->tw.spilled.list, (size_t)txn->mt_next_pgno << 1); txn->mt_flags |= MDBX_TXN_SPILLS; - mdbx_notice("spilled %u dirty-entries, now have %u dirty-room", spilled, - txn->tw.dirtyroom); - mdbx_iov_done(txn, &ctx); + NOTICE("spilled %u dirty-entries, %u dirty-npages, now have %zu dirty-room", + spilled_entries, spilled_npages, txn->tw.dirtyroom); } else { - mdbx_tassert(txn, ctx.iov_items == 0 && rc == MDBX_SUCCESS); - for (unsigned i = 1; i <= dl->length; ++i) { + tASSERT(txn, rc == MDBX_SUCCESS); + for (size_t i = 1; i <= dl->length; ++i) { MDBX_page *dp = dl->items[i].ptr; - mdbx_notice( - "dirtylist[%u]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", i, - dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, mdbx_dpl_age(txn, i), + VERBOSE( + "unspillable[%zu]: pgno %u, npages %u, flags 0x%04X, age %u, prio %u", + i, dp->mp_pgno, dpl_npages(dl, i), dp->mp_flags, dpl_age(txn, i), spill_prio(txn, i, reciprocal)); } } #if xMDBX_DEBUG_SPILLING == 2 if (txn->tw.loose_count + txn->tw.dirtyroom <= need / 2 + 1) - mdbx_error("dirty-list length: before %u, after %u, parent %i, loose %u; " - "needed %u, spillable %u; " - "spilled %u dirty-entries, now have %u dirty-room", - dl->length + spilled, dl->length, - (txn->mt_parent && txn->mt_parent->tw.dirtylist) - ? (int)txn->mt_parent->tw.dirtylist->length - : -1, - txn->tw.loose_count, need, spillable, spilled, - txn->tw.dirtyroom); - mdbx_ensure(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); + ERROR("dirty-list length: before %zu, after %zu, parent %zi, loose %zu; " + "needed %zu, spillable %zu; " + "spilled %u dirty-entries, now have %zu dirty-room", + dl->length + spilled_entries, dl->length, + (txn->mt_parent && txn->mt_parent->tw.dirtylist) + ? (intptr_t)txn->mt_parent->tw.dirtylist->length + : -1, + txn->tw.loose_count, need, spillable_entries, spilled_entries, + txn->tw.dirtyroom); + ENSURE(txn->mt_env, txn->tw.loose_count + txn->tw.dirtyroom > need / 2); #endif /* xMDBX_DEBUG_SPILLING */ done: @@ -5637,37 +5258,6 @@ done: : MDBX_TXN_FULL; } -static int mdbx_cursor_spill(MDBX_cursor *mc, const MDBX_val *key, - const MDBX_val *data) { - MDBX_txn *txn = mc->mc_txn; - /* Estimate how much space this operation will take: */ - /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ - unsigned need = CURSOR_STACK + 3; - /* 2) GC/FreeDB for any payload */ - if (mc->mc_dbi > FREE_DBI) { - need += txn->mt_dbs[FREE_DBI].md_depth + 3; - /* 3) Named DBs also dirty the main DB */ - if (mc->mc_dbi > MAIN_DBI) - need += txn->mt_dbs[MAIN_DBI].md_depth + 3; - } -#if xMDBX_DEBUG_SPILLING != 2 - /* production mode */ - /* 4) Double the page chain estimation - * for extensively splitting, rebalance and merging */ - need += need; - /* 5) Factor the key+data which to be put in */ - need += bytes2pgno(txn->mt_env, node_size(key, data)) + 1; -#else - /* debug mode */ - (void)key; - (void)data; - mc->mc_txn->mt_env->debug_dirtied_est = ++need; - mc->mc_txn->mt_env->debug_dirtied_act = 0; -#endif /* xMDBX_DEBUG_SPILLING == 2 */ - - return mdbx_txn_spill(txn, mc, need); -} - /*----------------------------------------------------------------------------*/ static bool meta_bootid_match(const MDBX_meta *meta) { @@ -5686,52 +5276,96 @@ static bool meta_weak_acceptable(const MDBX_env *env, const MDBX_meta *meta, #define METAPAGE(env, n) page_meta(pgno2page(env, n)) #define METAPAGE_END(env) METAPAGE(env, NUM_METAS) -MDBX_NOTHROW_PURE_FUNCTION static __inline txnid_t -constmeta_txnid(const MDBX_env *env, const MDBX_meta *meta) { - mdbx_memory_fence(mo_AcquireRelease, false); - txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); - txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); - mdbx_assert(env, a == b); - (void)env; - return (a == b) ? a : 0; +MDBX_NOTHROW_PURE_FUNCTION static txnid_t +constmeta_txnid(const MDBX_meta *meta) { + const txnid_t a = unaligned_peek_u64(4, &meta->mm_txnid_a); + const txnid_t b = unaligned_peek_u64(4, &meta->mm_txnid_b); + return likely(a == b) ? a : 0; } -static __inline txnid_t meta_txnid(const MDBX_env *env, - volatile const MDBX_meta *meta) { - (void)env; - mdbx_memory_fence(mo_AcquireRelease, false); - txnid_t a = unaligned_peek_u64_volatile(4, &meta->mm_txnid_a); - txnid_t b = unaligned_peek_u64_volatile(4, &meta->mm_txnid_b); - return (a == b) ? a : 0; +typedef struct { + uint64_t txnid; + size_t is_steady; +} meta_snap_t; + +static __always_inline txnid_t +atomic_load_txnid(const volatile MDBX_atomic_uint32_t *ptr) { +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + return atomic_load64((const volatile MDBX_atomic_uint64_t *)ptr, + mo_AcquireRelease); +#else + const uint32_t l = atomic_load32( + &ptr[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); + const uint32_t h = atomic_load32( + &ptr[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], mo_AcquireRelease); + return (uint64_t)h << 32 | l; +#endif +} + +static __inline meta_snap_t meta_snap(const volatile MDBX_meta *meta) { + txnid_t txnid = atomic_load_txnid(meta->mm_txnid_a); + jitter4testing(true); + size_t is_steady = META_IS_STEADY(meta) && txnid >= MIN_TXNID; + jitter4testing(true); + if (unlikely(txnid != atomic_load_txnid(meta->mm_txnid_b))) + txnid = is_steady = 0; + meta_snap_t r = {txnid, is_steady}; + return r; +} + +static __inline txnid_t meta_txnid(const volatile MDBX_meta *meta) { + return meta_snap(meta).txnid; } static __inline void meta_update_begin(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && - unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); + eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) < txnid && + unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; - unaligned_poke_u64(4, meta->mm_txnid_b, 0); - mdbx_memory_fence(mo_AcquireRelease, true); - unaligned_poke_u64(4, meta->mm_txnid_a, txnid); +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, 0, + mo_AcquireRelease); + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_a, txnid, + mo_AcquireRelease); +#else + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + 0, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + 0, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + (uint32_t)txnid, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_a[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + (uint32_t)(txnid >> 32), mo_AcquireRelease); +#endif } static __inline void meta_update_end(const MDBX_env *env, MDBX_meta *meta, txnid_t txnid) { - mdbx_assert(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); - mdbx_assert(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); + eASSERT(env, meta >= METAPAGE(env, 0) && meta < METAPAGE_END(env)); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_a) == txnid); + eASSERT(env, unaligned_peek_u64(4, meta->mm_txnid_b) < txnid); (void)env; - mdbx_jitter4testing(true); + jitter4testing(true); memcpy(&meta->mm_bootid, &bootid, 16); - unaligned_poke_u64(4, meta->mm_txnid_b, txnid); - mdbx_memory_fence(mo_AcquireRelease, true); +#if (defined(__amd64__) || defined(__e2k__)) && !defined(ENABLE_UBSAN) && \ + MDBX_UNALIGNED_OK >= 8 + atomic_store64((MDBX_atomic_uint64_t *)&meta->mm_txnid_b, txnid, + mo_AcquireRelease); +#else + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__], + (uint32_t)txnid, mo_AcquireRelease); + atomic_store32(&meta->mm_txnid_b[__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__], + (uint32_t)(txnid >> 32), mo_AcquireRelease); +#endif } static __inline void meta_set_txnid(const MDBX_env *env, MDBX_meta *meta, const txnid_t txnid) { - mdbx_assert(env, !env->me_map || meta < METAPAGE(env, 0) || - meta >= METAPAGE_END(env)); + eASSERT(env, + !env->me_map || meta < METAPAGE(env, 0) || meta >= METAPAGE_END(env)); (void)env; /* update inconsistently since this function used ONLY for filling meta-image * for writing, but not the actual meta-page */ @@ -5751,198 +5385,288 @@ static __inline uint64_t meta_sign(const MDBX_meta *meta) { return (sign > MDBX_DATASIGN_WEAK) ? sign : ~sign; } -enum meta_choise_mode { prefer_last, prefer_steady }; +typedef struct { + txnid_t txnid; + union { + const volatile MDBX_meta *ptr_v; + const MDBX_meta *ptr_c; + }; + size_t is_steady; +} meta_ptr_t; -static __inline bool meta_ot(const enum meta_choise_mode mode, - const MDBX_env *env, volatile const MDBX_meta *a, - volatile const MDBX_meta *b) { - mdbx_jitter4testing(true); - const txnid_t txnid_a = meta_txnid(env, a); - mdbx_jitter4testing(true); - const txnid_t txnid_b = meta_txnid(env, b); - mdbx_jitter4testing(true); - const bool is_stead_b = META_IS_STEADY(b); - - if (mode == prefer_steady) { - mdbx_jitter4testing(true); - const bool is_stead_a = META_IS_STEADY(a); - if (is_stead_a != is_stead_b) - return is_stead_b; - } else { - mdbx_assert(env, mode == prefer_last); - } - if (txnid_a == txnid_b) - return is_stead_b; - return txnid_a < txnid_b; +static meta_ptr_t meta_ptr(const MDBX_env *env, unsigned n) { + eASSERT(env, n < NUM_METAS); + meta_ptr_t r; + meta_snap_t snap = meta_snap(r.ptr_v = METAPAGE(env, n)); + r.txnid = snap.txnid; + r.is_steady = snap.is_steady; + return r; } -static bool meta_eq(const MDBX_env *env, volatile const MDBX_meta *a, - volatile const MDBX_meta *b) { - mdbx_jitter4testing(true); - const txnid_t txnid = meta_txnid(env, a); - if (!txnid || txnid != meta_txnid(env, b)) - return false; - - mdbx_jitter4testing(true); - if (META_IS_STEADY(a) != META_IS_STEADY(b)) - return false; - - mdbx_jitter4testing(true); - return true; +static __always_inline uint8_t meta_cmp2int(txnid_t a, txnid_t b, uint8_t s) { + return unlikely(a == b) ? 1 * s : (a > b) ? 2 * s : 0 * s; } -static int meta_eq_mask(const MDBX_env *env) { - volatile const MDBX_meta *m0 = METAPAGE(env, 0); - volatile const MDBX_meta *m1 = METAPAGE(env, 1); - volatile const MDBX_meta *m2 = METAPAGE(env, 2); - - int rc = meta_eq(env, m0, m1) ? 1 : 0; - if (meta_eq(env, m1, m2)) - rc += 2; - if (meta_eq(env, m2, m0)) - rc += 4; - return rc; +static __always_inline uint8_t meta_cmp2recent(uint8_t ab_cmp2int, + bool a_steady, bool b_steady) { + assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); + return ab_cmp2int > 1 || (ab_cmp2int == 1 && a_steady > b_steady); } -static __inline volatile const MDBX_meta * -meta_recent(const enum meta_choise_mode mode, const MDBX_env *env, - volatile const MDBX_meta *a, volatile const MDBX_meta *b) { - const bool a_older_that_b = meta_ot(mode, env, a, b); - mdbx_assert(env, !meta_eq(env, a, b)); - return a_older_that_b ? b : a; +static __always_inline uint8_t meta_cmp2steady(uint8_t ab_cmp2int, + bool a_steady, bool b_steady) { + assert(ab_cmp2int < 3 /* && a_steady< 2 && b_steady < 2 */); + return a_steady > b_steady || (a_steady == b_steady && ab_cmp2int > 1); } -static const MDBX_meta *meta_ancient_prefer_weak(const MDBX_env *env, - const MDBX_meta *a, - const MDBX_meta *b) { - const bool a_older_that_b = meta_ot(prefer_steady, env, a, b); - mdbx_assert(env, !meta_eq(env, a, b)); - return a_older_that_b ? a : b; +static __inline bool meta_choice_recent(txnid_t a_txnid, bool a_steady, + txnid_t b_txnid, bool b_steady) { + return meta_cmp2recent(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); } -static __inline volatile const MDBX_meta * -meta_mostrecent(const enum meta_choise_mode mode, const MDBX_env *env) { - volatile const MDBX_meta *m0 = METAPAGE(env, 0); - volatile const MDBX_meta *m1 = METAPAGE(env, 1); - volatile const MDBX_meta *m2 = METAPAGE(env, 2); - - volatile const MDBX_meta *head = meta_recent(mode, env, m0, m1); - head = meta_recent(mode, env, head, m2); - return head; +static __inline bool meta_choice_steady(txnid_t a_txnid, bool a_steady, + txnid_t b_txnid, bool b_steady) { + return meta_cmp2steady(meta_cmp2int(a_txnid, b_txnid, 1), a_steady, b_steady); } -static volatile const MDBX_meta *meta_prefer_steady(const MDBX_env *env) { - return meta_mostrecent(prefer_steady, env); +MDBX_MAYBE_UNUSED static uint8_t meta_cmp2pack(uint8_t c01, uint8_t c02, + uint8_t c12, bool s0, bool s1, + bool s2) { + assert(c01 < 3 && c02 < 3 && c12 < 3); + /* assert(s0 < 2 && s1 < 2 && s2 < 2); */ + const uint8_t recent = meta_cmp2recent(c01, s0, s1) + ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) + : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); + const uint8_t prefer_steady = meta_cmp2steady(c01, s0, s1) + ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) + : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); + + uint8_t tail; + if (recent == 0) + tail = meta_cmp2steady(c12, s1, s2) ? 2 : 1; + else if (recent == 1) + tail = meta_cmp2steady(c02, s0, s2) ? 2 : 0; + else + tail = meta_cmp2steady(c01, s0, s1) ? 1 : 0; + + const bool valid = + c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; + const bool strict = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && + (c12 != 1 || s1 != s2); + return tail | recent << 2 | prefer_steady << 4 | strict << 6 | valid << 7; } -MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * -constmeta_prefer_steady(const MDBX_env *env) { - return (const MDBX_meta *)meta_mostrecent(prefer_steady, env); +static __inline void meta_troika_unpack(meta_troika_t *troika, + const uint8_t packed) { + troika->recent = (packed >> 2) & 3; + troika->prefer_steady = (packed >> 4) & 3; + troika->tail_and_flags = packed & 0xC3; +#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ + troika->unused_pad = 0; +#endif } -static volatile const MDBX_meta *meta_prefer_last(const MDBX_env *env) { - return meta_mostrecent(prefer_last, env); +static const uint8_t troika_fsm_map[2 * 2 * 2 * 3 * 3 * 3] = { + 232, 201, 216, 216, 232, 233, 232, 232, 168, 201, 216, 152, 168, 233, 232, + 168, 233, 201, 216, 201, 233, 233, 232, 233, 168, 201, 152, 216, 232, 169, + 232, 168, 168, 193, 152, 152, 168, 169, 232, 168, 169, 193, 152, 194, 233, + 169, 232, 169, 232, 201, 216, 216, 232, 201, 232, 232, 168, 193, 216, 152, + 168, 193, 232, 168, 193, 193, 210, 194, 225, 193, 225, 193, 168, 137, 212, + 214, 232, 233, 168, 168, 168, 137, 212, 150, 168, 233, 168, 168, 169, 137, + 216, 201, 233, 233, 168, 169, 168, 137, 148, 214, 232, 169, 168, 168, 40, + 129, 148, 150, 168, 169, 168, 40, 169, 129, 152, 194, 233, 169, 168, 169, + 168, 137, 214, 214, 232, 201, 168, 168, 168, 129, 214, 150, 168, 193, 168, + 168, 129, 129, 210, 194, 225, 193, 161, 129, 212, 198, 212, 214, 228, 228, + 212, 212, 148, 201, 212, 150, 164, 233, 212, 148, 233, 201, 216, 201, 233, + 233, 216, 233, 148, 198, 148, 214, 228, 164, 212, 148, 148, 194, 148, 150, + 164, 169, 212, 148, 169, 194, 152, 194, 233, 169, 216, 169, 214, 198, 214, + 214, 228, 198, 212, 214, 150, 194, 214, 150, 164, 193, 212, 150, 194, 194, + 210, 194, 225, 193, 210, 194}; + +__hot static meta_troika_t meta_tap(const MDBX_env *env) { + meta_snap_t snap; + meta_troika_t troika; + snap = meta_snap(METAPAGE(env, 0)); + troika.txnid[0] = snap.txnid; + troika.fsm = (uint8_t)snap.is_steady << 0; + snap = meta_snap(METAPAGE(env, 1)); + troika.txnid[1] = snap.txnid; + troika.fsm += (uint8_t)snap.is_steady << 1; + troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[1], 8); + snap = meta_snap(METAPAGE(env, 2)); + troika.txnid[2] = snap.txnid; + troika.fsm += (uint8_t)snap.is_steady << 2; + troika.fsm += meta_cmp2int(troika.txnid[0], troika.txnid[2], 8 * 3); + troika.fsm += meta_cmp2int(troika.txnid[1], troika.txnid[2], 8 * 3 * 3); + + meta_troika_unpack(&troika, troika_fsm_map[troika.fsm]); + return troika; } -MDBX_NOTHROW_PURE_FUNCTION static const MDBX_meta * -constmeta_prefer_last(const MDBX_env *env) { - return (const MDBX_meta *)meta_mostrecent(prefer_last, env); +static txnid_t recent_committed_txnid(const MDBX_env *env) { + const txnid_t m0 = meta_txnid(METAPAGE(env, 0)); + const txnid_t m1 = meta_txnid(METAPAGE(env, 1)); + const txnid_t m2 = meta_txnid(METAPAGE(env, 2)); + return (m0 > m1) ? ((m0 > m2) ? m0 : m2) : ((m1 > m2) ? m1 : m2); } -static txnid_t mdbx_recent_committed_txnid(const MDBX_env *env) { - while (true) { - volatile const MDBX_meta *head = meta_prefer_last(env); - const txnid_t recent = meta_txnid(env, head); - mdbx_compiler_barrier(); - if (likely(head == meta_prefer_last(env) && - recent == meta_txnid(env, head))) - return recent; - } +static __inline bool meta_eq(const meta_troika_t *troika, size_t a, size_t b) { + assert(a < NUM_METAS && b < NUM_METAS); + return troika->txnid[a] == troika->txnid[b] && + (((troika->fsm >> a) ^ (troika->fsm >> b)) & 1) == 0 && + troika->txnid[a]; } -static txnid_t mdbx_recent_steady_txnid(const MDBX_env *env) { - while (true) { - volatile const MDBX_meta *head = meta_prefer_steady(env); - const txnid_t recent = meta_txnid(env, head); - mdbx_compiler_barrier(); - if (likely(head == meta_prefer_steady(env) && - recent == meta_txnid(env, head))) - return recent; - } +static unsigned meta_eq_mask(const meta_troika_t *troika) { + return meta_eq(troika, 0, 1) | meta_eq(troika, 1, 2) << 1 | + meta_eq(troika, 2, 0) << 2; } -static const char *mdbx_durable_str(volatile const MDBX_meta *const meta) { +__hot static bool meta_should_retry(const MDBX_env *env, + meta_troika_t *troika) { + const meta_troika_t prev = *troika; + *troika = meta_tap(env); + return prev.fsm != troika->fsm || prev.txnid[0] != troika->txnid[0] || + prev.txnid[1] != troika->txnid[1] || prev.txnid[2] != troika->txnid[2]; +} + +static __always_inline meta_ptr_t meta_recent(const MDBX_env *env, + const meta_troika_t *troika) { + meta_ptr_t r; + r.txnid = troika->txnid[troika->recent]; + r.ptr_v = METAPAGE(env, troika->recent); + r.is_steady = (troika->fsm >> troika->recent) & 1; + return r; +} + +static __always_inline meta_ptr_t +meta_prefer_steady(const MDBX_env *env, const meta_troika_t *troika) { + meta_ptr_t r; + r.txnid = troika->txnid[troika->prefer_steady]; + r.ptr_v = METAPAGE(env, troika->prefer_steady); + r.is_steady = (troika->fsm >> troika->prefer_steady) & 1; + return r; +} + +static __always_inline meta_ptr_t meta_tail(const MDBX_env *env, + const meta_troika_t *troika) { + const uint8_t tail = troika->tail_and_flags & 3; + MDBX_ANALYSIS_ASSUME(tail < NUM_METAS); + meta_ptr_t r; + r.txnid = troika->txnid[tail]; + r.ptr_v = METAPAGE(env, tail); + r.is_steady = (troika->fsm >> tail) & 1; + return r; +} + +static const char *durable_caption(const volatile MDBX_meta *const meta) { if (META_IS_STEADY(meta)) - return (unaligned_peek_u64_volatile(4, meta->mm_datasync_sign) == + return (unaligned_peek_u64_volatile(4, meta->mm_sign) == meta_sign((const MDBX_meta *)meta)) ? "Steady" : "Tainted"; return "Weak"; } +__cold static void meta_troika_dump(const MDBX_env *env, + const meta_troika_t *troika) { + const meta_ptr_t recent = meta_recent(env, troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, troika); + const meta_ptr_t tail = meta_tail(env, troika); + NOTICE("%" PRIaTXN ".%c:%" PRIaTXN ".%c:%" PRIaTXN ".%c, fsm=0x%02x, " + "head=%d-%" PRIaTXN ".%c, " + "base=%d-%" PRIaTXN ".%c, " + "tail=%d-%" PRIaTXN ".%c, " + "valid %c, strict %c", + troika->txnid[0], (troika->fsm & 1) ? 's' : 'w', troika->txnid[1], + (troika->fsm & 2) ? 's' : 'w', troika->txnid[2], + (troika->fsm & 4) ? 's' : 'w', troika->fsm, troika->recent, + recent.txnid, recent.is_steady ? 's' : 'w', troika->prefer_steady, + prefer_steady.txnid, prefer_steady.is_steady ? 's' : 'w', + troika->tail_and_flags % NUM_METAS, tail.txnid, + tail.is_steady ? 's' : 'w', TROIKA_VALID(troika) ? 'Y' : 'N', + TROIKA_STRICT_VALID(troika) ? 'Y' : 'N'); +} + /*----------------------------------------------------------------------------*/ /* Find oldest txnid still referenced. */ -static txnid_t mdbx_find_oldest(const MDBX_txn *txn) { - mdbx_tassert(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); - MDBX_env *env = txn->mt_env; - const txnid_t edge = mdbx_recent_steady_txnid(env); - mdbx_tassert(txn, edge <= txn->mt_txnid); +static txnid_t find_oldest_reader(MDBX_env *const env, const txnid_t steady) { + const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); + eASSERT(env, steady <= env->me_txn0->mt_txnid); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (unlikely(lck == NULL /* exclusive mode */)) { - mdbx_assert(env, env->me_lck == (void *)&env->x_lckless_stub); - return env->me_lck->mti_oldest_reader.weak = edge; + if (unlikely(lck == NULL /* exclusive without-lck mode */)) { + eASSERT(env, env->me_lck == (void *)&env->x_lckless_stub); + env->me_lck->mti_readers_refresh_flag.weak = nothing_changed; + return env->me_lck->mti_oldest_reader.weak = steady; } - const txnid_t last_oldest = + const txnid_t prev_oldest = atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease); - mdbx_tassert(txn, edge >= last_oldest); - if (likely(last_oldest == edge)) - return edge; + eASSERT(env, steady >= prev_oldest); - const uint32_t nothing_changed = MDBX_STRING_TETRAD("None"); - const uint32_t snap_readers_refresh_flag = - atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease); - mdbx_jitter4testing(false); - if (snap_readers_refresh_flag == nothing_changed) - return last_oldest; + txnid_t new_oldest = prev_oldest; + while (nothing_changed != + atomic_load32(&lck->mti_readers_refresh_flag, mo_AcquireRelease)) { + lck->mti_readers_refresh_flag.weak = nothing_changed; + jitter4testing(false); + const size_t snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + new_oldest = steady; - txnid_t oldest = edge; - atomic_store32(&lck->mti_readers_refresh_flag, nothing_changed, mo_Relaxed); - const unsigned snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ - const txnid_t snap = safe64_read(&lck->mti_readers[i].mr_txnid); - if (oldest > snap && last_oldest <= /* ignore pending updates */ snap) { - oldest = snap; - if (oldest == last_oldest) - return oldest; + for (size_t i = 0; i < snap_nreaders; ++i) { + const uint32_t pid = + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); + if (!pid) + continue; + jitter4testing(true); + + const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); + if (unlikely(rtxn < prev_oldest)) { + if (unlikely(nothing_changed == + atomic_load32(&lck->mti_readers_refresh_flag, + mo_AcquireRelease)) && + safe64_reset_compare(&lck->mti_readers[i].mr_txnid, rtxn)) { + NOTICE("kick stuck reader[%zu of %zu].pid_%u %" PRIaTXN + " < prev-oldest %" PRIaTXN ", steady-txn %" PRIaTXN, + i, snap_nreaders, pid, rtxn, prev_oldest, steady); + } + continue; + } + + if (rtxn < new_oldest) { + new_oldest = rtxn; + if (!MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS && new_oldest == prev_oldest) + break; } } } - if (oldest != last_oldest) { - mdbx_verbose("update oldest %" PRIaTXN " -> %" PRIaTXN, last_oldest, - oldest); - mdbx_tassert(txn, oldest >= lck->mti_oldest_reader.weak); - atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); + if (new_oldest != prev_oldest) { + VERBOSE("update oldest %" PRIaTXN " -> %" PRIaTXN, prev_oldest, new_oldest); + eASSERT(env, new_oldest >= lck->mti_oldest_reader.weak); + atomic_store64(&lck->mti_oldest_reader, new_oldest, mo_Relaxed); } - return oldest; + return new_oldest; +} + +static txnid_t txn_oldest_reader(const MDBX_txn *const txn) { + return find_oldest_reader(txn->mt_env, + txn->tw.troika.txnid[txn->tw.troika.prefer_steady]); } /* Find largest mvcc-snapshot still referenced. */ -__cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { +static pgno_t find_largest_snapshot(const MDBX_env *env, + pgno_t last_used_page) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (likely(lck != NULL /* exclusive mode */)) { - const unsigned snap_nreaders = + if (likely(lck != NULL /* check for exclusive without-lck mode */)) { + retry:; + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { - retry: + for (size_t i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ + /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32( &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); @@ -5952,65 +5676,77 @@ __cold static pgno_t mdbx_find_largest(MDBX_env *env, pgno_t largest) { mo_AcquireRelease) || snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) goto retry; - if (largest < snap_pages && - atomic_load64(&lck->mti_oldest_reader, mo_AcquireRelease) <= - /* ignore pending updates */ snap_txnid && - snap_txnid <= env->me_txn0->mt_txnid) - largest = snap_pages; + if (last_used_page < snap_pages && snap_txnid <= env->me_txn0->mt_txnid) + last_used_page = snap_pages; } } } - return largest; + return last_used_page; } /* Add a page to the txn's dirty list */ -static int __must_check_result mdbx_page_dirty(MDBX_txn *txn, MDBX_page *mp, - unsigned npages) { +__hot static int __must_check_result page_dirty(MDBX_txn *txn, MDBX_page *mp, + size_t npages) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + mp->mp_txnid = txn->mt_front; + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + txn->tw.writemap_dirty_npages += npages; + tASSERT(txn, txn->tw.spilled.list == nullptr); + return MDBX_SUCCESS; + } + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + #if xMDBX_DEBUG_SPILLING == 2 txn->mt_env->debug_dirtied_act += 1; - mdbx_ensure(txn->mt_env, - txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); - mdbx_ensure(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); + ENSURE(txn->mt_env, + txn->mt_env->debug_dirtied_act < txn->mt_env->debug_dirtied_est); + ENSURE(txn->mt_env, txn->tw.dirtyroom + txn->tw.loose_count > 0); #endif /* xMDBX_DEBUG_SPILLING == 2 */ int rc; - mp->mp_txnid = txn->mt_front; if (unlikely(txn->tw.dirtyroom == 0)) { if (txn->tw.loose_count) { - MDBX_page *loose = txn->tw.loose_pages; - mdbx_debug("purge-and-reclaim loose page %" PRIaPGNO, loose->mp_pgno); - rc = mdbx_pnl_insert_range(&txn->tw.reclaimed_pglist, loose->mp_pgno, 1); + MDBX_page *lp = txn->tw.loose_pages; + DEBUG("purge-and-reclaim loose page %" PRIaPGNO, lp->mp_pgno); + rc = pnl_insert_range(&txn->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - unsigned di = mdbx_dpl_search(txn, loose->mp_pgno); - mdbx_tassert(txn, txn->tw.dirtylist->items[di].ptr == loose); - mdbx_dpl_remove(txn, di); - txn->tw.loose_pages = loose->mp_next; + size_t di = dpl_search(txn, lp->mp_pgno); + tASSERT(txn, txn->tw.dirtylist->items[di].ptr == lp); + dpl_remove(txn, di); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + txn->tw.loose_pages = mp_next(lp); txn->tw.loose_count--; txn->tw.dirtyroom++; - if (!(txn->mt_flags & MDBX_WRITEMAP)) - mdbx_dpage_free(txn->mt_env, loose, 1); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) + dpage_free(txn->mt_env, lp, 1); } else { - mdbx_error("Dirtyroom is depleted, DPL length %u", - txn->tw.dirtylist->length); - if (!(txn->mt_flags & MDBX_WRITEMAP)) - mdbx_dpage_free(txn->mt_env, mp, npages); + ERROR("Dirtyroom is depleted, DPL length %zu", txn->tw.dirtylist->length); + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) + dpage_free(txn->mt_env, mp, npages); return MDBX_TXN_FULL; } } - rc = mdbx_dpl_append(txn, mp->mp_pgno, mp, npages); + rc = dpl_append(txn, mp->mp_pgno, mp, npages); if (unlikely(rc != MDBX_SUCCESS)) { bailout: txn->mt_flags |= MDBX_TXN_ERROR; return rc; } txn->tw.dirtyroom--; - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); return MDBX_SUCCESS; } +static void mincore_clean_cache(const MDBX_env *const env) { + memset(env->me_lck->mti_mincore_cache.begin, -1, + sizeof(env->me_lck->mti_mincore_cache.begin)); +} + #if !(defined(_WIN32) || defined(_WIN64)) MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { #ifdef ENOSYS @@ -6041,11 +5777,10 @@ MDBX_MAYBE_UNUSED static __always_inline int ignore_enosys(int err) { #if MDBX_ENABLE_MADVISE /* Turn on/off readahead. It's harmful when the DB is larger than RAM. */ -__cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, - const bool enable, - const bool force_whole) { - mdbx_assert(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); - mdbx_assert(env, (enable & 1) == (enable != 0)); +__cold static int set_readahead(const MDBX_env *env, const pgno_t edge, + const bool enable, const bool force_whole) { + eASSERT(env, edge >= NUM_METAS && edge <= MAX_PAGENO + 1); + eASSERT(env, (enable & 1) == (enable != 0)); const bool toggle = force_whole || ((enable ^ env->me_lck->mti_readahead_anchor) & 1) || !env->me_lck->mti_readahead_anchor; @@ -6061,12 +5796,12 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, length = (length < limit) ? length : limit; length -= offset; - mdbx_assert(env, 0 <= (intptr_t)length); + eASSERT(env, 0 <= (intptr_t)length); if (length == 0) return MDBX_SUCCESS; - mdbx_notice("readahead %s %u..%u", enable ? "ON" : "OFF", - bytes2pgno(env, offset), bytes2pgno(env, offset + length)); + NOTICE("readahead %s %u..%u", enable ? "ON" : "OFF", bytes2pgno(env, offset), + bytes2pgno(env, offset + length)); #if defined(F_RDAHEAD) if (toggle && unlikely(fcntl(env->me_lazy_fd, F_RDAHEAD, enable) == -1)) @@ -6074,16 +5809,15 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, #endif /* F_RDAHEAD */ int err; + void *const ptr = ptr_disp(env->me_map, offset); if (enable) { #if defined(MADV_NORMAL) - err = madvise(env->me_map + offset, length, MADV_NORMAL) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = + madvise(ptr, length, MADV_NORMAL) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_NORMAL) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_NORMAL)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_NORMAL)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_FADV_NORMAL) && defined(POSIX_FADV_WILLNEED) @@ -6112,20 +5846,18 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, (void)/* Ignore ENOTTY for DB on the ram-disk and so on */ fcntl( env->me_lazy_fd, F_RDADVISE, &hint); #elif defined(MADV_WILLNEED) - err = madvise(env->me_map + offset, length, MADV_WILLNEED) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = madvise(ptr, length, MADV_WILLNEED) ? ignore_enosys(errno) + : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_WILLNEED) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_WILLNEED)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_WILLNEED)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(_WIN32) || defined(_WIN64) if (mdbx_PrefetchVirtualMemory) { WIN32_MEMORY_RANGE_ENTRY hint; - hint.VirtualAddress = env->me_map + offset; + hint.VirtualAddress = ptr; hint.NumberOfBytes = length; (void)mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0); } @@ -6139,15 +5871,14 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, #endif } } else { + mincore_clean_cache(env); #if defined(MADV_RANDOM) - err = madvise(env->me_map + offset, length, MADV_RANDOM) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + err = + madvise(ptr, length, MADV_RANDOM) ? ignore_enosys(errno) : MDBX_SUCCESS; if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_MADV_RANDOM) - err = ignore_enosys( - posix_madvise(env->me_map + offset, length, POSIX_MADV_RANDOM)); + err = ignore_enosys(posix_madvise(ptr, length, POSIX_MADV_RANDOM)); if (unlikely(MDBX_IS_ERROR(err))) return err; #elif defined(POSIX_FADV_RANDOM) @@ -6168,42 +5899,178 @@ __cold static int mdbx_set_readahead(MDBX_env *env, const pgno_t edge, } #endif /* MDBX_ENABLE_MADVISE */ -__cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, - const pgno_t limit_pgno, const bool implicit) { - const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); - const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); +__cold static void update_mlcnt(const MDBX_env *env, + const pgno_t new_aligned_mlocked_pgno, + const bool lock_not_release) { + for (;;) { + const pgno_t mlock_pgno_before = + atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease); + eASSERT(env, + pgno_align2os_pgno(env, mlock_pgno_before) == mlock_pgno_before); + eASSERT(env, pgno_align2os_pgno(env, new_aligned_mlocked_pgno) == + new_aligned_mlocked_pgno); + if (lock_not_release ? (mlock_pgno_before >= new_aligned_mlocked_pgno) + : (mlock_pgno_before <= new_aligned_mlocked_pgno)) + break; + if (likely(atomic_cas32(&((MDBX_env *)env)->me_mlocked_pgno, + mlock_pgno_before, new_aligned_mlocked_pgno))) + for (;;) { + MDBX_atomic_uint32_t *const mlcnt = env->me_lck->mti_mlcnt; + const int32_t snap_locked = atomic_load32(mlcnt + 0, mo_Relaxed); + const int32_t snap_unlocked = atomic_load32(mlcnt + 1, mo_Relaxed); + if (mlock_pgno_before == 0 && (snap_locked - snap_unlocked) < INT_MAX) { + eASSERT(env, lock_not_release); + if (unlikely(!atomic_cas32(mlcnt + 0, snap_locked, snap_locked + 1))) + continue; + } + if (new_aligned_mlocked_pgno == 0 && + (snap_locked - snap_unlocked) > 0) { + eASSERT(env, !lock_not_release); + if (unlikely( + !atomic_cas32(mlcnt + 1, snap_unlocked, snap_unlocked + 1))) + continue; + } + NOTICE("%s-pages %u..%u, mlocked-process(es) %u -> %u", + lock_not_release ? "lock" : "unlock", + lock_not_release ? mlock_pgno_before : new_aligned_mlocked_pgno, + lock_not_release ? new_aligned_mlocked_pgno : mlock_pgno_before, + snap_locked - snap_unlocked, + atomic_load32(mlcnt + 0, mo_Relaxed) - + atomic_load32(mlcnt + 1, mo_Relaxed)); + return; + } + } +} + +__cold static void munlock_after(const MDBX_env *env, const pgno_t aligned_pgno, + const size_t end_bytes) { + if (atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) > aligned_pgno) { + int err = MDBX_ENOSYS; + const size_t munlock_begin = pgno2bytes(env, aligned_pgno); + const size_t munlock_size = end_bytes - munlock_begin; + eASSERT(env, end_bytes % env->me_os_psize == 0 && + munlock_begin % env->me_os_psize == 0 && + munlock_size % env->me_os_psize == 0); +#if defined(_WIN32) || defined(_WIN64) + err = VirtualUnlock(ptr_disp(env->me_map, munlock_begin), munlock_size) + ? MDBX_SUCCESS + : (int)GetLastError(); + if (err == ERROR_NOT_LOCKED) + err = MDBX_SUCCESS; +#elif defined(_POSIX_MEMLOCK_RANGE) + err = munlock(ptr_disp(env->me_map, munlock_begin), munlock_size) + ? errno + : MDBX_SUCCESS; +#endif + if (likely(err == MDBX_SUCCESS)) + update_mlcnt(env, aligned_pgno, false); + else { +#if defined(_WIN32) || defined(_WIN64) + WARNING("VirtualUnlock(%zu, %zu) error %d", munlock_begin, munlock_size, + err); +#else + WARNING("munlock(%zu, %zu) error %d", munlock_begin, munlock_size, err); +#endif + } + } +} + +__cold static void munlock_all(const MDBX_env *env) { + munlock_after(env, 0, bytes_align2os_bytes(env, env->me_dxb_mmap.current)); +} + +__cold static unsigned default_rp_augment_limit(const MDBX_env *env) { + /* default rp_augment_limit = ceil(npages / gold_ratio) */ + const size_t augment = (env->me_dbgeo.now >> (env->me_psize2log + 10)) * 633u; + eASSERT(env, augment < MDBX_PGL_LIMIT); + return pnl_bytes2size(pnl_size2bytes( + (augment > MDBX_PNL_INITIAL) ? augment : MDBX_PNL_INITIAL)); +} + +static bool default_prefault_write(const MDBX_env *env) { + return !MDBX_MMAP_INCOHERENT_FILE_WRITE && !env->me_incore && + (env->me_flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == MDBX_WRITEMAP; +} + +static void adjust_defaults(MDBX_env *env) { + if (!env->me_options.flags.non_auto.rp_augment_limit) + env->me_options.rp_augment_limit = default_rp_augment_limit(env); + if (!env->me_options.flags.non_auto.prefault_write) + env->me_options.prefault_write = default_prefault_write(env); + + const size_t basis = env->me_dbgeo.now; + /* TODO: use options? */ + const unsigned factor = 9; + size_t threshold = (basis < ((size_t)65536 << factor)) + ? 65536 /* minimal threshold */ + : (basis > (MEGABYTE * 4 << factor)) + ? MEGABYTE * 4 /* maximal threshold */ + : basis >> factor; + threshold = (threshold < env->me_dbgeo.shrink || !env->me_dbgeo.shrink) + ? threshold + : env->me_dbgeo.shrink; + + env->me_madv_threshold = + bytes2pgno(env, bytes_align2os_bytes(env, threshold)); +} + +enum resize_mode { implicit_grow, impilict_shrink, explicit_resize }; + +__cold static int dxb_resize(MDBX_env *const env, const pgno_t used_pgno, + const pgno_t size_pgno, pgno_t limit_pgno, + const enum resize_mode mode) { + /* Acquire guard to avoid collision between read and write txns + * around me_dbgeo and me_dxb_mmap */ +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_AcquireExclusive(&env->me_remap_guard); + int rc = MDBX_SUCCESS; + mdbx_handle_array_t *suspended = NULL; + mdbx_handle_array_t array_onstack; +#else + int rc = osal_fastmutex_acquire(&env->me_remap_guard); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; +#endif + const size_t prev_size = env->me_dxb_mmap.current; const size_t prev_limit = env->me_dxb_mmap.limit; + const pgno_t prev_limit_pgno = bytes2pgno(env, prev_limit); + eASSERT(env, limit_pgno >= size_pgno); + eASSERT(env, size_pgno >= used_pgno); + if (mode < explicit_resize && size_pgno <= prev_limit_pgno) { + /* The actual mapsize may be less since the geo.upper may be changed + * by other process. Avoids remapping until it necessary. */ + limit_pgno = prev_limit_pgno; + } + const size_t limit_bytes = pgno_align2os_bytes(env, limit_pgno); + const size_t size_bytes = pgno_align2os_bytes(env, size_pgno); #if MDBX_ENABLE_MADVISE || defined(MDBX_USE_VALGRIND) - const void *const prev_addr = env->me_map; + const void *const prev_map = env->me_dxb_mmap.base; #endif /* MDBX_ENABLE_MADVISE || MDBX_USE_VALGRIND */ - mdbx_verbose("resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR, - prev_size, size_bytes, prev_limit, limit_bytes); + VERBOSE("resize/%d datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR, + mode, prev_size, size_bytes, prev_limit, limit_bytes); - mdbx_assert(env, limit_bytes >= size_bytes); - mdbx_assert(env, bytes2pgno(env, size_bytes) >= size_pgno); - mdbx_assert(env, bytes2pgno(env, limit_bytes) >= limit_pgno); + eASSERT(env, limit_bytes >= size_bytes); + eASSERT(env, bytes2pgno(env, size_bytes) >= size_pgno); + eASSERT(env, bytes2pgno(env, limit_bytes) >= limit_pgno); unsigned mresize_flags = env->me_flags & (MDBX_RDONLY | MDBX_WRITEMAP | MDBX_UTTERLY_NOSYNC); -#if defined(_WIN32) || defined(_WIN64) - /* Acquire guard in exclusive mode for: - * - to avoid collision between read and write txns around env->me_dbgeo; - * - to avoid attachment of new reading threads (see mdbx_rdt_lock); */ - mdbx_srwlock_AcquireExclusive(&env->me_remap_guard); - mdbx_handle_array_t *suspended = NULL; - mdbx_handle_array_t array_onstack; - int rc = MDBX_SUCCESS; + if (mode >= impilict_shrink) + mresize_flags |= MDBX_SHRINK_ALLOWED; + if (limit_bytes == env->me_dxb_mmap.limit && size_bytes == env->me_dxb_mmap.current && size_bytes == env->me_dxb_mmap.filesize) goto bailout; - if ((env->me_flags & MDBX_NOTLS) == 0) { +#if defined(_WIN32) || defined(_WIN64) + if ((env->me_flags & MDBX_NOTLS) == 0 && + ((size_bytes < env->me_dxb_mmap.current && mode > implicit_grow) || + limit_bytes != env->me_dxb_mmap.limit)) { /* 1) Windows allows only extending a read-write section, but not a * corresponding mapped view. Therefore in other cases we must suspend * the local threads for safe remap. @@ -6216,83 +6083,88 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, array_onstack.limit = ARRAY_LENGTH(array_onstack.handles); array_onstack.count = 0; suspended = &array_onstack; - rc = mdbx_suspend_threads_before_remap(env, &suspended); + rc = osal_suspend_threads_before_remap(env, &suspended); if (rc != MDBX_SUCCESS) { - mdbx_error("failed suspend-for-remap: errcode %d", rc); + ERROR("failed suspend-for-remap: errcode %d", rc); goto bailout; } - mresize_flags |= implicit ? MDBX_MRESIZE_MAY_UNMAP - : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; + mresize_flags |= (mode < explicit_resize) + ? MDBX_MRESIZE_MAY_UNMAP + : MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; } #else /* Windows */ - /* Acquire guard to avoid collision between read and write txns - * around env->me_dbgeo */ - int rc = mdbx_fastmutex_acquire(&env->me_remap_guard); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - if (limit_bytes == env->me_dxb_mmap.limit && - size_bytes == env->me_dxb_mmap.current) - goto bailout; - MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (limit_bytes != env->me_dxb_mmap.limit && !(env->me_flags & MDBX_NOTLS) && - lck && !implicit) { - int err = mdbx_rdt_lock(env) /* lock readers table until remap done */; - if (unlikely(MDBX_IS_ERROR(err))) { - rc = err; - goto bailout; - } - - /* looking for readers from this process */ - const unsigned snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - mdbx_assert(env, !implicit); + if (mode == explicit_resize && limit_bytes != env->me_dxb_mmap.limit && + !(env->me_flags & MDBX_NOTLS)) { mresize_flags |= MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE; - for (unsigned i = 0; i < snap_nreaders; ++i) { - if (lck->mti_readers[i].mr_pid.weak == env->me_pid && - lck->mti_readers[i].mr_tid.weak != mdbx_thread_self()) { - /* the base address of the mapping can't be changed since - * the other reader thread from this process exists. */ - mdbx_rdt_unlock(env); - mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); - break; + if (lck) { + int err = osal_rdt_lock(env) /* lock readers table until remap done */; + if (unlikely(MDBX_IS_ERROR(err))) { + rc = err; + goto bailout; + } + + /* looking for readers from this process */ + const size_t snap_nreaders = + atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); + eASSERT(env, mode == explicit_resize); + for (size_t i = 0; i < snap_nreaders; ++i) { + if (lck->mti_readers[i].mr_pid.weak == env->me_pid && + lck->mti_readers[i].mr_tid.weak != osal_thread_self()) { + /* the base address of the mapping can't be changed since + * the other reader thread from this process exists. */ + osal_rdt_unlock(env); + mresize_flags &= ~(MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE); + break; + } } } } #endif /* ! Windows */ - if ((env->me_flags & MDBX_WRITEMAP) && env->me_lck->mti_unsynced_pages.weak) { + const pgno_t aligned_munlock_pgno = + (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) + ? 0 + : bytes2pgno(env, size_bytes); + if (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) { + mincore_clean_cache(env); + if ((env->me_flags & MDBX_WRITEMAP) && + env->me_lck->mti_unsynced_pages.weak) { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), - MDBX_SYNC_NONE); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, used_pgno), + MDBX_SYNC_NONE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } } + munlock_after(env, aligned_munlock_pgno, size_bytes); #if MDBX_ENABLE_MADVISE - if (size_bytes < prev_size) { - mdbx_notice("resize-MADV_%s %u..%u", - (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", - size_pgno, bytes2pgno(env, prev_size)); + if (size_bytes < prev_size && mode > implicit_grow) { + NOTICE("resize-MADV_%s %u..%u", + (env->me_flags & MDBX_WRITEMAP) ? "REMOVE" : "DONTNEED", size_pgno, + bytes2pgno(env, prev_size)); + const uint32_t munlocks_before = + atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); rc = MDBX_RESULT_TRUE; #if defined(MADV_REMOVE) if (env->me_flags & MDBX_WRITEMAP) - rc = - madvise(env->me_map + size_bytes, prev_size - size_bytes, MADV_REMOVE) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + rc = madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, + MADV_REMOVE) + ? ignore_enosys(errno) + : MDBX_SUCCESS; #endif /* MADV_REMOVE */ #if defined(MADV_DONTNEED) if (rc == MDBX_RESULT_TRUE) - rc = madvise(env->me_map + size_bytes, prev_size - size_bytes, + rc = madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, MADV_DONTNEED) ? ignore_enosys(errno) : MDBX_SUCCESS; #elif defined(POSIX_MADV_DONTNEED) if (rc == MDBX_RESULT_TRUE) - rc = ignore_enosys(posix_madvise(env->me_map + size_bytes, + rc = ignore_enosys(posix_madvise(ptr_disp(env->me_map, size_bytes), prev_size - size_bytes, POSIX_MADV_DONTNEED)); #elif defined(POSIX_FADV_DONTNEED) @@ -6301,37 +6173,66 @@ __cold static int mdbx_mapresize(MDBX_env *env, const pgno_t used_pgno, prev_size - size_bytes, POSIX_FADV_DONTNEED)); #endif /* MADV_DONTNEED */ - if (unlikely(MDBX_IS_ERROR(rc))) - goto bailout; - if (env->me_lck->mti_discarded_tail.weak > size_pgno) + if (unlikely(MDBX_IS_ERROR(rc))) { + const uint32_t mlocks_after = + atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); + if (rc == MDBX_EINVAL) { + const int severity = + (mlocks_after - munlocks_before) ? MDBX_LOG_NOTICE : MDBX_LOG_WARN; + if (LOG_ENABLED(severity)) + debug_log(severity, __func__, __LINE__, + "%s-madvise: ignore EINVAL (%d) since some pages maybe " + "locked (%u/%u mlcnt-processes)", + "resize", rc, mlocks_after, munlocks_before); + } else { + ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", + "mresize", "DONTNEED", size_bytes, prev_size - size_bytes, + mlocks_after, munlocks_before, rc); + goto bailout; + } + } else env->me_lck->mti_discarded_tail.weak = size_pgno; } #endif /* MDBX_ENABLE_MADVISE */ - rc = mdbx_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); + rc = osal_mresize(mresize_flags, &env->me_dxb_mmap, size_bytes, limit_bytes); #if MDBX_ENABLE_MADVISE if (rc == MDBX_SUCCESS) { + eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); + eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); + if (mode == explicit_resize) + eASSERT(env, size_bytes == env->me_dxb_mmap.current); + else + eASSERT(env, size_bytes <= env->me_dxb_mmap.current); env->me_lck->mti_discarded_tail.weak = size_pgno; const bool readahead = !(env->me_flags & MDBX_NORDAHEAD) && mdbx_is_readahead_reasonable(size_bytes, -(intptr_t)prev_size); const bool force = limit_bytes != prev_limit || - env->me_dxb_mmap.address != prev_addr + env->me_dxb_mmap.base != prev_map #if defined(_WIN32) || defined(_WIN64) || prev_size > size_bytes #endif /* Windows */ ; - rc = mdbx_set_readahead(env, size_pgno, readahead, force); + rc = set_readahead(env, size_pgno, readahead, force); } #endif /* MDBX_ENABLE_MADVISE */ bailout: if (rc == MDBX_SUCCESS) { - mdbx_assert(env, size_bytes <= env->me_dxb_mmap.filesize); - mdbx_assert(env, limit_bytes == env->me_dxb_mmap.limit); + eASSERT(env, limit_bytes == env->me_dxb_mmap.limit); + eASSERT(env, size_bytes <= env->me_dxb_mmap.filesize); + if (mode == explicit_resize) + eASSERT(env, size_bytes == env->me_dxb_mmap.current); + else + eASSERT(env, size_bytes <= env->me_dxb_mmap.current); + /* update env-geo to avoid influences */ + env->me_dbgeo.now = env->me_dxb_mmap.current; + env->me_dbgeo.upper = env->me_dxb_mmap.limit; + adjust_defaults(env); #ifdef MDBX_USE_VALGRIND - if (prev_limit != env->me_dxb_mmap.limit || prev_addr != env->me_map) { + if (prev_limit != env->me_dxb_mmap.limit || prev_map != env->me_map) { VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = 0; if (env->me_dxb_mmap.limit) @@ -6341,17 +6242,17 @@ bailout: #endif /* MDBX_USE_VALGRIND */ } else { if (rc != MDBX_UNABLE_EXTEND_MAPSIZE && rc != MDBX_EPERM) { - mdbx_error("failed resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - prev_size, size_bytes, prev_limit, limit_bytes, rc); + ERROR("failed resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); } else { - mdbx_warning("unable resize datafile/mapping: " - "present %" PRIuPTR " -> %" PRIuPTR ", " - "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", - prev_size, size_bytes, prev_limit, limit_bytes, rc); + WARNING("unable resize datafile/mapping: " + "present %" PRIuPTR " -> %" PRIuPTR ", " + "limit %" PRIuPTR " -> %" PRIuPTR ", errcode %d", + prev_size, size_bytes, prev_limit, limit_bytes, rc); } - if (!env->me_dxb_mmap.address) { + if (!env->me_dxb_mmap.base) { env->me_flags |= MDBX_FATAL_ERROR; if (env->me_txn) env->me_txn->mt_flags |= MDBX_TXN_ERROR; @@ -6361,115 +6262,578 @@ bailout: #if defined(_WIN32) || defined(_WIN64) int err = MDBX_SUCCESS; - mdbx_srwlock_ReleaseExclusive(&env->me_remap_guard); + osal_srwlock_ReleaseExclusive(&env->me_remap_guard); if (suspended) { - err = mdbx_resume_threads_after_remap(suspended); + err = osal_resume_threads_after_remap(suspended); if (suspended != &array_onstack) - mdbx_free(suspended); + osal_free(suspended); } #else if (env->me_lck_mmap.lck && (mresize_flags & (MDBX_MRESIZE_MAY_UNMAP | MDBX_MRESIZE_MAY_MOVE)) != 0) - mdbx_rdt_unlock(env); - int err = mdbx_fastmutex_release(&env->me_remap_guard); + osal_rdt_unlock(env); + int err = osal_fastmutex_release(&env->me_remap_guard); #endif /* Windows */ if (err != MDBX_SUCCESS) { - mdbx_fatal("failed resume-after-remap: errcode %d", err); + FATAL("failed resume-after-remap: errcode %d", err); return MDBX_PANIC; } return rc; } -__cold static int mdbx_mapresize_implicit(MDBX_env *env, const pgno_t used_pgno, - const pgno_t size_pgno, - const pgno_t limit_pgno) { - const pgno_t mapped_pgno = bytes2pgno(env, env->me_dxb_mmap.limit); - mdbx_assert(env, mapped_pgno >= used_pgno); - return mdbx_mapresize( - env, used_pgno, size_pgno, - (size_pgno > mapped_pgno) - ? limit_pgno - : /* The actual mapsize may be less since the geo.upper may be changed - by other process. So, avoids remapping until it necessary. */ - mapped_pgno, - true); -} +static int meta_unsteady(int err, MDBX_env *env, const txnid_t early_than, + const pgno_t pgno) { + MDBX_meta *const meta = METAPAGE(env, pgno); + const txnid_t txnid = constmeta_txnid(meta); + if (unlikely(err != MDBX_SUCCESS) || !META_IS_STEADY(meta) || + !(txnid < early_than)) + return err; -static int mdbx_meta_unsteady(MDBX_env *env, const txnid_t last_steady, - MDBX_meta *const meta, mdbx_filehandle_t fd) { + WARNING("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, txnid, pgno); const uint64_t wipe = MDBX_DATASIGN_NONE; - if (unlikely(META_IS_STEADY(meta)) && - constmeta_txnid(env, meta) <= last_steady) { - mdbx_warning("wipe txn #%" PRIaTXN ", meta %" PRIaPGNO, last_steady, - data_page(meta)->mp_pgno); - if (env->me_flags & MDBX_WRITEMAP) - unaligned_poke_u64(4, meta->mm_datasync_sign, wipe); - else - return mdbx_pwrite(fd, &wipe, sizeof(meta->mm_datasync_sign), - (uint8_t *)&meta->mm_datasync_sign - env->me_map); - if (constmeta_txnid(env, meta) == last_steady) - mdbx_assert(env, meta_checktxnid(env, meta, true)); + const void *ptr = &wipe; + size_t bytes = sizeof(meta->mm_sign), + offset = ptr_dist(&meta->mm_sign, env->me_map); + if (env->me_flags & MDBX_WRITEMAP) { + unaligned_poke_u64(4, meta->mm_sign, wipe); + osal_flush_incoherent_cpu_writeback(); + if (!MDBX_AVOID_MSYNC) { + err = + osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + return err; + } + ptr = data_page(meta); + offset = ptr_dist(ptr, env->me_map); + bytes = env->me_psize; } - return MDBX_SUCCESS; -} -__cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; - int err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 0), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 1), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - err = mdbx_meta_unsteady(env, last_steady, METAPAGE(env, 2), fd); - if (unlikely(err != MDBX_SUCCESS)) - return err; - - if (env->me_flags & MDBX_WRITEMAP) { - mdbx_flush_incoherent_cpu_writeback(); - err = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - MDBX_SYNC_DATA); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } else { - if (fd == env->me_lazy_fd) { -#if MDBX_USE_SYNCFILERANGE - static bool syncfilerange_unavailable; - if (!syncfilerange_unavailable && - sync_file_range(env->me_lazy_fd, 0, pgno2bytes(env, NUM_METAS), - SYNC_FILE_RANGE_WRITE | SYNC_FILE_RANGE_WAIT_AFTER)) { - err = errno; - if (ignore_enosys(err) == MDBX_RESULT_TRUE) - syncfilerange_unavailable = true; - } - if (syncfilerange_unavailable) -#endif /* MDBX_USE_SYNCFILERANGE */ - err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); + err = osal_pwrite(env->me_fd4meta, ptr, bytes, offset); + if (likely(err == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { + err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ } + return err; +} + +__cold static int wipe_steady(MDBX_txn *txn, txnid_t last_steady) { + MDBX_env *const env = txn->mt_env; + int err = MDBX_SUCCESS; + + /* early than last_steady */ + err = meta_unsteady(err, env, last_steady, 0); + err = meta_unsteady(err, env, last_steady, 1); + err = meta_unsteady(err, env, last_steady, 2); + + /* the last_steady */ + err = meta_unsteady(err, env, last_steady + 1, 0); + err = meta_unsteady(err, env, last_steady + 1, 1); + err = meta_unsteady(err, env, last_steady + 1, 2); + + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); /* force oldest refresh */ atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); - return MDBX_SUCCESS; + + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + txn->tw.troika = meta_tap(env); + for (MDBX_txn *scan = txn->mt_env->me_txn0; scan; scan = scan->mt_child) + if (scan != txn) + scan->tw.troika = txn->tw.troika; + return err; } +//------------------------------------------------------------------------------ + +MDBX_MAYBE_UNUSED __hot static pgno_t * +scan4seq_fallback(pgno_t *range, const size_t len, const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING + assert(range[-1] == len); + const pgno_t *const detent = range + len - seq; + const ptrdiff_t offset = (ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + if (likely(len > seq + 3)) { + do { + const pgno_t diff0 = range[offset + 0] - range[0]; + const pgno_t diff1 = range[offset + 1] - range[1]; + const pgno_t diff2 = range[offset + 2] - range[2]; + const pgno_t diff3 = range[offset + 3] - range[3]; + if (diff0 == target) + return range + 0; + if (diff1 == target) + return range + 1; + if (diff2 == target) + return range + 2; + if (diff3 == target) + return range + 3; + range += 4; + } while (range + 3 < detent); + if (range == detent) + return nullptr; + } + do + if (range[offset] - *range == target) + return range; + while (++range < detent); +#else + assert(range[-(ptrdiff_t)len] == len); + const pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + if (likely(len > seq + 3)) { + do { + const pgno_t diff0 = range[-0] - range[offset - 0]; + const pgno_t diff1 = range[-1] - range[offset - 1]; + const pgno_t diff2 = range[-2] - range[offset - 2]; + const pgno_t diff3 = range[-3] - range[offset - 3]; + /* Смысл вычислений до ветвлений в том, чтобы позволить компилятору + * загружать и вычислять все значения параллельно. */ + if (diff0 == target) + return range - 0; + if (diff1 == target) + return range - 1; + if (diff2 == target) + return range - 2; + if (diff3 == target) + return range - 3; + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + do + if (*range - range[offset] == target) + return range; + while (--range > detent); +#endif /* MDBX_PNL sort-order */ + return nullptr; +} + +MDBX_MAYBE_UNUSED static const pgno_t *scan4range_checker(const MDBX_PNL pnl, + const size_t seq) { + size_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pnl); +#if MDBX_PNL_ASCENDING + while (seq <= MDBX_PNL_GETSIZE(pnl) - begin) { + if (pnl[begin + seq] - pnl[begin] == seq) + return pnl + begin; + ++begin; + } +#else + while (begin > seq) { + if (pnl[begin - seq] - pnl[begin] == seq) + return pnl + begin; + --begin; + } +#endif /* MDBX_PNL sort-order */ + return nullptr; +} + +#if defined(_MSC_VER) && !defined(__builtin_clz) && \ + !__has_builtin(__builtin_clz) +MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clz(uint32_t value) { + unsigned long index; + _BitScanReverse(&index, value); + return 31 - index; +} +#endif /* _MSC_VER */ + +#if defined(_MSC_VER) && !defined(__builtin_clzl) && \ + !__has_builtin(__builtin_clzl) +MDBX_MAYBE_UNUSED static __always_inline size_t __builtin_clzl(size_t value) { + unsigned long index; +#ifdef _WIN64 + assert(sizeof(value) == 8); + _BitScanReverse64(&index, value); + return 63 - index; +#else + assert(sizeof(value) == 4); + _BitScanReverse(&index, value); + return 31 - index; +#endif +} +#endif /* _MSC_VER */ + +#if !MDBX_PNL_ASCENDING + +#if !defined(MDBX_ATTRIBUTE_TARGET) && \ + (__has_attribute(__target__) || __GNUC_PREREQ(5, 0)) +#define MDBX_ATTRIBUTE_TARGET(target) __attribute__((__target__(target))) +#endif /* MDBX_ATTRIBUTE_TARGET */ + +#if defined(__SSE2__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(__amd64__) +#define __SSE2__ +#define MDBX_ATTRIBUTE_TARGET_SSE2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) +#define MDBX_ATTRIBUTE_TARGET_SSE2 MDBX_ATTRIBUTE_TARGET("sse2") +#endif /* __SSE2__ */ + +#if defined(__AVX2__) +#define MDBX_ATTRIBUTE_TARGET_AVX2 /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) +#define MDBX_ATTRIBUTE_TARGET_AVX2 MDBX_ATTRIBUTE_TARGET("avx2") +#endif /* __AVX2__ */ + +#if defined(__AVX512BW__) +#define MDBX_ATTRIBUTE_TARGET_AVX512BW /* nope */ +#elif defined(MDBX_ATTRIBUTE_TARGET) && defined(__ia32__) && \ + (__GNUC_PREREQ(6, 0) || __CLANG_PREREQ(5, 0)) +#define MDBX_ATTRIBUTE_TARGET_AVX512BW MDBX_ATTRIBUTE_TARGET("avx512bw") +#endif /* __AVX512BW__ */ + +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 +MDBX_ATTRIBUTE_TARGET_SSE2 static __always_inline unsigned +diffcmp2mask_sse2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m128i pattern) { + const __m128i f = _mm_loadu_si128((const __m128i *)ptr); + const __m128i l = _mm_loadu_si128((const __m128i *)(ptr + offset)); + const __m128i cmp = _mm_cmpeq_epi32(_mm_sub_epi32(f, l), pattern); + return _mm_movemask_ps(*(const __m128 *)&cmp); +} + +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_SSE2 static pgno_t * +scan4seq_sse2(pgno_t *range, const size_t len, const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m128i pattern = _mm_set1_epi32(target); + uint8_t mask; + if (likely(len > seq + 3)) { + do { + mask = (uint8_t)diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return range + 28 - __builtin_clz(mask); + } + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = 0xF << extra; + mask &= diffcmp2mask_sse2(range - 3, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ + +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 +MDBX_ATTRIBUTE_TARGET_AVX2 static __always_inline unsigned +diffcmp2mask_avx2(const pgno_t *const ptr, const ptrdiff_t offset, + const __m256i pattern) { + const __m256i f = _mm256_loadu_si256((const __m256i *)ptr); + const __m256i l = _mm256_loadu_si256((const __m256i *)(ptr + offset)); + const __m256i cmp = _mm256_cmpeq_epi32(_mm256_sub_epi32(f, l), pattern); + return _mm256_movemask_ps(*(const __m256 *)&cmp); +} + +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX2 static pgno_t * +scan4seq_avx2(pgno_t *range, const size_t len, const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m256i pattern = _mm256_set1_epi32(target); + uint8_t mask; + if (likely(len > seq + 7)) { + do { + mask = (uint8_t)diffcmp2mask_avx2(range - 7, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return range + 24 - __builtin_clz(mask); + } + range -= 8; + } while (range > detent + 7); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 28 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xfe0 /* enough for '-31' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 8 - range); + assert(extra > 0 && extra < 8); + mask = 0xFF << extra; + mask &= diffcmp2mask_avx2(range - 7, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + if (range - 3 > detent) { + mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); + if (mask) + return range + 28 - __builtin_clz(mask); + range -= 4; + } + while (range > detent) { + if (*range - range[offset] == target) + return range; + --range; + } + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ + +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW +MDBX_ATTRIBUTE_TARGET_AVX512BW static __always_inline unsigned +diffcmp2mask_avx512bw(const pgno_t *const ptr, const ptrdiff_t offset, + const __m512i pattern) { + const __m512i f = _mm512_loadu_si512((const __m512i *)ptr); + const __m512i l = _mm512_loadu_si512((const __m512i *)(ptr + offset)); + return _mm512_cmpeq_epi32_mask(_mm512_sub_epi32(f, l), pattern); +} + +MDBX_MAYBE_UNUSED __hot MDBX_ATTRIBUTE_TARGET_AVX512BW static pgno_t * +scan4seq_avx512bw(pgno_t *range, const size_t len, const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const __m512i pattern = _mm512_set1_epi32(target); + unsigned mask; + if (likely(len > seq + 15)) { + do { + mask = diffcmp2mask_avx512bw(range - 15, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return range + 16 - __builtin_clz(mask); + } + range -= 16; + } while (range > detent + 15); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 60 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xfc0 /* enough for '-63' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 16 - range); + assert(extra > 0 && extra < 16); + mask = 0xFFFF << extra; + mask &= diffcmp2mask_avx512bw(range - 15, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + if (range - 7 > detent) { + mask = diffcmp2mask_avx2(range - 7, offset, *(const __m256i *)&pattern); + if (mask) + return range + 24 - __builtin_clz(mask); + range -= 8; + } + if (range - 3 > detent) { + mask = diffcmp2mask_sse2(range - 3, offset, *(const __m128i *)&pattern); + if (mask) + return range + 28 - __builtin_clz(mask); + range -= 4; + } + while (range > detent) { + if (*range - range[offset] == target) + return range; + --range; + } + return nullptr; +} +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ + +#if (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +static __always_inline size_t diffcmp2mask_neon(const pgno_t *const ptr, + const ptrdiff_t offset, + const uint32x4_t pattern) { + const uint32x4_t f = vld1q_u32(ptr); + const uint32x4_t l = vld1q_u32(ptr + offset); + const uint16x4_t cmp = vmovn_u32(vceqq_u32(vsubq_u32(f, l), pattern)); + if (sizeof(size_t) > 7) + return vget_lane_u64(vreinterpret_u64_u16(cmp), 0); + else + return vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(cmp, cmp))), + 0); +} + +__hot static pgno_t *scan4seq_neon(pgno_t *range, const size_t len, + const size_t seq) { + assert(seq > 0 && len > seq); +#if MDBX_PNL_ASCENDING +#error "FIXME: Not implemented" +#endif /* MDBX_PNL_ASCENDING */ + assert(range[-(ptrdiff_t)len] == len); + pgno_t *const detent = range - len + seq; + const ptrdiff_t offset = -(ptrdiff_t)seq; + const pgno_t target = (pgno_t)offset; + const uint32x4_t pattern = vmovq_n_u32(target); + size_t mask; + if (likely(len > seq + 3)) { + do { + mask = diffcmp2mask_neon(range - 3, offset, pattern); + if (mask) { +#ifndef __SANITIZE_ADDRESS__ + found: +#endif /* __SANITIZE_ADDRESS__ */ + return ptr_disp(range, -(__builtin_clzl(mask) >> sizeof(size_t) / 4)); + } + range -= 4; + } while (range > detent + 3); + if (range == detent) + return nullptr; + } + + /* Далее происходит чтение от 4 до 12 лишних байт, которые могут быть не + * только за пределами региона выделенного под PNL, но и пересекать границу + * страницы памяти. Что может приводить как к ошибкам ASAN, так и к падению. + * Поэтому проверяем смещение на странице, а с ASAN всегда страхуемся. */ +#ifndef __SANITIZE_ADDRESS__ + const unsigned on_page_safe_mask = 0xff0 /* enough for '-15' bytes offset */; + if (likely(on_page_safe_mask & (uintptr_t)(range + offset)) && + !RUNNING_ON_VALGRIND) { + const unsigned extra = (unsigned)(detent + 4 - range); + assert(extra > 0 && extra < 4); + mask = (~(size_t)0) << (extra * sizeof(size_t) * 2); + mask &= diffcmp2mask_neon(range - 3, offset, pattern); + if (mask) + goto found; + return nullptr; + } +#endif /* __SANITIZE_ADDRESS__ */ + do + if (*range - range[offset] == target) + return range; + while (--range != detent); + return nullptr; +} +#endif /* __ARM_NEON || __ARM_NEON__ */ + +#if defined(__AVX512BW__) && defined(MDBX_ATTRIBUTE_TARGET_AVX512BW) +#define scan4seq_default scan4seq_avx512bw +#define scan4seq_impl scan4seq_default +#elif defined(__AVX2__) && defined(MDBX_ATTRIBUTE_TARGET_AVX2) +#define scan4seq_default scan4seq_avx2 +#elif defined(__SSE2__) && defined(MDBX_ATTRIBUTE_TARGET_SSE2) +#define scan4seq_default scan4seq_sse2 +#elif (defined(__ARM_NEON) || defined(__ARM_NEON__)) && \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define scan4seq_default scan4seq_neon +/* Choosing of another variants should be added here. */ +#endif /* scan4seq_default */ + +#endif /* MDBX_PNL_ASCENDING */ + +#ifndef scan4seq_default +#define scan4seq_default scan4seq_fallback +#endif /* scan4seq_default */ + +#ifdef scan4seq_impl +/* The scan4seq_impl() is the best or no alternatives */ +#elif !MDBX_HAVE_BUILTIN_CPU_SUPPORTS +/* The scan4seq_default() will be used since no cpu-features detection support + * from compiler. Please don't ask to implement cpuid-based detection and don't + * make such PRs. */ +#define scan4seq_impl scan4seq_default +#else +/* Selecting the most appropriate implementation at runtime, + * depending on the available CPU features. */ +static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, + const size_t seq); +static pgno_t *(*scan4seq_impl)(pgno_t *range, const size_t len, + const size_t seq) = scan4seq_resolver; + +static pgno_t *scan4seq_resolver(pgno_t *range, const size_t len, + const size_t seq) { + pgno_t *(*choice)(pgno_t * range, const size_t len, const size_t seq) = + nullptr; +#if __has_builtin(__builtin_cpu_init) || defined(__BUILTIN_CPU_INIT__) || \ + __GNUC_PREREQ(4, 8) + __builtin_cpu_init(); +#endif /* __builtin_cpu_init() */ +#ifdef MDBX_ATTRIBUTE_TARGET_SSE2 + if (__builtin_cpu_supports("sse2")) + choice = scan4seq_sse2; +#endif /* MDBX_ATTRIBUTE_TARGET_SSE2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX2 + if (__builtin_cpu_supports("avx2")) + choice = scan4seq_avx2; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX2 */ +#ifdef MDBX_ATTRIBUTE_TARGET_AVX512BW + if (__builtin_cpu_supports("avx512bw")) + choice = scan4seq_avx512bw; +#endif /* MDBX_ATTRIBUTE_TARGET_AVX512BW */ + /* Choosing of another variants should be added here. */ + scan4seq_impl = choice ? choice : scan4seq_default; + return scan4seq_impl(range, len, seq); +} +#endif /* scan4seq_impl */ + +//------------------------------------------------------------------------------ + /* Allocate page numbers and memory for writing. Maintain mt_last_reclaimed, - * mt_reclaimed_pglist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. + * mt_relist and mt_next_pgno. Set MDBX_TXN_ERROR on failure. * * If there are free pages available from older transactions, they * are re-used first. Otherwise allocate a new page at mt_next_pgno. - * Do not modify the GC, just merge GC records into mt_reclaimed_pglist + * Do not modify the GC, just merge GC records into mt_relist * and move mt_last_reclaimed to say which records were consumed. Only this - * function can create mt_reclaimed_pglist and move + * function can create mt_relist and move * mt_last_reclaimed/mt_next_pgno. * * [in] mc cursor A cursor handle identifying the transaction and @@ -6478,582 +6842,1004 @@ __cold static int mdbx_wipe_steady(MDBX_env *env, const txnid_t last_steady) { * * Returns 0 on success, non-zero on failure.*/ -#define MDBX_ALLOC_CACHE 1 -#define MDBX_ALLOC_GC 2 -#define MDBX_ALLOC_NEW 4 -#define MDBX_ALLOC_SLOT 8 -#define MDBX_ALLOC_FAKE 16 -#define MDBX_ALLOC_NOLOG 32 -#define MDBX_ALLOC_ALL (MDBX_ALLOC_CACHE | MDBX_ALLOC_GC | MDBX_ALLOC_NEW) +#define MDBX_ALLOC_DEFAULT 0 +#define MDBX_ALLOC_RESERVE 1 +#define MDBX_ALLOC_UNIMPORTANT 2 +#define MDBX_ALLOC_COALESCE 4 /* внутреннее состояние */ +#define MDBX_ALLOC_SHOULD_SCAN 8 /* внутреннее состояние */ +#define MDBX_ALLOC_LIFO 16 /* внутреннее состояние */ -__hot static struct page_result mdbx_page_alloc(MDBX_cursor *mc, - const pgno_t num, int flags) { - struct page_result ret; - MDBX_txn *const txn = mc->mc_txn; - MDBX_env *const env = txn->mt_env; - mdbx_assert(env, num == 0 || !(flags & MDBX_ALLOC_SLOT)); - mdbx_assert(env, num > 0 || !(flags & MDBX_ALLOC_NEW)); +static __inline bool is_gc_usable(MDBX_txn *txn, const MDBX_cursor *mc, + const uint8_t flags) { + /* If txn is updating the GC, then the retired-list cannot play catch-up with + * itself by growing while trying to save it. */ + if (mc->mc_dbi == FREE_DBI && !(flags & MDBX_ALLOC_RESERVE) && + !(mc->mc_flags & C_GCU)) + return false; - const unsigned coalesce_threshold = - env->me_maxgc_ov1page - env->me_maxgc_ov1page / 4; - if (likely(flags & MDBX_ALLOC_GC)) { - flags |= env->me_flags & (MDBX_COALESCE | MDBX_LIFORECLAIM); - if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) > coalesce_threshold) - flags &= ~MDBX_COALESCE; - if (unlikely( - /* If mc is updating the GC, then the retired-list cannot play - catch-up with itself by growing while trying to save it. */ - (mc->mc_flags & C_RECLAIMING) || - /* avoid (recursive) search inside empty tree and while tree is - updating, https://libmdbx.dqdkfa.ru/dead-github/issues/31 */ - txn->mt_dbs[FREE_DBI].md_entries == 0 || - /* If our dirty list is already full, we can't touch GC */ - (txn->tw.dirtyroom < txn->mt_dbs[FREE_DBI].md_depth && - !(txn->mt_dbistate[FREE_DBI] & DBI_DIRTY)))) - flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); + /* avoid search inside empty tree and while tree is updating, + https://libmdbx.dqdkfa.ru/dead-github/issues/31 */ + if (unlikely(txn->mt_dbs[FREE_DBI].md_entries == 0)) { + txn->mt_flags |= MDBX_TXN_DRAINED_GC; + return false; } - if (likely(num == 1 && (flags & MDBX_ALLOC_CACHE) != 0)) { - /* If there are any loose pages, just use them */ - mdbx_assert(env, (flags & MDBX_ALLOC_SLOT) == 0); - if (likely(txn->tw.loose_pages)) { -#if MDBX_ENABLE_REFUND - if (txn->tw.loose_refund_wl > txn->mt_next_pgno) { - mdbx_refund(txn); - if (unlikely(!txn->tw.loose_pages)) - goto no_loose; - } -#endif /* MDBX_ENABLE_REFUND */ + return true; +} - ret.page = txn->tw.loose_pages; - txn->tw.loose_pages = ret.page->mp_next; - txn->tw.loose_count--; - mdbx_debug_extra("db %d use loose page %" PRIaPGNO, DDBI(mc), - ret.page->mp_pgno); - mdbx_tassert(txn, ret.page->mp_pgno < txn->mt_next_pgno); - mdbx_ensure(env, ret.page->mp_pgno >= NUM_METAS); - VALGRIND_MAKE_MEM_UNDEFINED(page_data(ret.page), page_space(txn->mt_env)); - MDBX_ASAN_UNPOISON_MEMORY_REGION(page_data(ret.page), - page_space(txn->mt_env)); - ret.page->mp_txnid = txn->mt_front; - ret.err = MDBX_SUCCESS; - return ret; - } - } -#if MDBX_ENABLE_REFUND -no_loose: -#endif /* MDBX_ENABLE_REFUND */ +__hot static bool is_already_reclaimed(const MDBX_txn *txn, txnid_t id) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); + for (size_t i = 1; i <= len; ++i) + if (txn->tw.lifo_reclaimed[i] == id) + return true; + return false; +} - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - pgno_t pgno, *re_list = txn->tw.reclaimed_pglist; - unsigned range_begin = 0, re_len = MDBX_PNL_SIZE(re_list); - txnid_t oldest = 0, last = 0; +__hot static pgno_t relist_get_single(MDBX_txn *txn) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); + assert(len > 0); + pgno_t *target = MDBX_PNL_EDGE(txn->tw.relist); + const ptrdiff_t dir = MDBX_PNL_ASCENDING ? 1 : -1; - while (true) { /* hsr-kick retry loop */ - MDBX_cursor_couple recur; - for (MDBX_cursor_op op = MDBX_FIRST;; - op = (flags & MDBX_LIFORECLAIM) ? MDBX_PREV : MDBX_NEXT) { - MDBX_val key, data; + /* Есть ТРИ потенциально выигрышные, но противо-направленные тактики: + * + * 1. Стараться использовать страницы с наименьшими номерами. Так обмен с + * диском будет более кучным, а у страниц ближе к концу БД будет больше шансов + * попасть под авто-компактификацию. Частично эта тактика уже реализована, но + * для её эффективности требуется явно приоритезировать выделение страниц: + * - поддерживать для relist, для ближних и для дальних страниц; + * - использовать страницы из дальнего списка, если первый пуст, + * а второй слишком большой, либо при пустой GC. + * + * 2. Стараться выделять страницы последовательно. Так записываемые на диск + * регионы будут линейными, что принципиально ускоряет запись на HDD. + * Одновременно, в среднем это не повлияет на чтение, точнее говоря, если + * порядок чтения не совпадает с порядком изменения (иначе говоря, если + * чтение не коррклирует с обновлениями и/или вставками) то не повлияет, иначе + * может ускорить. Однако, последовательности в среднем достаточно редки. + * Поэтому для эффективности требуется аккумулировать и поддерживать в ОЗУ + * огромные списки страниц, а затем сохранять их обратно в БД. Текущий формат + * БД (без битовых карт) для этого крайне не удачен. Поэтому эта тактика не + * имеет шансов быть успешной без смены формата БД (Mithril). + * + * 3. Стараться экономить последовательности страниц. Это позволяет избегать + * лишнего чтения/поиска в GC при более-менее постоянном размещении и/или + * обновлении данных требующих более одной страницы. Проблема в том, что без + * информации от приложения библиотека не может знать насколько + * востребованными будут последовательности в ближайшей перспективе, а + * экономия последовательностей "на всякий случай" не только затратна + * сама-по-себе, но и работает во вред. + * + * Поэтому: + * - в TODO добавляется разделение relist на «ближние» и «дальние» страницы, + * с последующей реализацией первой тактики; + * - преимущественное использование последовательностей отправляется + * в MithrilDB как составляющая "HDD frendly" feature; + * - реализованная в 3757eb72f7c6b46862f8f17881ac88e8cecc1979 экономия + * последовательностей отключается через MDBX_ENABLE_SAVING_SEQUENCES=0. + * + * В качестве альтернативы для безусловной «экономии» последовательностей, + * в следующих версиях libmdbx, вероятно, будет предложено + * API для взаимодействия с GC: + * - получение размера GC, включая гистограммы размеров последовательностей + * и близости к концу БД; + * - включение формирования "линейного запаса" для последующего использования + * в рамках текущей транзакции; + * - намеренная загрузка GC в память для коагуляции и "выпрямления"; + * - намеренное копирование данных из страниц в конце БД для последующего + * из освобождения, т.е. контролируемая компактификация по запросу. */ - /* Seek a big enough contiguous page range. - * Prefer pages with lower pgno. */ - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno)); - if (!(flags & (MDBX_COALESCE | MDBX_ALLOC_SLOT)) && re_len >= num) { - mdbx_tassert(txn, MDBX_PNL_LAST(re_list) < txn->mt_next_pgno && - MDBX_PNL_FIRST(re_list) < txn->mt_next_pgno); - range_begin = MDBX_PNL_ASCENDING ? 1 : re_len; - pgno = MDBX_PNL_LEAST(re_list); - if (likely(num == 1)) - goto done; - - const unsigned wanna_range = num - 1; +#ifndef MDBX_ENABLE_SAVING_SEQUENCES +#define MDBX_ENABLE_SAVING_SEQUENCES 0 +#endif + if (MDBX_ENABLE_SAVING_SEQUENCES && unlikely(target[dir] == *target + 1) && + len > 2) { + /* Пытаемся пропускать последовательности при наличии одиночных элементов. + * TODO: необходимо кэшировать пропускаемые последовательности + * чтобы не сканировать список сначала при каждом выделении. */ + pgno_t *scan = target + dir + dir; + size_t left = len; + do { + if (likely(scan[-dir] != *scan - 1 && *scan + 1 != scan[dir])) { #if MDBX_PNL_ASCENDING - mdbx_tassert(txn, pgno == re_list[1] && range_begin == 1); - while (true) { - unsigned range_end = range_begin + wanna_range; - if (re_list[range_end] - pgno == wanna_range) - goto done; - if (range_end == re_len) - break; - pgno = re_list[++range_begin]; - } -#else - mdbx_tassert(txn, pgno == re_list[re_len] && range_begin == re_len); - while (true) { - if (re_list[range_begin - wanna_range] - pgno == wanna_range) - goto done; - if (--range_begin == wanna_range) - break; - pgno = re_list[range_begin]; - } -#endif /* MDBX_PNL sort-order */ - } - - if (op == MDBX_FIRST) { /* 1st iteration, setup cursor, etc */ - if (unlikely(!(flags & MDBX_ALLOC_GC))) - break /* reclaiming is prohibited for now */; - - /* Prepare to fetch more and coalesce */ - oldest = (flags & MDBX_LIFORECLAIM) - ? mdbx_find_oldest(txn) - : atomic_load64(&env->me_lck->mti_oldest_reader, - mo_AcquireRelease); - ret.err = mdbx_cursor_init(&recur.outer, txn, FREE_DBI); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - if (flags & MDBX_LIFORECLAIM) { - /* Begin from oldest reader if any */ - if (oldest > MIN_TXNID) { - last = oldest - 1; - op = MDBX_SET_RANGE; - } - } else if (txn->tw.last_reclaimed) { - /* Continue lookup from txn->tw.last_reclaimed to oldest reader */ - last = txn->tw.last_reclaimed; - op = MDBX_SET_RANGE; - } - - key.iov_base = &last; - key.iov_len = sizeof(last); - } - - if (!(flags & MDBX_LIFORECLAIM)) { - /* Do not try fetch more if the record will be too recent */ - if (op != MDBX_FIRST && ++last >= oldest) { - oldest = mdbx_find_oldest(txn); - if (oldest <= last) - break; - } - } - - ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); - if (ret.err == MDBX_NOTFOUND && (flags & MDBX_LIFORECLAIM)) { - if (op == MDBX_SET_RANGE) - continue; - txnid_t snap = mdbx_find_oldest(txn); - if (oldest < snap) { - oldest = snap; - last = oldest - 1; - key.iov_base = &last; - key.iov_len = sizeof(last); - op = MDBX_SET_RANGE; - ret.err = mdbx_cursor_get(&recur.outer, &key, NULL, op); - } - } - if (unlikely(ret.err)) { - if (ret.err == MDBX_NOTFOUND) - break; - goto fail; - } - - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(key.iov_len != sizeof(txnid_t))) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - last = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(last < MIN_TXNID || last > MAX_TXNID)) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - if (oldest <= last) { - oldest = mdbx_find_oldest(txn); - if (oldest <= last) { - if (flags & MDBX_LIFORECLAIM) - continue; - break; - } - } - - if (flags & MDBX_LIFORECLAIM) { - /* skip IDs of records that already reclaimed */ - if (txn->tw.lifo_reclaimed) { - size_t i; - for (i = (size_t)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); i > 0; --i) - if (txn->tw.lifo_reclaimed[i] == last) - break; - if (i) - continue; - } - } - - /* Reading next GC record */ - MDBX_page *const mp = recur.outer.mc_pg[recur.outer.mc_top]; - if (unlikely((ret.err = mdbx_node_read( - &recur.outer, - page_node(mp, recur.outer.mc_ki[recur.outer.mc_top]), - &data, pp_txnid4chk(mp, txn))) != MDBX_SUCCESS)) - goto fail; - - if ((flags & MDBX_LIFORECLAIM) && !txn->tw.lifo_reclaimed) { - txn->tw.lifo_reclaimed = mdbx_txl_alloc(); - if (unlikely(!txn->tw.lifo_reclaimed)) { - ret.err = MDBX_ENOMEM; - goto fail; - } - } - - /* Append PNL from GC record to tw.reclaimed_pglist */ - mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); - pgno_t *gc_pnl = (pgno_t *)data.iov_base; - mdbx_tassert(txn, data.iov_len >= MDBX_PNL_SIZEOF(gc_pnl)); - if (unlikely(data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || - !mdbx_pnl_check(gc_pnl, txn->mt_next_pgno))) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - const unsigned gc_len = MDBX_PNL_SIZE(gc_pnl); - if (unlikely(/* list is too long already */ MDBX_PNL_SIZE( - txn->tw.reclaimed_pglist) >= - env->me_options.rp_augment_limit) && - ((/* not a slot-request from gc-update */ - (flags & MDBX_ALLOC_SLOT) == 0 && - /* have enough unallocated space */ txn->mt_geo.upper >= - txn->mt_next_pgno + (size_t)num) || - gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) >= - MDBX_PGL_LIMIT)) { - /* Stop reclaiming to avoid overflow the page list. - * This is a rare case while search for a continuously multi-page region - * in a large database. - * https://libmdbx.dqdkfa.ru/dead-github/issues/123 */ - mdbx_notice("stop reclaiming to avoid PNL overflow: %u (current) + %u " - "(chunk) -> %u", - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), gc_len, - gc_len + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); - flags &= ~(MDBX_ALLOC_GC | MDBX_COALESCE); + target = scan; break; +#else + /* вырезаем элемент с перемещением хвоста */ + const pgno_t pgno = *scan; + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); + while (++scan <= target) + scan[-1] = *scan; + return pgno; +#endif } - ret.err = mdbx_pnl_need(&txn->tw.reclaimed_pglist, gc_len); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - re_list = txn->tw.reclaimed_pglist; - - /* Remember ID of GC record */ - if (flags & MDBX_LIFORECLAIM) { - ret.err = mdbx_txl_append(&txn->tw.lifo_reclaimed, last); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - } - txn->tw.last_reclaimed = last; - - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - mdbx_debug_extra("PNL read txn %" PRIaTXN " root %" PRIaPGNO - " num %u, PNL", - last, txn->mt_dbs[FREE_DBI].md_root, gc_len); - for (unsigned i = gc_len; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO, gc_pnl[i]); - mdbx_debug_extra_print("%s\n", "."); - } - - /* Merge in descending sorted order */ - const unsigned prev_re_len = MDBX_PNL_SIZE(re_list); - mdbx_pnl_xmerge(re_list, gc_pnl); - /* re-check to avoid duplicates */ - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(!mdbx_pnl_check(re_list, txn->mt_next_pgno))) { - ret.err = MDBX_CORRUPTED; - goto fail; - } - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - - re_len = MDBX_PNL_SIZE(re_list); - mdbx_tassert(txn, re_len == 0 || re_list[re_len] < txn->mt_next_pgno); - if (MDBX_ENABLE_REFUND && re_len && - unlikely(MDBX_PNL_MOST(re_list) == txn->mt_next_pgno - 1)) { - /* Refund suitable pages into "unallocated" space */ - mdbx_refund(txn); - re_list = txn->tw.reclaimed_pglist; - re_len = MDBX_PNL_SIZE(re_list); - } - - /* Done for a kick-reclaim mode, actually no page needed */ - if (unlikely(flags & MDBX_ALLOC_SLOT)) { - mdbx_debug("early-return NULL-page for %s mode", "MDBX_ALLOC_SLOT"); - ret.err = MDBX_SUCCESS; - ret.page = NULL; - return ret; - } - - /* Don't try to coalesce too much. */ - if (flags & MDBX_COALESCE) { - if (re_len /* current size */ > coalesce_threshold || - (re_len > prev_re_len && - re_len - prev_re_len /* delta from prev */ >= - coalesce_threshold / 2)) { - mdbx_trace("clear %s %s", "MDBX_COALESCE", "since got threshold"); - flags &= ~MDBX_COALESCE; - } - } - } - - if (F_ISSET(flags, MDBX_COALESCE | MDBX_ALLOC_GC)) { - mdbx_debug_extra("clear %s and continue", "MDBX_COALESCE"); - flags &= ~MDBX_COALESCE; - continue; - } - - /* There is no suitable pages in the GC and to be able to allocate - * we should CHOICE one of: - * - make a new steady checkpoint if reclaiming was stopped by - * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; - * - kick lagging reader(s) if reclaiming was stopped by ones of it. - * - extend the database file. */ - - /* Will use new pages from the map if nothing is suitable in the GC. */ - range_begin = 0; - pgno = txn->mt_next_pgno; - const size_t next = (size_t)pgno + num; - - if (flags & MDBX_ALLOC_GC) { - const MDBX_meta *const head = constmeta_prefer_last(env); - const MDBX_meta *const steady = constmeta_prefer_steady(env); - /* does reclaiming stopped at the last steady point? */ - if (head != steady && META_IS_STEADY(steady) && - oldest == constmeta_txnid(env, steady)) { - mdbx_debug("gc-kick-steady: head %" PRIaTXN "-%s, tail %" PRIaTXN - "-%s, oldest %" PRIaTXN, - constmeta_txnid(env, head), mdbx_durable_str(head), - constmeta_txnid(env, steady), mdbx_durable_str(steady), - oldest); - ret.err = MDBX_RESULT_TRUE; - const pgno_t autosync_threshold = - atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); - const uint64_t autosync_period = - atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); - /* wipe the last steady-point if one of: - * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified - * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted - * otherwise, make a new steady-point if one of: - * - auto-sync threshold is specified and reached; - * - upper limit of database size is reached; - * - database is full (with the current file size) - * AND auto-sync threshold it NOT specified */ - if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && - ((autosync_threshold | autosync_period) == 0 || - next >= steady->mm_geo.now)) { - /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode - * without any auto-sync threshold(s). */ - ret.err = mdbx_wipe_steady(env, oldest); - mdbx_debug("gc-wipe-steady, rc %d", ret.err); - mdbx_assert(env, steady != meta_prefer_steady(env)); - } else if ((flags & MDBX_ALLOC_NEW) == 0 || - (autosync_threshold && - atomic_load32(&env->me_lck->mti_unsynced_pages, - mo_Relaxed) >= autosync_threshold) || - (autosync_period && - mdbx_osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, - mo_Relaxed) >= - autosync_period) || - next >= txn->mt_geo.upper || - (next >= txn->mt_end_pgno && - (autosync_threshold | autosync_period) == 0)) { - /* make steady checkpoint. */ - MDBX_meta meta = *head; - ret.err = mdbx_sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta); - mdbx_debug("gc-make-steady, rc %d", ret.err); - mdbx_assert(env, steady != meta_prefer_steady(env)); - } - if (ret.err == MDBX_SUCCESS) { - if (mdbx_find_oldest(txn) > oldest) - continue; - /* it is reasonable check/kick lagging reader(s) here, - * since we made a new steady point or wipe the last. */ - if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP && - mdbx_kick_longlived_readers(env, oldest) > oldest) - continue; - } else if (unlikely(ret.err != MDBX_RESULT_TRUE)) - goto fail; - } - } - - /* don't kick lagging reader(s) if is enough unallocated space - * at the end of database file. */ - if ((flags & MDBX_ALLOC_NEW) && next <= txn->mt_end_pgno) - goto done; - if ((flags & MDBX_ALLOC_GC) && oldest < txn->mt_txnid - xMDBX_TXNID_STEP && - mdbx_kick_longlived_readers(env, oldest) > oldest) - continue; - - ret.err = MDBX_NOTFOUND; - if (flags & MDBX_ALLOC_NEW) { - ret.err = MDBX_MAP_FULL; - if (next < txn->mt_geo.upper && txn->mt_geo.grow_pv) { - mdbx_assert(env, next > txn->mt_end_pgno); - const pgno_t grow_step = pv2pages(txn->mt_geo.grow_pv); - size_t aligned = pgno_align2os_pgno( - env, (pgno_t)(next + grow_step - next % grow_step)); - - if (aligned > txn->mt_geo.upper) - aligned = txn->mt_geo.upper; - mdbx_assert(env, aligned > txn->mt_end_pgno); - - mdbx_verbose("try growth datafile to %zu pages (+%zu)", aligned, - aligned - txn->mt_end_pgno); - ret.err = mdbx_mapresize_implicit(env, txn->mt_next_pgno, - (pgno_t)aligned, txn->mt_geo.upper); - if (ret.err == MDBX_SUCCESS) { - env->me_txn->mt_end_pgno = (pgno_t)aligned; - goto done; - } - - mdbx_error("unable growth datafile to %zu pages (+%zu), errcode %d", - aligned, aligned - txn->mt_end_pgno, ret.err); - } else { - mdbx_notice("gc-alloc: next %zu > upper %" PRIaPGNO, next, - txn->mt_geo.upper); - } - } - - fail: - mdbx_assert(env, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - int level; - const char *what; - if (likely(!(flags & MDBX_ALLOC_FAKE))) { - txn->mt_flags |= MDBX_TXN_ERROR; - level = MDBX_LOG_ERROR; - what = "pages"; - } else { - level = (flags & MDBX_ALLOC_NOLOG) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; - what = (flags & MDBX_ALLOC_SLOT) ? "gc-slot/backlog" : "backlog-pages"; - } - if (mdbx_log_enabled(level)) - mdbx_debug_log(level, __func__, __LINE__, - "unable alloc %u %s, flags 0x%x, errcode %d\n", num, what, - flags, ret.err); - - mdbx_assert(env, ret.err != MDBX_SUCCESS); - ret.page = NULL; - return ret; + scan += dir; + } while (--left > 2); } -done: - mdbx_assert(env, !(flags & MDBX_ALLOC_SLOT)); - mdbx_ensure(env, pgno >= NUM_METAS); - if (unlikely(flags & MDBX_ALLOC_FAKE)) { - mdbx_debug("return NULL-page for %u pages %s allocation", num, - "gc-slot/backlog"); - ret.page = NULL; - ret.err = MDBX_SUCCESS; - return ret; + const pgno_t pgno = *target; +#if MDBX_PNL_ASCENDING + /* вырезаем элемент с перемещением хвоста */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); + for (const pgno_t *const end = txn->tw.relist + len - 1; target <= end; + ++target) + *target = target[1]; +#else + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - 1); +#endif + return pgno; +} + +__hot static pgno_t relist_get_sequence(MDBX_txn *txn, const size_t num, + uint8_t flags) { + const size_t len = MDBX_PNL_GETSIZE(txn->tw.relist); + pgno_t *edge = MDBX_PNL_EDGE(txn->tw.relist); + assert(len >= num && num > 1); + const size_t seq = num - 1; +#if !MDBX_PNL_ASCENDING + if (edge[-(ptrdiff_t)seq] - *edge == seq) { + if (unlikely(flags & MDBX_ALLOC_RESERVE)) + return P_INVALID; + assert(edge == scan4range_checker(txn->tw.relist, seq)); + /* перемещать хвост не нужно, просто усекам список */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - num); + return *edge; + } +#endif + pgno_t *target = scan4seq_impl(edge, len, seq); + assert(target == scan4range_checker(txn->tw.relist, seq)); + if (target) { + if (unlikely(flags & MDBX_ALLOC_RESERVE)) + return P_INVALID; + const pgno_t pgno = *target; + /* вырезаем найденную последовательность с перемещением хвоста */ + MDBX_PNL_SETSIZE(txn->tw.relist, len - num); +#if MDBX_PNL_ASCENDING + for (const pgno_t *const end = txn->tw.relist + len - num; target <= end; + ++target) + *target = target[num]; +#else + for (const pgno_t *const end = txn->tw.relist + len; ++target <= end;) + target[-(ptrdiff_t)num] = *target; +#endif + return pgno; + } + return 0; +} + +#if MDBX_ENABLE_MINCORE +static __inline bool bit_tas(uint64_t *field, char bit) { + const uint64_t m = UINT64_C(1) << bit; + const bool r = (*field & m) != 0; + *field |= m; + return r; +} + +static bool mincore_fetch(MDBX_env *const env, const size_t unit_begin) { + MDBX_lockinfo *const lck = env->me_lck; + for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) { + const ptrdiff_t dist = unit_begin - lck->mti_mincore_cache.begin[i]; + if (likely(dist >= 0 && dist < 64)) { + const pgno_t tmp_begin = lck->mti_mincore_cache.begin[i]; + const uint64_t tmp_mask = lck->mti_mincore_cache.mask[i]; + do { + lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1]; + lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1]; + } while (--i); + lck->mti_mincore_cache.begin[0] = tmp_begin; + lck->mti_mincore_cache.mask[0] = tmp_mask; + return bit_tas(lck->mti_mincore_cache.mask, (char)dist); + } } + size_t pages = 64; + unsigned unit_log = sys_pagesize_ln2; + unsigned shift = 0; + if (env->me_psize > env->me_os_psize) { + unit_log = env->me_psize2log; + shift = env->me_psize2log - sys_pagesize_ln2; + pages <<= shift; + } + + const size_t offset = unit_begin << unit_log; + size_t length = pages << sys_pagesize_ln2; + if (offset + length > env->me_dxb_mmap.current) { + length = env->me_dxb_mmap.current - offset; + pages = length >> sys_pagesize_ln2; + } + +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.mincore.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + uint8_t *const vector = alloca(pages); + if (unlikely(mincore(ptr_disp(env->me_dxb_mmap.base, offset), length, + (void *)vector))) { + NOTICE("mincore(+%zu, %zu), err %d", offset, length, errno); + return false; + } + + for (size_t i = 1; i < ARRAY_LENGTH(lck->mti_mincore_cache.begin); ++i) { + lck->mti_mincore_cache.begin[i] = lck->mti_mincore_cache.begin[i - 1]; + lck->mti_mincore_cache.mask[i] = lck->mti_mincore_cache.mask[i - 1]; + } + lck->mti_mincore_cache.begin[0] = unit_begin; + + uint64_t mask = 0; +#ifdef MINCORE_INCORE + STATIC_ASSERT(MINCORE_INCORE == 1); +#endif + for (size_t i = 0; i < pages; ++i) { + uint64_t bit = (vector[i] & 1) == 0; + bit <<= i >> shift; + mask |= bit; + } + + lck->mti_mincore_cache.mask[0] = ~mask; + return bit_tas(lck->mti_mincore_cache.mask, 0); +} +#endif /* MDBX_ENABLE_MINCORE */ + +MDBX_MAYBE_UNUSED static __inline bool mincore_probe(MDBX_env *const env, + const pgno_t pgno) { +#if MDBX_ENABLE_MINCORE + const size_t offset_aligned = + floor_powerof2(pgno2bytes(env, pgno), env->me_os_psize); + const unsigned unit_log2 = (env->me_psize2log > sys_pagesize_ln2) + ? env->me_psize2log + : sys_pagesize_ln2; + const size_t unit_begin = offset_aligned >> unit_log2; + eASSERT(env, (unit_begin << unit_log2) == offset_aligned); + const ptrdiff_t dist = unit_begin - env->me_lck->mti_mincore_cache.begin[0]; + if (likely(dist >= 0 && dist < 64)) + return bit_tas(env->me_lck->mti_mincore_cache.mask, (char)dist); + return mincore_fetch(env, unit_begin); +#else + (void)env; + (void)pgno; + return false; +#endif /* MDBX_ENABLE_MINCORE */ +} + +static __inline pgr_t page_alloc_finalize(MDBX_env *const env, + MDBX_txn *const txn, + const MDBX_cursor *const mc, + const pgno_t pgno, const size_t num) { +#if MDBX_ENABLE_PROFGC + size_t majflt_before; + const uint64_t cputime_before = osal_cputime(&majflt_before); + profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; +#else + (void)mc; +#endif /* MDBX_ENABLE_PROFGC */ + ENSURE(env, pgno >= NUM_METAS); + + pgr_t ret; + bool need_clean = (env->me_flags & MDBX_PAGEPERTURB) != 0; if (env->me_flags & MDBX_WRITEMAP) { ret.page = pgno2page(env, pgno); - /* LY: reset no-access flag from mdbx_page_loose() */ - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); MDBX_ASAN_UNPOISON_MEMORY_REGION(ret.page, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + + /* Содержимое выделенной страницы не нужно, но если страница отсутствует + * в ОЗУ (что весьма вероятно), то любое обращение к ней приведет + * к page-fault: + * - прерыванию по отсутствию страницы; + * - переключение контекста в режим ядра с засыпанием процесса; + * - чтение страницы с диска; + * - обновление PTE и пробуждением процесса; + * - переключение контекста по доступности ЦПУ. + * + * Пытаемся минимизировать накладные расходы записывая страницу, что при + * наличии unified page cache приведет к появлению страницы в ОЗУ без чтения + * с диска. При этом запись на диск должна быть отложена адекватным ядром, + * так как страница отображена в память в режиме чтения-записи и следом в + * неё пишет ЦПУ. */ + + /* В случае если страница в памяти процесса, то излишняя запись может быть + * достаточно дорогой. Кроме системного вызова и копирования данных, в особо + * одаренных ОС при этом могут включаться файловая система, выделяться + * временная страница, пополняться очереди асинхронного выполнения, + * обновляться PTE с последующей генерацией page-fault и чтением данных из + * грязной I/O очереди. Из-за этого штраф за лишнюю запись может быть + * сравним с избегаемым ненужным чтением. */ + if (env->me_prefault_write) { + void *const pattern = ptr_disp( + env->me_pbuf, need_clean ? env->me_psize : env->me_psize * 2); + size_t file_offset = pgno2bytes(env, pgno); + if (likely(num == 1)) { + if (!mincore_probe(env, pgno)) { + osal_pwrite(env->me_lazy_fd, pattern, env->me_psize, file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + need_clean = false; + } + } else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + size_t n = 0, cleared = 0; + for (size_t i = 0; i < num; ++i) { + if (!mincore_probe(env, pgno + (pgno_t)i)) { + ++cleared; + iov[n].iov_len = env->me_psize; + iov[n].iov_base = pattern; + if (unlikely(++n == MDBX_AUXILARY_IOV_MAX)) { + osal_pwritev(env->me_lazy_fd, iov, MDBX_AUXILARY_IOV_MAX, + file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + file_offset += pgno2bytes(env, MDBX_AUXILARY_IOV_MAX); + n = 0; + } + } + } + if (likely(n > 0)) { + osal_pwritev(env->me_lazy_fd, iov, n, file_offset); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.prefault.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + if (cleared == num) + need_clean = false; + } + } } else { - ret.page = mdbx_page_malloc(txn, num); + ret.page = page_malloc(txn, num); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; - goto fail; + goto bailout; } } - if (range_begin) { - mdbx_cassert(mc, (mc->mc_flags & C_GCFREEZE) == 0); - mdbx_tassert(txn, pgno < txn->mt_next_pgno); - mdbx_tassert(txn, pgno == re_list[range_begin]); - /* Cutoff allocated pages from tw.reclaimed_pglist */ -#if MDBX_PNL_ASCENDING - for (unsigned i = range_begin + num; i <= re_len;) - re_list[range_begin++] = re_list[i++]; - MDBX_PNL_SIZE(re_list) = re_len = range_begin - 1; -#else - MDBX_PNL_SIZE(re_list) = re_len -= num; - for (unsigned i = range_begin - num; i < re_len;) - re_list[++i] = re_list[++range_begin]; -#endif - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - } else { - txn->mt_next_pgno = pgno + num; - mdbx_assert(env, txn->mt_next_pgno <= txn->mt_end_pgno); - } - - if (unlikely(env->me_flags & MDBX_PAGEPERTURB)) + if (unlikely(need_clean)) memset(ret.page, -1, pgno2bytes(env, num)); - VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); + VALGRIND_MAKE_MEM_UNDEFINED(ret.page, pgno2bytes(env, num)); ret.page->mp_pgno = pgno; ret.page->mp_leaf2_ksize = 0; ret.page->mp_flags = 0; - if ((mdbx_assert_enabled() || mdbx_audit_enabled()) && num > 1) { - ret.page->mp_pages = num; + if ((ASSERT_ENABLED() || AUDIT_ENABLED()) && num > 1) { + ret.page->mp_pages = (pgno_t)num; ret.page->mp_flags = P_OVERFLOW; } - ret.err = mdbx_page_dirty(txn, ret.page, num); - if (unlikely(ret.err != MDBX_SUCCESS)) - goto fail; - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + ret.err = page_dirty(txn, ret.page, (pgno_t)num); +bailout: + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); +#if MDBX_ENABLE_PROFGC + size_t majflt_after; + prof->xtime_cpu += osal_cputime(&majflt_after) - cputime_before; + prof->majflt += majflt_after - majflt_before; +#endif /* MDBX_ENABLE_PROFGC */ return ret; } -/* Copy the used portions of a non-overflow page. */ -__hot static void mdbx_page_copy(MDBX_page *dst, const MDBX_page *src, - size_t psize) { - STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); - STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); - if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { - size_t upper = src->mp_upper, lower = src->mp_lower, unused = upper - lower; +static pgr_t page_alloc_slowpath(const MDBX_cursor *const mc, const size_t num, + uint8_t flags) { +#if MDBX_ENABLE_PROFGC + const uint64_t monotime_before = osal_monotime(); +#endif /* MDBX_ENABLE_PROFGC */ - /* If page isn't full, just copy the used portion. Adjust - * alignment so memcpy may copy words instead of bytes. */ - if (unused >= MDBX_CACHELINE_SIZE * 2) { - lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *)); - upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *)); - memcpy(dst, src, lower); - dst = (void *)((char *)dst + upper); - src = (void *)((char *)src + upper); - psize -= upper; + pgr_t ret; + MDBX_txn *const txn = mc->mc_txn; + MDBX_env *const env = txn->mt_env; +#if MDBX_ENABLE_PROFGC + profgc_stat_t *const prof = (mc->mc_dbi == FREE_DBI) + ? &env->me_lck->mti_pgop_stat.gc_prof.self + : &env->me_lck->mti_pgop_stat.gc_prof.work; + prof->spe_counter += 1; +#endif /* MDBX_ENABLE_PROFGC */ + + eASSERT(env, num > 0 || (flags & MDBX_ALLOC_RESERVE)); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + + pgno_t pgno = 0; + size_t newnext; + if (num > 1) { +#if MDBX_ENABLE_PROFGC + prof->xpages += 1; +#endif /* MDBX_ENABLE_PROFGC */ + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) + goto done; + } + } else { + eASSERT(env, num == 0 || MDBX_PNL_GETSIZE(txn->tw.relist) == 0); + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE) || num == 0); + } + + //--------------------------------------------------------------------------- + + if (unlikely(!is_gc_usable(txn, mc, flags))) { + eASSERT(env, txn->mt_flags & MDBX_TXN_DRAINED_GC); + goto no_gc; + } + + eASSERT(env, (flags & (MDBX_ALLOC_COALESCE | MDBX_ALLOC_LIFO | + MDBX_ALLOC_SHOULD_SCAN)) == 0); + flags += (env->me_flags & MDBX_LIFORECLAIM) ? MDBX_ALLOC_LIFO : 0; + + if (/* Не коагулируем записи при подготовке резерва для обновления GC. + * Иначе попытка увеличить резерв может приводить к необходимости ещё + * большего резерва из-за увеличения списка переработанных страниц. */ + (flags & MDBX_ALLOC_RESERVE) == 0) { + if (txn->mt_dbs[FREE_DBI].md_branch_pages && + MDBX_PNL_GETSIZE(txn->tw.relist) < env->me_maxgc_ov1page / 2) + flags += MDBX_ALLOC_COALESCE; + } + + MDBX_cursor *const gc = ptr_disp(env->me_txn0, sizeof(MDBX_txn)); + eASSERT(env, mc != gc && gc->mc_next == nullptr); + gc->mc_txn = txn; + gc->mc_flags = 0; + + env->me_prefault_write = env->me_options.prefault_write; + if (env->me_prefault_write) { + /* Проверка посредством minicore() существенно снижает затраты, но в + * простейших случаях (тривиальный бенчмарк) интегральная производительность + * становится вдвое меньше. А на платформах без mincore() и с проблемной + * подсистемой виртуальной памяти ситуация может быть многократно хуже. + * Поэтому избегаем затрат в ситуациях когда prefaukt-write скорее всего не + * нужна. */ + const bool readahead_enabled = env->me_lck->mti_readahead_anchor & 1; + const pgno_t readahead_edge = env->me_lck->mti_readahead_anchor >> 1; + if (/* Не суетимся если GC почти пустая и БД маленькая */ + (txn->mt_dbs[FREE_DBI].md_branch_pages == 0 && + txn->mt_geo.now < 1234) || + /* Не суетимся если страница в зоне включенного упреждающего чтения */ + (readahead_enabled && pgno + num < readahead_edge)) + env->me_prefault_write = false; + } + +retry_gc_refresh_oldest:; + txnid_t oldest = txn_oldest_reader(txn); +retry_gc_have_oldest: + if (unlikely(oldest >= txn->mt_txnid)) { + ERROR("unexpected/invalid oldest-readed txnid %" PRIaTXN + " for current-txnid %" PRIaTXN, + oldest, txn->mt_txnid); + ret.err = MDBX_PROBLEM; + goto fail; + } + const txnid_t detent = oldest + 1; + + txnid_t id = 0; + MDBX_cursor_op op = MDBX_FIRST; + if (flags & MDBX_ALLOC_LIFO) { + if (!txn->tw.lifo_reclaimed) { + txn->tw.lifo_reclaimed = txl_alloc(); + if (unlikely(!txn->tw.lifo_reclaimed)) { + ret.err = MDBX_ENOMEM; + goto fail; + } + } + /* Begin lookup backward from oldest reader */ + id = detent - 1; + op = MDBX_SET_RANGE; + } else if (txn->tw.last_reclaimed) { + /* Continue lookup forward from last-reclaimed */ + id = txn->tw.last_reclaimed + 1; + if (id >= detent) + goto depleted_gc; + op = MDBX_SET_RANGE; + } + +next_gc:; + MDBX_val key; + key.iov_base = &id; + key.iov_len = sizeof(id); + +#if MDBX_ENABLE_PROFGC + prof->rsteps += 1; +#endif /* MDBX_ENABLE_PROFGC */ + + /* Seek first/next GC record */ + ret.err = cursor_get(gc, &key, NULL, op); + if (unlikely(ret.err != MDBX_SUCCESS)) { + if (unlikely(ret.err != MDBX_NOTFOUND)) + goto fail; + if ((flags & MDBX_ALLOC_LIFO) && op == MDBX_SET_RANGE) { + op = MDBX_PREV; + goto next_gc; + } + goto depleted_gc; + } + if (unlikely(key.iov_len != sizeof(txnid_t))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + id = unaligned_peek_u64(4, key.iov_base); + if (flags & MDBX_ALLOC_LIFO) { + op = MDBX_PREV; + if (id >= detent || is_already_reclaimed(txn, id)) + goto next_gc; + } else { + op = MDBX_NEXT; + if (unlikely(id >= detent)) + goto depleted_gc; + } + txn->mt_flags &= ~MDBX_TXN_DRAINED_GC; + + /* Reading next GC record */ + MDBX_val data; + MDBX_page *const mp = gc->mc_pg[gc->mc_top]; + if (unlikely((ret.err = node_read(gc, page_node(mp, gc->mc_ki[gc->mc_top]), + &data, mp)) != MDBX_SUCCESS)) + goto fail; + + pgno_t *gc_pnl = (pgno_t *)data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(gc_pnl) || + !pnl_check(gc_pnl, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + + const size_t gc_len = MDBX_PNL_GETSIZE(gc_pnl); + TRACE("gc-read: id #%" PRIaTXN " len %zu, re-list will %zu ", id, gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist)); + + if (unlikely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= + env->me_maxgc_ov1page)) { + /* Don't try to coalesce too much. */ + if (flags & MDBX_ALLOC_SHOULD_SCAN) { + eASSERT(env, flags & MDBX_ALLOC_COALESCE); + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); + eASSERT(env, num > 0); +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.coalescences += 1; +#endif /* MDBX_ENABLE_PROFGC */ + TRACE("clear %s %s", "MDBX_ALLOC_COALESCE", "since got threshold"); + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + if (likely(num == 1)) { + pgno = relist_get_single(txn); + goto done; + } + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) + goto done; + } + flags -= MDBX_ALLOC_COALESCE | MDBX_ALLOC_SHOULD_SCAN; + } + if (unlikely(/* list is too long already */ MDBX_PNL_GETSIZE( + txn->tw.relist) >= env->me_options.rp_augment_limit) && + ((/* not a slot-request from gc-update */ num && + /* have enough unallocated space */ txn->mt_geo.upper >= + txn->mt_next_pgno + num) || + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) >= MDBX_PGL_LIMIT)) { + /* Stop reclaiming to avoid large/overflow the page list. This is a rare + * case while search for a continuously multi-page region in a + * large database, see https://libmdbx.dqdkfa.ru/dead-github/issues/123 */ + NOTICE("stop reclaiming %s: %zu (current) + %zu " + "(chunk) -> %zu, rp_augment_limit %u", + likely(gc_len + MDBX_PNL_GETSIZE(txn->tw.relist) < MDBX_PGL_LIMIT) + ? "since rp_augment_limit was reached" + : "to avoid PNL overflow", + MDBX_PNL_GETSIZE(txn->tw.relist), gc_len, + gc_len + MDBX_PNL_GETSIZE(txn->tw.relist), + env->me_options.rp_augment_limit); + goto depleted_gc; } } - memcpy(dst, src, psize); + + /* Remember ID of readed GC record */ + txn->tw.last_reclaimed = id; + if (flags & MDBX_ALLOC_LIFO) { + ret.err = txl_append(&txn->tw.lifo_reclaimed, id); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + } + + /* Append PNL from GC record to tw.relist */ + ret.err = pnl_need(&txn->tw.relist, gc_len); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + DEBUG_EXTRA("readed GC-pnl txn %" PRIaTXN " root %" PRIaPGNO + " len %zu, PNL", + id, txn->mt_dbs[FREE_DBI].md_root, gc_len); + for (size_t i = gc_len; i; i--) + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, gc_pnl[i]); + DEBUG_EXTRA_PRINT(", next_pgno %u\n", txn->mt_next_pgno); + } + + /* Merge in descending sorted order */ + pnl_merge(txn->tw.relist, gc_pnl); + flags |= MDBX_ALLOC_SHOULD_SCAN; + if (AUDIT_ENABLED()) { + if (unlikely(!pnl_check(txn->tw.relist, txn->mt_next_pgno))) { + ret.err = MDBX_CORRUPTED; + goto fail; + } + } else { + eASSERT(env, pnl_check_allocated(txn->tw.relist, txn->mt_next_pgno)); + } + eASSERT(env, dirtylist_check(txn)); + + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.relist) == 0 || + MDBX_PNL_MOST(txn->tw.relist) < txn->mt_next_pgno); + if (MDBX_ENABLE_REFUND && MDBX_PNL_GETSIZE(txn->tw.relist) && + unlikely(MDBX_PNL_MOST(txn->tw.relist) == txn->mt_next_pgno - 1)) { + /* Refund suitable pages into "unallocated" space */ + txn_refund(txn); + } + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + + /* Done for a kick-reclaim mode, actually no page needed */ + if (unlikely(num == 0)) { + eASSERT(env, ret.err == MDBX_SUCCESS); + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "early-exit for slot", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto early_exit; + } + + /* TODO: delete reclaimed records */ + + eASSERT(env, op == MDBX_PREV || op == MDBX_NEXT); + if (flags & MDBX_ALLOC_COALESCE) { + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "coalesce-continue", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto next_gc; + } + +scan: + eASSERT(env, flags & MDBX_ALLOC_SHOULD_SCAN); + eASSERT(env, num > 0); + if (MDBX_PNL_GETSIZE(txn->tw.relist) >= num) { + eASSERT(env, MDBX_PNL_LAST(txn->tw.relist) < txn->mt_next_pgno && + MDBX_PNL_FIRST(txn->tw.relist) < txn->mt_next_pgno); + if (likely(num == 1)) { + eASSERT(env, !(flags & MDBX_ALLOC_RESERVE)); + pgno = relist_get_single(txn); + goto done; + } + pgno = relist_get_sequence(txn, num, flags); + if (likely(pgno)) + goto done; + } + flags -= MDBX_ALLOC_SHOULD_SCAN; + if (ret.err == MDBX_SUCCESS) { + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "continue-search", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + goto next_gc; + } + +depleted_gc: + TRACE("%s: last id #%" PRIaTXN ", re-len %zu", "gc-depleted", id, + MDBX_PNL_GETSIZE(txn->tw.relist)); + ret.err = MDBX_NOTFOUND; + if (flags & MDBX_ALLOC_SHOULD_SCAN) + goto scan; + txn->mt_flags |= MDBX_TXN_DRAINED_GC; + + //------------------------------------------------------------------------- + + /* There is no suitable pages in the GC and to be able to allocate + * we should CHOICE one of: + * - make a new steady checkpoint if reclaiming was stopped by + * the last steady-sync, or wipe it in the MDBX_UTTERLY_NOSYNC mode; + * - kick lagging reader(s) if reclaiming was stopped by ones of it. + * - extend the database file. */ + + /* Will use new pages from the map if nothing is suitable in the GC. */ + newnext = txn->mt_next_pgno + num; + + /* Does reclaiming stopped at the last steady point? */ + const meta_ptr_t recent = meta_recent(env, &txn->tw.troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &txn->tw.troika); + if (recent.ptr_c != prefer_steady.ptr_c && prefer_steady.is_steady && + detent == prefer_steady.txnid + 1) { + DEBUG("gc-kick-steady: recent %" PRIaTXN "-%s, steady %" PRIaTXN + "-%s, detent %" PRIaTXN, + recent.txnid, durable_caption(recent.ptr_c), prefer_steady.txnid, + durable_caption(prefer_steady.ptr_c), detent); + const pgno_t autosync_threshold = + atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); + const uint64_t autosync_period = + atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; + /* wipe the last steady-point if one of: + * - UTTERLY_NOSYNC mode AND auto-sync threshold is NOT specified + * - UTTERLY_NOSYNC mode AND free space at steady-point is exhausted + * otherwise, make a new steady-point if one of: + * - auto-sync threshold is specified and reached; + * - upper limit of database size is reached; + * - database is full (with the current file size) + * AND auto-sync threshold it NOT specified */ + if (F_ISSET(env->me_flags, MDBX_UTTERLY_NOSYNC) && + ((autosync_threshold | autosync_period) == 0 || + newnext >= prefer_steady.ptr_c->mm_geo.now)) { + /* wipe steady checkpoint in MDBX_UTTERLY_NOSYNC mode + * without any auto-sync threshold(s). */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.wipes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + ret.err = wipe_steady(txn, detent); + DEBUG("gc-wipe-steady, rc %d", ret.err); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; + } + if ((autosync_threshold && + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + autosync_threshold) || + (autosync_period && + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period) || + newnext >= txn->mt_geo.upper || + ((num == 0 || newnext >= txn->mt_end_pgno) && + (autosync_threshold | autosync_period) == 0)) { + /* make steady checkpoint. */ +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.flushes += 1; +#endif /* MDBX_ENABLE_PROFGC */ + MDBX_meta meta = *recent.ptr_c; + ret.err = sync_locked(env, env->me_flags & MDBX_WRITEMAP, &meta, + &txn->tw.troika); + DEBUG("gc-make-steady, rc %d", ret.err); + eASSERT(env, ret.err != MDBX_RESULT_TRUE); + if (unlikely(ret.err != MDBX_SUCCESS)) + goto fail; + eASSERT(env, prefer_steady.ptr_c != + meta_prefer_steady(env, &txn->tw.troika).ptr_c); + goto retry_gc_refresh_oldest; + } + } + + if (unlikely(true == atomic_load32(&env->me_lck->mti_readers_refresh_flag, + mo_AcquireRelease))) { + oldest = txn_oldest_reader(txn); + if (oldest >= detent) + goto retry_gc_have_oldest; + } + + /* Avoid kick lagging reader(s) if is enough unallocated space + * at the end of database file. */ + if (!(flags & MDBX_ALLOC_RESERVE) && newnext <= txn->mt_end_pgno) { + eASSERT(env, pgno == 0); + goto done; + } + + if (oldest < txn->mt_txnid - xMDBX_TXNID_STEP) { + oldest = kick_longlived_readers(env, oldest); + if (oldest >= detent) + goto retry_gc_have_oldest; + } + + //--------------------------------------------------------------------------- + +no_gc: + eASSERT(env, pgno == 0); +#ifndef MDBX_ENABLE_BACKLOG_DEPLETED +#define MDBX_ENABLE_BACKLOG_DEPLETED 0 +#endif /* MDBX_ENABLE_BACKLOG_DEPLETED*/ + if (MDBX_ENABLE_BACKLOG_DEPLETED && + unlikely(!(txn->mt_flags & MDBX_TXN_DRAINED_GC))) { + ret.err = MDBX_BACKLOG_DEPLETED; + goto fail; + } + if (flags & MDBX_ALLOC_RESERVE) { + ret.err = MDBX_NOTFOUND; + goto fail; + } + + /* Will use new pages from the map if nothing is suitable in the GC. */ + newnext = txn->mt_next_pgno + num; + if (newnext <= txn->mt_end_pgno) + goto done; + + if (newnext > txn->mt_geo.upper || !txn->mt_geo.grow_pv) { + NOTICE("gc-alloc: next %zu > upper %" PRIaPGNO, newnext, txn->mt_geo.upper); + ret.err = MDBX_MAP_FULL; + goto fail; + } + + eASSERT(env, newnext > txn->mt_end_pgno); + const size_t grow_step = pv2pages(txn->mt_geo.grow_pv); + size_t aligned = pgno_align2os_pgno( + env, (pgno_t)(newnext + grow_step - newnext % grow_step)); + + if (aligned > txn->mt_geo.upper) + aligned = txn->mt_geo.upper; + eASSERT(env, aligned >= newnext); + + VERBOSE("try growth datafile to %zu pages (+%zu)", aligned, + aligned - txn->mt_end_pgno); + ret.err = dxb_resize(env, txn->mt_next_pgno, (pgno_t)aligned, + txn->mt_geo.upper, implicit_grow); + if (ret.err != MDBX_SUCCESS) { + ERROR("unable growth datafile to %zu pages (+%zu), errcode %d", aligned, + aligned - txn->mt_end_pgno, ret.err); + goto fail; + } + env->me_txn->mt_end_pgno = (pgno_t)aligned; + eASSERT(env, pgno == 0); + + //--------------------------------------------------------------------------- + +done: + ret.err = MDBX_SUCCESS; + if (likely((flags & MDBX_ALLOC_RESERVE) == 0)) { + if (pgno) { + eASSERT(env, pgno + num <= txn->mt_next_pgno && pgno >= NUM_METAS); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + } else { + pgno = txn->mt_next_pgno; + txn->mt_next_pgno += (pgno_t)num; + eASSERT(env, txn->mt_next_pgno <= txn->mt_end_pgno); + eASSERT(env, pgno >= NUM_METAS && pgno + num <= txn->mt_next_pgno); + } + + ret = page_alloc_finalize(env, txn, mc, pgno, num); + if (unlikely(ret.err != MDBX_SUCCESS)) { + fail: + eASSERT(env, ret.err != MDBX_SUCCESS); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + int level; + const char *what; + if (flags & MDBX_ALLOC_RESERVE) { + level = + (flags & MDBX_ALLOC_UNIMPORTANT) ? MDBX_LOG_DEBUG : MDBX_LOG_NOTICE; + what = num ? "reserve-pages" : "fetch-slot"; + } else { + txn->mt_flags |= MDBX_TXN_ERROR; + level = MDBX_LOG_ERROR; + what = "pages"; + } + if (LOG_ENABLED(level)) + debug_log(level, __func__, __LINE__, + "unable alloc %zu %s, alloc-flags 0x%x, err %d, txn-flags " + "0x%x, re-list-len %zu, loose-count %zu, gc: height %u, " + "branch %zu, leaf %zu, large %zu, entries %zu\n", + num, what, flags, ret.err, txn->mt_flags, + MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count, + txn->mt_dbs[FREE_DBI].md_depth, + (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_entries); + ret.page = NULL; + } + } else { + early_exit: + DEBUG("return NULL for %zu pages for ALLOC_%s, rc %d", num, + num ? "RESERVE" : "SLOT", ret.err); + ret.page = NULL; + } + +#if MDBX_ENABLE_PROFGC + prof->rtime_monotonic += osal_monotime() - monotime_before; +#endif /* MDBX_ENABLE_PROFGC */ + return ret; +} + +__hot static pgr_t page_alloc(const MDBX_cursor *const mc) { + MDBX_txn *const txn = mc->mc_txn; + tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, F_ISSET(txn->mt_dbistate[mc->mc_dbi], DBI_DIRTY | DBI_VALID)); + + /* If there are any loose pages, just use them */ + while (likely(txn->tw.loose_pages)) { +#if MDBX_ENABLE_REFUND + if (unlikely(txn->tw.loose_refund_wl > txn->mt_next_pgno)) { + txn_refund(txn); + if (!txn->tw.loose_pages) + break; + } +#endif /* MDBX_ENABLE_REFUND */ + + MDBX_page *lp = txn->tw.loose_pages; + MDBX_ASAN_UNPOISON_MEMORY_REGION(lp, txn->mt_env->me_psize); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + txn->tw.loose_pages = mp_next(lp); + txn->tw.loose_count--; + DEBUG_EXTRA("db %d use loose page %" PRIaPGNO, DDBI(mc), lp->mp_pgno); + tASSERT(txn, lp->mp_pgno < txn->mt_next_pgno); + tASSERT(txn, lp->mp_pgno >= NUM_METAS); + VALGRIND_MAKE_MEM_UNDEFINED(page_data(lp), page_space(txn->mt_env)); + lp->mp_txnid = txn->mt_front; + pgr_t ret = {lp, MDBX_SUCCESS}; + return ret; + } + + if (likely(MDBX_PNL_GETSIZE(txn->tw.relist) > 0)) + return page_alloc_finalize(txn->mt_env, txn, mc, relist_get_single(txn), 1); + + return page_alloc_slowpath(mc, 1, MDBX_ALLOC_DEFAULT); +} + +/* Copy the used portions of a page. */ +__hot static void page_copy(MDBX_page *const dst, const MDBX_page *const src, + const size_t size) { + STATIC_ASSERT(UINT16_MAX > MAX_PAGESIZE - PAGEHDRSZ); + STATIC_ASSERT(MIN_PAGESIZE > PAGEHDRSZ + NODESIZE * 4); + void *copy_dst = dst; + const void *copy_src = src; + size_t copy_len = size; + if (src->mp_flags & P_LEAF2) { + copy_len = PAGEHDRSZ + src->mp_leaf2_ksize * page_numkeys(src); + if (unlikely(copy_len > size)) + goto bailout; + } + if ((src->mp_flags & (P_LEAF2 | P_OVERFLOW)) == 0) { + size_t upper = src->mp_upper, lower = src->mp_lower; + intptr_t unused = upper - lower; + /* If page isn't full, just copy the used portion. Adjust + * alignment so memcpy may copy words instead of bytes. */ + if (unused > MDBX_CACHELINE_SIZE * 3) { + lower = ceil_powerof2(lower + PAGEHDRSZ, sizeof(void *)); + upper = floor_powerof2(upper + PAGEHDRSZ, sizeof(void *)); + if (unlikely(upper > copy_len)) + goto bailout; + memcpy(copy_dst, copy_src, lower); + copy_dst = ptr_disp(copy_dst, upper); + copy_src = ptr_disp(copy_src, upper); + copy_len -= upper; + } + } + memcpy(copy_dst, copy_src, copy_len); + return; + +bailout: + if (src->mp_flags & P_LEAF2) + bad_page(src, "%s addr %p, n-keys %zu, ksize %u", + "invalid/corrupted source page", __Wpedantic_format_voidptr(src), + page_numkeys(src), src->mp_leaf2_ksize); + else + bad_page(src, "%s addr %p, upper %u", "invalid/corrupted source page", + __Wpedantic_format_voidptr(src), src->mp_upper); + memset(dst, -1, size); } /* Pull a page off the txn's spill list, if present. * * If a page being referenced was spilled to disk in this txn, bring * it back and make it dirty/writable again. */ -static struct page_result __must_check_result -mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { - mdbx_verbose("unspill page %" PRIaPGNO, mp->mp_pgno); - mdbx_tassert(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); - mdbx_tassert(txn, IS_SPILLED(txn, mp)); +static pgr_t __must_check_result page_unspill(MDBX_txn *const txn, + const MDBX_page *const mp) { + VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + tASSERT(txn, IS_SPILLED(txn, mp)); const MDBX_txn *scan = txn; - struct page_result ret; + pgr_t ret; do { - mdbx_tassert(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); - const unsigned si = mdbx_search_spilled(scan, mp->mp_pgno); + tASSERT(txn, (scan->mt_flags & MDBX_TXN_SPILLS) != 0); + const size_t si = search_spilled(scan, mp->mp_pgno); if (!si) continue; const unsigned npages = IS_OVERFLOW(mp) ? mp->mp_pages : 1; - ret.page = mdbx_page_malloc(txn, npages); + ret.page = page_malloc(txn, npages); if (unlikely(!ret.page)) { ret.err = MDBX_ENOMEM; return ret; } - mdbx_page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); + page_copy(ret.page, mp, pgno2bytes(txn->mt_env, npages)); if (scan == txn) { /* If in current txn, this page is no longer spilled. * If it happens to be the last page, truncate the spill list. * Otherwise mark it as deleted by setting the LSB. */ - mdbx_spill_remove(txn, si, npages); + spill_remove(txn, si, npages); } /* otherwise, if belonging to a parent txn, the * page remains spilled until child commits */ - ret.err = mdbx_page_dirty(txn, ret.page, npages); + ret.err = page_dirty(txn, ret.page, npages); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; #if MDBX_ENABLE_PGOP_STAT @@ -7064,11 +7850,11 @@ mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { return ret; } while (likely((scan = scan->mt_parent) != nullptr && (scan->mt_flags & MDBX_TXN_SPILLS) != 0)); - mdbx_error("Page %" PRIaPGNO " mod-txnid %" PRIaTXN - " not found in the spill-list(s), current txn %" PRIaTXN - " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, - mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, - txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); + ERROR("Page %" PRIaPGNO " mod-txnid %" PRIaTXN + " not found in the spill-list(s), current txn %" PRIaTXN + " front %" PRIaTXN ", root txn %" PRIaTXN " front %" PRIaTXN, + mp->mp_pgno, mp->mp_txnid, txn->mt_txnid, txn->mt_front, + txn->mt_env->me_txn0->mt_txnid, txn->mt_env->me_txn0->mt_front); ret.err = MDBX_PROBLEM; ret.page = NULL; return ret; @@ -7080,46 +7866,79 @@ mdbx_page_unspill(MDBX_txn *const txn, const MDBX_page *const mp) { * [in] mc cursor pointing to the page to be touched * * Returns 0 on success, non-zero on failure. */ -__hot static int mdbx_page_touch(MDBX_cursor *mc) { +__hot static int page_touch(MDBX_cursor *mc) { const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; MDBX_page *np; MDBX_txn *txn = mc->mc_txn; int rc; - if (mdbx_assert_enabled()) { + tASSERT(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, F_ISSET(*mc->mc_dbistate, DBI_DIRTY | DBI_VALID)); + tASSERT(txn, !IS_OVERFLOW(mp)); + if (ASSERT_ENABLED()) { if (mc->mc_flags & C_SUB) { MDBX_xcursor *mx = container_of(mc->mc_db, MDBX_xcursor, mx_db); MDBX_cursor_couple *couple = container_of(mx, MDBX_cursor_couple, inner); - mdbx_tassert(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); - mdbx_tassert(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); - mdbx_tassert(txn, *couple->outer.mc_dbistate & DBI_DIRTY); - } else { - mdbx_tassert(txn, *mc->mc_dbistate & DBI_DIRTY); + tASSERT(txn, mc->mc_db == &couple->outer.mc_xcursor->mx_db); + tASSERT(txn, mc->mc_dbx == &couple->outer.mc_xcursor->mx_dbx); + tASSERT(txn, *couple->outer.mc_dbistate & DBI_DIRTY); } - mdbx_tassert(txn, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); - mdbx_tassert(txn, !IS_OVERFLOW(mp)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); + tASSERT(txn, dirtylist_check(txn)); } - if (IS_MODIFIABLE(txn, mp) || IS_SUBP(mp)) + if (IS_MODIFIABLE(txn, mp)) { + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) && !MDBX_AVOID_MSYNC); + return MDBX_SUCCESS; + } + if (IS_SUBP(mp)) + return MDBX_SUCCESS; + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + const size_t n = dpl_search(txn, mp->mp_pgno); + if (MDBX_AVOID_MSYNC && + unlikely(txn->tw.dirtylist->items[n].pgno != mp->mp_pgno)) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP)); + tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length + 1); + VERBOSE("unspill page %" PRIaPGNO, mp->mp_pgno); + np = (MDBX_page *)mp; +#if MDBX_ENABLE_PGOP_STAT + txn->mt_env->me_lck->mti_pgop_stat.unspill.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + return page_dirty(txn, np, 1); + } + tASSERT(txn, n > 0 && n <= txn->tw.dirtylist->length); + tASSERT(txn, txn->tw.dirtylist->items[n].pgno == mp->mp_pgno && + txn->tw.dirtylist->items[n].ptr == mp); + if (!MDBX_AVOID_MSYNC || (txn->mt_flags & MDBX_WRITEMAP) == 0) { + size_t *const ptr = + ptr_disp(txn->tw.dirtylist->items[n].ptr, -(ptrdiff_t)sizeof(size_t)); + *ptr = txn->tw.dirtylru; + } return MDBX_SUCCESS; + } + if (IS_SUBP(mp)) { + np = (MDBX_page *)mp; + np->mp_txnid = txn->mt_front; + return MDBX_SUCCESS; + } + tASSERT(txn, !IS_OVERFLOW(mp)); if (IS_FROZEN(txn, mp)) { /* CoW the page */ - rc = mdbx_pnl_need(&txn->tw.retired_pages, 1); + rc = pnl_need(&txn->tw.retired_pages, 1); if (unlikely(rc != MDBX_SUCCESS)) goto fail; - const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL); + const pgr_t par = page_alloc(mc); rc = par.err; np = par.page; if (unlikely(rc != MDBX_SUCCESS)) goto fail; const pgno_t pgno = np->mp_pgno; - mdbx_debug("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), - mp->mp_pgno, pgno); - mdbx_tassert(txn, mp->mp_pgno != pgno); - mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); + DEBUG("touched db %d page %" PRIaPGNO " -> %" PRIaPGNO, DDBI(mc), + mp->mp_pgno, pgno); + tASSERT(txn, mp->mp_pgno != pgno); + pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); /* Update the parent page, if any, to point to the new page */ if (mc->mc_top) { MDBX_page *parent = mc->mc_pg[mc->mc_top - 1]; @@ -7132,43 +7951,43 @@ __hot static int mdbx_page_touch(MDBX_cursor *mc) { #if MDBX_ENABLE_PGOP_STAT txn->mt_env->me_lck->mti_pgop_stat.cow.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - mdbx_page_copy(np, mp, txn->mt_env->me_psize); + page_copy(np, mp, txn->mt_env->me_psize); np->mp_pgno = pgno; np->mp_txnid = txn->mt_front; } else if (IS_SPILLED(txn, mp)) { - struct page_result pur = mdbx_page_unspill(txn, mp); + pgr_t pur = page_unspill(txn, mp); np = pur.page; rc = pur.err; if (likely(rc == MDBX_SUCCESS)) { - mdbx_tassert(txn, np != nullptr); + tASSERT(txn, np != nullptr); goto done; } goto fail; } else { if (unlikely(!txn->mt_parent)) { - mdbx_error("Unexpected not frozen/modifiable/spilled but shadowed %s " - "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," - " without parent transaction, current txn %" PRIaTXN - " front %" PRIaTXN, - IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, - mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); + ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + IS_BRANCH(mp) ? "branch" : "leaf", mp->mp_pgno, mp->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); rc = MDBX_PROBLEM; goto fail; } - mdbx_debug("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); - mdbx_tassert(txn, txn->tw.dirtylist->length <= - MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); + DEBUG("clone db %d page %" PRIaPGNO, DDBI(mc), mp->mp_pgno); + tASSERT(txn, + txn->tw.dirtylist->length <= MDBX_PGL_LIMIT + MDBX_PNL_GRANULATE); /* No - copy it */ - np = mdbx_page_malloc(txn, 1); + np = page_malloc(txn, 1); if (unlikely(!np)) { rc = MDBX_ENOMEM; goto fail; } - mdbx_page_copy(np, mp, txn->mt_env->me_psize); + page_copy(np, mp, txn->mt_env->me_psize); /* insert a clone of parent's dirty page, so don't touch dirtyroom */ - rc = mdbx_page_dirty(txn, np, 1); + rc = page_dirty(txn, np, 1); if (unlikely(rc != MDBX_SUCCESS)) goto fail; @@ -7209,8 +8028,49 @@ fail: return rc; } -__cold static int mdbx_env_sync_internal(MDBX_env *env, bool force, - bool nonblock) { +static int meta_sync(const MDBX_env *env, const meta_ptr_t head) { + eASSERT(env, atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != + (uint32_t)head.txnid); + /* Функция может вызываться (в том числе) при (env->me_flags & + * MDBX_NOMETASYNC) == 0 и env->me_fd4meta == env->me_dsync_fd, например если + * предыдущая транзакция была выполненна с флагом MDBX_NOMETASYNC. */ + + int rc = MDBX_RESULT_TRUE; + if (env->me_flags & MDBX_WRITEMAP) { + if (!MDBX_AVOID_MSYNC) { + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } else { +#if MDBX_ENABLE_PGOP_ST + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const MDBX_page *page = data_page(head.ptr_c); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + ptr_dist(page, env->me_map)); + + if (likely(rc == MDBX_SUCCESS) && env->me_fd4meta == env->me_lazy_fd) { + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + } + } else { + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + } + + if (likely(rc == MDBX_SUCCESS)) + env->me_lck->mti_meta_sync_txnid.weak = (uint32_t)head.txnid; + return rc; +} + +__cold static int env_sync(MDBX_env *env, bool force, bool nonblock) { bool locked = false; int rc = MDBX_RESULT_TRUE /* means "nothing to sync" */; @@ -7226,62 +8086,84 @@ retry:; goto bailout; } - const pgno_t unsynced_pages = - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed); - volatile const MDBX_meta *head = meta_prefer_last(env); - const txnid_t head_txnid = meta_txnid(env, head); - const uint32_t synched_meta_txnid_u32 = - atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); - if (unsynced_pages == 0 && synched_meta_txnid_u32 == (uint32_t)head_txnid && - META_IS_STEADY(head)) - goto bailout; + const bool inside_txn = (env->me_txn0->mt_owner == osal_thread_self()); + meta_ptr_t head; + if (inside_txn | locked) + head = meta_recent(env, &env->me_txn0->tw.troika); + else { + const meta_troika_t troika = meta_tap(env); + head = meta_recent(env, &troika); + } + const uint64_t unsynced_pages = + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed); + if (unsynced_pages == 0) { + const uint32_t synched_meta_txnid_u32 = + atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); + if (synched_meta_txnid_u32 == (uint32_t)head.txnid && head.is_steady) + goto bailout; + } - const pgno_t autosync_threshold = + if (!inside_txn && locked && (env->me_flags & MDBX_WRITEMAP) && + unlikely(head.ptr_c->mm_geo.next > + bytes2pgno(env, env->me_dxb_mmap.current))) { + rc = dxb_resize(env, head.ptr_c->mm_geo.next, head.ptr_c->mm_geo.now, + head.ptr_c->mm_geo.upper, implicit_grow); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } + + const size_t autosync_threshold = atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; if (force || (autosync_threshold && unsynced_pages >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= - autosync_period)) + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP /* clear flags for full steady sync */; - const bool inside_txn = (env->me_txn0->mt_owner == mdbx_thread_self()); if (!inside_txn) { if (!locked) { - int err; +#if MDBX_ENABLE_PGOP_STAT unsigned wops = 0; +#endif /* MDBX_ENABLE_PGOP_STAT */ + + int err; /* pre-sync to avoid latency for writer */ - if (unsynced_pages > /* FIXME: define threshold */ 16 && + if (unsynced_pages > /* FIXME: define threshold */ 42 && (flags & MDBX_SAFE_NOSYNC) == 0) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); if (flags & MDBX_WRITEMAP) { /* Acquire guard to avoid collision with remap */ #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_AcquireShared(&env->me_remap_guard); + osal_srwlock_AcquireShared(&env->me_remap_guard); #else - err = mdbx_fastmutex_acquire(&env->me_remap_guard); + err = osal_fastmutex_acquire(&env->me_remap_guard); if (unlikely(err != MDBX_SUCCESS)) return err; #endif - const size_t usedbytes = pgno_align2os_bytes(env, head->mm_geo.next); - err = mdbx_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); + const size_t usedbytes = + pgno_align2os_bytes(env, head.ptr_c->mm_geo.next); + err = osal_msync(&env->me_dxb_mmap, 0, usedbytes, MDBX_SYNC_DATA); #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); #else - int unlock_err = mdbx_fastmutex_release(&env->me_remap_guard); + int unlock_err = osal_fastmutex_release(&env->me_remap_guard); if (unlikely(unlock_err != MDBX_SUCCESS) && err == MDBX_SUCCESS) err = unlock_err; #endif } else - err = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); + err = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA); if (unlikely(err != MDBX_SUCCESS)) return err; - /* pre-sync done */ +#if MDBX_ENABLE_PGOP_STAT wops = 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + /* pre-sync done */ rc = MDBX_SUCCESS /* means "some data was synced" */; } @@ -7293,25 +8175,25 @@ retry:; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += wops; #endif /* MDBX_ENABLE_PGOP_STAT */ + env->me_txn0->tw.troika = meta_tap(env); + eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); goto retry; } - env->me_txn0->mt_txnid = head_txnid; - mdbx_assert(env, head_txnid == meta_txnid(env, head)); - mdbx_assert(env, head_txnid == mdbx_recent_committed_txnid(env)); - mdbx_find_oldest(env->me_txn0); + eASSERT(env, head.txnid == recent_committed_txnid(env)); + env->me_txn0->mt_txnid = head.txnid; + txn_oldest_reader(env->me_txn0); flags |= MDBX_SHRINK_ALLOWED; } - mdbx_assert(env, inside_txn || locked); - mdbx_assert(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); + eASSERT(env, inside_txn || locked); + eASSERT(env, !inside_txn || (flags & MDBX_SHRINK_ALLOWED) == 0); - if (!META_IS_STEADY(head) || - ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { - mdbx_debug("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIaPGNO, - data_page((const void *)head)->mp_pgno, mdbx_durable_str(head), - unsynced_pages); - MDBX_meta meta = *head; - rc = mdbx_sync_locked(env, flags, &meta); + if (!head.is_steady || ((flags & MDBX_SAFE_NOSYNC) == 0 && unsynced_pages)) { + DEBUG("meta-head %" PRIaPGNO ", %s, sync_pending %" PRIu64, + data_page(head.ptr_c)->mp_pgno, durable_caption(head.ptr_c), + unsynced_pages); + MDBX_meta meta = *head.ptr_c; + rc = sync_locked(env, flags, &meta, &env->me_txn0->tw.troika); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -7319,19 +8201,8 @@ retry:; /* LY: sync meta-pages if MDBX_NOMETASYNC enabled * and someone was not synced above. */ if (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != - (uint32_t)head_txnid) { -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ - rc = (flags & MDBX_WRITEMAP) - ? mdbx_msync(&env->me_dxb_mmap, 0, - pgno_align2os_bytes(env, NUM_METAS), - MDBX_SYNC_DATA | MDBX_SYNC_IODQ) - : mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - if (likely(rc == MDBX_SUCCESS)) - atomic_store32(&env->me_lck->mti_meta_sync_txnid, (uint32_t)head_txnid, - mo_Relaxed); - } + (uint32_t)head.txnid) + rc = meta_sync(env, head); bailout: if (locked) @@ -7347,7 +8218,7 @@ static __inline int check_env(const MDBX_env *env, const bool wanna_active) { return MDBX_EBADSIGN; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { ((MDBX_env *)env)->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -7359,7 +8230,7 @@ static __inline int check_env(const MDBX_env *env, const bool wanna_active) { if (wanna_active) { if (unlikely((env->me_flags & MDBX_ENV_ACTIVE) == 0)) return MDBX_EPERM; - mdbx_assert(env, env->me_map != nullptr); + eASSERT(env, env->me_map != nullptr); } return MDBX_SUCCESS; @@ -7370,7 +8241,7 @@ __cold int mdbx_env_sync_ex(MDBX_env *env, bool force, bool nonblock) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_env_sync_internal(env, force, nonblock); + return env_sync(env, force, nonblock); } #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API @@ -7382,8 +8253,10 @@ __cold int mdbx_env_sync_poll(MDBX_env *env) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Back up parent txn's cursors, then grab the originals for tracking */ -static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { - for (int i = parent->mt_numdbs; --i >= 0;) { +static int cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { + tASSERT(parent, parent->mt_cursors[FREE_DBI] == nullptr); + nested->mt_cursors[FREE_DBI] = nullptr; + for (int i = parent->mt_numdbs; --i > FREE_DBI;) { nested->mt_cursors[i] = NULL; MDBX_cursor *mc = parent->mt_cursors[i]; if (mc != NULL) { @@ -7393,7 +8266,7 @@ static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { bk = mc; if (mc->mc_signature != MDBX_MC_LIVE) continue; - bk = mdbx_malloc(size); + bk = osal_malloc(size); if (unlikely(!bk)) return MDBX_ENOMEM; #if MDBX_DEBUG @@ -7427,24 +8300,27 @@ static int mdbx_cursor_shadow(MDBX_txn *parent, MDBX_txn *nested) { * [in] merge true to keep changes to parent cursors, false to revert. * * Returns 0 on success, non-zero on failure. */ -static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { - for (int i = txn->mt_numdbs; --i >= 0;) { - MDBX_cursor *next, *mc = txn->mt_cursors[i]; +static void cursors_eot(MDBX_txn *txn, const bool merge) { + tASSERT(txn, txn->mt_cursors[FREE_DBI] == nullptr); + for (intptr_t i = txn->mt_numdbs; --i > FREE_DBI;) { + MDBX_cursor *mc = txn->mt_cursors[i]; if (!mc) continue; - txn->mt_cursors[i] = NULL; + txn->mt_cursors[i] = nullptr; do { const unsigned stage = mc->mc_signature; - MDBX_cursor *bk = mc->mc_backup; - next = mc->mc_next; - mdbx_ensure(txn->mt_env, - stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); - mdbx_cassert(mc, mc->mc_dbi == (unsigned)i); + MDBX_cursor *const next = mc->mc_next; + MDBX_cursor *const bk = mc->mc_backup; + ENSURE(txn->mt_env, + stage == MDBX_MC_LIVE || (stage == MDBX_MC_WAIT4EOT && bk)); + cASSERT(mc, mc->mc_dbi == (MDBX_dbi)i); if (bk) { MDBX_xcursor *mx = mc->mc_xcursor; - mdbx_cassert(mc, mx == bk->mc_xcursor); - mdbx_tassert(txn, txn->mt_parent != NULL); - mdbx_ensure(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + tASSERT(txn, txn->mt_parent != NULL); + /* Zap: Using uninitialized memory '*mc->mc_backup'. */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); + ENSURE(txn->mt_env, bk->mc_signature == MDBX_MC_LIVE); + tASSERT(txn, mx == bk->mc_xcursor); if (stage == MDBX_MC_WAIT4EOT /* Cursor was closed by user */) mc->mc_signature = stage /* Promote closed state to parent txn */; else if (merge) { @@ -7468,28 +8344,29 @@ static void mdbx_cursors_eot(MDBX_txn *txn, const bool merge) { *mx = *(MDBX_xcursor *)(bk + 1); } bk->mc_signature = 0; - mdbx_free(bk); + osal_free(bk); } else { - mdbx_ensure(txn->mt_env, stage == MDBX_MC_LIVE); + ENSURE(txn->mt_env, stage == MDBX_MC_LIVE); mc->mc_signature = MDBX_MC_READY4CLOSE /* Cursor may be reused */; mc->mc_flags = 0 /* reset C_UNTRACK */; } - } while ((mc = next) != NULL); + mc = next; + } while (mc); } } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) /* Find largest mvcc-snapshot still referenced by this process. */ -static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { +static pgno_t find_largest_this(MDBX_env *env, pgno_t largest) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck != NULL /* exclusive mode */)) { - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease) == env->me_pid) { - /* mdbx_jitter4testing(true); */ + /* jitter4testing(true); */ const pgno_t snap_pages = atomic_load32( &lck->mti_readers[i].mr_snapshot_pages_used, mo_Relaxed); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); @@ -7510,7 +8387,7 @@ static pgno_t mdbx_find_largest_this(MDBX_env *env, pgno_t largest) { return largest; } -static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { +static void txn_valgrind(MDBX_env *env, MDBX_txn *txn) { #if !defined(__SANITIZE_ADDRESS__) if (!RUNNING_ON_VALGRIND) return; @@ -7526,10 +8403,9 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { } else { /* transaction end */ bool should_unlock = false; pgno_t last = MAX_PAGENO + 1; - if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) { + if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) { /* inside write-txn */ - const MDBX_meta *head = constmeta_prefer_last(env); - last = head->mm_geo.next; + last = meta_recent(env, &env->me_txn0->tw.troika).ptr_v->mm_geo.next; } else if (env->me_flags & MDBX_RDONLY) { /* read-only mode, no write-txn, no wlock mutex */ last = NUM_METAS; @@ -7542,15 +8418,16 @@ static void mdbx_txn_valgrind(MDBX_env *env, MDBX_txn *txn) { return; } - last = mdbx_find_largest_this(env, last); + last = find_largest_this(env, last); const pgno_t edge = env->me_poison_edge; if (edge > last) { - mdbx_assert(env, last >= NUM_METAS); + eASSERT(env, last >= NUM_METAS); env->me_poison_edge = last; - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, last), + VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, pgno2bytes(env, last)), pgno2bytes(env, edge - last)); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + pgno2bytes(env, last), - pgno2bytes(env, edge - last)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->me_map, pgno2bytes(env, last)), + pgno2bytes(env, edge - last)); } if (should_unlock) mdbx_txn_unlock(env); @@ -7564,48 +8441,48 @@ typedef struct { } bind_rslot_result; static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { - mdbx_assert(env, env->me_lck_mmap.lck); - mdbx_assert(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); - mdbx_assert(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); + eASSERT(env, env->me_lck_mmap.lck); + eASSERT(env, env->me_lck->mti_magic_and_version == MDBX_LOCK_MAGIC); + eASSERT(env, env->me_lck->mti_os_and_format == MDBX_LOCK_FORMAT); - bind_rslot_result result = {mdbx_rdt_lock(env), nullptr}; + bind_rslot_result result = {osal_rdt_lock(env), nullptr}; if (unlikely(MDBX_IS_ERROR(result.err))) return result; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = MDBX_PANIC; return result; } if (unlikely(!env->me_map)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = MDBX_EPERM; return result; } if (unlikely(env->me_live_reader != env->me_pid)) { - result.err = mdbx_rpid_set(env); + result.err = osal_rpid_set(env); if (unlikely(result.err != MDBX_SUCCESS)) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); return result; } env->me_live_reader = env->me_pid; } result.err = MDBX_SUCCESS; - unsigned slot, nreaders; + size_t slot, nreaders; while (1) { - nreaders = atomic_load32(&env->me_lck->mti_numreaders, mo_Relaxed); + nreaders = env->me_lck->mti_numreaders.weak; for (slot = 0; slot < nreaders; slot++) - if (atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, mo_Relaxed) == - 0) + if (!atomic_load32(&env->me_lck->mti_readers[slot].mr_pid, + mo_AcquireRelease)) break; if (likely(slot < env->me_maxreaders)) break; - result.err = mdbx_cleanup_dead_readers(env, true, NULL); + result.err = cleanup_dead_readers(env, true, NULL); if (result.err != MDBX_RESULT_TRUE) { - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); result.err = (result.err == MDBX_SUCCESS) ? MDBX_READERS_FULL : result.err; return result; @@ -7618,17 +8495,16 @@ static bind_rslot_result bind_rslot(MDBX_env *env, const uintptr_t tid) { * slot, next publish it in lck->mti_numreaders. After * that, it is safe for mdbx_env_close() to touch it. * When it will be closed, we can finally claim it. */ - atomic_store32(&result.rslot->mr_pid, 0, mo_Relaxed); + atomic_store32(&result.rslot->mr_pid, 0, mo_AcquireRelease); safe64_reset(&result.rslot->mr_txnid, true); if (slot == nreaders) - atomic_store32(&env->me_lck->mti_numreaders, ++nreaders, mo_Relaxed); - atomic_store64(&result.rslot->mr_tid, (env->me_flags & MDBX_NOTLS) ? 0 : tid, - mo_Relaxed); - atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_Relaxed); - mdbx_rdt_unlock(env); + env->me_lck->mti_numreaders.weak = (uint32_t)++nreaders; + result.rslot->mr_tid.weak = (env->me_flags & MDBX_NOTLS) ? 0 : tid; + atomic_store32(&result.rslot->mr_pid, env->me_pid, mo_AcquireRelease); + osal_rdt_unlock(env); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - mdbx_assert(env, env->me_live_reader == env->me_pid); + eASSERT(env, env->me_live_reader == env->me_pid); thread_rthc_set(env->me_txkey, result.rslot); } return result; @@ -7643,22 +8519,22 @@ __cold int mdbx_thread_register(const MDBX_env *env) { return (env->me_flags & MDBX_EXCLUSIVE) ? MDBX_EINVAL : MDBX_EPERM; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); return MDBX_EINVAL /* MDBX_NOTLS mode */; } - mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | + MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r != NULL)) { - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); if (unlikely(r->mr_pid.weak != env->me_pid)) return MDBX_BAD_RSLOT; return MDBX_RESULT_TRUE /* already registered */; } - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); if (env->me_txn0 && unlikely(env->me_txn0->mt_owner == tid)) return MDBX_TXN_OVERLAPPING; return bind_rslot((MDBX_env *)env, tid).err; @@ -7673,22 +8549,23 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { return MDBX_RESULT_TRUE; if (unlikely((env->me_flags & MDBX_ENV_TXKEY) == 0)) { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); return MDBX_RESULT_TRUE /* MDBX_NOTLS mode */; } - mdbx_assert(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | - MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); + eASSERT(env, (env->me_flags & (MDBX_NOTLS | MDBX_ENV_TXKEY | + MDBX_EXCLUSIVE)) == MDBX_ENV_TXKEY); MDBX_reader *r = thread_rthc_get(env->me_txkey); if (unlikely(r == NULL)) return MDBX_RESULT_TRUE /* not registered */; - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); if (unlikely(r->mr_pid.weak != env->me_pid || - r->mr_tid.weak != mdbx_thread_self())) + r->mr_tid.weak != osal_thread_self())) return MDBX_BAD_RSLOT; + eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); if (unlikely(r->mr_txnid.weak < SAFE64_INVALID_THRESHOLD)) return MDBX_BUSY /* transaction is still active */; @@ -7700,45 +8577,43 @@ __cold int mdbx_thread_unregister(const MDBX_env *env) { } /* check against https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ -static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, - bool report) { - const txnid_t head_txnid = meta_txnid(env, meta); - const txnid_t freedb_mod_txnid = meta->mm_dbs[FREE_DBI].md_mod_txnid; - const txnid_t maindb_mod_txnid = meta->mm_dbs[MAIN_DBI].md_mod_txnid; +static bool coherency_check(const MDBX_env *env, const txnid_t txnid, + const volatile MDBX_db *dbs, + const volatile MDBX_meta *meta, bool report) { + const txnid_t freedb_mod_txnid = dbs[FREE_DBI].md_mod_txnid; + const txnid_t maindb_mod_txnid = dbs[MAIN_DBI].md_mod_txnid; - const pgno_t freedb_root_pgno = meta->mm_dbs[FREE_DBI].md_root; + const pgno_t freedb_root_pgno = dbs[FREE_DBI].md_root; const MDBX_page *freedb_root = (env->me_map && freedb_root_pgno != P_INVALID) ? pgno2page(env, freedb_root_pgno) : nullptr; - const pgno_t maindb_root_pgno = meta->mm_dbs[MAIN_DBI].md_root; + const pgno_t maindb_root_pgno = dbs[MAIN_DBI].md_root; const MDBX_page *maindb_root = (env->me_map && maindb_root_pgno != P_INVALID) ? pgno2page(env, maindb_root_pgno) : nullptr; - const uint64_t magic_and_version = - unaligned_peek_u64(4, &meta->mm_magic_and_version); + unaligned_peek_u64_volatile(4, &meta->mm_magic_and_version); + bool ok = true; - if (unlikely(!head_txnid || head_txnid < freedb_mod_txnid || + if (unlikely(txnid < freedb_mod_txnid || (!freedb_mod_txnid && freedb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) - mdbx_warning( - "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN - " %s", - "free", freedb_mod_txnid, head_txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN + " for meta_txnid %" PRIaTXN " %s", + "free", freedb_mod_txnid, txnid, + "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } - if (unlikely(head_txnid < maindb_mod_txnid || + if (unlikely(txnid < maindb_mod_txnid || (!maindb_mod_txnid && maindb_root && likely(magic_and_version == MDBX_DATA_MAGIC)))) { if (report) - mdbx_warning( - "catch invalid %sdb.mod_txnid %" PRIaTXN " for meta_txnid %" PRIaTXN - " %s", - "main", maindb_mod_txnid, head_txnid, - "(workaround for incoherent flaw of unified page/buffer cache)"); + WARNING("catch invalid %sdb.mod_txnid %" PRIaTXN + " for meta_txnid %" PRIaTXN " %s", + "main", maindb_mod_txnid, txnid, + "(workaround for incoherent flaw of unified page/buffer cache)"); ok = false; } if (likely(freedb_root && freedb_mod_txnid)) { @@ -7748,7 +8623,7 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, const txnid_t root_txnid = freedb_root->mp_txnid; if (unlikely(root_txnid != freedb_mod_txnid)) { if (report) - mdbx_warning( + WARNING( "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN " for %sdb.mod_txnid %" PRIaTXN " %s", freedb_root_pgno, root_txnid, "free", freedb_mod_txnid, @@ -7763,7 +8638,7 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, const txnid_t root_txnid = maindb_root->mp_txnid; if (unlikely(root_txnid != maindb_mod_txnid)) { if (report) - mdbx_warning( + WARNING( "catch invalid root_page %" PRIaPGNO " mod_txnid %" PRIaTXN " for %sdb.mod_txnid %" PRIaTXN " %s", maindb_root_pgno, root_txnid, "main", maindb_mod_txnid, @@ -7771,24 +8646,29 @@ static bool meta_checktxnid(const MDBX_env *env, const MDBX_meta *meta, ok = false; } } + if (unlikely(!ok) && report) + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; return ok; } -/* check with timeout as the workaround - * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ -static int meta_waittxnid(const MDBX_env *env, const MDBX_meta *meta, - uint64_t *timestamp) { - if (likely(meta_checktxnid(env, meta, !*timestamp))) - return MDBX_SUCCESS; - - if (!*timestamp) - *timestamp = mdbx_osal_monotime(); - else if (unlikely(mdbx_osal_monotime() - *timestamp > 65536 / 10)) { - mdbx_error("bailout waiting for valid snapshot %s", - "(workaround for incoherent flaw of unified page/buffer cache)"); - return MDBX_CORRUPTED; +__cold static int coherency_timeout(uint64_t *timestamp, pgno_t pgno) { + if (likely(timestamp && *timestamp == 0)) + *timestamp = osal_monotime(); + else if (unlikely(!timestamp || osal_monotime() - *timestamp > + osal_16dot16_to_monotime(65536 / 10))) { + if (pgno) + ERROR("bailout waiting for %" PRIaPGNO " page arrival %s", pgno, + "(workaround for incoherent flaw of unified page/buffer cache)"); + else + ERROR("bailout waiting for valid snapshot (%s)", + "workaround for incoherent flaw of unified page/buffer cache"); + return MDBX_PROBLEM; } + osal_memory_fence(mo_AcquireRelease, true); #if defined(_WIN32) || defined(_WIN64) SwitchToThread(); #elif defined(__linux__) || defined(__gnu_linux__) || defined(_UNIX03_SOURCE) @@ -7801,13 +8681,54 @@ static int meta_waittxnid(const MDBX_env *env, const MDBX_meta *meta, return MDBX_RESULT_TRUE; } +/* check with timeout as the workaround + * for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ +__hot static int coherency_check_readed(const MDBX_env *env, + const txnid_t txnid, + const volatile MDBX_db *dbs, + const volatile MDBX_meta *meta, + uint64_t *timestamp) { + const bool report = !(timestamp && *timestamp); + if (unlikely(!coherency_check(env, txnid, dbs, meta, report))) + return coherency_timeout(timestamp, 0); + return MDBX_SUCCESS; +} + +static int coherency_check_written(const MDBX_env *env, const txnid_t txnid, + const volatile MDBX_meta *meta, + uint64_t *timestamp) { + const bool report = !(timestamp && *timestamp); + const txnid_t head_txnid = meta_txnid(meta); + if (unlikely(head_txnid < MIN_TXNID || (head_txnid < txnid))) { + if (report) { + env->me_lck->mti_pgop_stat.incoherence.weak = + (env->me_lck->mti_pgop_stat.incoherence.weak >= INT32_MAX) + ? INT32_MAX + : env->me_lck->mti_pgop_stat.incoherence.weak + 1; + WARNING("catch %s txnid %" PRIaTXN " for meta_%" PRIaPGNO " %s", + (head_txnid < MIN_TXNID) ? "invalid" : "unexpected", head_txnid, + bytes2pgno(env, ptr_dist(meta, env->me_map)), + "(workaround for incoherent flaw of unified page/buffer cache)"); + } + return coherency_timeout(timestamp, 0); + } + return coherency_check_readed(env, head_txnid, meta->mm_dbs, meta, timestamp); +} + +static bool coherency_check_meta(const MDBX_env *env, + const volatile MDBX_meta *meta, bool report) { + uint64_t timestamp = 0; + return coherency_check_written(env, 0, meta, report ? ×tamp : nullptr) == + MDBX_SUCCESS; +} + /* Common code for mdbx_txn_begin() and mdbx_txn_renew(). */ -static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { +static int txn_renew(MDBX_txn *txn, const unsigned flags) { MDBX_env *env = txn->mt_env; int rc; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -7826,28 +8747,28 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { STATIC_ASSERT(offsetof(MDBX_lockinfo, mti_readers) % MDBX_CACHELINE_SIZE == 0); - const uintptr_t tid = mdbx_thread_self(); + const uintptr_t tid = osal_thread_self(); if (flags & MDBX_TXN_RDONLY) { - mdbx_assert(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); + eASSERT(env, (flags & ~(MDBX_TXN_RO_BEGIN_FLAGS | MDBX_WRITEMAP)) == 0); txn->mt_flags = MDBX_TXN_RDONLY | (env->me_flags & (MDBX_NOTLS | MDBX_WRITEMAP)); MDBX_reader *r = txn->to.reader; STATIC_ASSERT(sizeof(uintptr_t) <= sizeof(r->mr_tid)); if (likely(env->me_flags & MDBX_ENV_TXKEY)) { - mdbx_assert(env, !(env->me_flags & MDBX_NOTLS)); + eASSERT(env, !(env->me_flags & MDBX_NOTLS)); r = thread_rthc_get(env->me_txkey); if (likely(r)) { if (unlikely(!r->mr_pid.weak) && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { + (runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN)) { thread_rthc_set(env->me_txkey, nullptr); r = nullptr; } else { - mdbx_assert(env, r->mr_pid.weak == env->me_pid); - mdbx_assert(env, r->mr_tid.weak == mdbx_thread_self()); + eASSERT(env, r->mr_pid.weak == env->me_pid); + eASSERT(env, r->mr_tid.weak == osal_thread_self()); } } } else { - mdbx_assert(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); + eASSERT(env, !env->me_lck_mmap.lck || (env->me_flags & MDBX_NOTLS)); } if (likely(r)) { @@ -7862,12 +8783,12 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } txn->to.reader = r; if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) { - mdbx_assert(env, txn->mt_txnid == 0); - mdbx_assert(env, txn->mt_owner == 0); - mdbx_assert(env, txn->mt_numdbs == 0); + eASSERT(env, txn->mt_txnid == 0); + eASSERT(env, txn->mt_owner == 0); + eASSERT(env, txn->mt_numdbs == 0); if (likely(r)) { - mdbx_assert(env, r->mr_snapshot_pages_used.weak == 0); - mdbx_assert(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + eASSERT(env, r->mr_snapshot_pages_used.weak == 0); + eASSERT(env, r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); atomic_store32(&r->mr_snapshot_pages_used, 0, mo_Relaxed); } txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; @@ -7875,109 +8796,102 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } /* Seek & fetch the last meta */ - if (likely(/* not recovery mode */ env->me_stuck_meta < 0)) { - uint64_t timestamp = 0; - while (1) { - volatile const MDBX_meta *const meta = meta_prefer_last(env); - mdbx_jitter4testing(false); - const txnid_t snap = meta_txnid(env, meta); - mdbx_jitter4testing(false); - if (likely(r)) { - safe64_reset(&r->mr_txnid, false); - atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, - mo_Relaxed); - atomic_store64(&r->mr_snapshot_pages_retired, - unaligned_peek_u64_volatile(4, meta->mm_pages_retired), - mo_Relaxed); - safe64_write(&r->mr_txnid, snap); - mdbx_jitter4testing(false); - mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); - mdbx_assert( - env, r->mr_tid.weak == - ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, r->mr_txnid.weak == snap); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_AcquireRelease); - } else { - /* exclusive mode without lck */ - } - mdbx_jitter4testing(true); + uint64_t timestamp = 0; + size_t loop = 0; + meta_troika_t troika = meta_tap(env); + while (1) { + const meta_ptr_t head = + likely(env->me_stuck_meta < 0) + ? /* regular */ meta_recent(env, &troika) + : /* recovery mode */ meta_ptr(env, env->me_stuck_meta); + if (likely(r)) { + safe64_reset(&r->mr_txnid, false); + atomic_store32(&r->mr_snapshot_pages_used, head.ptr_v->mm_geo.next, + mo_Relaxed); + atomic_store64( + &r->mr_snapshot_pages_retired, + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired), + mo_Relaxed); + safe64_write(&r->mr_txnid, head.txnid); + eASSERT(env, r->mr_pid.weak == osal_getpid()); + eASSERT(env, + r->mr_tid.weak == + ((env->me_flags & MDBX_NOTLS) ? 0 : osal_thread_self())); + eASSERT(env, r->mr_txnid.weak == head.txnid || + (r->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD && + head.txnid < env->me_lck->mti_oldest_reader.weak)); + atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, + mo_AcquireRelease); + } else { + /* exclusive mode without lck */ + eASSERT(env, !env->me_lck_mmap.lck && + env->me_lck == (void *)&env->x_lckless_stub); + } + jitter4testing(true); - /* Snap the state from current meta-head */ - txn->mt_txnid = snap; - txn->mt_geo = meta->mm_geo; - STATIC_ASSERT(CORE_DBS == 2); - txn->mt_dbs[0] = meta->mm_dbs[0]; - txn->mt_dbs[1] = meta->mm_dbs[1]; - txn->mt_canary = meta->mm_canary; + /* Snap the state from current meta-head */ + txn->mt_txnid = head.txnid; + txn->mt_geo = head.ptr_v->mm_geo; + memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + txn->mt_canary = head.ptr_v->mm_canary; - /* LY: Retry on a race, ITS#7970. - * The barrier is not needed here since C11-atomics are used, - * but it is reasonable paranoia to avoid compiler misoptimization - * and makes clarity for code readers. */ - mdbx_compiler_barrier(); - if (likely(meta == meta_prefer_last(env) && - snap == meta_txnid(env, meta) && - snap >= atomic_load64(&env->me_lck->mti_oldest_reader, - mo_AcquireRelease))) { - /* workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269 */ - rc = meta_waittxnid(env, (const MDBX_meta *)meta, ×tamp); - mdbx_jitter4testing(false); - if (likely(rc == MDBX_SUCCESS)) - break; - if (likely(rc == MDBX_RESULT_TRUE)) - continue; + if (likely(env->me_stuck_meta < 0) && + unlikely(meta_should_retry(env, &troika) || + head.txnid < atomic_load64(&env->me_lck->mti_oldest_reader, + mo_AcquireRelease))) { + if (unlikely(++loop > 42)) { + ERROR("bailout waiting for valid snapshot (%s)", + "metapages are too volatile"); + rc = MDBX_PROBLEM; + txn->mt_txnid = INVALID_TXNID; + if (likely(r)) + safe64_reset(&r->mr_txnid, false); goto bailout; } + timestamp = 0; + continue; } - } else { - /* r/o recovery mode */ - MDBX_meta *const meta = METAPAGE(env, env->me_stuck_meta); - txn->mt_txnid = constmeta_txnid(env, meta); - txn->mt_geo = meta->mm_geo; - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); - txn->mt_canary = meta->mm_canary; - if (likely(r)) { - atomic_store32(&r->mr_snapshot_pages_used, meta->mm_geo.next, - mo_Relaxed); - atomic_store64(&r->mr_snapshot_pages_retired, - unaligned_peek_u64(4, meta->mm_pages_retired), - mo_Relaxed); - atomic_store64(&r->mr_txnid, txn->mt_txnid, mo_Relaxed); - mdbx_jitter4testing(false); - mdbx_assert(env, r->mr_pid.weak == mdbx_getpid()); - mdbx_assert( - env, r->mr_tid.weak == - ((env->me_flags & MDBX_NOTLS) ? 0 : mdbx_thread_self())); - mdbx_assert(env, r->mr_txnid.weak == txn->mt_txnid); - atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, - mo_Relaxed); + + rc = coherency_check_readed(env, head.txnid, txn->mt_dbs, head.ptr_v, + ×tamp); + jitter4testing(false); + if (likely(rc == MDBX_SUCCESS)) + break; + + if (unlikely(rc != MDBX_RESULT_TRUE)) { + txn->mt_txnid = INVALID_TXNID; + if (likely(r)) + safe64_reset(&r->mr_txnid, false); + goto bailout; } } if (unlikely(txn->mt_txnid < MIN_TXNID || txn->mt_txnid > MAX_TXNID)) { - mdbx_error("%s", "environment corrupted by died writer, must shutdown!"); + ERROR("%s", "environment corrupted by died writer, must shutdown!"); + if (likely(r)) + safe64_reset(&r->mr_txnid, false); + txn->mt_txnid = INVALID_TXNID; rc = MDBX_CORRUPTED; goto bailout; } - mdbx_assert(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); + eASSERT(env, txn->mt_txnid >= env->me_lck->mti_oldest_reader.weak); txn->mt_dbxs = env->me_dbxs; /* mostly static anyway */ - mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); txn->mt_numdbs = env->me_numdbs; } else { - mdbx_assert(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | - MDBX_WRITEMAP)) == 0); + eASSERT(env, (flags & ~(MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS | + MDBX_WRITEMAP)) == 0); if (unlikely(txn->mt_owner == tid || /* not recovery mode */ env->me_stuck_meta >= 0)) return MDBX_BUSY; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && (env->me_flags & MDBX_NOTLS) == 0 && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { - const unsigned snap_nreaders = + (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) { + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) == env->me_pid && unlikely(atomic_load64(&lck->mti_readers[i].mr_tid, mo_Relaxed) == @@ -7990,8 +8904,8 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } /* Not yet touching txn == env->me_txn0, it may be active */ - mdbx_jitter4testing(false); - rc = mdbx_txn_lock(env, F_ISSET(flags, MDBX_TXN_TRY)); + jitter4testing(false); + rc = mdbx_txn_lock(env, !!(flags & MDBX_TXN_TRY)); if (unlikely(rc)) return rc; if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { @@ -8005,23 +8919,23 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { } #endif /* Windows */ - mdbx_jitter4testing(false); - const MDBX_meta *meta = constmeta_prefer_last(env); + txn->tw.troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); uint64_t timestamp = 0; while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { - rc = meta_waittxnid(env, (const MDBX_meta *)meta, ×tamp); + rc = coherency_check_readed(env, head.txnid, head.ptr_v->mm_dbs, + head.ptr_v, ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } - mdbx_jitter4testing(false); - txn->mt_canary = meta->mm_canary; - const txnid_t snap = constmeta_txnid(env, meta); - txn->mt_txnid = safe64_txnid_next(snap); + txn->mt_canary = head.ptr_c->mm_canary; + eASSERT(env, meta_txnid(head.ptr_v) == head.txnid); + txn->mt_txnid = safe64_txnid_next(head.txnid); if (unlikely(txn->mt_txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); + ERROR("txnid overflow, raise %d", rc); goto bailout; } @@ -8032,88 +8946,160 @@ static int mdbx_txn_renew0(MDBX_txn *txn, const unsigned flags) { #if MDBX_ENABLE_REFUND txn->tw.loose_refund_wl = 0; #endif /* MDBX_ENABLE_REFUND */ - MDBX_PNL_SIZE(txn->tw.retired_pages) = 0; - txn->tw.spill_pages = NULL; - txn->tw.spill_least_removed = 0; + MDBX_PNL_SETSIZE(txn->tw.retired_pages, 0); + txn->tw.spilled.list = NULL; + txn->tw.spilled.least_removed = 0; txn->tw.last_reclaimed = 0; if (txn->tw.lifo_reclaimed) - MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = 0; + MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, 0); env->me_txn = txn; txn->mt_numdbs = env->me_numdbs; memcpy(txn->mt_dbiseqs, env->me_dbiseqs, txn->mt_numdbs * sizeof(unsigned)); /* Copy the DB info and flags */ - memcpy(txn->mt_dbs, meta->mm_dbs, CORE_DBS * sizeof(MDBX_db)); + memcpy(txn->mt_dbs, head.ptr_c->mm_dbs, CORE_DBS * sizeof(MDBX_db)); /* Moved to here to avoid a data race in read TXNs */ - txn->mt_geo = meta->mm_geo; + txn->mt_geo = head.ptr_c->mm_geo; - rc = mdbx_dpl_alloc(txn); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; - txn->tw.dirtylru = MDBX_DEBUG ? ~42u : 0; + if ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) { + rc = dpl_alloc(txn); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + txn->tw.dirtyroom = txn->mt_env->me_options.dp_limit; + txn->tw.dirtylru = MDBX_DEBUG ? UINT32_MAX / 3 - 42 : 0; + } else { + tASSERT(txn, txn->tw.dirtylist == nullptr); + txn->tw.dirtylist = nullptr; + txn->tw.dirtyroom = MAX_PAGENO; + txn->tw.dirtylru = 0; + } + eASSERT(env, txn->tw.writemap_dirty_npages == 0); + eASSERT(env, txn->tw.writemap_spilled_npages == 0); } /* Setup db info */ - mdbx_compiler_barrier(); + osal_compiler_barrier(); memset(txn->mt_cursors, 0, sizeof(MDBX_cursor *) * txn->mt_numdbs); - for (unsigned i = CORE_DBS; i < txn->mt_numdbs; i++) { + for (size_t i = CORE_DBS; i < txn->mt_numdbs; i++) { const unsigned db_flags = env->me_dbflags[i]; txn->mt_dbs[i].md_flags = db_flags & DB_PERSISTENT_FLAGS; txn->mt_dbistate[i] = (db_flags & DB_VALID) ? DBI_VALID | DBI_USRVALID | DBI_STALE : 0; } txn->mt_dbistate[MAIN_DBI] = DBI_VALID | DBI_USRVALID; + rc = + setup_dbx(&txn->mt_dbxs[MAIN_DBI], &txn->mt_dbs[MAIN_DBI], env->me_psize); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; txn->mt_dbistate[FREE_DBI] = DBI_VALID; txn->mt_front = txn->mt_txnid + ((flags & (MDBX_WRITEMAP | MDBX_RDONLY)) == 0); if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) { - mdbx_warning("%s", "environment had fatal error, must shutdown!"); + WARNING("%s", "environment had fatal error, must shutdown!"); rc = MDBX_PANIC; } else { - const size_t size = - pgno2bytes(env, (txn->mt_flags & MDBX_TXN_RDONLY) ? txn->mt_next_pgno - : txn->mt_end_pgno); - if (unlikely(size > env->me_dxb_mmap.limit)) { + const size_t size_bytes = pgno2bytes(env, txn->mt_end_pgno); + const size_t used_bytes = pgno2bytes(env, txn->mt_next_pgno); + const size_t required_bytes = + (txn->mt_flags & MDBX_TXN_RDONLY) ? used_bytes : size_bytes; + if (unlikely(required_bytes > env->me_dxb_mmap.current)) { + /* Размер БД (для пишущих транзакций) или используемых данных (для + * читающих транзакций) больше предыдущего/текущего размера внутри + * процесса, увеличиваем. Сюда также попадает случай увеличения верхней + * границы размера БД и отображения. В читающих транзакциях нельзя + * изменять размер файла, который может быть больше необходимого этой + * транзакции. */ if (txn->mt_geo.upper > MAX_PAGENO + 1 || bytes2pgno(env, pgno2bytes(env, txn->mt_geo.upper)) != txn->mt_geo.upper) { rc = MDBX_UNABLE_EXTEND_MAPSIZE; goto bailout; } - rc = mdbx_mapresize(env, txn->mt_next_pgno, txn->mt_end_pgno, - txn->mt_geo.upper, - (txn->mt_flags & MDBX_TXN_RDONLY) ? true : false); - if (rc != MDBX_SUCCESS) + rc = dxb_resize(env, txn->mt_next_pgno, txn->mt_end_pgno, + txn->mt_geo.upper, implicit_grow); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + } else if (unlikely(size_bytes < env->me_dxb_mmap.current)) { + /* Размер БД меньше предыдущего/текущего размера внутри процесса, можно + * уменьшить, но всё сложнее: + * - размер файла согласован со всеми читаемыми снимками на момент + * коммита последней транзакции; + * - в читающей транзакции размер файла может быть больше и него нельзя + * изменять, в том числе менять madvise (меньша размера файла нельзя, + * а за размером нет смысла). + * - в пишущей транзакции уменьшать размер файла можно только после + * проверки размера читаемых снимков, но в этом нет смысла, так как + * это будет сделано при фиксации транзакции. + * + * В сухом остатке, можно только установить dxb_mmap.current равным + * размеру файла, а это проще сделать без вызова dxb_resize() и усложения + * внутренней логики. + * + * В этой тактике есть недостаток: если пишущите транзакции не регулярны, + * и при завершении такой транзакции файл БД остаётся не-уменьшеным из-за + * читающих транзакций использующих предыдущие снимки. */ +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_AcquireShared(&env->me_remap_guard); +#else + rc = osal_fastmutex_acquire(&env->me_remap_guard); +#endif + if (likely(rc == MDBX_SUCCESS)) { + rc = osal_filesize(env->me_dxb_mmap.fd, &env->me_dxb_mmap.filesize); + if (likely(rc == MDBX_SUCCESS)) { + eASSERT(env, env->me_dxb_mmap.filesize >= required_bytes); + if (env->me_dxb_mmap.current > env->me_dxb_mmap.filesize) + env->me_dxb_mmap.current = (size_t)env->me_dxb_mmap.filesize; + } +#if defined(_WIN32) || defined(_WIN64) + osal_srwlock_ReleaseShared(&env->me_remap_guard); +#else + int err = osal_fastmutex_release(&env->me_remap_guard); + if (unlikely(err) && likely(rc == MDBX_SUCCESS)) + rc = err; +#endif + } + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } + eASSERT(env, + pgno2bytes(env, txn->mt_next_pgno) <= env->me_dxb_mmap.current); + eASSERT(env, env->me_dxb_mmap.limit >= env->me_dxb_mmap.current); if (txn->mt_flags & MDBX_TXN_RDONLY) { #if defined(_WIN32) || defined(_WIN64) - if (((size > env->me_dbgeo.lower && env->me_dbgeo.shrink) || + if (((used_bytes > env->me_dbgeo.lower && env->me_dbgeo.shrink) || (mdbx_RunningUnderWine() && /* under Wine acquisition of remap_guard is always required, * since Wine don't support section extending, * i.e. in both cases unmap+map are required. */ - size < env->me_dbgeo.upper && env->me_dbgeo.grow)) && + used_bytes < env->me_dbgeo.upper && env->me_dbgeo.grow)) && /* avoid recursive use SRW */ (txn->mt_flags & MDBX_NOTLS) == 0) { txn->mt_flags |= MDBX_SHRINK_ALLOWED; - mdbx_srwlock_AcquireShared(&env->me_remap_guard); + osal_srwlock_AcquireShared(&env->me_remap_guard); } #endif /* Windows */ } else { - env->me_dxb_mmap.current = size; - env->me_dxb_mmap.filesize = - (env->me_dxb_mmap.filesize < size) ? size : env->me_dxb_mmap.filesize; + if (unlikely(txn->mt_dbs[FREE_DBI].md_flags != MDBX_INTEGERKEY)) { + ERROR("unexpected/invalid db-flags 0x%u for GC/FreeDB", + txn->mt_dbs[FREE_DBI].md_flags); + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + + tASSERT(txn, txn == env->me_txn0); + MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); + rc = cursor_init(gc, txn, FREE_DBI); + if (rc != MDBX_SUCCESS) + goto bailout; } #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, txn); + txn_valgrind(env, txn); #endif txn->mt_owner = tid; return MDBX_SUCCESS; } bailout: - mdbx_tassert(txn, rc != MDBX_SUCCESS); - mdbx_txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); + tASSERT(txn, rc != MDBX_SUCCESS); + txn_end(txn, MDBX_END_SLOT | MDBX_END_FAIL_BEGIN); return rc; } @@ -8127,13 +9113,13 @@ static __always_inline int check_txn(const MDBX_txn *txn, int bad_bits) { if (unlikely(txn->mt_flags & bad_bits)) return MDBX_BAD_TXN; - mdbx_tassert(txn, (txn->mt_flags & MDBX_NOTLS) == - ((txn->mt_flags & MDBX_TXN_RDONLY) - ? txn->mt_env->me_flags & MDBX_NOTLS - : 0)); + tASSERT(txn, (txn->mt_flags & MDBX_NOTLS) == + ((txn->mt_flags & MDBX_TXN_RDONLY) + ? txn->mt_env->me_flags & MDBX_NOTLS + : 0)); #if MDBX_TXN_CHECKOWNER STATIC_ASSERT(MDBX_NOTLS > MDBX_TXN_FINISHED + MDBX_TXN_RDONLY); - if (unlikely(txn->mt_owner != mdbx_thread_self()) && + if (unlikely(txn->mt_owner != osal_thread_self()) && (txn->mt_flags & (MDBX_NOTLS | MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) < (MDBX_TXN_FINISHED | MDBX_TXN_RDONLY)) return txn->mt_owner ? MDBX_THREAD_MISMATCH : MDBX_BAD_TXN; @@ -8150,7 +9136,7 @@ static __always_inline int check_txn_rw(const MDBX_txn *txn, int bad_bits) { if (unlikely(err)) return err; - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) return MDBX_EACCESS; return MDBX_SUCCESS; @@ -8173,14 +9159,14 @@ int mdbx_txn_renew(MDBX_txn *txn) { return rc; } - rc = mdbx_txn_renew0(txn, MDBX_TXN_RDONLY); + rc = txn_renew(txn, MDBX_TXN_RDONLY); if (rc == MDBX_SUCCESS) { - txn->mt_owner = mdbx_thread_self(); - mdbx_debug("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + txn->mt_owner = osal_thread_self(); + DEBUG("renew txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', + (void *)txn, (void *)txn->mt_env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); } return rc; } @@ -8207,9 +9193,6 @@ void *mdbx_txn_get_userctx(const MDBX_txn *txn) { int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, MDBX_txn **ret, void *context) { - MDBX_txn *txn; - unsigned size, tsize; - if (unlikely(!ret)) return MDBX_EINVAL; *ret = NULL; @@ -8228,6 +9211,7 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, flags |= env->me_flags & MDBX_WRITEMAP; + MDBX_txn *txn = nullptr; if (parent) { /* Nested transactions: Max 1 child, write txns only, no writemap */ rc = check_txn_rw(parent, @@ -8237,63 +9221,69 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (env->me_options.spill_parent4child_denominator) { /* Spill dirty-pages of parent to provide dirtyroom for child txn */ - rc = mdbx_txn_spill(parent, nullptr, - parent->tw.dirtylist->length / - env->me_options.spill_parent4child_denominator); + rc = txn_spill(parent, nullptr, + parent->tw.dirtylist->length / + env->me_options.spill_parent4child_denominator); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + tASSERT(parent, audit_ex(parent, 0, false) == 0); flags |= parent->mt_flags & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_SPILLS); } else if (flags & MDBX_TXN_RDONLY) { if (env->me_txn0 && - unlikely(env->me_txn0->mt_owner == mdbx_thread_self()) && - (mdbx_runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) + unlikely(env->me_txn0->mt_owner == osal_thread_self()) && + (runtime_flags & MDBX_DBG_LEGACY_OVERLAP) == 0) return MDBX_TXN_OVERLAPPING; } else { /* Reuse preallocated write txn. However, do not touch it until - * mdbx_txn_renew0() succeeds, since it currently may be active. */ + * txn_renew() succeeds, since it currently may be active. */ txn = env->me_txn0; goto renew; } - size = env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); - size += tsize = sizeof(MDBX_txn); - if (unlikely((txn = mdbx_malloc(size)) == NULL)) { - mdbx_debug("calloc: %s", "failed"); + const size_t base = (flags & MDBX_TXN_RDONLY) + ? sizeof(MDBX_txn) - sizeof(txn->tw) + sizeof(txn->to) + : sizeof(MDBX_txn); + const size_t size = + base + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + 1); + txn = osal_malloc(size); + if (unlikely(txn == nullptr)) { + DEBUG("calloc: %s", "failed"); return MDBX_ENOMEM; } #if MDBX_DEBUG memset(txn, 0xCD, size); VALGRIND_MAKE_MEM_UNDEFINED(txn, size); #endif /* MDBX_DEBUG */ - memset(txn, 0, tsize); + MDBX_ANALYSIS_ASSUME(size > base); + memset(txn, 0, + (MDBX_GOOFY_MSVC_STATIC_ANALYZER && base > size) ? size : base); + txn->mt_dbs = ptr_disp(txn, base); + txn->mt_cursors = ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); + txn->mt_dbistate = ptr_disp(txn, size - env->me_maxdbs); txn->mt_dbxs = env->me_dbxs; /* static */ - txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbistate = (uint8_t *)txn + size - env->me_maxdbs; txn->mt_flags = flags; txn->mt_env = env; if (parent) { - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); txn->mt_dbiseqs = parent->mt_dbiseqs; txn->mt_geo = parent->mt_geo; - rc = mdbx_dpl_alloc(txn); + rc = dpl_alloc(txn); if (likely(rc == MDBX_SUCCESS)) { - const unsigned len = - MDBX_PNL_SIZE(parent->tw.reclaimed_pglist) + parent->tw.loose_count; - txn->tw.reclaimed_pglist = - mdbx_pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.reclaimed_pglist)) + const size_t len = + MDBX_PNL_GETSIZE(parent->tw.relist) + parent->tw.loose_count; + txn->tw.relist = + pnl_alloc((len > MDBX_PNL_INITIAL) ? len : MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.relist)) rc = MDBX_ENOMEM; } if (unlikely(rc != MDBX_SUCCESS)) { nested_failed: - mdbx_pnl_free(txn->tw.reclaimed_pglist); - mdbx_dpl_free(txn); - mdbx_free(txn); + pnl_free(txn->tw.relist); + dpl_free(txn); + osal_free(txn); return rc; } @@ -8301,51 +9291,50 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, if (parent->tw.loose_count) { do { MDBX_page *lp = parent->tw.loose_pages; - const unsigned di = mdbx_dpl_exist(parent, lp->mp_pgno); - mdbx_tassert(parent, di && parent->tw.dirtylist->items[di].ptr == lp); - mdbx_tassert(parent, lp->mp_flags == P_LOOSE); - rc = - mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, lp->mp_pgno, 1); + tASSERT(parent, lp->mp_flags == P_LOOSE); + rc = pnl_insert_range(&parent->tw.relist, lp->mp_pgno, 1); if (unlikely(rc != MDBX_SUCCESS)) goto nested_failed; - parent->tw.loose_pages = lp->mp_next; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + parent->tw.loose_pages = mp_next(lp); /* Remove from dirty list */ - mdbx_page_wash(parent, di, lp, 1); + page_wash(parent, dpl_exist(parent, lp->mp_pgno), lp, 1); } while (parent->tw.loose_pages); parent->tw.loose_count = 0; #if MDBX_ENABLE_REFUND parent->tw.loose_refund_wl = 0; #endif /* MDBX_ENABLE_REFUND */ - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); } txn->tw.dirtyroom = parent->tw.dirtyroom; txn->tw.dirtylru = parent->tw.dirtylru; - mdbx_dpl_sort(parent); - if (parent->tw.spill_pages) - mdbx_spill_purge(parent); + dpl_sort(parent); + if (parent->tw.spilled.list) + spill_purge(parent); - mdbx_tassert(txn, MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) >= - MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)); - memcpy(txn->tw.reclaimed_pglist, parent->tw.reclaimed_pglist, - MDBX_PNL_SIZEOF(parent->tw.reclaimed_pglist)); - mdbx_assert(env, mdbx_pnl_check4assert( - txn->tw.reclaimed_pglist, - (txn->mt_next_pgno /* LY: intentional assignment here, - only for assertion */ - = parent->mt_next_pgno) - - MDBX_ENABLE_REFUND)); + tASSERT(txn, MDBX_PNL_ALLOCLEN(txn->tw.relist) >= + MDBX_PNL_GETSIZE(parent->tw.relist)); + memcpy(txn->tw.relist, parent->tw.relist, + MDBX_PNL_SIZEOF(parent->tw.relist)); + eASSERT(env, pnl_check_allocated( + txn->tw.relist, + (txn->mt_next_pgno /* LY: intentional assignment here, + only for assertion */ + = parent->mt_next_pgno) - + MDBX_ENABLE_REFUND)); txn->tw.last_reclaimed = parent->tw.last_reclaimed; if (parent->tw.lifo_reclaimed) { txn->tw.lifo_reclaimed = parent->tw.lifo_reclaimed; parent->tw.lifo_reclaimed = - (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.lifo_reclaimed); + (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.lifo_reclaimed); } txn->tw.retired_pages = parent->tw.retired_pages; parent->tw.retired_pages = - (void *)(intptr_t)MDBX_PNL_SIZE(parent->tw.retired_pages); + (void *)(intptr_t)MDBX_PNL_GETSIZE(parent->tw.retired_pages); txn->mt_txnid = parent->mt_txnid; txn->mt_front = parent->mt_front + 1; @@ -8359,55 +9348,56 @@ int mdbx_txn_begin_ex(MDBX_env *env, MDBX_txn *parent, MDBX_txn_flags_t flags, txn->mt_numdbs = parent->mt_numdbs; txn->mt_owner = parent->mt_owner; memcpy(txn->mt_dbs, parent->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); + txn->tw.troika = parent->tw.troika; /* Copy parent's mt_dbistate, but clear DB_NEW */ - for (unsigned i = 0; i < txn->mt_numdbs; i++) + for (size_t i = 0; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] = parent->mt_dbistate[i] & ~(DBI_FRESH | DBI_CREAT | DBI_DIRTY); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); env->me_txn = txn; - rc = mdbx_cursor_shadow(parent, txn); - if (mdbx_audit_enabled() && mdbx_assert_enabled()) { + rc = cursor_shadow(parent, txn); + if (AUDIT_ENABLED() && ASSERT_ENABLED()) { txn->mt_signature = MDBX_MT_SIGNATURE; - mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); + tASSERT(txn, audit_ex(txn, 0, false) == 0); } if (unlikely(rc != MDBX_SUCCESS)) - mdbx_txn_end(txn, MDBX_END_FAIL_BEGINCHILD); + txn_end(txn, MDBX_END_FAIL_BEGINCHILD); } else { /* MDBX_TXN_RDONLY */ txn->mt_dbiseqs = env->me_dbiseqs; renew: - rc = mdbx_txn_renew0(txn, flags); + rc = txn_renew(txn, flags); } if (unlikely(rc != MDBX_SUCCESS)) { if (txn != env->me_txn0) - mdbx_free(txn); + osal_free(txn); } else { if (flags & (MDBX_TXN_RDONLY_PREPARE - MDBX_TXN_RDONLY)) - mdbx_assert(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); + eASSERT(env, txn->mt_flags == (MDBX_TXN_RDONLY | MDBX_TXN_FINISHED)); else if (flags & MDBX_TXN_RDONLY) - mdbx_assert(env, (txn->mt_flags & - ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | - /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); + eASSERT(env, (txn->mt_flags & + ~(MDBX_NOTLS | MDBX_TXN_RDONLY | MDBX_WRITEMAP | + /* Win32: SRWL flag */ MDBX_SHRINK_ALLOWED)) == 0); else { - mdbx_assert(env, (txn->mt_flags & ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | - MDBX_NOMETASYNC | MDBX_SAFE_NOSYNC | - MDBX_TXN_SPILLS)) == 0); - assert(!txn->tw.spill_pages && !txn->tw.spill_least_removed); + eASSERT(env, (txn->mt_flags & + ~(MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED | MDBX_NOMETASYNC | + MDBX_SAFE_NOSYNC | MDBX_TXN_SPILLS)) == 0); + assert(!txn->tw.spilled.list && !txn->tw.spilled.least_removed); } txn->mt_signature = MDBX_MT_SIGNATURE; txn->mt_userctx = context; *ret = txn; - mdbx_debug("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', - (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + DEBUG("begin txn %" PRIaTXN "%c %p on env %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, + (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); } return rc; @@ -8423,7 +9413,7 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_env *const env = txn->mt_env; #if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { + if (unlikely(env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } @@ -8433,24 +9423,21 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_used = pgno2bytes(env, txn->mt_geo.next); if (txn->mt_flags & MDBX_TXN_RDONLY) { - volatile const MDBX_meta *head_meta; - txnid_t head_txnid; + meta_ptr_t head; uint64_t head_retired; + meta_troika_t troika = meta_tap(env); do { /* fetch info from volatile head */ - head_meta = meta_prefer_last(env); - head_txnid = meta_txnid(env, head_meta); + head = meta_recent(env, &troika); head_retired = - unaligned_peek_u64_volatile(4, head_meta->mm_pages_retired); - info->txn_space_limit_soft = pgno2bytes(env, head_meta->mm_geo.now); - info->txn_space_limit_hard = pgno2bytes(env, head_meta->mm_geo.upper); + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); + info->txn_space_limit_soft = pgno2bytes(env, head.ptr_v->mm_geo.now); + info->txn_space_limit_hard = pgno2bytes(env, head.ptr_v->mm_geo.upper); info->txn_space_leftover = - pgno2bytes(env, head_meta->mm_geo.now - head_meta->mm_geo.next); - mdbx_compiler_barrier(); - } while (unlikely(head_meta != meta_prefer_last(env) || - head_txnid != meta_txnid(env, head_meta))); + pgno2bytes(env, head.ptr_v->mm_geo.now - head.ptr_v->mm_geo.next); + } while (unlikely(meta_should_retry(env, &troika))); - info->txn_reader_lag = head_txnid - info->txn_id; + info->txn_reader_lag = head.txnid - info->txn_id; info->txn_space_dirty = info->txn_space_retired = 0; uint64_t reader_snapshot_pages_retired; if (txn->to.reader && @@ -8464,13 +9451,13 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && info->txn_reader_lag > 1 && lck) { /* find next more recent reader */ - txnid_t next_reader = head_txnid; - const unsigned snap_nreaders = + txnid_t next_reader = head.txnid; + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { retry: if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - mdbx_jitter4testing(true); + jitter4testing(true); const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); const uint64_t snap_retired = @@ -8503,23 +9490,25 @@ int mdbx_txn_info(const MDBX_txn *txn, MDBX_txn_info *info, bool scan_rlt) { info->txn_space_limit_soft = pgno2bytes(env, txn->mt_geo.now); info->txn_space_limit_hard = pgno2bytes(env, txn->mt_geo.upper); info->txn_space_retired = pgno2bytes( - env, txn->mt_child ? (unsigned)(uintptr_t)txn->tw.retired_pages - : MDBX_PNL_SIZE(txn->tw.retired_pages)); + env, txn->mt_child ? (size_t)txn->tw.retired_pages + : MDBX_PNL_GETSIZE(txn->tw.retired_pages)); info->txn_space_leftover = pgno2bytes(env, txn->tw.dirtyroom); - info->txn_space_dirty = - pgno2bytes(env, txn->mt_env->me_options.dp_limit - txn->tw.dirtyroom); + info->txn_space_dirty = pgno2bytes( + env, txn->tw.dirtylist ? txn->tw.dirtylist->pages_including_loose + : (txn->tw.writemap_dirty_npages + + txn->tw.writemap_spilled_npages)); info->txn_reader_lag = INT64_MAX; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (scan_rlt && lck) { txnid_t oldest_snapshot = txn->mt_txnid; - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); if (snap_nreaders) { - oldest_snapshot = mdbx_find_oldest(txn); + oldest_snapshot = txn_oldest_reader(txn); if (oldest_snapshot == txn->mt_txnid - 1) { /* check if there is at least one reader */ bool exists = false; - for (unsigned i = 0; i < snap_nreaders; ++i) { + for (size_t i = 0; i < snap_nreaders; ++i) { if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_Relaxed) && txn->mt_txnid > safe64_read(&lck->mti_readers[i].mr_txnid)) { exists = true; @@ -8559,37 +9548,45 @@ int mdbx_txn_flags(const MDBX_txn *txn) { } /* Check for misused dbi handles */ -#define TXN_DBI_CHANGED(txn, dbi) \ - ((txn)->mt_dbiseqs[dbi] != (txn)->mt_env->me_dbiseqs[dbi]) +static __inline bool dbi_changed(MDBX_txn *txn, size_t dbi) { + if (txn->mt_dbiseqs == txn->mt_env->me_dbiseqs) + return false; + if (likely( + txn->mt_dbiseqs[dbi].weak == + atomic_load32((MDBX_atomic_uint32_t *)&txn->mt_env->me_dbiseqs[dbi], + mo_AcquireRelease))) + return false; + return true; +} -static __inline unsigned dbi_seq(const MDBX_env *const env, unsigned slot) { - unsigned v = env->me_dbiseqs[slot] + 1; +static __inline unsigned dbi_seq(const MDBX_env *const env, size_t slot) { + unsigned v = env->me_dbiseqs[slot].weak + 1; return v + (v == 0); } static void dbi_import_locked(MDBX_txn *txn) { const MDBX_env *const env = txn->mt_env; - unsigned n = env->me_numdbs; - for (unsigned i = CORE_DBS; i < n; ++i) { + size_t n = env->me_numdbs; + for (size_t i = CORE_DBS; i < n; ++i) { if (i >= txn->mt_numdbs) { txn->mt_cursors[i] = NULL; if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[i] = 0; + txn->mt_dbiseqs[i].weak = 0; txn->mt_dbistate[i] = 0; } - if ((TXN_DBI_CHANGED(txn, i) && + if ((dbi_changed(txn, i) && (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0) || ((env->me_dbflags[i] & DB_VALID) && !(txn->mt_dbistate[i] & DBI_VALID))) { - mdbx_tassert(txn, (txn->mt_dbistate[i] & - (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); + tASSERT(txn, + (txn->mt_dbistate[i] & (DBI_CREAT | DBI_DIRTY | DBI_FRESH)) == 0); txn->mt_dbiseqs[i] = env->me_dbiseqs[i]; txn->mt_dbs[i].md_flags = env->me_dbflags[i] & DB_PERSISTENT_FLAGS; txn->mt_dbistate[i] = 0; if (env->me_dbflags[i] & DB_VALID) { txn->mt_dbistate[i] = DBI_VALID | DBI_USRVALID | DBI_STALE; - mdbx_tassert(txn, txn->mt_dbxs[i].md_cmp != NULL); - mdbx_tassert(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); + tASSERT(txn, txn->mt_dbxs[i].md_cmp != NULL); + tASSERT(txn, txn->mt_dbxs[i].md_name.iov_base != NULL); } } } @@ -8600,12 +9597,12 @@ static void dbi_import_locked(MDBX_txn *txn) { else { if ((txn->mt_dbistate[n] & DBI_USRVALID) == 0) { if (txn->mt_dbiseqs != env->me_dbiseqs) - txn->mt_dbiseqs[n] = 0; + txn->mt_dbiseqs[n].weak = 0; txn->mt_dbistate[n] = 0; } ++n; } - txn->mt_numdbs = n; + txn->mt_numdbs = (MDBX_dbi)n; } /* Import DBI which opened after txn started into context */ @@ -8614,43 +9611,47 @@ __cold static bool dbi_import(MDBX_txn *txn, MDBX_dbi dbi) { (dbi >= txn->mt_numdbs && dbi >= txn->mt_env->me_numdbs)) return false; - mdbx_ensure(txn->mt_env, mdbx_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == - MDBX_SUCCESS); + ENSURE(txn->mt_env, + osal_fastmutex_acquire(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); dbi_import_locked(txn); - mdbx_ensure(txn->mt_env, mdbx_fastmutex_release(&txn->mt_env->me_dbi_lock) == - MDBX_SUCCESS); + ENSURE(txn->mt_env, + osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); return txn->mt_dbistate[dbi] & DBI_USRVALID; } /* Export or close DBI handles opened in this txn. */ static void dbi_update(MDBX_txn *txn, int keep) { - mdbx_tassert(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); + tASSERT(txn, !txn->mt_parent && txn == txn->mt_env->me_txn0); MDBX_dbi n = txn->mt_numdbs; if (n) { bool locked = false; MDBX_env *const env = txn->mt_env; - for (unsigned i = n; --i >= CORE_DBS;) { + for (size_t i = n; --i >= CORE_DBS;) { if (likely((txn->mt_dbistate[i] & DBI_CREAT) == 0)) continue; if (!locked) { - mdbx_ensure(env, - mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); locked = true; } - if (env->me_numdbs <= i || txn->mt_dbiseqs[i] != env->me_dbiseqs[i]) + if (env->me_numdbs <= i || + txn->mt_dbiseqs[i].weak != env->me_dbiseqs[i].weak) continue /* dbi explicitly closed and/or then re-opened by other txn */; if (keep) { env->me_dbflags[i] = txn->mt_dbs[i].md_flags | DB_VALID; } else { - char *ptr = env->me_dbxs[i].md_name.iov_base; - if (ptr) { + const MDBX_val name = env->me_dbxs[i].md_name; + if (name.iov_base) { + env->me_dbxs[i].md_name.iov_base = nullptr; + eASSERT(env, env->me_dbflags[i] == 0); + atomic_store32(&env->me_dbiseqs[i], dbi_seq(env, i), + mo_AcquireRelease); env->me_dbxs[i].md_name.iov_len = 0; - mdbx_memory_fence(mo_AcquireRelease, true); - mdbx_assert(env, env->me_dbflags[i] == 0); - env->me_dbiseqs[i] = dbi_seq(env, i); - env->me_dbxs[i].md_name.iov_base = NULL; - mdbx_free(ptr); + if (name.iov_len) + osal_free(name.iov_base); + } else { + eASSERT(env, name.iov_len == 0); + eASSERT(env, env->me_dbflags[i] == 0); } } } @@ -8658,8 +9659,7 @@ static void dbi_update(MDBX_txn *txn, int keep) { n = env->me_numdbs; if (n > CORE_DBS && unlikely(!(env->me_dbflags[n - 1] & DB_VALID))) { if (!locked) { - mdbx_ensure(env, - mdbx_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_acquire(&env->me_dbi_lock) == MDBX_SUCCESS); locked = true; } @@ -8670,30 +9670,29 @@ static void dbi_update(MDBX_txn *txn, int keep) { } if (unlikely(locked)) - mdbx_ensure(env, - mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } } /* Filter-out pgno list from transaction's dirty-page list */ -static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, - const bool spilled) { - if (MDBX_PNL_SIZE(pl) && txn->tw.dirtylist->length) { - mdbx_tassert( - txn, mdbx_pnl_check4assert(pl, (size_t)txn->mt_next_pgno << spilled)); - MDBX_dpl *dl = mdbx_dpl_sort(txn); +static void dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, const bool spilled) { + tASSERT(txn, (txn->mt_flags & MDBX_TXN_RDONLY) == 0); + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + if (MDBX_PNL_GETSIZE(pl) && txn->tw.dirtylist->length) { + tASSERT(txn, pnl_check_allocated(pl, (size_t)txn->mt_next_pgno << spilled)); + MDBX_dpl *dl = dpl_sort(txn); /* Scanning in ascend order */ - const int step = MDBX_PNL_ASCENDING ? 1 : -1; - const int begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_SIZE(pl); - const int end = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(pl) + 1 : 0; - mdbx_tassert(txn, pl[begin] <= pl[end - step]); + const intptr_t step = MDBX_PNL_ASCENDING ? 1 : -1; + const intptr_t begin = MDBX_PNL_ASCENDING ? 1 : MDBX_PNL_GETSIZE(pl); + const intptr_t end = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(pl) + 1 : 0; + tASSERT(txn, pl[begin] <= pl[end - step]); - unsigned r = mdbx_dpl_search(txn, pl[begin] >> spilled); - mdbx_tassert(txn, dl->sorted == dl->length); - for (int i = begin; r <= dl->length;) { /* scan loop */ + size_t w, r = dpl_search(txn, pl[begin] >> spilled); + tASSERT(txn, dl->sorted == dl->length); + for (intptr_t i = begin; r <= dl->length;) { /* scan loop */ assert(i != end); - mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + tASSERT(txn, !spilled || (pl[i] & 1) == 0); pgno_t pl_pgno = pl[i] >> spilled; pgno_t dp_pgno = dl->items[r].pgno; if (likely(dp_pgno != pl_pgno)) { @@ -8706,12 +9705,13 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, } /* update loop */ - unsigned w = r; + unsigned npages; + w = r; remove_dl: - if ((txn->mt_env->me_flags & MDBX_WRITEMAP) == 0) { - MDBX_page *dp = dl->items[r].ptr; - mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dl, r)); - } + npages = dpl_npages(dl, r); + dl->pages_including_loose -= npages; + if (!MDBX_AVOID_MSYNC || !(txn->mt_flags & MDBX_WRITEMAP)) + dpage_free(txn->mt_env, dl->items[r].ptr, npages); ++r; next_i: i += step; @@ -8721,7 +9721,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, } else { while (r <= dl->length) { assert(i != end); - mdbx_tassert(txn, !spilled || (pl[i] & 1) == 0); + tASSERT(txn, !spilled || (pl[i] & 1) == 0); pl_pgno = pl[i] >> spilled; dp_pgno = dl->items[r].pgno; if (dp_pgno < pl_pgno) @@ -8734,8 +9734,7 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, } dl->sorted = dpl_setlen(dl, w - 1); txn->tw.dirtyroom += r - w; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == (txn->mt_parent ? txn->mt_parent->tw.dirtyroom : txn->mt_env->me_options.dp_limit)); return; @@ -8747,49 +9746,48 @@ static void mdbx_dpl_sift(MDBX_txn *const txn, MDBX_PNL pl, * May be called twice for readonly txns: First reset it, then abort. * [in] txn the transaction handle to end * [in] mode why and how to end the transaction */ -static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { +static int txn_end(MDBX_txn *txn, const unsigned mode) { MDBX_env *env = txn->mt_env; static const char *const names[] = MDBX_END_NAMES; #if MDBX_ENV_CHECKPID - if (unlikely(txn->mt_env->me_pid != mdbx_getpid())) { + if (unlikely(txn->mt_env->me_pid != osal_getpid())) { env->me_flags |= MDBX_FATAL_ERROR; return MDBX_PANIC; } #endif /* MDBX_ENV_CHECKPID */ - mdbx_debug("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - names[mode & MDBX_END_OPMASK], txn->mt_txnid, - (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, - (void *)env, txn->mt_dbs[MAIN_DBI].md_root, - txn->mt_dbs[FREE_DBI].md_root); + DEBUG("%s txn %" PRIaTXN "%c %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + names[mode & MDBX_END_OPMASK], txn->mt_txnid, + (txn->mt_flags & MDBX_TXN_RDONLY) ? 'r' : 'w', (void *)txn, (void *)env, + txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); if (!(mode & MDBX_END_EOTDONE)) /* !(already closed cursors) */ - mdbx_cursors_eot(txn, false); + cursors_eot(txn, false); int rc = MDBX_SUCCESS; - if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) { + if (txn->mt_flags & MDBX_TXN_RDONLY) { if (txn->to.reader) { MDBX_reader *slot = txn->to.reader; - mdbx_assert(env, slot->mr_pid.weak == env->me_pid); - if (likely(!F_ISSET(txn->mt_flags, MDBX_TXN_FINISHED))) { - mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); - mdbx_assert(env, txn->mt_txnid == slot->mr_txnid.weak && - slot->mr_txnid.weak >= - env->me_lck->mti_oldest_reader.weak); + eASSERT(env, slot->mr_pid.weak == env->me_pid); + if (likely(!(txn->mt_flags & MDBX_TXN_FINISHED))) { + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); + eASSERT(env, + txn->mt_txnid == slot->mr_txnid.weak && + slot->mr_txnid.weak >= env->me_lck->mti_oldest_reader.weak); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif atomic_store32(&slot->mr_snapshot_pages_used, 0, mo_Relaxed); safe64_reset(&slot->mr_txnid, false); atomic_store32(&env->me_lck->mti_readers_refresh_flag, true, mo_Relaxed); } else { - mdbx_assert(env, slot->mr_pid.weak == env->me_pid); - mdbx_assert(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); + eASSERT(env, slot->mr_pid.weak == env->me_pid); + eASSERT(env, slot->mr_txnid.weak >= SAFE64_INVALID_THRESHOLD); } if (mode & MDBX_END_SLOT) { if ((env->me_flags & MDBX_ENV_TXKEY) == 0) @@ -8799,103 +9797,103 @@ static int mdbx_txn_end(MDBX_txn *txn, const unsigned mode) { } #if defined(_WIN32) || defined(_WIN64) if (txn->mt_flags & MDBX_SHRINK_ALLOWED) - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); #endif txn->mt_numdbs = 0; /* prevent further DBI activity */ txn->mt_flags = MDBX_TXN_RDONLY | MDBX_TXN_FINISHED; txn->mt_owner = 0; } else if (!(txn->mt_flags & MDBX_TXN_FINISHED)) { - mdbx_ensure(env, txn->mt_txnid >= - /* paranoia is appropriate here */ env->me_lck - ->mti_oldest_reader.weak); + ENSURE(env, txn->mt_txnid >= + /* paranoia is appropriate here */ env->me_lck + ->mti_oldest_reader.weak); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (txn == env->me_txn0) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif txn->mt_flags = MDBX_TXN_FINISHED; txn->mt_owner = 0; env->me_txn = txn->mt_parent; - mdbx_pnl_free(txn->tw.spill_pages); - txn->tw.spill_pages = nullptr; + pnl_free(txn->tw.spilled.list); + txn->tw.spilled.list = nullptr; if (txn == env->me_txn0) { - mdbx_assert(env, txn->mt_parent == NULL); + eASSERT(env, txn->mt_parent == NULL); /* Export or close DBI handles created in this txn */ dbi_update(txn, mode & MDBX_END_UPDATE); - mdbx_pnl_shrink(&txn->tw.retired_pages); - mdbx_pnl_shrink(&txn->tw.reclaimed_pglist); + pnl_shrink(&txn->tw.retired_pages); + pnl_shrink(&txn->tw.relist); if (!(env->me_flags & MDBX_WRITEMAP)) - mdbx_dlist_free(txn); + dlist_free(txn); /* The writer mutex was locked in mdbx_txn_begin. */ mdbx_txn_unlock(env); } else { - mdbx_assert(env, txn->mt_parent != NULL); + eASSERT(env, txn->mt_parent != NULL); MDBX_txn *const parent = txn->mt_parent; - mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); - mdbx_assert(env, parent->mt_child == txn && - (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - mdbx_assert( - env, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + eASSERT(env, memcmp(&txn->tw.troika, &parent->tw.troika, + sizeof(meta_troika_t)) == 0); if (txn->tw.lifo_reclaimed) { - mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) >= - (unsigned)(uintptr_t)parent->tw.lifo_reclaimed); - MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) = - (unsigned)(uintptr_t)parent->tw.lifo_reclaimed; + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) >= + (uintptr_t)parent->tw.lifo_reclaimed); + MDBX_PNL_SETSIZE(txn->tw.lifo_reclaimed, + (uintptr_t)parent->tw.lifo_reclaimed); parent->tw.lifo_reclaimed = txn->tw.lifo_reclaimed; } if (txn->tw.retired_pages) { - mdbx_assert(env, MDBX_PNL_SIZE(txn->tw.retired_pages) >= - (unsigned)(uintptr_t)parent->tw.retired_pages); - MDBX_PNL_SIZE(txn->tw.retired_pages) = - (unsigned)(uintptr_t)parent->tw.retired_pages; + eASSERT(env, MDBX_PNL_GETSIZE(txn->tw.retired_pages) >= + (uintptr_t)parent->tw.retired_pages); + MDBX_PNL_SETSIZE(txn->tw.retired_pages, + (uintptr_t)parent->tw.retired_pages); parent->tw.retired_pages = txn->tw.retired_pages; } parent->mt_child = nullptr; parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; parent->tw.dirtylru = txn->tw.dirtylru; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); - if (!(env->me_flags & MDBX_WRITEMAP)) - mdbx_dlist_free(txn); - mdbx_dpl_free(txn); - mdbx_pnl_free(txn->tw.reclaimed_pglist); + tASSERT(parent, dirtylist_check(parent)); + tASSERT(parent, audit_ex(parent, 0, false) == 0); + dlist_free(txn); + dpl_free(txn); + pnl_free(txn->tw.relist); if (parent->mt_geo.upper != txn->mt_geo.upper || parent->mt_geo.now != txn->mt_geo.now) { /* undo resize performed by child txn */ - rc = mdbx_mapresize_implicit(env, parent->mt_next_pgno, - parent->mt_geo.now, parent->mt_geo.upper); + rc = dxb_resize(env, parent->mt_next_pgno, parent->mt_geo.now, + parent->mt_geo.upper, impilict_shrink); if (rc == MDBX_EPERM) { /* unable undo resize (it is regular for Windows), * therefore promote size changes from child to the parent txn */ - mdbx_warning("unable undo resize performed by child txn, promote to " - "the parent (%u->%u, %u->%u)", - txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, - parent->mt_geo.upper); + WARNING("unable undo resize performed by child txn, promote to " + "the parent (%u->%u, %u->%u)", + txn->mt_geo.now, parent->mt_geo.now, txn->mt_geo.upper, + parent->mt_geo.upper); parent->mt_geo.now = txn->mt_geo.now; parent->mt_geo.upper = txn->mt_geo.upper; parent->mt_flags |= MDBX_TXN_DIRTY; rc = MDBX_SUCCESS; } else if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_error("error %d while undo resize performed by child txn, fail " - "the parent", - rc); + ERROR("error %d while undo resize performed by child txn, fail " + "the parent", + rc); parent->mt_flags |= MDBX_TXN_ERROR; - if (!env->me_dxb_mmap.address) + if (!env->me_dxb_mmap.base) env->me_flags |= MDBX_FATAL_ERROR; } } } } - mdbx_assert(env, txn == env->me_txn0 || txn->mt_owner == 0); + eASSERT(env, txn == env->me_txn0 || txn->mt_owner == 0); if ((mode & MDBX_END_FREE) != 0 && txn != env->me_txn0) { txn->mt_signature = 0; - mdbx_free(txn); + osal_free(txn); } return rc; @@ -8911,10 +9909,10 @@ int mdbx_txn_reset(MDBX_txn *txn) { return MDBX_EINVAL; /* LY: don't close DBI-handles */ - rc = mdbx_txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); + rc = txn_end(txn, MDBX_END_RESET | MDBX_END_UPDATE); if (rc == MDBX_SUCCESS) { - mdbx_tassert(txn, txn->mt_signature == MDBX_MT_SIGNATURE); - mdbx_tassert(txn, txn->mt_owner == 0); + tASSERT(txn, txn->mt_signature == MDBX_MT_SIGNATURE); + tASSERT(txn, txn->mt_owner == 0); } return rc; } @@ -8937,10 +9935,10 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY)) + if (txn->mt_flags & MDBX_TXN_RDONLY) /* LY: don't close DBI-handles */ - return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | - MDBX_END_FREE); + return txn_end(txn, MDBX_END_ABORT | MDBX_END_UPDATE | MDBX_END_SLOT | + MDBX_END_FREE); if (unlikely(txn->mt_flags & MDBX_TXN_FINISHED)) return MDBX_BAD_TXN; @@ -8948,34 +9946,33 @@ int mdbx_txn_abort(MDBX_txn *txn) { if (txn->mt_child) mdbx_txn_abort(txn->mt_child); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - return mdbx_txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); + tASSERT(txn, (txn->mt_flags & MDBX_TXN_ERROR) || dirtylist_check(txn)); + return txn_end(txn, MDBX_END_ABORT | MDBX_END_SLOT | MDBX_END_FREE); } /* Count all the pages in each DB and in the GC and make sure * it matches the actual number of pages being used. */ -__cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, - bool dont_filter_gc) { - pgno_t pending = 0; - if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { - pending = txn->tw.loose_count + MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + - (MDBX_PNL_SIZE(txn->tw.retired_pages) - retired_stored); - } +__cold static int audit_ex(MDBX_txn *txn, size_t retired_stored, + bool dont_filter_gc) { + size_t pending = 0; + if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) + pending = txn->tw.loose_count + MDBX_PNL_GETSIZE(txn->tw.relist) + + (MDBX_PNL_GETSIZE(txn->tw.retired_pages) - retired_stored); MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, txn, FREE_DBI); + int rc = cursor_init(&cx.outer, txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - pgno_t gc = 0; + size_t gc = 0; MDBX_val key, data; - while ((rc = mdbx_cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { + while ((rc = cursor_get(&cx.outer, &key, &data, MDBX_NEXT)) == 0) { if (!dont_filter_gc) { if (unlikely(key.iov_len != sizeof(txnid_t))) return MDBX_CORRUPTED; txnid_t id = unaligned_peek_u64(4, key.iov_base); if (txn->tw.lifo_reclaimed) { - for (unsigned i = 1; i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed); ++i) + for (size_t i = 1; i <= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed); ++i) if (id == txn->tw.lifo_reclaimed[i]) goto skip; } else if (id <= txn->tw.last_reclaimed) @@ -8985,30 +9982,31 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, gc += *(pgno_t *)data.iov_base; skip:; } - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); - for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) + for (size_t i = FREE_DBI; i < txn->mt_numdbs; i++) txn->mt_dbistate[i] &= ~DBI_AUDITED; - pgno_t used = NUM_METAS; - for (MDBX_dbi i = FREE_DBI; i <= MAIN_DBI; i++) { + size_t used = NUM_METAS; + for (size_t i = FREE_DBI; i <= MAIN_DBI; i++) { if (!(txn->mt_dbistate[i] & DBI_VALID)) continue; - rc = mdbx_cursor_init(&cx.outer, txn, i); + rc = cursor_init(&cx.outer, txn, i); if (unlikely(rc != MDBX_SUCCESS)) return rc; txn->mt_dbistate[i] |= DBI_AUDITED; if (txn->mt_dbs[i].md_root == P_INVALID) continue; - used += txn->mt_dbs[i].md_branch_pages + txn->mt_dbs[i].md_leaf_pages + - txn->mt_dbs[i].md_overflow_pages; + used += (size_t)txn->mt_dbs[i].md_branch_pages + + (size_t)txn->mt_dbs[i].md_leaf_pages + + (size_t)txn->mt_dbs[i].md_overflow_pages; if (i != MAIN_DBI) continue; - rc = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); + rc = page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (rc == MDBX_SUCCESS) { MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (unsigned j = 0; j < page_numkeys(mp); j++) { + for (size_t j = 0; j < page_numkeys(mp); j++) { MDBX_node *node = page_node(mp, j); if (node_flags(node) == F_SUBDATA) { if (unlikely(node_ds(node) != sizeof(MDBX_db))) @@ -9018,7 +10016,7 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) { for (MDBX_dbi k = txn->mt_numdbs; --k > MAIN_DBI;) { if ((txn->mt_dbistate[k] & DBI_VALID) && - /* txn->mt_dbxs[k].md_name.iov_len > 0 && */ + /* txn->mt_dbxs[k].md_name.iov_base && */ node_ks(node) == txn->mt_dbxs[k].md_name.iov_len && memcmp(node_key(node), txn->mt_dbxs[k].md_name.iov_base, node_ks(node)) == 0) { @@ -9029,33 +10027,35 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, } } } - used += - db->md_branch_pages + db->md_leaf_pages + db->md_overflow_pages; + used += (size_t)db->md_branch_pages + (size_t)db->md_leaf_pages + + (size_t)db->md_overflow_pages; } } - rc = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); + rc = cursor_sibling(&cx.outer, SIBLING_RIGHT); } - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); } - for (MDBX_dbi i = FREE_DBI; i < txn->mt_numdbs; i++) { + for (size_t i = FREE_DBI; i < txn->mt_numdbs; i++) { if ((txn->mt_dbistate[i] & (DBI_VALID | DBI_AUDITED | DBI_STALE)) != DBI_VALID) continue; for (MDBX_txn *t = txn; t; t = t->mt_parent) if (F_ISSET(t->mt_dbistate[i], DBI_DIRTY | DBI_CREAT)) { - used += t->mt_dbs[i].md_branch_pages + t->mt_dbs[i].md_leaf_pages + - t->mt_dbs[i].md_overflow_pages; + used += (size_t)t->mt_dbs[i].md_branch_pages + + (size_t)t->mt_dbs[i].md_leaf_pages + + (size_t)t->mt_dbs[i].md_overflow_pages; txn->mt_dbistate[i] |= DBI_AUDITED; break; } + MDBX_ANALYSIS_ASSUME(txn != nullptr); if (!(txn->mt_dbistate[i] & DBI_AUDITED)) { - mdbx_warning("audit %s@%" PRIaTXN - ": unable account dbi %d / \"%*s\", state 0x%02x", - txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, - (int)txn->mt_dbxs[i].md_name.iov_len, - (const char *)txn->mt_dbxs[i].md_name.iov_base, - txn->mt_dbistate[i]); + WARNING("audit %s@%" PRIaTXN + ": unable account dbi %zd / \"%*s\", state 0x%02x", + txn->mt_parent ? "nested-" : "", txn->mt_txnid, i, + (int)txn->mt_dbxs[i].md_name.iov_len, + (const char *)txn->mt_dbxs[i].md_name.iov_base, + txn->mt_dbistate[i]); } } @@ -9063,91 +10063,167 @@ __cold static int mdbx_audit_ex(MDBX_txn *txn, unsigned retired_stored, return MDBX_SUCCESS; if ((txn->mt_flags & MDBX_TXN_RDONLY) == 0) - mdbx_error("audit @%" PRIaTXN ": %u(pending) = %u(loose) + " - "%u(reclaimed) + %u(retired-pending) - %u(retired-stored)", - txn->mt_txnid, pending, txn->tw.loose_count, - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), - txn->tw.retired_pages ? MDBX_PNL_SIZE(txn->tw.retired_pages) : 0, - retired_stored); - mdbx_error("audit @%" PRIaTXN ": %" PRIaPGNO "(pending) + %" PRIaPGNO - "(gc) + %" PRIaPGNO "(count) = %" PRIaPGNO "(total) <> %" PRIaPGNO - "(allocated)", - txn->mt_txnid, pending, gc, used, pending + gc + used, - txn->mt_next_pgno); + ERROR("audit @%" PRIaTXN ": %zu(pending) = %zu(loose) + " + "%zu(reclaimed) + %zu(retired-pending) - %zu(retired-stored)", + txn->mt_txnid, pending, txn->tw.loose_count, + MDBX_PNL_GETSIZE(txn->tw.relist), + txn->tw.retired_pages ? MDBX_PNL_GETSIZE(txn->tw.retired_pages) : 0, + retired_stored); + ERROR("audit @%" PRIaTXN ": %zu(pending) + %zu" + "(gc) + %zu(count) = %zu(total) <> %zu" + "(allocated)", + txn->mt_txnid, pending, gc, used, pending + gc + used, + (size_t)txn->mt_next_pgno); return MDBX_PROBLEM; } -static __always_inline unsigned backlog_size(MDBX_txn *txn) { - return MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) + txn->tw.loose_count; +typedef struct gc_update_context { + size_t retired_stored, loop; + size_t settled, cleaned_slot, reused_slot, filled_slot; + txnid_t cleaned_id, rid; + bool lifo, dense; +#if MDBX_ENABLE_BIGFOOT + txnid_t bigfoot; +#endif /* MDBX_ENABLE_BIGFOOT */ + MDBX_cursor cursor; +} gcu_context_t; + +static __inline int gcu_context_init(MDBX_txn *txn, gcu_context_t *ctx) { + memset(ctx, 0, offsetof(gcu_context_t, cursor)); + ctx->lifo = (txn->mt_env->me_flags & MDBX_LIFORECLAIM) != 0; +#if MDBX_ENABLE_BIGFOOT + ctx->bigfoot = txn->mt_txnid; +#endif /* MDBX_ENABLE_BIGFOOT */ + return cursor_init(&ctx->cursor, txn, FREE_DBI); } -/* LY: Prepare a backlog of pages to modify GC itself, - * while reclaiming is prohibited. It should be enough to prevent search - * in mdbx_page_alloc() during a deleting, when GC tree is unbalanced. */ -static int mdbx_prep_backlog(MDBX_txn *txn, MDBX_cursor *gc_cursor, - const size_t pnl_bytes, unsigned *retired_stored) { - const unsigned linear4list = number_of_ovpages(txn->mt_env, pnl_bytes); - const unsigned backlog4cow = txn->mt_dbs[FREE_DBI].md_depth; - const unsigned backlog4rebalance = backlog4cow + 1; +static __always_inline size_t gcu_backlog_size(MDBX_txn *txn) { + return MDBX_PNL_GETSIZE(txn->tw.relist) + txn->tw.loose_count; +} - if (likely(linear4list == 1 && - backlog_size(txn) > (pnl_bytes - ? backlog4rebalance - : (backlog4cow + backlog4rebalance)))) - return MDBX_SUCCESS; - - mdbx_trace(">> pnl_bytes %zu, backlog %u, 4list %u, 4cow %u, 4rebalance %u", - pnl_bytes, backlog_size(txn), linear4list, backlog4cow, - backlog4rebalance); - - MDBX_val gc_key, fake_val; - int err; - if (unlikely(linear4list > 2)) { - gc_key.iov_base = fake_val.iov_base = nullptr; - gc_key.iov_len = sizeof(txnid_t); - fake_val.iov_len = pnl_bytes; - err = mdbx_cursor_spill(gc_cursor, &gc_key, &fake_val); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - - gc_cursor->mc_flags &= ~C_RECLAIMING; - err = mdbx_cursor_touch(gc_cursor); - mdbx_trace("== after-touch, backlog %u, err %d", backlog_size(txn), err); - - if (unlikely(linear4list > 1) && err == MDBX_SUCCESS) { - if (retired_stored) { - gc_key.iov_base = &txn->mt_txnid; - gc_key.iov_len = sizeof(txn->mt_txnid); - const struct cursor_set_result csr = - mdbx_cursor_set(gc_cursor, &gc_key, &fake_val, MDBX_SET); +static int gcu_clean_stored_retired(MDBX_txn *txn, gcu_context_t *ctx) { + int err = MDBX_SUCCESS; + if (ctx->retired_stored) { + MDBX_cursor *const gc = ptr_disp(txn, sizeof(MDBX_txn)); + tASSERT(txn, txn == txn->mt_env->me_txn0 && gc->mc_next == nullptr); + gc->mc_txn = txn; + gc->mc_flags = 0; + gc->mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = gc; + do { + MDBX_val key, val; +#if MDBX_ENABLE_BIGFOOT + key.iov_base = &ctx->bigfoot; +#else + key.iov_base = &txn->mt_txnid; +#endif /* MDBX_ENABLE_BIGFOOT */ + key.iov_len = sizeof(txnid_t); + const struct cursor_set_result csr = cursor_set(gc, &key, &val, MDBX_SET); if (csr.err == MDBX_SUCCESS && csr.exact) { - *retired_stored = 0; - err = mdbx_cursor_del(gc_cursor, 0); - mdbx_trace("== clear-4linear, backlog %u, err %d", backlog_size(txn), - err); + ctx->retired_stored = 0; + err = cursor_del(gc, 0); + TRACE("== clear-4linear, backlog %zu, err %d", gcu_backlog_size(txn), + err); } } - err = - mdbx_page_alloc(gc_cursor, linear4list, MDBX_ALLOC_GC | MDBX_ALLOC_FAKE) - .err; - mdbx_trace("== after-4linear, backlog %u, err %d", backlog_size(txn), err); - mdbx_cassert(gc_cursor, - backlog_size(txn) >= linear4list || err != MDBX_SUCCESS); +#if MDBX_ENABLE_BIGFOOT + while (!err && --ctx->bigfoot >= txn->mt_txnid); +#else + while (0); +#endif /* MDBX_ENABLE_BIGFOOT */ + txn->mt_cursors[FREE_DBI] = gc->mc_next; + gc->mc_next = nullptr; + } + return err; +} + +static int gcu_touch(gcu_context_t *ctx) { + MDBX_val key, val; + key.iov_base = val.iov_base = nullptr; + key.iov_len = sizeof(txnid_t); + val.iov_len = MDBX_PNL_SIZEOF(ctx->cursor.mc_txn->tw.retired_pages); + ctx->cursor.mc_flags |= C_GCU; + int err = cursor_touch(&ctx->cursor, &key, &val); + ctx->cursor.mc_flags -= C_GCU; + return err; +} + +/* Prepare a backlog of pages to modify GC itself, while reclaiming is + * prohibited. It should be enough to prevent search in page_alloc_slowpath() + * during a deleting, when GC tree is unbalanced. */ +static int gcu_prepare_backlog(MDBX_txn *txn, gcu_context_t *ctx) { + const size_t for_cow = txn->mt_dbs[FREE_DBI].md_depth; + const size_t for_rebalance = for_cow + 1 + + (txn->mt_dbs[FREE_DBI].md_depth + 1ul >= + txn->mt_dbs[FREE_DBI].md_branch_pages); + size_t for_split = ctx->retired_stored == 0; + + const intptr_t retired_left = + MDBX_PNL_SIZEOF(txn->tw.retired_pages) - ctx->retired_stored; + size_t for_relist = 0; + if (MDBX_ENABLE_BIGFOOT && retired_left > 0) { + for_relist = (retired_left + txn->mt_env->me_maxgc_ov1page - 1) / + txn->mt_env->me_maxgc_ov1page; + const size_t per_branch_page = txn->mt_env->me_maxgc_per_branch; + for (size_t entries = for_relist; entries > 1; for_split += entries) + entries = (entries + per_branch_page - 1) / per_branch_page; + } else if (!MDBX_ENABLE_BIGFOOT && retired_left != 0) { + for_relist = + number_of_ovpages(txn->mt_env, MDBX_PNL_SIZEOF(txn->tw.retired_pages)); } - while (backlog_size(txn) < backlog4cow + linear4list && err == MDBX_SUCCESS) - err = mdbx_page_alloc(gc_cursor, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE | - MDBX_ALLOC_NOLOG) + const size_t for_tree_before_touch = for_cow + for_rebalance + for_split; + const size_t for_tree_after_touch = for_rebalance + for_split; + const size_t for_all_before_touch = for_relist + for_tree_before_touch; + const size_t for_all_after_touch = for_relist + for_tree_after_touch; + + if (likely(for_relist < 2 && gcu_backlog_size(txn) > for_all_before_touch)) + return MDBX_SUCCESS; + + TRACE(">> retired-stored %zu, left %zi, backlog %zu, need %zu (4list %zu, " + "4split %zu, " + "4cow %zu, 4tree %zu)", + ctx->retired_stored, retired_left, gcu_backlog_size(txn), + for_all_before_touch, for_relist, for_split, for_cow, + for_tree_before_touch); + + int err = gcu_touch(ctx); + TRACE("== after-touch, backlog %zu, err %d", gcu_backlog_size(txn), err); + + if (!MDBX_ENABLE_BIGFOOT && unlikely(for_relist > 1) && + MDBX_PNL_GETSIZE(txn->tw.retired_pages) != ctx->retired_stored && + err == MDBX_SUCCESS) { + if (unlikely(ctx->retired_stored)) { + err = gcu_clean_stored_retired(txn, ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (!ctx->retired_stored) + return /* restart by tail-recursion */ gcu_prepare_backlog(txn, ctx); + } + err = page_alloc_slowpath(&ctx->cursor, for_relist, MDBX_ALLOC_RESERVE).err; + TRACE("== after-4linear, backlog %zu, err %d", gcu_backlog_size(txn), err); + cASSERT(&ctx->cursor, + gcu_backlog_size(txn) >= for_relist || err != MDBX_SUCCESS); + } + + while (gcu_backlog_size(txn) < for_all_after_touch && err == MDBX_SUCCESS) + err = page_alloc_slowpath(&ctx->cursor, 0, + MDBX_ALLOC_RESERVE | MDBX_ALLOC_UNIMPORTANT) .err; - gc_cursor->mc_flags |= C_RECLAIMING; - mdbx_trace("<< backlog %u, err %d", backlog_size(txn), err); + TRACE("<< backlog %zu, err %d, gc: height %u, branch %zu, leaf %zu, large " + "%zu, entries %zu", + gcu_backlog_size(txn), err, txn->mt_dbs[FREE_DBI].md_depth, + (size_t)txn->mt_dbs[FREE_DBI].md_branch_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_leaf_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_overflow_pages, + (size_t)txn->mt_dbs[FREE_DBI].md_entries); + tASSERT(txn, + err != MDBX_NOTFOUND || (txn->mt_flags & MDBX_TXN_DRAINED_GC) != 0); return (err != MDBX_NOTFOUND) ? err : MDBX_SUCCESS; } -static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { +static __inline void gcu_clean_reserved(MDBX_env *env, MDBX_val pnl) { /* PNL is initially empty, zero out at least the length */ memset(pnl.iov_base, 0, sizeof(pgno_t)); if ((env->me_flags & (MDBX_WRITEMAP | MDBX_NOMEMINIT)) == 0) @@ -9166,164 +10242,144 @@ static __inline void clean_reserved_gc_pnl(MDBX_env *env, MDBX_val pnl) { * "checks and balances") to partially bypass the fundamental design problems * inherited from LMDB. So do not try to understand it completely in order to * avoid your madness. */ -static int mdbx_update_gc(MDBX_txn *txn) { - /* txn->tw.reclaimed_pglist[] can grow and shrink during this call. - * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. - * Page numbers cannot disappear from txn->tw.retired_pages[]. */ +static int update_gc(MDBX_txn *txn, gcu_context_t *ctx) { + TRACE("\n>>> @%" PRIaTXN, txn->mt_txnid); MDBX_env *const env = txn->mt_env; - const bool lifo = (env->me_flags & MDBX_LIFORECLAIM) != 0; - const char *dbg_prefix_mode = lifo ? " lifo" : " fifo"; + const char *const dbg_prefix_mode = ctx->lifo ? " lifo" : " fifo"; (void)dbg_prefix_mode; - mdbx_trace("\n>>> @%" PRIaTXN, txn->mt_txnid); + ctx->cursor.mc_next = txn->mt_cursors[FREE_DBI]; + txn->mt_cursors[FREE_DBI] = &ctx->cursor; - unsigned retired_stored = 0, loop = 0; - MDBX_cursor_couple couple; - int rc = mdbx_cursor_init(&couple.outer, txn, FREE_DBI); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout_notracking; - - couple.outer.mc_flags |= C_RECLAIMING; - couple.outer.mc_next = txn->mt_cursors[FREE_DBI]; - txn->mt_cursors[FREE_DBI] = &couple.outer; - bool dense_gc = false; + /* txn->tw.relist[] can grow and shrink during this call. + * txn->tw.last_reclaimed and txn->tw.retired_pages[] can only grow. + * But page numbers cannot disappear from txn->tw.retired_pages[]. */ retry: - ++loop; - mdbx_trace("%s", " >> restart"); - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - if (unlikely(/* paranoia */ loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { - mdbx_error("too more loops %u, bailout", loop); + if (ctx->loop++) + TRACE("%s", " >> restart"); + int rc = MDBX_SUCCESS; + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, dirtylist_check(txn)); + if (unlikely(/* paranoia */ ctx->loop > ((MDBX_DEBUG > 0) ? 12 : 42))) { + ERROR("too more loops %zu, bailout", ctx->loop); rc = MDBX_PROBLEM; goto bailout; } - if (unlikely(dense_gc) && retired_stored) { - rc = mdbx_prep_backlog(txn, &couple.outer, - MDBX_PNL_SIZEOF(txn->tw.retired_pages), - &retired_stored); + if (unlikely(ctx->dense)) { + rc = gcu_clean_stored_retired(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - unsigned settled = 0, cleaned_gc_slot = 0, reused_gc_slot = 0, - filled_gc_slot = ~0u; - txnid_t cleaned_gc_id = 0, gc_rid = txn->tw.last_reclaimed; + ctx->settled = 0; + ctx->cleaned_slot = 0; + ctx->reused_slot = 0; + ctx->filled_slot = ~0u; + ctx->cleaned_id = 0; + ctx->rid = txn->tw.last_reclaimed; while (true) { /* Come back here after each Put() in case retired-list changed */ - MDBX_val key, data; - mdbx_trace("%s", " >> continue"); + TRACE("%s", " >> continue"); - if (retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages) && - MDBX_PNL_SIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page) { - rc = mdbx_prep_backlog(txn, &couple.outer, - MDBX_PNL_SIZEOF(txn->tw.retired_pages), - &retired_stored); + if (ctx->retired_stored != MDBX_PNL_GETSIZE(txn->tw.retired_pages) && + (ctx->loop == 1 || ctx->retired_stored > env->me_maxgc_ov1page || + MDBX_PNL_GETSIZE(txn->tw.retired_pages) > env->me_maxgc_ov1page)) { + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - if (lifo) { - if (cleaned_gc_slot < (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0)) { - settled = 0; - cleaned_gc_slot = 0; - reused_gc_slot = 0; - filled_gc_slot = ~0u; + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + MDBX_val key, data; + if (ctx->lifo) { + if (ctx->cleaned_slot < (txn->tw.lifo_reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) + : 0)) { + ctx->settled = 0; + ctx->cleaned_slot = 0; + ctx->reused_slot = 0; + ctx->filled_slot = ~0u; /* LY: cleanup reclaimed records. */ do { - cleaned_gc_id = txn->tw.lifo_reclaimed[++cleaned_gc_slot]; - mdbx_tassert(txn, - cleaned_gc_slot > 0 && - cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); - key.iov_base = &cleaned_gc_id; - key.iov_len = sizeof(cleaned_gc_id); - rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_SET); + ctx->cleaned_id = txn->tw.lifo_reclaimed[++ctx->cleaned_slot]; + tASSERT(txn, + ctx->cleaned_slot > 0 && + ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); + key.iov_base = &ctx->cleaned_id; + key.iov_len = sizeof(ctx->cleaned_id); + rc = cursor_set(&ctx->cursor, &key, NULL, MDBX_SET).err; if (rc == MDBX_NOTFOUND) continue; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - if (likely(!dense_gc)) { - rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); + if (likely(!ctx->dense)) { + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, - cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); - mdbx_trace("%s: cleanup-reclaimed-id [%u]%" PRIaTXN, dbg_prefix_mode, - cleaned_gc_slot, cleaned_gc_id); - mdbx_tassert(txn, *txn->mt_cursors == &couple.outer); - rc = mdbx_cursor_del(&couple.outer, 0); + tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); + TRACE("%s: cleanup-reclaimed-id [%zu]%" PRIaTXN, dbg_prefix_mode, + ctx->cleaned_slot, ctx->cleaned_id); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor); + rc = cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } while (cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); - mdbx_txl_sort(txn->tw.lifo_reclaimed); + } while (ctx->cleaned_slot < MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); + txl_sort(txn->tw.lifo_reclaimed); } } else { - /* If using records from GC which we have not yet deleted, - * now delete them and any we reserved for tw.reclaimed_pglist. */ - while (cleaned_gc_id <= txn->tw.last_reclaimed) { - rc = mdbx_cursor_first(&couple.outer, &key, NULL); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND) - break; + /* Удаляем оставшиеся вынутые из GC записи. */ + while (ctx->cleaned_id <= txn->tw.last_reclaimed) { + rc = cursor_first(&ctx->cursor, &key, NULL); + if (rc == MDBX_NOTFOUND) + break; + if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } - gc_rid = cleaned_gc_id; - settled = 0; - reused_gc_slot = 0; - cleaned_gc_id = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(cleaned_gc_id < MIN_TXNID || cleaned_gc_id > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } - if (cleaned_gc_id > txn->tw.last_reclaimed) + ctx->rid = ctx->cleaned_id; + ctx->settled = 0; + ctx->reused_slot = 0; + ctx->cleaned_id = unaligned_peek_u64(4, key.iov_base); + if (ctx->cleaned_id > txn->tw.last_reclaimed) break; - if (likely(!dense_gc)) { - rc = mdbx_prep_backlog(txn, &couple.outer, 0, nullptr); + if (likely(!ctx->dense)) { + rc = gcu_prepare_backlog(txn, ctx); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, cleaned_gc_id <= txn->tw.last_reclaimed); - mdbx_tassert(txn, cleaned_gc_id < env->me_lck->mti_oldest_reader.weak); - mdbx_trace("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, - cleaned_gc_id); - mdbx_tassert(txn, *txn->mt_cursors == &couple.outer); - rc = mdbx_cursor_del(&couple.outer, 0); + tASSERT(txn, ctx->cleaned_id <= txn->tw.last_reclaimed); + tASSERT(txn, ctx->cleaned_id <= env->me_lck->mti_oldest_reader.weak); + TRACE("%s: cleanup-reclaimed-id %" PRIaTXN, dbg_prefix_mode, + ctx->cleaned_id); + tASSERT(txn, *txn->mt_cursors == &ctx->cursor); + rc = cursor_del(&ctx->cursor, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, dirtylist_check(txn)); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } /* return suitable into unallocated space */ - if (mdbx_refund(txn)) { - mdbx_tassert( - txn, mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + if (txn_refund(txn)) { + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9331,69 +10387,90 @@ retry: /* handle loose pages - put ones into the reclaimed- or retired-list */ if (txn->tw.loose_pages) { - /* Return loose page numbers to tw.reclaimed_pglist, + tASSERT(txn, txn->tw.loose_count > 0); + /* Return loose page numbers to tw.relist, * though usually none are left at this point. * The pages themselves remain in dirtylist. */ if (unlikely(!txn->tw.lifo_reclaimed && txn->tw.last_reclaimed < 1)) { - if (txn->tw.loose_count > 0) { - /* Put loose page numbers in tw.retired_pages, - * since unable to return them to tw.reclaimed_pglist. */ - if (unlikely((rc = mdbx_pnl_need(&txn->tw.retired_pages, - txn->tw.loose_count)) != 0)) - goto bailout; - for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) - mdbx_pnl_xappend(txn->tw.retired_pages, mp->mp_pgno); - mdbx_trace("%s: append %u loose-pages to retired-pages", - dbg_prefix_mode, txn->tw.loose_count); + TRACE("%s: try allocate gc-slot for %zu loose-pages", dbg_prefix_mode, + txn->tw.loose_count); + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; + if (rc == MDBX_SUCCESS) { + TRACE("%s: retry since gc-slot for %zu loose-pages available", + dbg_prefix_mode, txn->tw.loose_count); + continue; } + + /* Put loose page numbers in tw.retired_pages, + * since unable to return them to tw.relist. */ + if (unlikely((rc = pnl_need(&txn->tw.retired_pages, + txn->tw.loose_count)) != 0)) + goto bailout; + for (MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { + pnl_xappend(txn->tw.retired_pages, lp->mp_pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + } + TRACE("%s: append %zu loose-pages to retired-pages", dbg_prefix_mode, + txn->tw.loose_count); } else { /* Room for loose pages + temp PNL with same */ - rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, - 2 * txn->tw.loose_count + 2); + rc = pnl_need(&txn->tw.relist, 2 * txn->tw.loose_count + 2); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - MDBX_PNL loose = txn->tw.reclaimed_pglist + - MDBX_PNL_ALLOCLEN(txn->tw.reclaimed_pglist) - + MDBX_PNL loose = txn->tw.relist + MDBX_PNL_ALLOCLEN(txn->tw.relist) - txn->tw.loose_count - 1; - unsigned count = 0; - for (MDBX_page *mp = txn->tw.loose_pages; mp; mp = mp->mp_next) { - mdbx_tassert(txn, mp->mp_flags == P_LOOSE); - loose[++count] = mp->mp_pgno; + size_t count = 0; + for (MDBX_page *lp = txn->tw.loose_pages; lp; lp = mp_next(lp)) { + tASSERT(txn, lp->mp_flags == P_LOOSE); + loose[++count] = lp->mp_pgno; + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); } - mdbx_tassert(txn, count == txn->tw.loose_count); - MDBX_PNL_SIZE(loose) = count; - mdbx_pnl_sort(loose, txn->mt_next_pgno); - mdbx_pnl_xmerge(txn->tw.reclaimed_pglist, loose); - mdbx_trace("%s: append %u loose-pages to reclaimed-pages", - dbg_prefix_mode, txn->tw.loose_count); + tASSERT(txn, count == txn->tw.loose_count); + MDBX_PNL_SETSIZE(loose, count); + pnl_sort(loose, txn->mt_next_pgno); + pnl_merge(txn->tw.relist, loose); + TRACE("%s: append %zu loose-pages to reclaimed-pages", dbg_prefix_mode, + txn->tw.loose_count); } /* filter-out list of dirty-pages from loose-pages */ MDBX_dpl *const dl = txn->tw.dirtylist; - unsigned w = 0; - for (unsigned r = w; ++r <= dl->length;) { - MDBX_page *dp = dl->items[r].ptr; - mdbx_tassert(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); - mdbx_tassert(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); - if ((dp->mp_flags & P_LOOSE) == 0) { - if (++w != r) - dl->items[w] = dl->items[r]; - } else { - mdbx_tassert(txn, dp->mp_flags == P_LOOSE); - if ((env->me_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(env, dp, 1); + if (dl) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, dl->sorted <= dl->length); + size_t w = 0, sorted_out = 0; + for (size_t r = w; ++r <= dl->length;) { + MDBX_page *dp = dl->items[r].ptr; + tASSERT(txn, dp->mp_flags == P_LOOSE || IS_MODIFIABLE(txn, dp)); + tASSERT(txn, dpl_endpgno(dl, r) <= txn->mt_next_pgno); + if ((dp->mp_flags & P_LOOSE) == 0) { + if (++w != r) + dl->items[w] = dl->items[r]; + } else { + tASSERT(txn, dp->mp_flags == P_LOOSE); + sorted_out += dl->sorted >= r; + if (!MDBX_AVOID_MSYNC || !(env->me_flags & MDBX_WRITEMAP)) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + dpage_free(env, dp, 1); + } + } } + TRACE("%s: filtered-out loose-pages from %zu -> %zu dirty-pages", + dbg_prefix_mode, dl->length, w); + tASSERT(txn, txn->tw.loose_count == dl->length - w); + dl->sorted -= sorted_out; + tASSERT(txn, dl->sorted <= w); + dpl_setlen(dl, w); + dl->pages_including_loose -= txn->tw.loose_count; + txn->tw.dirtyroom += txn->tw.loose_count; + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); } - mdbx_trace("%s: filtered-out loose-pages from %u -> %u dirty-pages", - dbg_prefix_mode, dl->length, w); - mdbx_tassert(txn, txn->tw.loose_count == dl->length - w); - dpl_setlen(dl, w); - dl->sorted = 0; - txn->tw.dirtyroom += txn->tw.loose_count; - mdbx_tassert(txn, - txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); txn->tw.loose_pages = NULL; txn->tw.loose_count = 0; #if MDBX_ENABLE_REFUND @@ -9401,52 +10478,127 @@ retry: #endif /* MDBX_ENABLE_REFUND */ } - const unsigned amount = (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); + const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); /* handle retired-list - store ones into single gc-record */ - if (retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)) { - if (unlikely(!retired_stored)) { + if (ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + if (unlikely(!ctx->retired_stored)) { /* Make sure last page of GC is touched and on retired-list */ - couple.outer.mc_flags &= ~C_RECLAIMING; - rc = mdbx_page_search(&couple.outer, NULL, - MDBX_PS_LAST | MDBX_PS_MODIFY); - couple.outer.mc_flags |= C_RECLAIMING; + rc = cursor_last(&ctx->cursor, nullptr, nullptr); + if (likely(rc != MDBX_SUCCESS)) + rc = gcu_touch(ctx); if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_NOTFOUND) goto bailout; } + +#if MDBX_ENABLE_BIGFOOT + size_t retired_pages_before; + do { + if (ctx->bigfoot > txn->mt_txnid) { + rc = gcu_clean_stored_retired(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + tASSERT(txn, ctx->bigfoot <= txn->mt_txnid); + } + + retired_pages_before = MDBX_PNL_GETSIZE(txn->tw.retired_pages); + rc = gcu_prepare_backlog(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", dbg_prefix_mode, + retired_pages_before, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } + + pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); + ctx->retired_stored = 0; + ctx->bigfoot = txn->mt_txnid; + do { + if (ctx->retired_stored) { + rc = gcu_prepare_backlog(txn, ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + if (ctx->retired_stored >= + MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + TRACE("%s: retired-list changed (%zu -> %zu), retry", + dbg_prefix_mode, retired_pages_before, + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + break; + } + } + key.iov_len = sizeof(txnid_t); + key.iov_base = &ctx->bigfoot; + const size_t left = + MDBX_PNL_GETSIZE(txn->tw.retired_pages) - ctx->retired_stored; + const size_t chunk = + (left > env->me_maxgc_ov1page && ctx->bigfoot < MAX_TXNID) + ? env->me_maxgc_ov1page + : left; + data.iov_len = (chunk + 1) * sizeof(pgno_t); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + + if (retired_pages_before == MDBX_PNL_GETSIZE(txn->tw.retired_pages)) { + const size_t at = (ctx->lifo == MDBX_PNL_ASCENDING) + ? left - chunk + : ctx->retired_stored; + pgno_t *const begin = txn->tw.retired_pages + at; + /* MDBX_PNL_ASCENDING == false && LIFO == false: + * - the larger pgno is at the beginning of retired list + * and should be placed with the larger txnid. + * MDBX_PNL_ASCENDING == true && LIFO == true: + * - the larger pgno is at the ending of retired list + * and should be placed with the smaller txnid. + */ + const pgno_t save = *begin; + *begin = (pgno_t)chunk; + memcpy(data.iov_base, begin, data.iov_len); + *begin = save; + TRACE("%s: put-retired/bigfoot @ %" PRIaTXN + " (slice #%u) #%zu [%zu..%zu] of %zu", + dbg_prefix_mode, ctx->bigfoot, + (unsigned)(ctx->bigfoot - txn->mt_txnid), chunk, at, + at + chunk, retired_pages_before); + } + ctx->retired_stored += chunk; + } while (ctx->retired_stored < + MDBX_PNL_GETSIZE(txn->tw.retired_pages) && + (++ctx->bigfoot, true)); + } while (retired_pages_before != MDBX_PNL_GETSIZE(txn->tw.retired_pages)); +#else /* Write to last page of GC */ - key.iov_len = sizeof(txn->mt_txnid); + key.iov_len = sizeof(txnid_t); key.iov_base = &txn->mt_txnid; do { + gcu_prepare_backlog(txn, ctx); data.iov_len = MDBX_PNL_SIZEOF(txn->tw.retired_pages); - mdbx_prep_backlog(txn, &couple.outer, data.iov_len, &retired_stored); - rc = mdbx_cursor_put(&couple.outer, &key, &data, MDBX_RESERVE); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; /* Retry if tw.retired_pages[] grew during the Put() */ } while (data.iov_len < MDBX_PNL_SIZEOF(txn->tw.retired_pages)); - retired_stored = (unsigned)MDBX_PNL_SIZE(txn->tw.retired_pages); - mdbx_pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); - mdbx_assert(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); + ctx->retired_stored = MDBX_PNL_GETSIZE(txn->tw.retired_pages); + pnl_sort(txn->tw.retired_pages, txn->mt_next_pgno); + eASSERT(env, data.iov_len == MDBX_PNL_SIZEOF(txn->tw.retired_pages)); memcpy(data.iov_base, txn->tw.retired_pages, data.iov_len); - mdbx_trace("%s: put-retired #%u @ %" PRIaTXN, dbg_prefix_mode, - retired_stored, txn->mt_txnid); - - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) { - unsigned i = retired_stored; - mdbx_debug_extra("PNL write txn %" PRIaTXN " root %" PRIaPGNO - " num %u, PNL", - txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); + TRACE("%s: put-retired #%zu @ %" PRIaTXN, dbg_prefix_mode, + ctx->retired_stored, txn->mt_txnid); +#endif /* MDBX_ENABLE_BIGFOOT */ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) { + size_t i = ctx->retired_stored; + DEBUG_EXTRA("txn %" PRIaTXN " root %" PRIaPGNO " num %zu, retired-PNL", + txn->mt_txnid, txn->mt_dbs[FREE_DBI].md_root, i); for (; i; i--) - mdbx_debug_extra_print(" %" PRIaPGNO, txn->tw.retired_pages[i]); - mdbx_debug_extra_print("%s\n", "."); + DEBUG_EXTRA_PRINT(" %" PRIaPGNO, txn->tw.retired_pages[i]); + DEBUG_EXTRA_PRINT("%s\n", "."); } - if (unlikely(amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) && - settled)) { - mdbx_trace("%s: reclaimed-list changed %u -> %u, retry", - dbg_prefix_mode, amount, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + if (unlikely(amount != MDBX_PNL_GETSIZE(txn->tw.relist) && + ctx->settled)) { + TRACE("%s: reclaimed-list changed %zu -> %zu, retry", dbg_prefix_mode, + amount, MDBX_PNL_GETSIZE(txn->tw.relist)); goto retry /* rare case, but avoids GC fragmentation and one cycle. */ ; @@ -9455,250 +10607,226 @@ retry: } /* handle reclaimed and lost pages - merge and store both into gc */ - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, txn->tw.loose_count == 0); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, txn->tw.loose_count == 0); - mdbx_trace("%s", " >> reserving"); - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored, false); + TRACE("%s", " >> reserving"); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - const unsigned left = amount - settled; - mdbx_trace("%s: amount %u, settled %d, left %d, lifo-reclaimed-slots %u, " - "reused-gc-slots %u", - dbg_prefix_mode, amount, settled, (int)left, - txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0, - reused_gc_slot); - if (0 >= (int)left) + const size_t left = amount - ctx->settled; + TRACE("%s: amount %zu, settled %zd, left %zd, lifo-reclaimed-slots %zu, " + "reused-gc-slots %zu", + dbg_prefix_mode, amount, ctx->settled, left, + txn->tw.lifo_reclaimed ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0, + ctx->reused_slot); + if (0 >= (intptr_t)left) break; - const unsigned prefer_max_scatter = 257; + const size_t prefer_max_scatter = MDBX_ENABLE_BIGFOOT ? MDBX_TXL_MAX : 257; txnid_t reservation_gc_id; - if (lifo) { + if (ctx->lifo) { if (txn->tw.lifo_reclaimed == nullptr) { - txn->tw.lifo_reclaimed = mdbx_txl_alloc(); + txn->tw.lifo_reclaimed = txl_alloc(); if (unlikely(!txn->tw.lifo_reclaimed)) { rc = MDBX_ENOMEM; goto bailout; } } - if ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < - prefer_max_scatter && - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * + if (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && + left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * env->me_maxgc_ov1page && - !dense_gc) { - /* LY: need just a txn-id for save page list. */ + !ctx->dense) { + /* Hужен свободный для для сохранения списка страниц. */ bool need_cleanup = false; - txnid_t snap_oldest; + txnid_t snap_oldest = 0; retry_rid: - couple.outer.mc_flags &= ~C_RECLAIMING; do { - snap_oldest = mdbx_find_oldest(txn); - rc = - mdbx_page_alloc(&couple.outer, 0, - MDBX_ALLOC_GC | MDBX_ALLOC_SLOT | MDBX_ALLOC_FAKE) - .err; + rc = page_alloc_slowpath(&ctx->cursor, 0, MDBX_ALLOC_RESERVE).err; + snap_oldest = env->me_lck->mti_oldest_reader.weak; if (likely(rc == MDBX_SUCCESS)) { - mdbx_trace("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, - MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); + TRACE("%s: took @%" PRIaTXN " from GC", dbg_prefix_mode, + MDBX_PNL_LAST(txn->tw.lifo_reclaimed)); need_cleanup = true; } - } while (rc == MDBX_SUCCESS && - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < - prefer_max_scatter && - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * - env->me_maxgc_ov1page); - couple.outer.mc_flags |= C_RECLAIMING; + } while ( + rc == MDBX_SUCCESS && + MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && + left > + (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot) * + env->me_maxgc_ov1page); if (likely(rc == MDBX_SUCCESS)) { - mdbx_trace("%s: got enough from GC.", dbg_prefix_mode); + TRACE("%s: got enough from GC.", dbg_prefix_mode); continue; } else if (unlikely(rc != MDBX_NOTFOUND)) /* LY: some troubles... */ goto bailout; - if (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { + if (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { if (need_cleanup) { - mdbx_txl_sort(txn->tw.lifo_reclaimed); - cleaned_gc_slot = 0; + txl_sort(txn->tw.lifo_reclaimed); + ctx->cleaned_slot = 0; } - gc_rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); + ctx->rid = MDBX_PNL_LAST(txn->tw.lifo_reclaimed); } else { - mdbx_tassert(txn, txn->tw.last_reclaimed == 0); - if (unlikely(mdbx_find_oldest(txn) != snap_oldest)) - /* should retry mdbx_page_alloc(MDBX_ALLOC_GC) + tASSERT(txn, txn->tw.last_reclaimed == 0); + if (unlikely(txn_oldest_reader(txn) != snap_oldest)) + /* should retry page_alloc_slowpath() * if the oldest reader changes since the last attempt */ goto retry_rid; /* no reclaimable GC entries, * therefore no entries with ID < mdbx_find_oldest(txn) */ - txn->tw.last_reclaimed = gc_rid = snap_oldest - 1; - mdbx_trace("%s: none recycled yet, set rid to @%" PRIaTXN, - dbg_prefix_mode, gc_rid); + txn->tw.last_reclaimed = ctx->rid = snap_oldest; + TRACE("%s: none recycled yet, set rid to @%" PRIaTXN, dbg_prefix_mode, + ctx->rid); } - /* LY: GC is empty, will look any free txn-id in high2low order. */ - while (MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && - left > ((unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot) * + /* В GC нет годных к переработке записей, + * будем использовать свободные id в обратном порядке. */ + while (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) < prefer_max_scatter && + left > (MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - + ctx->reused_slot) * env->me_maxgc_ov1page) { - if (unlikely(gc_rid <= MIN_TXNID)) { - if (unlikely(MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) <= - reused_gc_slot)) { - mdbx_notice("** restart: reserve depleted (reused_gc_slot %u >= " - "lifo_reclaimed %u" PRIaTXN, - reused_gc_slot, - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + if (unlikely(ctx->rid <= MIN_TXNID)) { + if (unlikely(MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) <= + ctx->reused_slot)) { + NOTICE("** restart: reserve depleted (reused_gc_slot %zu >= " + "lifo_reclaimed %zu" PRIaTXN, + ctx->reused_slot, + MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); goto retry; } break; } - mdbx_tassert(txn, gc_rid >= MIN_TXNID && gc_rid <= MAX_TXNID); - --gc_rid; - key.iov_base = &gc_rid; - key.iov_len = sizeof(gc_rid); - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + tASSERT(txn, ctx->rid >= MIN_TXNID && ctx->rid <= MAX_TXNID); + ctx->rid -= 1; + key.iov_base = &ctx->rid; + key.iov_len = sizeof(ctx->rid); + rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; if (unlikely(rc == MDBX_SUCCESS)) { - mdbx_debug("%s: GC's id %" PRIaTXN - " is used, continue bottom-up search", - dbg_prefix_mode, gc_rid); - ++gc_rid; - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_FIRST); - if (rc == MDBX_NOTFOUND) { - mdbx_debug("%s: GC is empty (going dense-mode)", dbg_prefix_mode); - dense_gc = true; - break; - } + DEBUG("%s: GC's id %" PRIaTXN " is present, going to first", + dbg_prefix_mode, ctx->rid); + rc = cursor_first(&ctx->cursor, &key, nullptr); if (unlikely(rc != MDBX_SUCCESS || key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } - txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } + const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); if (gc_first <= MIN_TXNID) { - mdbx_debug("%s: no free GC's id(s) less than %" PRIaTXN - " (going dense-mode)", - dbg_prefix_mode, gc_rid); - dense_gc = true; + DEBUG("%s: no free GC's id(s) less than %" PRIaTXN + " (going dense-mode)", + dbg_prefix_mode, ctx->rid); + ctx->dense = true; break; } - gc_rid = gc_first - 1; + ctx->rid = gc_first - 1; } - mdbx_assert(env, !dense_gc); - rc = mdbx_txl_append(&txn->tw.lifo_reclaimed, gc_rid); + eASSERT(env, !ctx->dense); + rc = txl_append(&txn->tw.lifo_reclaimed, ctx->rid); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - if (reused_gc_slot) + if (ctx->reused_slot) /* rare case, but it is better to clear and re-create GC entries * with less fragmentation. */ need_cleanup = true; else - cleaned_gc_slot += + ctx->cleaned_slot += 1 /* mark cleanup is not needed for added slot. */; - mdbx_trace("%s: append @%" PRIaTXN - " to lifo-reclaimed, cleaned-gc-slot = %u", - dbg_prefix_mode, gc_rid, cleaned_gc_slot); + TRACE("%s: append @%" PRIaTXN + " to lifo-reclaimed, cleaned-gc-slot = %zu", + dbg_prefix_mode, ctx->rid, ctx->cleaned_slot); } - if (need_cleanup || dense_gc) { - if (cleaned_gc_slot) - mdbx_trace( - "%s: restart inner-loop to clear and re-create GC entries", - dbg_prefix_mode); - cleaned_gc_slot = 0; + if (need_cleanup || ctx->dense) { + if (ctx->cleaned_slot) { + TRACE("%s: restart to clear and re-create GC entries", + dbg_prefix_mode); + goto retry; + } continue; } } - const unsigned i = - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot; - mdbx_tassert(txn, i > 0 && i <= MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + const size_t i = + MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot; + tASSERT(txn, i > 0 && i <= MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); reservation_gc_id = txn->tw.lifo_reclaimed[i]; - mdbx_trace("%s: take @%" PRIaTXN " from lifo-reclaimed[%u]", - dbg_prefix_mode, reservation_gc_id, i); + TRACE("%s: take @%" PRIaTXN " from lifo-reclaimed[%zu]", dbg_prefix_mode, + reservation_gc_id, i); } else { - mdbx_tassert(txn, txn->tw.lifo_reclaimed == NULL); - if (unlikely(gc_rid == 0)) { - gc_rid = mdbx_find_oldest(txn) - 1; - rc = mdbx_cursor_get(&couple.outer, &key, NULL, MDBX_FIRST); - if (rc == MDBX_SUCCESS) { - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(key.iov_len != sizeof(txnid_t))) { + tASSERT(txn, txn->tw.lifo_reclaimed == NULL); + if (unlikely(ctx->rid == 0)) { + ctx->rid = txn_oldest_reader(txn); + rc = cursor_first(&ctx->cursor, &key, nullptr); + if (likely(rc == MDBX_SUCCESS)) { + if (unlikely(key.iov_len != sizeof(txnid_t))) { rc = MDBX_CORRUPTED; goto bailout; } - txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); - if (!MDBX_DISABLE_PAGECHECKS && - unlikely(gc_first < MIN_TXNID || gc_first > MAX_TXNID)) { - rc = MDBX_CORRUPTED; - goto bailout; - } - if (gc_rid >= gc_first) - gc_rid = gc_first - 1; - if (unlikely(gc_rid == 0)) { - mdbx_error("%s", "** no GC tail-space to store (going dense-mode)"); - dense_gc = true; + const txnid_t gc_first = unaligned_peek_u64(4, key.iov_base); + if (ctx->rid >= gc_first) + ctx->rid = gc_first - 1; + if (unlikely(ctx->rid == 0)) { + ERROR("%s", "** no GC tail-space to store (going dense-mode)"); + ctx->dense = true; goto retry; } } else if (rc != MDBX_NOTFOUND) goto bailout; - txn->tw.last_reclaimed = gc_rid; - cleaned_gc_id = gc_rid + 1; + txn->tw.last_reclaimed = ctx->rid; + ctx->cleaned_id = ctx->rid + 1; } - reservation_gc_id = gc_rid--; - mdbx_trace("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, - reservation_gc_id); + reservation_gc_id = ctx->rid--; + TRACE("%s: take @%" PRIaTXN " from head-gc-id", dbg_prefix_mode, + reservation_gc_id); } - ++reused_gc_slot; + ++ctx->reused_slot; - unsigned chunk = left; + size_t chunk = left; if (unlikely(chunk > env->me_maxgc_ov1page)) { - const unsigned avail_gc_slots = + const size_t avail_gc_slots = txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - - reused_gc_slot + 1 - : (gc_rid < INT16_MAX) ? (unsigned)gc_rid - : INT16_MAX; + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot + 1 + : (ctx->rid < INT16_MAX) ? (size_t)ctx->rid + : INT16_MAX; if (avail_gc_slots > 1) { +#if MDBX_ENABLE_BIGFOOT + chunk = (chunk < env->me_maxgc_ov1page * (size_t)2) + ? chunk / 2 + : env->me_maxgc_ov1page; +#else if (chunk < env->me_maxgc_ov1page * 2) chunk /= 2; else { - const unsigned threshold = + const size_t threshold = env->me_maxgc_ov1page * ((avail_gc_slots < prefer_max_scatter) ? avail_gc_slots : prefer_max_scatter); if (left < threshold) chunk = env->me_maxgc_ov1page; else { - const unsigned tail = left - threshold + env->me_maxgc_ov1page + 1; - unsigned span = 1; - unsigned avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / - sizeof(pgno_t)) /* - 1 + span */; + const size_t tail = left - threshold + env->me_maxgc_ov1page + 1; + size_t span = 1; + size_t avail = ((pgno2bytes(env, span) - PAGEHDRSZ) / + sizeof(pgno_t)) /* - 1 + span */; if (tail > avail) { - for (unsigned i = amount - span; i > 0; --i) { - if (MDBX_PNL_ASCENDING - ? (txn->tw.reclaimed_pglist[i] + span) - : (txn->tw.reclaimed_pglist[i] - span) == - txn->tw.reclaimed_pglist[i + span]) { + for (size_t i = amount - span; i > 0; --i) { + if (MDBX_PNL_ASCENDING ? (txn->tw.relist[i] + span) + : (txn->tw.relist[i] - span) == + txn->tw.relist[i + span]) { span += 1; - avail = (unsigned)((pgno2bytes(env, span) - PAGEHDRSZ) / - sizeof(pgno_t)) - - 1 + span; + avail = + ((pgno2bytes(env, span) - PAGEHDRSZ) / sizeof(pgno_t)) - + 1 + span; if (avail >= tail) break; } @@ -9707,29 +10835,30 @@ retry: chunk = (avail >= tail) ? tail - span : (avail_gc_slots > 3 && - reused_gc_slot < prefer_max_scatter - 3) + ctx->reused_slot < prefer_max_scatter - 3) ? avail - span : tail; } } +#endif /* MDBX_ENABLE_BIGFOOT */ } } - mdbx_tassert(txn, chunk > 0); + tASSERT(txn, chunk > 0); - mdbx_trace("%s: gc_rid %" PRIaTXN ", reused_gc_slot %u, reservation-id " - "%" PRIaTXN, - dbg_prefix_mode, gc_rid, reused_gc_slot, reservation_gc_id); + TRACE("%s: gc_rid %" PRIaTXN ", reused_gc_slot %zu, reservation-id " + "%" PRIaTXN, + dbg_prefix_mode, ctx->rid, ctx->reused_slot, reservation_gc_id); - mdbx_trace("%s: chunk %u, gc-per-ovpage %u", dbg_prefix_mode, chunk, - env->me_maxgc_ov1page); + TRACE("%s: chunk %zu, gc-per-ovpage %u", dbg_prefix_mode, chunk, + env->me_maxgc_ov1page); - mdbx_tassert(txn, reservation_gc_id < env->me_lck->mti_oldest_reader.weak); + tASSERT(txn, reservation_gc_id <= env->me_lck->mti_oldest_reader.weak); if (unlikely( reservation_gc_id < MIN_TXNID || - reservation_gc_id >= + reservation_gc_id > atomic_load64(&env->me_lck->mti_oldest_reader, mo_Relaxed))) { - mdbx_error("** internal error (reservation_gc_id %" PRIaTXN ")", - reservation_gc_id); + ERROR("** internal error (reservation_gc_id %" PRIaTXN ")", + reservation_gc_id); rc = MDBX_PROBLEM; goto bailout; } @@ -9737,164 +10866,155 @@ retry: key.iov_len = sizeof(reservation_gc_id); key.iov_base = &reservation_gc_id; data.iov_len = (chunk + 1) * sizeof(pgno_t); - mdbx_trace("%s: reserve %u [%u...%u) @%" PRIaTXN, dbg_prefix_mode, chunk, - settled + 1, settled + chunk + 1, reservation_gc_id); - mdbx_prep_backlog(txn, &couple.outer, data.iov_len, nullptr); - rc = mdbx_cursor_put(&couple.outer, &key, &data, - MDBX_RESERVE | MDBX_NOOVERWRITE); - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + TRACE("%s: reserve %zu [%zu...%zu) @%" PRIaTXN, dbg_prefix_mode, chunk, + ctx->settled + 1, ctx->settled + chunk + 1, reservation_gc_id); + gcu_prepare_backlog(txn, ctx); + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, + MDBX_RESERVE | MDBX_NOOVERWRITE); + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - clean_reserved_gc_pnl(env, data); - settled += chunk; - mdbx_trace("%s: settled %u (+%u), continue", dbg_prefix_mode, settled, - chunk); + gcu_clean_reserved(env, data); + ctx->settled += chunk; + TRACE("%s: settled %zu (+%zu), continue", dbg_prefix_mode, ctx->settled, + chunk); if (txn->tw.lifo_reclaimed && - unlikely(amount < MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) && - (loop < 5 || MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) - amount > - env->me_maxgc_ov1page)) { - mdbx_notice("** restart: reclaimed-list growth %u -> %u", amount, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + unlikely(amount < MDBX_PNL_GETSIZE(txn->tw.relist)) && + (ctx->loop < 5 || + MDBX_PNL_GETSIZE(txn->tw.relist) - amount > env->me_maxgc_ov1page)) { + NOTICE("** restart: reclaimed-list growth %zu -> %zu", amount, + MDBX_PNL_GETSIZE(txn->tw.relist)); goto retry; } continue; } - mdbx_tassert( - txn, - cleaned_gc_slot == - (txn->tw.lifo_reclaimed ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) : 0)); + tASSERT(txn, + ctx->cleaned_slot == (txn->tw.lifo_reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) + : 0)); - mdbx_trace("%s", " >> filling"); + TRACE("%s", " >> filling"); /* Fill in the reserved records */ - filled_gc_slot = + ctx->filled_slot = txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - reused_gc_slot - : reused_gc_slot; + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) - ctx->reused_slot + : ctx->reused_slot; rc = MDBX_SUCCESS; - mdbx_tassert(txn, - mdbx_pnl_check4assert(txn->tw.reclaimed_pglist, - txn->mt_next_pgno - MDBX_ENABLE_REFUND)); - mdbx_tassert(txn, mdbx_dirtylist_check(txn)); - if (MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)) { + tASSERT(txn, pnl_check_allocated(txn->tw.relist, + txn->mt_next_pgno - MDBX_ENABLE_REFUND)); + tASSERT(txn, dirtylist_check(txn)); + if (MDBX_PNL_GETSIZE(txn->tw.relist)) { MDBX_val key, data; key.iov_len = data.iov_len = 0; /* avoid MSVC warning */ key.iov_base = data.iov_base = NULL; - const unsigned amount = MDBX_PNL_SIZE(txn->tw.reclaimed_pglist); - unsigned left = amount; + const size_t amount = MDBX_PNL_GETSIZE(txn->tw.relist); + size_t left = amount; if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); - rc = mdbx_cursor_first(&couple.outer, &key, &data); + tASSERT(txn, ctx->lifo == 0); + rc = cursor_first(&ctx->cursor, &key, &data); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { - mdbx_tassert(txn, lifo != 0); + tASSERT(txn, ctx->lifo != 0); } while (true) { txnid_t fill_gc_id; - mdbx_trace("%s: left %u of %u", dbg_prefix_mode, left, - (unsigned)MDBX_PNL_SIZE(txn->tw.reclaimed_pglist)); + TRACE("%s: left %zu of %zu", dbg_prefix_mode, left, + MDBX_PNL_GETSIZE(txn->tw.relist)); if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); + tASSERT(txn, ctx->lifo == 0); fill_gc_id = unaligned_peek_u64(4, key.iov_base); - if (filled_gc_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { - mdbx_notice( - "** restart: reserve depleted (filled_slot %u, fill_id %" PRIaTXN + if (ctx->filled_slot-- == 0 || fill_gc_id > txn->tw.last_reclaimed) { + NOTICE( + "** restart: reserve depleted (filled_slot %zu, fill_id %" PRIaTXN " > last_reclaimed %" PRIaTXN, - filled_gc_slot, fill_gc_id, txn->tw.last_reclaimed); + ctx->filled_slot, fill_gc_id, txn->tw.last_reclaimed); goto retry; } } else { - mdbx_tassert(txn, lifo != 0); - if (++filled_gc_slot > - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)) { - mdbx_notice("** restart: reserve depleted (filled_gc_slot %u > " - "lifo_reclaimed %u" PRIaTXN, - filled_gc_slot, - (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + tASSERT(txn, ctx->lifo != 0); + if (++ctx->filled_slot > MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)) { + NOTICE("** restart: reserve depleted (filled_gc_slot %zu > " + "lifo_reclaimed %zu" PRIaTXN, + ctx->filled_slot, MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); goto retry; } - fill_gc_id = txn->tw.lifo_reclaimed[filled_gc_slot]; - mdbx_trace("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%u]", - dbg_prefix_mode, fill_gc_id, filled_gc_slot); + fill_gc_id = txn->tw.lifo_reclaimed[ctx->filled_slot]; + TRACE("%s: seek-reservation @%" PRIaTXN " at lifo_reclaimed[%zu]", + dbg_prefix_mode, fill_gc_id, ctx->filled_slot); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_SET_KEY); + rc = cursor_set(&ctx->cursor, &key, &data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - mdbx_tassert(txn, cleaned_gc_slot == - (txn->tw.lifo_reclaimed - ? MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : 0)); - mdbx_tassert(txn, fill_gc_id > 0 && - fill_gc_id < env->me_lck->mti_oldest_reader.weak); + tASSERT(txn, ctx->cleaned_slot == + (txn->tw.lifo_reclaimed + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) + : 0)); + tASSERT(txn, fill_gc_id > 0 && + fill_gc_id <= env->me_lck->mti_oldest_reader.weak); key.iov_base = &fill_gc_id; key.iov_len = sizeof(fill_gc_id); - mdbx_tassert(txn, data.iov_len >= sizeof(pgno_t) * 2); - couple.outer.mc_flags |= C_GCFREEZE; - unsigned chunk = (unsigned)(data.iov_len / sizeof(pgno_t)) - 1; + tASSERT(txn, data.iov_len >= sizeof(pgno_t) * 2); + size_t chunk = data.iov_len / sizeof(pgno_t) - 1; if (unlikely(chunk > left)) { - mdbx_trace("%s: chunk %u > left %u, @%" PRIaTXN, dbg_prefix_mode, chunk, - left, fill_gc_id); - if ((loop < 5 && chunk - left > loop / 2) || + TRACE("%s: chunk %zu > left %zu, @%" PRIaTXN, dbg_prefix_mode, chunk, + left, fill_gc_id); + if ((ctx->loop < 5 && chunk - left > ctx->loop / 2) || chunk - left > env->me_maxgc_ov1page) { data.iov_len = (left + 1) * sizeof(pgno_t); - if (loop < 7) - couple.outer.mc_flags &= ~C_GCFREEZE; } chunk = left; } - rc = mdbx_cursor_put(&couple.outer, &key, &data, - MDBX_CURRENT | MDBX_RESERVE); - couple.outer.mc_flags &= ~C_GCFREEZE; + rc = cursor_put_nochecklen(&ctx->cursor, &key, &data, + MDBX_CURRENT | MDBX_RESERVE); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - clean_reserved_gc_pnl(env, data); + gcu_clean_reserved(env, data); if (unlikely(txn->tw.loose_count || - amount != MDBX_PNL_SIZE(txn->tw.reclaimed_pglist))) { - mdbx_notice("** restart: reclaimed-list growth (%u -> %u, loose +%u)", - amount, MDBX_PNL_SIZE(txn->tw.reclaimed_pglist), - txn->tw.loose_count); + amount != MDBX_PNL_GETSIZE(txn->tw.relist))) { + NOTICE("** restart: reclaimed-list growth (%zu -> %zu, loose +%zu)", + amount, MDBX_PNL_GETSIZE(txn->tw.relist), txn->tw.loose_count); goto retry; } if (unlikely(txn->tw.lifo_reclaimed - ? cleaned_gc_slot < MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) - : cleaned_gc_id < txn->tw.last_reclaimed)) { - mdbx_notice("%s", "** restart: reclaimed-slots changed"); + ? ctx->cleaned_slot < + MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) + : ctx->cleaned_id < txn->tw.last_reclaimed)) { + NOTICE("%s", "** restart: reclaimed-slots changed"); goto retry; } - if (unlikely(retired_stored != MDBX_PNL_SIZE(txn->tw.retired_pages))) { - mdbx_tassert(txn, - retired_stored < MDBX_PNL_SIZE(txn->tw.retired_pages)); - mdbx_notice("** restart: retired-list growth (%u -> %u)", - retired_stored, MDBX_PNL_SIZE(txn->tw.retired_pages)); + if (unlikely(ctx->retired_stored != + MDBX_PNL_GETSIZE(txn->tw.retired_pages))) { + tASSERT(txn, + ctx->retired_stored < MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + NOTICE("** restart: retired-list growth (%zu -> %zu)", + ctx->retired_stored, MDBX_PNL_GETSIZE(txn->tw.retired_pages)); goto retry; } pgno_t *dst = data.iov_base; - *dst++ = chunk; - pgno_t *src = MDBX_PNL_BEGIN(txn->tw.reclaimed_pglist) + left - chunk; + *dst++ = (pgno_t)chunk; + pgno_t *src = MDBX_PNL_BEGIN(txn->tw.relist) + left - chunk; memcpy(dst, src, chunk * sizeof(pgno_t)); pgno_t *from = src, *to = src + chunk; - mdbx_trace("%s: fill %u [ %u:%" PRIaPGNO "...%u:%" PRIaPGNO - "] @%" PRIaTXN, - dbg_prefix_mode, chunk, - (unsigned)(from - txn->tw.reclaimed_pglist), from[0], - (unsigned)(to - txn->tw.reclaimed_pglist), to[-1], fill_gc_id); + TRACE("%s: fill %zu [ %zu:%" PRIaPGNO "...%zu:%" PRIaPGNO "] @%" PRIaTXN, + dbg_prefix_mode, chunk, from - txn->tw.relist, from[0], + to - txn->tw.relist, to[-1], fill_gc_id); left -= chunk; - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, retired_stored + amount - left, true); + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, ctx->retired_stored + amount - left, true); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -9904,51 +11024,53 @@ retry: } if (txn->tw.lifo_reclaimed == nullptr) { - mdbx_tassert(txn, lifo == 0); - rc = mdbx_cursor_next(&couple.outer, &key, &data, MDBX_NEXT); + tASSERT(txn, ctx->lifo == 0); + rc = cursor_next(&ctx->cursor, &key, &data, MDBX_NEXT); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } else { - mdbx_tassert(txn, lifo != 0); + tASSERT(txn, ctx->lifo != 0); } } } - mdbx_tassert(txn, rc == MDBX_SUCCESS); + tASSERT(txn, rc == MDBX_SUCCESS); if (unlikely(txn->tw.loose_count != 0)) { - mdbx_notice("** restart: got %u loose pages", txn->tw.loose_count); + NOTICE("** restart: got %zu loose pages", txn->tw.loose_count); goto retry; } - if (unlikely(filled_gc_slot != + if (unlikely(ctx->filled_slot != (txn->tw.lifo_reclaimed - ? (unsigned)MDBX_PNL_SIZE(txn->tw.lifo_reclaimed) + ? MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed) : 0))) { - const bool will_retry = loop < 9; - mdbx_notice("** %s: reserve excess (filled-slot %u, loop %u)", - will_retry ? "restart" : "ignore", filled_gc_slot, loop); + const bool will_retry = ctx->loop < 9; + NOTICE("** %s: reserve excess (filled-slot %zu, loop %zu)", + will_retry ? "restart" : "ignore", ctx->filled_slot, ctx->loop); if (will_retry) goto retry; } - mdbx_tassert(txn, - txn->tw.lifo_reclaimed == NULL || - cleaned_gc_slot == MDBX_PNL_SIZE(txn->tw.lifo_reclaimed)); + tASSERT(txn, + txn->tw.lifo_reclaimed == NULL || + ctx->cleaned_slot == MDBX_PNL_GETSIZE(txn->tw.lifo_reclaimed)); bailout: - txn->mt_cursors[FREE_DBI] = couple.outer.mc_next; + txn->mt_cursors[FREE_DBI] = ctx->cursor.mc_next; -bailout_notracking: - MDBX_PNL_SIZE(txn->tw.reclaimed_pglist) = 0; - mdbx_trace("<<< %u loops, rc = %d", loop, rc); + MDBX_PNL_SETSIZE(txn->tw.relist, 0); +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.wloops += ctx->loop; +#endif /* MDBX_ENABLE_PROFGC */ + TRACE("<<< %zu loops, rc = %d", ctx->loop, rc); return rc; } -static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { - MDBX_dpl *const dl = - (txn->mt_flags & MDBX_WRITEMAP) ? txn->tw.dirtylist : mdbx_dpl_sort(txn); +static int txn_write(MDBX_txn *txn, iov_ctx_t *ctx) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + MDBX_dpl *const dl = dpl_sort(txn); int rc = MDBX_SUCCESS; - unsigned r, w; + size_t r, w, total_npages = 0; for (w = 0, r = 1; r <= dl->length; ++r) { MDBX_page *dp = dl->items[r].ptr; if (dp->mp_flags & P_LOOSE) { @@ -9956,25 +11078,34 @@ static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { continue; } unsigned npages = dpl_npages(dl, r); + total_npages += npages; rc = iov_page(txn, ctx, dp, npages); if (unlikely(rc != MDBX_SUCCESS)) - break; + return rc; } - if (ctx->iov_items) { - /* iov_page() frees dirty-pages and reset iov_items in case of failure. */ - mdbx_tassert(txn, rc == MDBX_SUCCESS); - rc = mdbx_iov_write(txn, ctx); + if (!iov_empty(ctx)) { + tASSERT(txn, rc == MDBX_SUCCESS); + rc = iov_write(ctx); } + if (likely(rc == MDBX_SUCCESS) && ctx->fd == txn->mt_env->me_lazy_fd) { + txn->mt_env->me_lck->mti_unsynced_pages.weak += total_npages; + if (!txn->mt_env->me_lck->mti_eoos_timestamp.weak) + txn->mt_env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); + } + + txn->tw.dirtylist->pages_including_loose -= total_npages; while (r <= dl->length) dl->items[++w] = dl->items[r++]; dl->sorted = dpl_setlen(dl, w); txn->tw.dirtyroom += r - 1 - w; - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + tASSERT(txn, txn->tw.dirtylist->length == txn->tw.loose_count); + tASSERT(txn, txn->tw.dirtylist->pages_including_loose == txn->tw.loose_count); return rc; } @@ -9982,8 +11113,7 @@ static int mdbx_txn_write(MDBX_txn *txn, struct mdbx_iov_ctx *ctx) { static __always_inline bool check_dbi(MDBX_txn *txn, MDBX_dbi dbi, unsigned validity) { if (likely(dbi < txn->mt_numdbs)) { - mdbx_memory_fence(mo_AcquireRelease, false); - if (likely(!TXN_DBI_CHANGED(txn, dbi))) { + if (likely(!dbi_changed(txn, dbi))) { if (likely(txn->mt_dbistate[dbi] & validity)) return true; if (likely(dbi < CORE_DBS || @@ -9999,48 +11129,47 @@ int mdbx_txn_commit(MDBX_txn *txn) { return __inline_mdbx_txn_commit(txn); } #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ /* Merge child txn into parent */ -static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, - const unsigned parent_retired_len) { - MDBX_dpl *const src = mdbx_dpl_sort(txn); +static __inline void txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, + const size_t parent_retired_len) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0); + MDBX_dpl *const src = dpl_sort(txn); /* Remove refunded pages from parent's dirty list */ - MDBX_dpl *const dst = mdbx_dpl_sort(parent); + MDBX_dpl *const dst = dpl_sort(parent); if (MDBX_ENABLE_REFUND) { - unsigned n = dst->length; + size_t n = dst->length; while (n && dst->items[n].pgno >= parent->mt_next_pgno) { - if (!(txn->mt_env->me_flags & MDBX_WRITEMAP)) { - MDBX_page *dp = dst->items[n].ptr; - mdbx_dpage_free(txn->mt_env, dp, dpl_npages(dst, n)); - } + const unsigned npages = dpl_npages(dst, n); + dpage_free(txn->mt_env, dst->items[n].ptr, npages); --n; } parent->tw.dirtyroom += dst->sorted - n; dst->sorted = dpl_setlen(dst, n); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); } /* Remove reclaimed pages from parent's dirty list */ - const MDBX_PNL reclaimed_list = parent->tw.reclaimed_pglist; - mdbx_dpl_sift(parent, reclaimed_list, false); + const MDBX_PNL reclaimed_list = parent->tw.relist; + dpl_sift(parent, reclaimed_list, false); /* Move retired pages from parent's dirty & spilled list to reclaimed */ - unsigned r, w, d, s, l; + size_t r, w, d, s, l; for (r = w = parent_retired_len; - ++r <= MDBX_PNL_SIZE(parent->tw.retired_pages);) { + ++r <= MDBX_PNL_GETSIZE(parent->tw.retired_pages);) { const pgno_t pgno = parent->tw.retired_pages[r]; - const unsigned di = mdbx_dpl_exist(parent, pgno); - const unsigned si = !di ? mdbx_search_spilled(parent, pgno) : 0; + const size_t di = dpl_exist(parent, pgno); + const size_t si = !di ? search_spilled(parent, pgno) : 0; unsigned npages; const char *kind; if (di) { MDBX_page *dp = dst->items[di].ptr; - mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | - P_OVERFLOW | P_SPILLED)) == 0); + tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | + P_OVERFLOW | P_SPILLED)) == 0); npages = dpl_npages(dst, di); - mdbx_page_wash(parent, di, dp, npages); + page_wash(parent, di, dp, npages); kind = "dirty"; l = 1; if (unlikely(npages > l)) { @@ -10055,7 +11184,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, /* Список retired страниц не сортирован, но для ускорения сортировки * дополняется в соответствии с MDBX_PNL_ASCENDING */ #if MDBX_PNL_ASCENDING - const unsigned len = MDBX_PNL_SIZE(parent->tw.retired_pages); + const size_t len = MDBX_PNL_GETSIZE(parent->tw.retired_pages); while (r < len && parent->tw.retired_pages[r + 1] == pgno + l) { ++r; if (++l == npages) @@ -10072,54 +11201,54 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } } else if (unlikely(si)) { l = npages = 1; - mdbx_spill_remove(parent, si, 1); + spill_remove(parent, si, 1); kind = "spilled"; } else { parent->tw.retired_pages[++w] = pgno; continue; } - mdbx_debug("reclaim retired parent's %u->%u %s page %" PRIaPGNO, npages, l, - kind, pgno); - int err = mdbx_pnl_insert_range(&parent->tw.reclaimed_pglist, pgno, l); - mdbx_ensure(txn->mt_env, err == MDBX_SUCCESS); + DEBUG("reclaim retired parent's %u -> %zu %s page %" PRIaPGNO, npages, l, + kind, pgno); + int err = pnl_insert_range(&parent->tw.relist, pgno, l); + ENSURE(txn->mt_env, err == MDBX_SUCCESS); } - MDBX_PNL_SIZE(parent->tw.retired_pages) = w; + MDBX_PNL_SETSIZE(parent->tw.retired_pages, w); /* Filter-out parent spill list */ - if (parent->tw.spill_pages && MDBX_PNL_SIZE(parent->tw.spill_pages) > 0) { - const MDBX_PNL sl = mdbx_spill_purge(parent); - unsigned len = MDBX_PNL_SIZE(sl); + if (parent->tw.spilled.list && + MDBX_PNL_GETSIZE(parent->tw.spilled.list) > 0) { + const MDBX_PNL sl = spill_purge(parent); + size_t len = MDBX_PNL_GETSIZE(sl); if (len) { /* Remove refunded pages from parent's spill list */ if (MDBX_ENABLE_REFUND && MDBX_PNL_MOST(sl) >= (parent->mt_next_pgno << 1)) { #if MDBX_PNL_ASCENDING - unsigned i = MDBX_PNL_SIZE(sl); + size_t i = MDBX_PNL_GETSIZE(sl); assert(MDBX_PNL_MOST(sl) == MDBX_PNL_LAST(sl)); do { if ((sl[i] & 1) == 0) - mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); i -= 1; } while (i && sl[i] >= (parent->mt_next_pgno << 1)); - MDBX_PNL_SIZE(sl) = i; + MDBX_PNL_SETSIZE(sl, i); #else assert(MDBX_PNL_MOST(sl) == MDBX_PNL_FIRST(sl)); - unsigned i = 0; + size_t i = 0; do { ++i; if ((sl[i] & 1) == 0) - mdbx_debug("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); + DEBUG("refund parent's spilled page %" PRIaPGNO, sl[i] >> 1); } while (i < len && sl[i + 1] >= (parent->mt_next_pgno << 1)); - MDBX_PNL_SIZE(sl) = len -= i; + MDBX_PNL_SETSIZE(sl, len -= i); memmove(sl + 1, sl + 1 + i, len * sizeof(sl[0])); #endif } - mdbx_tassert( - txn, mdbx_pnl_check4assert(sl, (size_t)parent->mt_next_pgno << 1)); + tASSERT(txn, pnl_check_allocated(sl, (size_t)parent->mt_next_pgno << 1)); /* Remove reclaimed pages from parent's spill list */ - s = MDBX_PNL_SIZE(sl), r = MDBX_PNL_SIZE(reclaimed_list); + s = MDBX_PNL_GETSIZE(sl), r = MDBX_PNL_GETSIZE(reclaimed_list); /* Scanning from end to begin */ while (s && r) { if (sl[s] & 1) { @@ -10133,9 +11262,9 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, s -= !cmp; r -= cmp; } else { - mdbx_debug("remove reclaimed parent's spilled page %" PRIaPGNO, - reclaimed_pgno); - mdbx_spill_remove(parent, s, 1); + DEBUG("remove reclaimed parent's spilled page %" PRIaPGNO, + reclaimed_pgno); + spill_remove(parent, s, 1); --s; --r; } @@ -10143,10 +11272,10 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, /* Remove anything in our dirty list from parent's spill list */ /* Scanning spill list in descend order */ - const int step = MDBX_PNL_ASCENDING ? -1 : 1; - s = MDBX_PNL_ASCENDING ? MDBX_PNL_SIZE(sl) : 1; + const intptr_t step = MDBX_PNL_ASCENDING ? -1 : 1; + s = MDBX_PNL_ASCENDING ? MDBX_PNL_GETSIZE(sl) : 1; d = src->length; - while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_SIZE(sl))) { + while (d && (MDBX_PNL_ASCENDING ? s > 0 : s <= MDBX_PNL_GETSIZE(sl))) { if (sl[s] & 1) { s += step; continue; @@ -10164,41 +11293,40 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, continue; } - mdbx_debug("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, - dirty_pgno_form); - mdbx_spill_remove(parent, s, 1); + DEBUG("remove dirtied parent's spilled %u page %" PRIaPGNO, npages, + dirty_pgno_form); + spill_remove(parent, s, 1); s += step; } /* Squash deleted pagenums if we deleted any */ - mdbx_spill_purge(parent); + spill_purge(parent); } } /* Remove anything in our spill list from parent's dirty list */ - if (txn->tw.spill_pages) { - mdbx_tassert(txn, mdbx_pnl_check4assert(txn->tw.spill_pages, - (size_t)parent->mt_next_pgno << 1)); - mdbx_dpl_sift(parent, txn->tw.spill_pages, true); - mdbx_tassert(parent, - parent->tw.dirtyroom + parent->tw.dirtylist->length == - (parent->mt_parent ? parent->mt_parent->tw.dirtyroom - : parent->mt_env->me_options.dp_limit)); + if (txn->tw.spilled.list) { + tASSERT(txn, pnl_check_allocated(txn->tw.spilled.list, + (size_t)parent->mt_next_pgno << 1)); + dpl_sift(parent, txn->tw.spilled.list, true); + tASSERT(parent, + parent->tw.dirtyroom + parent->tw.dirtylist->length == + (parent->mt_parent ? parent->mt_parent->tw.dirtyroom + : parent->mt_env->me_options.dp_limit)); } /* Find length of merging our dirty list with parent's and release * filter-out pages */ for (l = 0, d = dst->length, s = src->length; d > 0 && s > 0;) { MDBX_page *sp = src->items[s].ptr; - mdbx_tassert(parent, - (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_LOOSE | P_SPILLED)) == 0); + tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_LOOSE | P_SPILLED)) == 0); const unsigned s_npages = dpl_npages(src, s); const pgno_t s_pgno = src->items[s].pgno; MDBX_page *dp = dst->items[d].ptr; - mdbx_tassert(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | - P_OVERFLOW | P_SPILLED)) == 0); + tASSERT(parent, (dp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_SPILLED)) == 0); const unsigned d_npages = dpl_npages(dst, d); const pgno_t d_pgno = dst->items[d].pgno; @@ -10214,19 +11342,17 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, ++l; } else { dst->items[d--].ptr = nullptr; - if ((txn->mt_flags & MDBX_WRITEMAP) == 0) - mdbx_dpage_free(txn->mt_env, dp, d_npages); + dpage_free(txn->mt_env, dp, d_npages); } } assert(dst->sorted == dst->length); - mdbx_tassert(parent, dst->detent >= l + d + s); + tASSERT(parent, dst->detent >= l + d + s); dst->sorted = l + d + s; /* the merged length */ while (s > 0) { MDBX_page *sp = src->items[s].ptr; - mdbx_tassert(parent, - (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | - P_LOOSE | P_SPILLED)) == 0); + tASSERT(parent, (sp->mp_flags & ~(P_LEAF | P_LEAF2 | P_BRANCH | P_OVERFLOW | + P_LOOSE | P_SPILLED)) == 0); if (sp->mp_flags != P_LOOSE) { sp->mp_txnid = parent->mt_front; sp->mp_flags &= ~P_SPILLED; @@ -10248,7 +11374,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } ++w; } - mdbx_notice("squash to begin for extending-merge %u -> %u", d, w - 1); + NOTICE("squash to begin for extending-merge %zu -> %zu", d, w - 1); d = w - 1; continue; } @@ -10278,7 +11404,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } } } else { - /* from begin to end with dst shrinking (a lot of new overflow pages) */ + /* from begin to end with shrinking (a lot of new large/overflow pages) */ for (l = s = d = 1; s <= src->length && d <= dst->length;) { if (unlikely(l >= d)) { /* squash to get a gap of free space for merge */ @@ -10290,7 +11416,7 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, } --w; } - mdbx_notice("squash to end for shrinking-merge %u -> %u", d, w + 1); + NOTICE("squash to end for shrinking-merge %zu -> %zu", d, w + 1); d = w + 1; continue; } @@ -10324,100 +11450,143 @@ static __inline void mdbx_txn_merge(MDBX_txn *const parent, MDBX_txn *const txn, assert(parent->tw.dirtyroom <= parent->mt_env->me_options.dp_limit); dpl_setlen(dst, dst->sorted); parent->tw.dirtylru = txn->tw.dirtylru; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); - mdbx_dpl_free(txn); - if (txn->tw.spill_pages) { - if (parent->tw.spill_pages) { + /* В текущем понимании выгоднее пересчитать кол-во страниц, + * чем подмешивать лишние ветвления и вычисления в циклы выше. */ + dst->pages_including_loose = 0; + for (r = 1; r <= dst->length; ++r) + dst->pages_including_loose += dpl_npages(dst, r); + + tASSERT(parent, dirtylist_check(parent)); + dpl_free(txn); + + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { /* Must not fail since space was preserved above. */ - mdbx_pnl_xmerge(parent->tw.spill_pages, txn->tw.spill_pages); - mdbx_pnl_free(txn->tw.spill_pages); + pnl_merge(parent->tw.spilled.list, txn->tw.spilled.list); + pnl_free(txn->tw.spilled.list); } else { - parent->tw.spill_pages = txn->tw.spill_pages; - parent->tw.spill_least_removed = txn->tw.spill_least_removed; + parent->tw.spilled.list = txn->tw.spilled.list; + parent->tw.spilled.least_removed = txn->tw.spilled.least_removed; } - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); } parent->mt_flags &= ~MDBX_TXN_HAS_CHILD; - if (parent->tw.spill_pages) { - assert(mdbx_pnl_check4assert(parent->tw.spill_pages, - (size_t)parent->mt_next_pgno << 1)); - if (MDBX_PNL_SIZE(parent->tw.spill_pages)) + if (parent->tw.spilled.list) { + assert(pnl_check_allocated(parent->tw.spilled.list, + (size_t)parent->mt_next_pgno << 1)); + if (MDBX_PNL_GETSIZE(parent->tw.spilled.list)) parent->mt_flags |= MDBX_TXN_SPILLS; } } +static void take_gcprof(MDBX_txn *txn, MDBX_commit_latency *latency) { + MDBX_env *const env = txn->mt_env; + if (MDBX_ENABLE_PROFGC) { + pgop_stat_t *const ptr = &env->me_lck->mti_pgop_stat; + latency->gc_prof.work_counter = ptr->gc_prof.work.spe_counter; + latency->gc_prof.work_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.work.rtime_monotonic); + latency->gc_prof.work_xtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.work.xtime_cpu); + latency->gc_prof.work_rsteps = ptr->gc_prof.work.rsteps; + latency->gc_prof.work_xpages = ptr->gc_prof.work.xpages; + latency->gc_prof.work_majflt = ptr->gc_prof.work.majflt; + + latency->gc_prof.self_counter = ptr->gc_prof.self.spe_counter; + latency->gc_prof.self_rtime_monotonic = + osal_monotime_to_16dot16(ptr->gc_prof.self.rtime_monotonic); + latency->gc_prof.self_xtime_cpu = + osal_monotime_to_16dot16(ptr->gc_prof.self.xtime_cpu); + latency->gc_prof.self_rsteps = ptr->gc_prof.self.rsteps; + latency->gc_prof.self_xpages = ptr->gc_prof.self.xpages; + latency->gc_prof.self_majflt = ptr->gc_prof.self.majflt; + + latency->gc_prof.wloops = ptr->gc_prof.wloops; + latency->gc_prof.coalescences = ptr->gc_prof.coalescences; + latency->gc_prof.wipes = ptr->gc_prof.wipes; + latency->gc_prof.flushes = ptr->gc_prof.flushes; + latency->gc_prof.kicks = ptr->gc_prof.kicks; + if (txn == env->me_txn0) + memset(&ptr->gc_prof, 0, sizeof(ptr->gc_prof)); + } else + memset(&latency->gc_prof, 0, sizeof(latency->gc_prof)); +} + int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { STATIC_ASSERT(MDBX_TXN_FINISHED == MDBX_TXN_BLOCKED - MDBX_TXN_HAS_CHILD - MDBX_TXN_ERROR); - const uint64_t ts_0 = latency ? mdbx_osal_monotime() : 0; - uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0; - uint32_t audit_duration = 0; + const uint64_t ts_0 = latency ? osal_monotime() : 0; + uint64_t ts_1 = 0, ts_2 = 0, ts_3 = 0, ts_4 = 0, ts_5 = 0, gc_cputime = 0; int rc = check_txn(txn, MDBX_TXN_FINISHED); - if (unlikely(rc != MDBX_SUCCESS)) - goto provide_latency; + if (unlikely(rc != MDBX_SUCCESS)) { + if (latency) + memset(latency, 0, sizeof(*latency)); + return rc; + } + + MDBX_env *const env = txn->mt_env; +#if MDBX_ENV_CHECKPID + if (unlikely(env->me_pid != osal_getpid())) { + env->me_flags |= MDBX_FATAL_ERROR; + if (latency) + memset(latency, 0, sizeof(*latency)); + return MDBX_PANIC; + } +#endif /* MDBX_ENV_CHECKPID */ if (unlikely(txn->mt_flags & MDBX_TXN_ERROR)) { rc = MDBX_RESULT_TRUE; goto fail; } - MDBX_env *env = txn->mt_env; -#if MDBX_ENV_CHECKPID - if (unlikely(env->me_pid != mdbx_getpid())) { - env->me_flags |= MDBX_FATAL_ERROR; - rc = MDBX_PANIC; - goto provide_latency; - } -#endif /* MDBX_ENV_CHECKPID */ - - /* mdbx_txn_end() mode for a commit which writes nothing */ + /* txn_end() mode for a commit which writes nothing */ unsigned end_mode = MDBX_END_PURE_COMMIT | MDBX_END_UPDATE | MDBX_END_SLOT | MDBX_END_FREE; - if (unlikely(F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) goto done; if (txn->mt_child) { rc = mdbx_txn_commit_ex(txn->mt_child, NULL); - mdbx_tassert(txn, txn->mt_child == NULL); + tASSERT(txn, txn->mt_child == NULL); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } if (unlikely(txn != env->me_txn)) { - mdbx_debug("%s", "attempt to commit unknown transaction"); + DEBUG("%s", "attempt to commit unknown transaction"); rc = MDBX_EINVAL; goto fail; } if (txn->mt_parent) { - mdbx_tassert(txn, mdbx_audit_ex(txn, 0, false) == 0); - mdbx_assert(env, txn != env->me_txn0); + tASSERT(txn, audit_ex(txn, 0, false) == 0); + eASSERT(env, txn != env->me_txn0); MDBX_txn *const parent = txn->mt_parent; - mdbx_assert(env, parent->mt_signature == MDBX_MT_SIGNATURE); - mdbx_assert(env, parent->mt_child == txn && - (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); - mdbx_assert(env, mdbx_dirtylist_check(txn)); + eASSERT(env, parent->mt_signature == MDBX_MT_SIGNATURE); + eASSERT(env, parent->mt_child == txn && + (parent->mt_flags & MDBX_TXN_HAS_CHILD) != 0); + eASSERT(env, dirtylist_check(txn)); if (txn->tw.dirtylist->length == 0 && !(txn->mt_flags & MDBX_TXN_DIRTY) && parent->mt_numdbs == txn->mt_numdbs) { for (int i = txn->mt_numdbs; --i >= 0;) { - mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); if ((txn->mt_dbistate[i] & DBI_STALE) && !(parent->mt_dbistate[i] & DBI_STALE)) - mdbx_tassert(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], - sizeof(MDBX_db)) == 0); + tASSERT(txn, memcmp(&parent->mt_dbs[i], &txn->mt_dbs[i], + sizeof(MDBX_db)) == 0); } - mdbx_tassert(txn, memcmp(&parent->mt_geo, &txn->mt_geo, - sizeof(parent->mt_geo)) == 0); - mdbx_tassert(txn, memcmp(&parent->mt_canary, &txn->mt_canary, - sizeof(parent->mt_canary)) == 0); - mdbx_tassert(txn, !txn->tw.spill_pages || - MDBX_PNL_SIZE(txn->tw.spill_pages) == 0); - mdbx_tassert(txn, txn->tw.loose_count == 0); + tASSERT(txn, memcmp(&parent->mt_geo, &txn->mt_geo, + sizeof(parent->mt_geo)) == 0); + tASSERT(txn, memcmp(&parent->mt_canary, &txn->mt_canary, + sizeof(parent->mt_canary)) == 0); + tASSERT(txn, !txn->tw.spilled.list || + MDBX_PNL_GETSIZE(txn->tw.spilled.list) == 0); + tASSERT(txn, txn->tw.loose_count == 0); /* fast completion of pure nested transaction */ end_mode = MDBX_END_PURE_COMMIT | MDBX_END_SLOT | MDBX_END_FREE; @@ -10426,32 +11595,30 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { /* Preserve space for spill list to avoid parent's state corruption * if allocation fails. */ - const unsigned parent_retired_len = - (unsigned)(uintptr_t)parent->tw.retired_pages; - mdbx_tassert(txn, - parent_retired_len <= MDBX_PNL_SIZE(txn->tw.retired_pages)); - const unsigned retired_delta = - MDBX_PNL_SIZE(txn->tw.retired_pages) - parent_retired_len; + const size_t parent_retired_len = (uintptr_t)parent->tw.retired_pages; + tASSERT(txn, parent_retired_len <= MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + const size_t retired_delta = + MDBX_PNL_GETSIZE(txn->tw.retired_pages) - parent_retired_len; if (retired_delta) { - rc = mdbx_pnl_need(&txn->tw.reclaimed_pglist, retired_delta); + rc = pnl_need(&txn->tw.relist, retired_delta); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - if (txn->tw.spill_pages) { - if (parent->tw.spill_pages) { - rc = mdbx_pnl_need(&parent->tw.spill_pages, - MDBX_PNL_SIZE(txn->tw.spill_pages)); + if (txn->tw.spilled.list) { + if (parent->tw.spilled.list) { + rc = pnl_need(&parent->tw.spilled.list, + MDBX_PNL_GETSIZE(txn->tw.spilled.list)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - mdbx_spill_purge(txn); + spill_purge(txn); } if (unlikely(txn->tw.dirtylist->length + parent->tw.dirtylist->length > parent->tw.dirtylist->detent && - !mdbx_dpl_reserve(parent, txn->tw.dirtylist->length + - parent->tw.dirtylist->length))) { + !dpl_reserve(parent, txn->tw.dirtylist->length + + parent->tw.dirtylist->length))) { rc = MDBX_ENOMEM; goto fail; } @@ -10464,9 +11631,9 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->tw.retired_pages = txn->tw.retired_pages; txn->tw.retired_pages = NULL; - mdbx_pnl_free(parent->tw.reclaimed_pglist); - parent->tw.reclaimed_pglist = txn->tw.reclaimed_pglist; - txn->tw.reclaimed_pglist = NULL; + pnl_free(parent->tw.relist); + parent->tw.relist = txn->tw.relist; + txn->tw.relist = NULL; parent->tw.last_reclaimed = txn->tw.last_reclaimed; parent->mt_geo = txn->mt_geo; @@ -10481,64 +11648,76 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { parent->tw.loose_pages = txn->tw.loose_pages; /* Merge our cursors into parent's and close them */ - mdbx_cursors_eot(txn, true); + cursors_eot(txn, true); end_mode |= MDBX_END_EOTDONE; /* Update parent's DBs array */ memcpy(parent->mt_dbs, txn->mt_dbs, txn->mt_numdbs * sizeof(MDBX_db)); parent->mt_numdbs = txn->mt_numdbs; - for (unsigned i = 0; i < txn->mt_numdbs; i++) { + for (size_t i = 0; i < txn->mt_numdbs; i++) { /* preserve parent's status */ const uint8_t state = txn->mt_dbistate[i] | (parent->mt_dbistate[i] & (DBI_CREAT | DBI_FRESH | DBI_DIRTY)); - mdbx_debug("db %u dbi-state %s 0x%02x -> 0x%02x", i, - (parent->mt_dbistate[i] != state) ? "update" : "still", - parent->mt_dbistate[i], state); + DEBUG("dbi %zu dbi-state %s 0x%02x -> 0x%02x", i, + (parent->mt_dbistate[i] != state) ? "update" : "still", + parent->mt_dbistate[i], state); parent->mt_dbistate[i] = state; } - ts_1 = latency ? mdbx_osal_monotime() : 0; - mdbx_txn_merge(parent, txn, parent_retired_len); - ts_2 = latency ? mdbx_osal_monotime() : 0; + if (latency) { + ts_1 = osal_monotime(); + ts_2 = /* no gc-update */ ts_1; + ts_3 = /* no audit */ ts_2; + ts_4 = /* no write */ ts_3; + ts_5 = /* no sync */ ts_4; + } + txn_merge(parent, txn, parent_retired_len); env->me_txn = parent; parent->mt_child = NULL; - mdbx_tassert(parent, mdbx_dirtylist_check(parent)); + tASSERT(parent, dirtylist_check(parent)); #if MDBX_ENABLE_REFUND - mdbx_refund(parent); - if (mdbx_assert_enabled()) { + txn_refund(parent); + if (ASSERT_ENABLED()) { /* Check parent's loose pages not suitable for refund */ - for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = lp->mp_next) - mdbx_tassert(parent, lp->mp_pgno < parent->tw.loose_refund_wl && - lp->mp_pgno + 1 < parent->mt_next_pgno); + for (MDBX_page *lp = parent->tw.loose_pages; lp; lp = mp_next(lp)) { + tASSERT(parent, lp->mp_pgno < parent->tw.loose_refund_wl && + lp->mp_pgno + 1 < parent->mt_next_pgno); + MDBX_ASAN_UNPOISON_MEMORY_REGION(&mp_next(lp), sizeof(MDBX_page *)); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(lp), sizeof(MDBX_page *)); + } /* Check parent's reclaimed pages not suitable for refund */ - if (MDBX_PNL_SIZE(parent->tw.reclaimed_pglist)) - mdbx_tassert(parent, MDBX_PNL_MOST(parent->tw.reclaimed_pglist) + 1 < - parent->mt_next_pgno); + if (MDBX_PNL_GETSIZE(parent->tw.relist)) + tASSERT(parent, + MDBX_PNL_MOST(parent->tw.relist) + 1 < parent->mt_next_pgno); } #endif /* MDBX_ENABLE_REFUND */ - ts_4 = ts_3 = latency ? mdbx_osal_monotime() : 0; txn->mt_signature = 0; - mdbx_free(txn); - mdbx_tassert(parent, mdbx_audit_ex(parent, 0, false) == 0); + osal_free(txn); + tASSERT(parent, audit_ex(parent, 0, false) == 0); rc = MDBX_SUCCESS; goto provide_latency; } - mdbx_tassert(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == - (txn->mt_parent ? txn->mt_parent->tw.dirtyroom - : txn->mt_env->me_options.dp_limit)); - mdbx_cursors_eot(txn, false); + if (!txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.dirtyroom + txn->tw.dirtylist->length == + (txn->mt_parent ? txn->mt_parent->tw.dirtyroom + : txn->mt_env->me_options.dp_limit)); + } + cursors_eot(txn, false); end_mode |= MDBX_END_EOTDONE; - if (txn->tw.dirtylist->length == 0 && + if ((!txn->tw.dirtylist || txn->tw.dirtylist->length == 0) && (txn->mt_flags & (MDBX_TXN_DIRTY | MDBX_TXN_SPILLS)) == 0) { - for (int i = txn->mt_numdbs; --i >= 0;) - mdbx_tassert(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); + for (intptr_t i = txn->mt_numdbs; --i >= 0;) + tASSERT(txn, (txn->mt_dbistate[i] & DBI_DIRTY) == 0); #if defined(MDBX_NOSUCCESS_EMPTY_COMMIT) && MDBX_NOSUCCESS_EMPTY_COMMIT - rc = mdbx_txn_end(txn, end_mode); + rc = txn_end(txn, end_mode); if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = MDBX_RESULT_TRUE; @@ -10548,10 +11727,10 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { #endif /* MDBX_NOSUCCESS_EMPTY_COMMIT */ } - mdbx_debug("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO - "/%" PRIaPGNO, - txn->mt_txnid, (void *)txn, (void *)env, - txn->mt_dbs[MAIN_DBI].md_root, txn->mt_dbs[FREE_DBI].md_root); + DEBUG("committing txn %" PRIaTXN " %p on mdbenv %p, root page %" PRIaPGNO + "/%" PRIaPGNO, + txn->mt_txnid, (void *)txn, (void *)env, txn->mt_dbs[MAIN_DBI].md_root, + txn->mt_dbs[FREE_DBI].md_root); /* Update DB root pointers */ if (txn->mt_numdbs > CORE_DBS) { @@ -10559,32 +11738,41 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { MDBX_val data; data.iov_len = sizeof(MDBX_db); - rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) goto fail; for (MDBX_dbi i = CORE_DBS; i < txn->mt_numdbs; i++) { if (txn->mt_dbistate[i] & DBI_DIRTY) { MDBX_db *db = &txn->mt_dbs[i]; - mdbx_debug("update main's entry for sub-db %u, mod_txnid %" PRIaTXN - " -> %" PRIaTXN, - i, db->md_mod_txnid, txn->mt_txnid); + DEBUG("update main's entry for sub-db %u, mod_txnid %" PRIaTXN + " -> %" PRIaTXN, + i, db->md_mod_txnid, txn->mt_txnid); + /* Может быть mod_txnid > front после коммита вложенных тразакций */ db->md_mod_txnid = txn->mt_txnid; data.iov_base = db; - WITH_CURSOR_TRACKING(couple.outer, - rc = mdbx_cursor_put(&couple.outer, - &txn->mt_dbxs[i].md_name, - &data, F_SUBDATA)); + WITH_CURSOR_TRACKING( + couple.outer, + rc = cursor_put_nochecklen(&couple.outer, &txn->mt_dbxs[i].md_name, + &data, F_SUBDATA)); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } } } - ts_1 = latency ? mdbx_osal_monotime() : 0; - rc = mdbx_update_gc(txn); + ts_1 = latency ? osal_monotime() : 0; + + gcu_context_t gcu_ctx; + gc_cputime = latency ? osal_cputime(nullptr) : 0; + rc = gcu_context_init(txn, &gcu_ctx); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = update_gc(txn, &gcu_ctx); + gc_cputime = latency ? osal_cputime(nullptr) - gc_cputime : 0; if (unlikely(rc != MDBX_SUCCESS)) goto fail; + tASSERT(txn, txn->tw.loose_count == 0); txn->mt_dbs[FREE_DBI].md_mod_txnid = (txn->mt_dbistate[FREE_DBI] & DBI_DIRTY) ? txn->mt_txnid : txn->mt_dbs[FREE_DBI].md_mod_txnid; @@ -10593,152 +11781,386 @@ int mdbx_txn_commit_ex(MDBX_txn *txn, MDBX_commit_latency *latency) { ? txn->mt_txnid : txn->mt_dbs[MAIN_DBI].md_mod_txnid; - ts_2 = latency ? mdbx_osal_monotime() : 0; - if (mdbx_audit_enabled()) { - rc = mdbx_audit_ex(txn, MDBX_PNL_SIZE(txn->tw.retired_pages), true); - const uint64_t audit_end = mdbx_osal_monotime(); - audit_duration = mdbx_osal_monotime_to_16dot16(audit_end - ts_2); - ts_2 = audit_end; + ts_2 = latency ? osal_monotime() : 0; + ts_3 = ts_2; + if (AUDIT_ENABLED()) { + rc = audit_ex(txn, MDBX_PNL_GETSIZE(txn->tw.retired_pages), true); + ts_3 = osal_monotime(); if (unlikely(rc != MDBX_SUCCESS)) goto fail; } - struct mdbx_iov_ctx ctx; - mdbx_iov_init(txn, &ctx); - rc = mdbx_txn_write(txn, &ctx); - if (likely(rc == MDBX_SUCCESS)) - mdbx_iov_done(txn, &ctx); - /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ - ts_3 = latency ? mdbx_osal_monotime() : 0; - - if (likely(rc == MDBX_SUCCESS)) { - const MDBX_meta *head = constmeta_prefer_last(env); - MDBX_meta meta; - memcpy(meta.mm_magic_and_version, head->mm_magic_and_version, 8); - meta.mm_extra_flags = head->mm_extra_flags; - meta.mm_validator_id = head->mm_validator_id; - meta.mm_extra_pagehdr = head->mm_extra_pagehdr; - unaligned_poke_u64(4, meta.mm_pages_retired, - unaligned_peek_u64(4, head->mm_pages_retired) + - MDBX_PNL_SIZE(txn->tw.retired_pages)); - meta.mm_geo = txn->mt_geo; - meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; - meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; - meta.mm_canary = txn->mt_canary; - meta_set_txnid(env, &meta, txn->mt_txnid); - - rc = mdbx_sync_locked( - env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, &meta); + bool need_flush_for_nometasync = false; + const meta_ptr_t head = meta_recent(env, &txn->tw.troika); + const uint32_t meta_sync_txnid = + atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed); + /* sync prev meta */ + if (head.is_steady && meta_sync_txnid != (uint32_t)head.txnid) { + /* Исправление унаследованного от LMDB недочета: + * + * Всё хорошо, если все процессы работающие с БД не используют WRITEMAP. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате fdatasync() при записи данных этой транзакции. + * + * Всё хорошо, если все процессы работающие с БД используют WRITEMAP + * без MDBX_AVOID_MSYNC. + * Тогда мета-страница (обновленная, но не сброшенная на диск) будет + * сохранена в результате msync() при записи данных этой транзакции. + * + * Если же в процессах работающих с БД используется оба метода, как sync() + * в режиме MDBX_WRITEMAP, так и записи через файловый дескриптор, то + * становится невозможным обеспечить фиксацию на диске мета-страницы + * предыдущей транзакции и данных текущей транзакции, за счет одной + * sync-операцией выполняемой после записи данных текущей транзакции. + * Соответственно, требуется явно обновлять мета-страницу, что полностью + * уничтожает выгоду от NOMETASYNC. */ + const uint32_t txnid_dist = + ((txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) + ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; + /* Смысл "магии" в том, чтобы избежать отдельного вызова fdatasync() + * или msync() для гарантированной фиксации на диске мета-страницы, + * которая была "лениво" отправлена на запись в предыдущей транзакции, + * но не сброшена на диск из-за активного режима MDBX_NOMETASYNC. */ + if ( +#if defined(_WIN32) || defined(_WIN64) + !env->me_overlapped_fd && +#endif + meta_sync_txnid == (uint32_t)head.txnid - txnid_dist) + need_flush_for_nometasync = true; + else { + rc = meta_sync(env, head); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "presync-meta", rc); + goto fail; + } + } } - ts_4 = latency ? mdbx_osal_monotime() : 0; + + if (txn->tw.dirtylist) { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + tASSERT(txn, txn->tw.loose_count == 0); + + mdbx_filehandle_t fd = +#if defined(_WIN32) || defined(_WIN64) + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + (void)need_flush_for_nometasync; +#else +#define MDBX_WRITETHROUGH_THRESHOLD_DEFAULT 2 + (need_flush_for_nometasync || + env->me_dsync_fd == INVALID_HANDLE_VALUE || + txn->tw.dirtylist->length > env->me_options.writethrough_threshold || + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) + ? env->me_lazy_fd + : env->me_dsync_fd; +#endif /* Windows */ + + iov_ctx_t write_ctx; + rc = iov_init(txn, &write_ctx, txn->tw.dirtylist->length, + txn->tw.dirtylist->pages_including_loose, fd, false); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "iov-init", rc); + goto fail; + } + + rc = txn_write(txn, &write_ctx); + if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("txn-%s: error %d", "write", rc); + goto fail; + } + } else { + tASSERT(txn, (txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + env->me_lck->mti_unsynced_pages.weak += txn->tw.writemap_dirty_npages; + if (!env->me_lck->mti_eoos_timestamp.weak) + env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); + } + + /* TODO: use ctx.flush_begin & ctx.flush_end for range-sync */ + ts_4 = latency ? osal_monotime() : 0; + + MDBX_meta meta; + memcpy(meta.mm_magic_and_version, head.ptr_c->mm_magic_and_version, 8); + meta.mm_extra_flags = head.ptr_c->mm_extra_flags; + meta.mm_validator_id = head.ptr_c->mm_validator_id; + meta.mm_extra_pagehdr = head.ptr_c->mm_extra_pagehdr; + unaligned_poke_u64(4, meta.mm_pages_retired, + unaligned_peek_u64(4, head.ptr_c->mm_pages_retired) + + MDBX_PNL_GETSIZE(txn->tw.retired_pages)); + meta.mm_geo = txn->mt_geo; + meta.mm_dbs[FREE_DBI] = txn->mt_dbs[FREE_DBI]; + meta.mm_dbs[MAIN_DBI] = txn->mt_dbs[MAIN_DBI]; + meta.mm_canary = txn->mt_canary; + + txnid_t commit_txnid = txn->mt_txnid; +#if MDBX_ENABLE_BIGFOOT + if (gcu_ctx.bigfoot > txn->mt_txnid) { + commit_txnid = gcu_ctx.bigfoot; + TRACE("use @%" PRIaTXN " (+%zu) for commit bigfoot-txn", commit_txnid, + (size_t)(commit_txnid - txn->mt_txnid)); + } +#endif + meta_set_txnid(env, &meta, commit_txnid); + + rc = sync_locked(env, env->me_flags | txn->mt_flags | MDBX_SHRINK_ALLOWED, + &meta, &txn->tw.troika); + + ts_5 = latency ? osal_monotime() : 0; if (unlikely(rc != MDBX_SUCCESS)) { env->me_flags |= MDBX_FATAL_ERROR; + ERROR("txn-%s: error %d", "sync", rc); goto fail; } end_mode = MDBX_END_COMMITTED | MDBX_END_UPDATE | MDBX_END_EOTDONE; done: - rc = mdbx_txn_end(txn, end_mode); + if (latency) + take_gcprof(txn, latency); + rc = txn_end(txn, end_mode); provide_latency: if (latency) { - latency->audit = audit_duration; - latency->preparation = - ts_1 ? mdbx_osal_monotime_to_16dot16(ts_1 - ts_0) : 0; - latency->gc = - (ts_1 && ts_2) ? mdbx_osal_monotime_to_16dot16(ts_2 - ts_1) : 0; - latency->write = - (ts_2 && ts_3) ? mdbx_osal_monotime_to_16dot16(ts_3 - ts_2) : 0; - latency->sync = - (ts_3 && ts_4) ? mdbx_osal_monotime_to_16dot16(ts_4 - ts_3) : 0; - const uint64_t ts_5 = mdbx_osal_monotime(); - latency->ending = ts_4 ? mdbx_osal_monotime_to_16dot16(ts_5 - ts_4) : 0; - latency->whole = mdbx_osal_monotime_to_16dot16(ts_5 - ts_0); + latency->preparation = ts_1 ? osal_monotime_to_16dot16(ts_1 - ts_0) : 0; + latency->gc_wallclock = + (ts_2 > ts_1) ? osal_monotime_to_16dot16(ts_2 - ts_1) : 0; + latency->gc_cputime = gc_cputime ? osal_monotime_to_16dot16(gc_cputime) : 0; + latency->audit = (ts_3 > ts_2) ? osal_monotime_to_16dot16(ts_3 - ts_2) : 0; + latency->write = (ts_4 > ts_3) ? osal_monotime_to_16dot16(ts_4 - ts_3) : 0; + latency->sync = (ts_5 > ts_4) ? osal_monotime_to_16dot16(ts_5 - ts_4) : 0; + const uint64_t ts_6 = osal_monotime(); + latency->ending = ts_5 ? osal_monotime_to_16dot16(ts_6 - ts_5) : 0; + latency->whole = osal_monotime_to_16dot16_noUnderflow(ts_6 - ts_0); } return rc; fail: + txn->mt_flags |= MDBX_TXN_ERROR; + if (latency) + take_gcprof(txn, latency); mdbx_txn_abort(txn); goto provide_latency; } -static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, - const MDBX_page *const page, - const unsigned meta_number, - unsigned *guess_pagesize) { +static __always_inline int cmp_int_inline(const size_t expected_alignment, + const MDBX_val *a, + const MDBX_val *b) { + if (likely(a->iov_len == b->iov_len)) { + if (sizeof(size_t) > 7 && likely(a->iov_len == 8)) + return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), + unaligned_peek_u64(expected_alignment, b->iov_base)); + if (likely(a->iov_len == 4)) + return CMP2INT(unaligned_peek_u32(expected_alignment, a->iov_base), + unaligned_peek_u32(expected_alignment, b->iov_base)); + if (sizeof(size_t) < 8 && likely(a->iov_len == 8)) + return CMP2INT(unaligned_peek_u64(expected_alignment, a->iov_base), + unaligned_peek_u64(expected_alignment, b->iov_base)); + } + ERROR("mismatch and/or invalid size %p.%zu/%p.%zu for INTEGERKEY/INTEGERDUP", + a->iov_base, a->iov_len, b->iov_base, b->iov_len); + return 0; +} + +__hot static int cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(1, a, b); +} + +/* Compare two items pointing at 2-byte aligned unsigned int's. */ +#if MDBX_UNALIGNED_OK < 2 || \ + (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) +__hot static int cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(2, a, b); +} +#else +#define cmp_int_align2 cmp_int_unaligned +#endif /* !MDBX_UNALIGNED_OK || debug */ + +/* Compare two items pointing at aligned unsigned int's. */ +#if MDBX_UNALIGNED_OK < 4 || \ + (MDBX_DEBUG || MDBX_FORCE_ASSERTIONS || !defined(NDEBUG)) +__hot static int cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { + return cmp_int_inline(4, a, b); +} +#else +#define cmp_int_align4 cmp_int_unaligned +#endif /* !MDBX_UNALIGNED_OK || debug */ + +/* Compare two items lexically */ +__hot static int cmp_lexical(const MDBX_val *a, const MDBX_val *b) { + if (a->iov_len == b->iov_len) + return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; + + const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; + const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; + return likely(diff_data) ? diff_data : diff_len; +} + +MDBX_NOTHROW_PURE_FUNCTION static __always_inline unsigned +tail3le(const uint8_t *p, size_t l) { + STATIC_ASSERT(sizeof(unsigned) > 2); + // 1: 0 0 0 + // 2: 0 1 1 + // 3: 0 1 2 + return p[0] | p[l >> 1] << 8 | p[l - 1] << 16; +} + +/* Compare two items in reverse byte order */ +__hot static int cmp_reverse(const MDBX_val *a, const MDBX_val *b) { + size_t left = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; + if (likely(left)) { + const uint8_t *pa = ptr_disp(a->iov_base, a->iov_len); + const uint8_t *pb = ptr_disp(b->iov_base, b->iov_len); + while (left >= sizeof(size_t)) { + pa -= sizeof(size_t); + pb -= sizeof(size_t); + left -= sizeof(size_t); + STATIC_ASSERT(sizeof(size_t) == 4 || sizeof(size_t) == 8); + if (sizeof(size_t) == 4) { + uint32_t xa = unaligned_peek_u32(1, pa); + uint32_t xb = unaligned_peek_u32(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap32(xa); + xb = osal_bswap32(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } else { + uint64_t xa = unaligned_peek_u64(1, pa); + uint64_t xb = unaligned_peek_u64(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap64(xa); + xb = osal_bswap64(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } + } + if (sizeof(size_t) == 8 && left >= 4) { + pa -= 4; + pb -= 4; + left -= 4; + uint32_t xa = unaligned_peek_u32(1, pa); + uint32_t xb = unaligned_peek_u32(1, pb); +#if __BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__ + xa = osal_bswap32(xa); + xb = osal_bswap32(xb); +#endif /* __BYTE_ORDER__ != __ORDER_BIG_ENDIAN__ */ + if (xa != xb) + return (xa < xb) ? -1 : 1; + } + if (left) { + unsigned xa = tail3le(pa - left, left); + unsigned xb = tail3le(pb - left, left); + if (xa != xb) + return (xa < xb) ? -1 : 1; + } + } + return CMP2INT(a->iov_len, b->iov_len); +} + +/* Fast non-lexically comparator */ +__hot static int cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { + int diff = CMP2INT(a->iov_len, b->iov_len); + return (likely(diff) || a->iov_len == 0) + ? diff + : memcmp(a->iov_base, b->iov_base, a->iov_len); +} + +__hot static bool eq_fast_slowpath(const uint8_t *a, const uint8_t *b, + size_t l) { + if (likely(l > 3)) { + if (MDBX_UNALIGNED_OK >= 4 && likely(l < 9)) + return ((unaligned_peek_u32(1, a) - unaligned_peek_u32(1, b)) | + (unaligned_peek_u32(1, a + l - 4) - + unaligned_peek_u32(1, b + l - 4))) == 0; + if (MDBX_UNALIGNED_OK >= 8 && sizeof(size_t) > 7 && likely(l < 17)) + return ((unaligned_peek_u64(1, a) - unaligned_peek_u64(1, b)) | + (unaligned_peek_u64(1, a + l - 8) - + unaligned_peek_u64(1, b + l - 8))) == 0; + return memcmp(a, b, l) == 0; + } + if (likely(l)) + return tail3le(a, l) == tail3le(b, l); + return true; +} + +static __always_inline bool eq_fast(const MDBX_val *a, const MDBX_val *b) { + return unlikely(a->iov_len == b->iov_len) && + eq_fast_slowpath(a->iov_base, b->iov_base, a->iov_len); +} + +static int validate_meta(MDBX_env *env, MDBX_meta *const meta, + const MDBX_page *const page, + const unsigned meta_number, unsigned *guess_pagesize) { const uint64_t magic_and_version = unaligned_peek_u64(4, &meta->mm_magic_and_version); if (unlikely(magic_and_version != MDBX_DATA_MAGIC && magic_and_version != MDBX_DATA_MAGIC_LEGACY_COMPAT && magic_and_version != MDBX_DATA_MAGIC_LEGACY_DEVEL)) { - mdbx_error("meta[%u] has invalid magic/version %" PRIx64, meta_number, - magic_and_version); + ERROR("meta[%u] has invalid magic/version %" PRIx64, meta_number, + magic_and_version); return ((magic_and_version >> 8) != MDBX_MAGIC) ? MDBX_INVALID : MDBX_VERSION_MISMATCH; } if (unlikely(page->mp_pgno != meta_number)) { - mdbx_error("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, - page->mp_pgno); + ERROR("meta[%u] has invalid pageno %" PRIaPGNO, meta_number, page->mp_pgno); return MDBX_INVALID; } if (unlikely(page->mp_flags != P_META)) { - mdbx_error("page #%u not a meta-page", meta_number); + ERROR("page #%u not a meta-page", meta_number); return MDBX_INVALID; } /* LY: check pagesize */ if (unlikely(!is_powerof2(meta->mm_psize) || meta->mm_psize < MIN_PAGESIZE || meta->mm_psize > MAX_PAGESIZE)) { - mdbx_warning("meta[%u] has invalid pagesize (%u), skip it", meta_number, - meta->mm_psize); + WARNING("meta[%u] has invalid pagesize (%u), skip it", meta_number, + meta->mm_psize); return is_powerof2(meta->mm_psize) ? MDBX_VERSION_MISMATCH : MDBX_INVALID; } if (guess_pagesize && *guess_pagesize != meta->mm_psize) { *guess_pagesize = meta->mm_psize; - mdbx_verbose("meta[%u] took pagesize %u", meta_number, meta->mm_psize); + VERBOSE("meta[%u] took pagesize %u", meta_number, meta->mm_psize); } const txnid_t txnid = unaligned_peek_u64(4, &meta->mm_txnid_a); if (unlikely(txnid != unaligned_peek_u64(4, &meta->mm_txnid_b))) { - mdbx_warning("meta[%u] not completely updated, skip it", meta_number); + WARNING("meta[%u] not completely updated, skip it", meta_number); return MDBX_RESULT_TRUE; } /* LY: check signature as a checksum */ if (META_IS_STEADY(meta) && - unlikely(unaligned_peek_u64(4, &meta->mm_datasync_sign) != - meta_sign(meta))) { - mdbx_warning("meta[%u] has invalid steady-checksum (0x%" PRIx64 - " != 0x%" PRIx64 "), skip it", - meta_number, unaligned_peek_u64(4, &meta->mm_datasync_sign), - meta_sign(meta)); + unlikely(unaligned_peek_u64(4, &meta->mm_sign) != meta_sign(meta))) { + WARNING("meta[%u] has invalid steady-checksum (0x%" PRIx64 " != 0x%" PRIx64 + "), skip it", + meta_number, unaligned_peek_u64(4, &meta->mm_sign), + meta_sign(meta)); return MDBX_RESULT_TRUE; } - mdbx_debug("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, - meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, - meta->mm_geo.next, meta->mm_geo.now, meta->mm_geo.upper, - pv2pages(meta->mm_geo.grow_pv), pv2pages(meta->mm_geo.shrink_pv), - txnid, mdbx_durable_str(meta)); + DEBUG("checking meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + page->mp_pgno, meta->mm_dbs[MAIN_DBI].md_root, + meta->mm_dbs[FREE_DBI].md_root, meta->mm_geo.lower, meta->mm_geo.next, + meta->mm_geo.now, meta->mm_geo.upper, pv2pages(meta->mm_geo.grow_pv), + pv2pages(meta->mm_geo.shrink_pv), txnid, durable_caption(meta)); if (unlikely(txnid < MIN_TXNID || txnid > MAX_TXNID)) { - mdbx_warning("meta[%u] has invalid txnid %" PRIaTXN ", skip it", - meta_number, txnid); + WARNING("meta[%u] has invalid txnid %" PRIaTXN ", skip it", meta_number, + txnid); return MDBX_RESULT_TRUE; } /* LY: check min-pages value */ if (unlikely(meta->mm_geo.lower < MIN_PAGENO || meta->mm_geo.lower > MAX_PAGENO + 1)) { - mdbx_warning("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.lower); + WARNING("meta[%u] has invalid min-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.lower); return MDBX_INVALID; } @@ -10746,16 +12168,16 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, if (unlikely(meta->mm_geo.upper < MIN_PAGENO || meta->mm_geo.upper > MAX_PAGENO + 1 || meta->mm_geo.upper < meta->mm_geo.lower)) { - mdbx_warning("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.upper); + WARNING("meta[%u] has invalid max-pages (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.upper); return MDBX_INVALID; } /* LY: check last_pgno */ if (unlikely(meta->mm_geo.next < MIN_PAGENO || meta->mm_geo.next - 1 > MAX_PAGENO)) { - mdbx_warning("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.next); + WARNING("meta[%u] has invalid next-pageno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next); return MDBX_CORRUPTED; } @@ -10763,20 +12185,20 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, const uint64_t used_bytes = meta->mm_geo.next * (uint64_t)meta->mm_psize; if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { /* Here could be a race with DB-shrinking performed by other process */ - int err = mdbx_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); + int err = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); if (unlikely(err != MDBX_SUCCESS)) return err; if (unlikely(used_bytes > env->me_dxb_mmap.filesize)) { - mdbx_warning("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 - "), skip it", - meta_number, used_bytes, env->me_dxb_mmap.filesize); + WARNING("meta[%u] used-bytes (%" PRIu64 ") beyond filesize (%" PRIu64 + "), skip it", + meta_number, used_bytes, env->me_dxb_mmap.filesize); return MDBX_CORRUPTED; } } if (unlikely(meta->mm_geo.next - 1 > MAX_PAGENO || used_bytes > MAX_MAPSIZE)) { - mdbx_warning("meta[%u] has too large used-space (%" PRIu64 "), skip it", - meta_number, used_bytes); + WARNING("meta[%u] has too large used-space (%" PRIu64 "), skip it", + meta_number, used_bytes); return MDBX_TOO_LARGE; } @@ -10789,24 +12211,24 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, if (unlikely(mapsize_min < MIN_MAPSIZE || mapsize_min > MAX_MAPSIZE)) { if (MAX_MAPSIZE != MAX_MAPSIZE64 && mapsize_min > MAX_MAPSIZE && mapsize_min <= MAX_MAPSIZE64) { - mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO && - used_bytes <= MAX_MAPSIZE); - mdbx_warning("meta[%u] has too large min-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_min, used_bytes); + eASSERT(env, + meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); + WARNING("meta[%u] has too large min-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_min, used_bytes); geo_lower = (pgno_t)((mapsize_min = MAX_MAPSIZE) / meta->mm_psize); if (geo_lower > MAX_PAGENO + 1) { geo_lower = MAX_PAGENO + 1; mapsize_min = geo_lower * (uint64_t)meta->mm_psize; } - mdbx_warning("meta[%u] consider get-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "lower", geo_lower, meta->mm_geo.lower); + WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "lower", geo_lower, meta->mm_geo.lower); meta->mm_geo.lower = geo_lower; } else { - mdbx_warning("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_min); + WARNING("meta[%u] has invalid min-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_min); return MDBX_VERSION_MISMATCH; } } @@ -10819,25 +12241,25 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, ceil_powerof2((size_t)mapsize_max, env->me_os_psize) / (size_t)meta->mm_psize)) { if (mapsize_max > MAX_MAPSIZE64) { - mdbx_warning("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", - meta_number, mapsize_max); + WARNING("meta[%u] has invalid max-mapsize (%" PRIu64 "), skip it", + meta_number, mapsize_max); return MDBX_VERSION_MISMATCH; } /* allow to open large DB from a 32-bit environment */ - mdbx_assert(env, meta->mm_geo.next - 1 <= MAX_PAGENO && - used_bytes <= MAX_MAPSIZE); - mdbx_warning("meta[%u] has too large max-mapsize (%" PRIu64 "), " - "but size of used space still acceptable (%" PRIu64 ")", - meta_number, mapsize_max, used_bytes); + eASSERT(env, + meta->mm_geo.next - 1 <= MAX_PAGENO && used_bytes <= MAX_MAPSIZE); + WARNING("meta[%u] has too large max-mapsize (%" PRIu64 "), " + "but size of used space still acceptable (%" PRIu64 ")", + meta_number, mapsize_max, used_bytes); geo_upper = (pgno_t)((mapsize_max = MAX_MAPSIZE) / meta->mm_psize); if (geo_upper > MAX_PAGENO + 1) { geo_upper = MAX_PAGENO + 1; mapsize_max = geo_upper * (uint64_t)meta->mm_psize; } - mdbx_warning("meta[%u] consider get-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "upper", geo_upper, meta->mm_geo.upper); + WARNING("meta[%u] consider get-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "upper", geo_upper, meta->mm_geo.upper); meta->mm_geo.upper = geo_upper; } @@ -10855,16 +12277,16 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, geo_now = geo_upper; if (unlikely(meta->mm_geo.next > geo_now)) { - mdbx_warning("meta[%u] next-pageno (%" PRIaPGNO - ") is beyond end-pgno (%" PRIaPGNO "), skip it", - meta_number, meta->mm_geo.next, geo_now); + WARNING("meta[%u] next-pageno (%" PRIaPGNO + ") is beyond end-pgno (%" PRIaPGNO "), skip it", + meta_number, meta->mm_geo.next, geo_now); return MDBX_CORRUPTED; } if (meta->mm_geo.now != geo_now) { - mdbx_warning("meta[%u] consider geo-%s pageno is %" PRIaPGNO - " instead of wrong %" PRIaPGNO - ", will be corrected on next commit(s)", - meta_number, "now", geo_now, meta->mm_geo.now); + WARNING("meta[%u] consider geo-%s pageno is %" PRIaPGNO + " instead of wrong %" PRIaPGNO + ", will be corrected on next commit(s)", + meta_number, "now", geo_now, meta->mm_geo.now); meta->mm_geo.now = geo_now; } @@ -10875,12 +12297,12 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, meta->mm_dbs[FREE_DBI].md_entries || meta->mm_dbs[FREE_DBI].md_leaf_pages || meta->mm_dbs[FREE_DBI].md_overflow_pages)) { - mdbx_warning("meta[%u] has false-empty %s, skip it", meta_number, "GC"); + WARNING("meta[%u] has false-empty %s, skip it", meta_number, "GC"); return MDBX_CORRUPTED; } } else if (unlikely(meta->mm_dbs[FREE_DBI].md_root >= meta->mm_geo.next)) { - mdbx_warning("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", - meta_number, "GC", meta->mm_dbs[FREE_DBI].md_root); + WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, + "GC", meta->mm_dbs[FREE_DBI].md_root); return MDBX_CORRUPTED; } @@ -10891,49 +12313,48 @@ static int mdbx_validate_meta(MDBX_env *env, MDBX_meta *const meta, meta->mm_dbs[MAIN_DBI].md_entries || meta->mm_dbs[MAIN_DBI].md_leaf_pages || meta->mm_dbs[MAIN_DBI].md_overflow_pages)) { - mdbx_warning("meta[%u] has false-empty %s", meta_number, "MainDB"); + WARNING("meta[%u] has false-empty %s", meta_number, "MainDB"); return MDBX_CORRUPTED; } } else if (unlikely(meta->mm_dbs[MAIN_DBI].md_root >= meta->mm_geo.next)) { - mdbx_warning("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", - meta_number, "MainDB", meta->mm_dbs[MAIN_DBI].md_root); + WARNING("meta[%u] has invalid %s-root %" PRIaPGNO ", skip it", meta_number, + "MainDB", meta->mm_dbs[MAIN_DBI].md_root); return MDBX_CORRUPTED; } if (unlikely(meta->mm_dbs[FREE_DBI].md_mod_txnid > txnid)) { - mdbx_warning("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", - meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC"); + WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", + meta_number, meta->mm_dbs[FREE_DBI].md_mod_txnid, "GC"); return MDBX_CORRUPTED; } if (unlikely(meta->mm_dbs[MAIN_DBI].md_mod_txnid > txnid)) { - mdbx_warning("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", - meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB"); + WARNING("meta[%u] has wrong md_mod_txnid %" PRIaTXN " for %s, skip it", + meta_number, meta->mm_dbs[MAIN_DBI].md_mod_txnid, "MainDB"); return MDBX_CORRUPTED; } return MDBX_SUCCESS; } -static int mdbx_validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, - MDBX_meta *dest) { +static int validate_meta_copy(MDBX_env *env, const MDBX_meta *meta, + MDBX_meta *dest) { *dest = *meta; - return mdbx_validate_meta(env, dest, data_page(meta), - bytes2pgno(env, (uint8_t *)meta - env->me_map), - nullptr); + return validate_meta(env, dest, data_page(meta), + bytes2pgno(env, ptr_dist(meta, env->me_map)), nullptr); } /* Read the environment parameters of a DB environment * before mapping it into memory. */ -__cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, - const int lck_exclusive, - const mdbx_mode_t mode_bits) { - int rc = mdbx_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); +__cold static int read_header(MDBX_env *env, MDBX_meta *dest, + const int lck_exclusive, + const mdbx_mode_t mode_bits) { + memset(dest, 0, sizeof(MDBX_meta)); + int rc = osal_filesize(env->me_lazy_fd, &env->me_dxb_mmap.filesize); if (unlikely(rc != MDBX_SUCCESS)) return rc; - memset(dest, 0, sizeof(MDBX_meta)); - unaligned_poke_u64(4, dest->mm_datasync_sign, MDBX_DATASIGN_WEAK); + unaligned_poke_u64(4, dest->mm_sign, MDBX_DATASIGN_WEAK); rc = MDBX_CORRUPTED; /* Read twice all meta pages so we can find the latest one. */ @@ -10950,54 +12371,84 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, char buffer[MIN_PAGESIZE]; unsigned retryleft = 42; while (1) { - mdbx_trace("reading meta[%d]: offset %u, bytes %u, retry-left %u", - meta_number, offset, MIN_PAGESIZE, retryleft); - int err = mdbx_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); + TRACE("reading meta[%d]: offset %u, bytes %u, retry-left %u", meta_number, + offset, MIN_PAGESIZE, retryleft); + int err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); + if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && + env->me_dxb_mmap.filesize == 0 && + mode_bits /* non-zero for DB creation */ != 0) { + NOTICE("read meta: empty file (%d, %s)", err, mdbx_strerror(err)); + return err; + } +#if defined(_WIN32) || defined(_WIN64) + if (err == ERROR_LOCK_VIOLATION) { + SleepEx(0, true); + err = osal_pread(env->me_lazy_fd, buffer, MIN_PAGESIZE, offset); + if (err == ERROR_LOCK_VIOLATION && --retryleft) { + WARNING("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); + continue; + } + } +#endif /* Windows */ if (err != MDBX_SUCCESS) { - if (err == MDBX_ENODATA && offset == 0 && loop_count == 0 && - env->me_dxb_mmap.filesize == 0 && - mode_bits /* non-zero for DB creation */ != 0) - mdbx_notice("read meta: empty file (%d, %s)", err, - mdbx_strerror(err)); - else - mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); + ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } char again[MIN_PAGESIZE]; - err = mdbx_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); + err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); +#if defined(_WIN32) || defined(_WIN64) + if (err == ERROR_LOCK_VIOLATION) { + SleepEx(0, true); + err = osal_pread(env->me_lazy_fd, again, MIN_PAGESIZE, offset); + if (err == ERROR_LOCK_VIOLATION && --retryleft) { + WARNING("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); + continue; + } + } +#endif /* Windows */ if (err != MDBX_SUCCESS) { - mdbx_error("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, - mdbx_strerror(err)); + ERROR("read meta[%u,%u]: %i, %s", offset, MIN_PAGESIZE, err, + mdbx_strerror(err)); return err; } if (memcmp(buffer, again, MIN_PAGESIZE) == 0 || --retryleft == 0) break; - mdbx_verbose("meta[%u] was updated, re-read it", meta_number); + VERBOSE("meta[%u] was updated, re-read it", meta_number); } if (!retryleft) { - mdbx_error("meta[%u] is too volatile, skip it", meta_number); + ERROR("meta[%u] is too volatile, skip it", meta_number); continue; } MDBX_page *const page = (MDBX_page *)buffer; MDBX_meta *const meta = page_meta(page); - rc = mdbx_validate_meta(env, meta, page, meta_number, &guess_pagesize); + rc = validate_meta(env, meta, page, meta_number, &guess_pagesize); if (rc != MDBX_SUCCESS) continue; - if ((env->me_stuck_meta < 0) - ? meta_ot(meta_bootid_match(meta) ? prefer_last : prefer_steady, - env, dest, meta) - : (meta_number == (unsigned)env->me_stuck_meta)) { + bool latch; + if (env->me_stuck_meta >= 0) + latch = (meta_number == (unsigned)env->me_stuck_meta); + else if (meta_bootid_match(meta)) + latch = meta_choice_recent( + meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), + dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); + else + latch = meta_choice_steady( + meta->unsafe_txnid, SIGN_IS_STEADY(meta->unsafe_sign), + dest->unsafe_txnid, SIGN_IS_STEADY(dest->unsafe_sign)); + if (latch) { *dest = *meta; if (!lck_exclusive && !META_IS_STEADY(dest)) loop_limit += 1; /* LY: should re-read to hush race with update */ - mdbx_verbose("latch meta[%u]", meta_number); + VERBOSE("latch meta[%u]", meta_number); } } @@ -11005,7 +12456,7 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, (env->me_stuck_meta < 0 && !(META_IS_STEADY(dest) || meta_weak_acceptable(env, dest, lck_exclusive)))) { - mdbx_error("%s", "no usable meta-pages, database is corrupted"); + ERROR("%s", "no usable meta-pages, database is corrupted"); if (rc == MDBX_SUCCESS) { /* TODO: try to restore the database by fully checking b-tree structure * for the each meta page, if the corresponding option was given */ @@ -11017,18 +12468,18 @@ __cold static int mdbx_read_header(MDBX_env *env, MDBX_meta *dest, return MDBX_SUCCESS; } -__cold static MDBX_page *mdbx_meta_model(const MDBX_env *env, MDBX_page *model, - unsigned num) { - mdbx_ensure(env, is_powerof2(env->me_psize)); - mdbx_ensure(env, env->me_psize >= MIN_PAGESIZE); - mdbx_ensure(env, env->me_psize <= MAX_PAGESIZE); - mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); - mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); +__cold static MDBX_page *meta_model(const MDBX_env *env, MDBX_page *model, + size_t num) { + ENSURE(env, is_powerof2(env->me_psize)); + ENSURE(env, env->me_psize >= MIN_PAGESIZE); + ENSURE(env, env->me_psize <= MAX_PAGESIZE); + ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); memset(model, 0, env->me_psize); - model->mp_pgno = num; + model->mp_pgno = (pgno_t)num; model->mp_flags = P_META; MDBX_meta *const model_meta = page_meta(model); unaligned_poke_u64(4, model_meta->mm_magic_and_version, MDBX_DATA_MAGIC); @@ -11041,68 +12492,50 @@ __cold static MDBX_page *mdbx_meta_model(const MDBX_env *env, MDBX_page *model, model_meta->mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); model_meta->mm_geo.next = NUM_METAS; - mdbx_ensure(env, model_meta->mm_geo.lower >= MIN_PAGENO); - mdbx_ensure(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1); - mdbx_ensure(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); - mdbx_ensure(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); - mdbx_ensure(env, model_meta->mm_geo.next >= MIN_PAGENO); - mdbx_ensure(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); - mdbx_ensure(env, model_meta->mm_geo.grow_pv == - pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); - mdbx_ensure(env, model_meta->mm_geo.shrink_pv == - pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); + ENSURE(env, model_meta->mm_geo.lower >= MIN_PAGENO); + ENSURE(env, model_meta->mm_geo.upper <= MAX_PAGENO + 1); + ENSURE(env, model_meta->mm_geo.now >= model_meta->mm_geo.lower); + ENSURE(env, model_meta->mm_geo.now <= model_meta->mm_geo.upper); + ENSURE(env, model_meta->mm_geo.next >= MIN_PAGENO); + ENSURE(env, model_meta->mm_geo.next <= model_meta->mm_geo.now); + ENSURE(env, model_meta->mm_geo.grow_pv == + pages2pv(pv2pages(model_meta->mm_geo.grow_pv))); + ENSURE(env, model_meta->mm_geo.shrink_pv == + pages2pv(pv2pages(model_meta->mm_geo.shrink_pv))); model_meta->mm_psize = env->me_psize; model_meta->mm_dbs[FREE_DBI].md_flags = MDBX_INTEGERKEY; model_meta->mm_dbs[FREE_DBI].md_root = P_INVALID; model_meta->mm_dbs[MAIN_DBI].md_root = P_INVALID; meta_set_txnid(env, model_meta, MIN_TXNID + num); - unaligned_poke_u64(4, model_meta->mm_datasync_sign, meta_sign(model_meta)); - mdbx_assert(env, meta_checktxnid(env, model_meta, true)); - return (MDBX_page *)((uint8_t *)model + env->me_psize); + unaligned_poke_u64(4, model_meta->mm_sign, meta_sign(model_meta)); + eASSERT(env, coherency_check_meta(env, model_meta, true)); + return ptr_disp(model, env->me_psize); } /* Fill in most of the zeroed meta-pages for an empty database environment. * Return pointer to recently (head) meta-page. */ -__cold static MDBX_meta *mdbx_init_metas(const MDBX_env *env, void *buffer) { +__cold static MDBX_meta *init_metas(const MDBX_env *env, void *buffer) { MDBX_page *page0 = (MDBX_page *)buffer; - MDBX_page *page1 = mdbx_meta_model(env, page0, 0); - MDBX_page *page2 = mdbx_meta_model(env, page1, 1); - mdbx_meta_model(env, page2, 2); - mdbx_assert(env, !meta_eq(env, page_meta(page0), page_meta(page1))); - mdbx_assert(env, !meta_eq(env, page_meta(page1), page_meta(page2))); - mdbx_assert(env, !meta_eq(env, page_meta(page2), page_meta(page0))); + MDBX_page *page1 = meta_model(env, page0, 0); + MDBX_page *page2 = meta_model(env, page1, 1); + meta_model(env, page2, 2); return page_meta(page2); } -#if MDBX_ENABLE_MADVISE && !(defined(_WIN32) || defined(_WIN64)) -static size_t mdbx_madvise_threshold(const MDBX_env *env, - const size_t largest_bytes) { - /* TODO: use options */ - const unsigned factor = 9; - const size_t threshold = (largest_bytes < (65536ul << factor)) - ? 65536 /* minimal threshold */ - : (largest_bytes > (MEGABYTE * 4 << factor)) - ? MEGABYTE * 4 /* maximal threshold */ - : largest_bytes >> factor; - return bytes_align2os_bytes(env, threshold); -} -#endif /* MDBX_ENABLE_MADVISE */ - -static int mdbx_sync_locked(MDBX_env *env, unsigned flags, - MDBX_meta *const pending) { - mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); +static int sync_locked(MDBX_env *env, unsigned flags, MDBX_meta *const pending, + meta_troika_t *const troika) { + eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); const MDBX_meta *const meta0 = METAPAGE(env, 0); const MDBX_meta *const meta1 = METAPAGE(env, 1); const MDBX_meta *const meta2 = METAPAGE(env, 2); - const MDBX_meta *const head = constmeta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, troika); int rc; - mdbx_assert(env, meta_eq_mask(env) == 0); - mdbx_assert(env, - pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); - mdbx_assert(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); - mdbx_assert(env, pending->mm_geo.next <= pending->mm_geo.now); + eASSERT(env, + pending < METAPAGE(env, 0) || pending > METAPAGE(env, NUM_METAS)); + eASSERT(env, (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0); + eASSERT(env, pending->mm_geo.next <= pending->mm_geo.now); if (flags & MDBX_SAFE_NOSYNC) { /* Check auto-sync conditions */ @@ -11110,118 +12543,145 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, atomic_load32(&env->me_lck->mti_autosync_threshold, mo_Relaxed); const uint64_t autosync_period = atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed); + uint64_t eoos_timestamp; if ((autosync_threshold && - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) >= autosync_threshold) || (autosync_period && - mdbx_osal_monotime() - - atomic_load64(&env->me_lck->mti_sync_timestamp, mo_Relaxed) >= - autosync_period)) + (eoos_timestamp = + atomic_load64(&env->me_lck->mti_eoos_timestamp, mo_Relaxed)) && + osal_monotime() - eoos_timestamp >= autosync_period)) flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; /* force steady */ } pgno_t shrink = 0; if (flags & MDBX_SHRINK_ALLOWED) { - /* LY: check conditions to discard unused pages */ - const pgno_t largest_pgno = mdbx_find_largest( - env, (head->mm_geo.next > pending->mm_geo.next) ? head->mm_geo.next - : pending->mm_geo.next); - mdbx_assert(env, largest_pgno >= NUM_METAS); + const size_t prev_discarded_pgno = + atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); + if (prev_discarded_pgno < pending->mm_geo.next) + env->me_lck->mti_discarded_tail.weak = pending->mm_geo.next; + else if (prev_discarded_pgno >= + pending->mm_geo.next + env->me_madv_threshold) { + /* LY: check conditions to discard unused pages */ + const pgno_t largest_pgno = find_largest_snapshot( + env, (head.ptr_c->mm_geo.next > pending->mm_geo.next) + ? head.ptr_c->mm_geo.next + : pending->mm_geo.next); + eASSERT(env, largest_pgno >= NUM_METAS); + #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - const pgno_t edge = env->me_poison_edge; - if (edge > largest_pgno) { - env->me_poison_edge = largest_pgno; - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + pgno2bytes(env, largest_pgno), - pgno2bytes(env, edge - largest_pgno)); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + - pgno2bytes(env, largest_pgno), - pgno2bytes(env, edge - largest_pgno)); - } + const pgno_t edge = env->me_poison_edge; + if (edge > largest_pgno) { + env->me_poison_edge = largest_pgno; + VALGRIND_MAKE_MEM_NOACCESS( + ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + MDBX_ASAN_POISON_MEMORY_REGION( + ptr_disp(env->me_map, pgno2bytes(env, largest_pgno)), + pgno2bytes(env, edge - largest_pgno)); + } #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + #if MDBX_ENABLE_MADVISE && \ (defined(MADV_DONTNEED) || defined(POSIX_MADV_DONTNEED)) - const size_t largest_bytes = pgno2bytes(env, largest_pgno); - /* threshold to avoid unreasonable frequent madvise() calls */ - const size_t madvise_threshold = mdbx_madvise_threshold(env, largest_bytes); - const size_t discard_edge_bytes = bytes_align2os_bytes( - env, ((MDBX_RDONLY & - (env->me_lck_mmap.lck ? env->me_lck_mmap.lck->mti_envmode.weak - : env->me_flags)) - ? largest_bytes - : largest_bytes + madvise_threshold)); - const pgno_t discard_edge_pgno = bytes2pgno(env, discard_edge_bytes); - const pgno_t prev_discarded_pgno = - atomic_load32(&env->me_lck->mti_discarded_tail, mo_Relaxed); - if (prev_discarded_pgno >= - discard_edge_pgno + bytes2pgno(env, madvise_threshold)) { - mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", largest_pgno, - prev_discarded_pgno); - atomic_store32(&env->me_lck->mti_discarded_tail, discard_edge_pgno, - mo_Relaxed); - const size_t prev_discarded_bytes = - ceil_powerof2(pgno2bytes(env, prev_discarded_pgno), env->me_os_psize); - mdbx_ensure(env, prev_discarded_bytes > discard_edge_bytes); + const size_t discard_edge_pgno = pgno_align2os_pgno(env, largest_pgno); + if (prev_discarded_pgno >= discard_edge_pgno + env->me_madv_threshold) { + const size_t prev_discarded_bytes = + pgno_align2os_bytes(env, prev_discarded_pgno); + const size_t discard_edge_bytes = pgno2bytes(env, discard_edge_pgno); + /* из-за выравнивания prev_discarded_bytes и discard_edge_bytes + * могут быть равны */ + if (prev_discarded_bytes > discard_edge_bytes) { + NOTICE("shrink-MADV_%s %zu..%zu", "DONTNEED", discard_edge_pgno, + prev_discarded_pgno); + munlock_after(env, discard_edge_pgno, + bytes_align2os_bytes(env, env->me_dxb_mmap.current)); + const uint32_t munlocks_before = + atomic_load32(&env->me_lck->mti_mlcnt[1], mo_Relaxed); #if defined(MADV_DONTNEED) - int advise = MADV_DONTNEED; + int advise = MADV_DONTNEED; #if defined(MADV_FREE) && \ 0 /* MADV_FREE works for only anonymous vma at the moment */ - if ((env->me_flags & MDBX_WRITEMAP) && - mdbx_linux_kernel_version > 0x04050000) - advise = MADV_FREE; + if ((env->me_flags & MDBX_WRITEMAP) && + linux_kernel_version > 0x04050000) + advise = MADV_FREE; #endif /* MADV_FREE */ - int err = madvise(env->me_map + discard_edge_bytes, - prev_discarded_bytes - discard_edge_bytes, advise) - ? ignore_enosys(errno) - : MDBX_SUCCESS; + int err = madvise(ptr_disp(env->me_map, discard_edge_bytes), + prev_discarded_bytes - discard_edge_bytes, advise) + ? ignore_enosys(errno) + : MDBX_SUCCESS; #else - int err = ignore_enosys(posix_madvise( - env->me_map + discard_edge_bytes, - prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); + int err = ignore_enosys(posix_madvise( + ptr_disp(env->me_map, discard_edge_bytes), + prev_discarded_bytes - discard_edge_bytes, POSIX_MADV_DONTNEED)); #endif - if (unlikely(MDBX_IS_ERROR(err))) - return err; - } + if (unlikely(MDBX_IS_ERROR(err))) { + const uint32_t mlocks_after = + atomic_load32(&env->me_lck->mti_mlcnt[0], mo_Relaxed); + if (err == MDBX_EINVAL) { + const int severity = (mlocks_after - munlocks_before) + ? MDBX_LOG_NOTICE + : MDBX_LOG_WARN; + if (LOG_ENABLED(severity)) + debug_log( + severity, __func__, __LINE__, + "%s-madvise: ignore EINVAL (%d) since some pages maybe " + "locked (%u/%u mlcnt-processes)", + "shrink", err, mlocks_after, munlocks_before); + } else { + ERROR("%s-madvise(%s, %zu, +%zu), %u/%u mlcnt-processes, err %d", + "shrink", "DONTNEED", discard_edge_bytes, + prev_discarded_bytes - discard_edge_bytes, mlocks_after, + munlocks_before, err); + return err; + } + } else + env->me_lck->mti_discarded_tail.weak = discard_edge_pgno; + } + } #endif /* MDBX_ENABLE_MADVISE && (MADV_DONTNEED || POSIX_MADV_DONTNEED) */ - /* LY: check conditions to shrink datafile */ - const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; - pgno_t shrink_step = 0; - if (pending->mm_geo.shrink_pv && - pending->mm_geo.now - pending->mm_geo.next > - (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + backlog_gap) { - if (pending->mm_geo.now > largest_pgno && - pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { - pgno_t grow_step = 0; - const pgno_t aligner = - pending->mm_geo.grow_pv - ? (grow_step = pv2pages(pending->mm_geo.grow_pv)) - : shrink_step; - const pgno_t with_backlog_gap = largest_pgno + backlog_gap; - const pgno_t aligned = pgno_align2os_pgno( - env, with_backlog_gap + aligner - with_backlog_gap % aligner); - const pgno_t bottom = - (aligned > pending->mm_geo.lower) ? aligned : pending->mm_geo.lower; - if (pending->mm_geo.now > bottom) { - if (META_IS_STEADY(meta_prefer_steady(env))) - /* force steady, but only if steady-checkpoint is present */ - flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; - shrink = pending->mm_geo.now - bottom; - pending->mm_geo.now = bottom; - if (unlikely(constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a))) { - const txnid_t txnid = - safe64_txnid_next(unaligned_peek_u64(4, pending->mm_txnid_a)); - mdbx_notice("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, - unaligned_peek_u64(4, pending->mm_txnid_a), txnid); - mdbx_ensure(env, env->me_txn0->mt_owner != mdbx_thread_self() && - !env->me_txn); - if (unlikely(txnid > MAX_TXNID)) { - rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); - goto fail; + /* LY: check conditions to shrink datafile */ + const pgno_t backlog_gap = 3 + pending->mm_dbs[FREE_DBI].md_depth * 3; + pgno_t shrink_step = 0; + if (pending->mm_geo.shrink_pv && + pending->mm_geo.now - pending->mm_geo.next > + (shrink_step = pv2pages(pending->mm_geo.shrink_pv)) + + backlog_gap) { + if (pending->mm_geo.now > largest_pgno && + pending->mm_geo.now - largest_pgno > shrink_step + backlog_gap) { + const pgno_t aligner = + pending->mm_geo.grow_pv + ? /* grow_step */ pv2pages(pending->mm_geo.grow_pv) + : shrink_step; + const pgno_t with_backlog_gap = largest_pgno + backlog_gap; + const pgno_t aligned = + pgno_align2os_pgno(env, (size_t)with_backlog_gap + aligner - + with_backlog_gap % aligner); + const pgno_t bottom = (aligned > pending->mm_geo.lower) + ? aligned + : pending->mm_geo.lower; + if (pending->mm_geo.now > bottom) { + if (TROIKA_HAVE_STEADY(troika)) + /* force steady, but only if steady-checkpoint is present */ + flags &= MDBX_WRITEMAP | MDBX_SHRINK_ALLOWED; + shrink = pending->mm_geo.now - bottom; + pending->mm_geo.now = bottom; + if (unlikely(head.txnid == pending->unsafe_txnid)) { + const txnid_t txnid = safe64_txnid_next(pending->unsafe_txnid); + NOTICE("force-forward pending-txn %" PRIaTXN " -> %" PRIaTXN, + pending->unsafe_txnid, txnid); + ENSURE(env, !env->me_txn0 || + (env->me_txn0->mt_owner != osal_thread_self() && + !env->me_txn)); + if (unlikely(txnid > MAX_TXNID)) { + rc = MDBX_TXN_FULL; + ERROR("txnid overflow, raise %d", rc); + goto fail; + } + meta_set_txnid(env, pending, txnid); + eASSERT(env, coherency_check_meta(env, pending, true)); } - meta_set_txnid(env, pending, txnid); - mdbx_assert(env, meta_checktxnid(env, pending, true)); } } } @@ -11230,126 +12690,134 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, /* LY: step#1 - sync previously written/updated data-pages */ rc = MDBX_RESULT_FALSE /* carry steady */; - if (atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { - mdbx_assert(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - enum mdbx_syncmode_bits mode_bits = MDBX_SYNC_NONE; + if (atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed)) { + eASSERT(env, ((flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); + enum osal_syncmode_bits mode_bits = MDBX_SYNC_NONE; + unsigned sync_op = 0; if ((flags & MDBX_SAFE_NOSYNC) == 0) { + sync_op = 1; mode_bits = MDBX_SYNC_DATA; - if (pending->mm_geo.next > meta_prefer_steady(env)->mm_geo.now) + if (pending->mm_geo.next > + meta_prefer_steady(env, troika).ptr_c->mm_geo.now) mode_bits |= MDBX_SYNC_SIZE; if (flags & MDBX_NOMETASYNC) mode_bits |= MDBX_SYNC_IODQ; - } + } else if (unlikely(env->me_incore)) + goto skip_incore_sync; + if (flags & MDBX_WRITEMAP) { #if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; + env->me_lck->mti_pgop_stat.msync.weak += sync_op; +#else + (void)sync_op; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (flags & MDBX_WRITEMAP) rc = - mdbx_msync(&env->me_dxb_mmap, 0, + osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, pending->mm_geo.next), mode_bits); - else - rc = mdbx_fsync(env->me_lazy_fd, mode_bits); + } else { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += sync_op; +#else + (void)sync_op; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->me_lazy_fd, mode_bits); + } if (unlikely(rc != MDBX_SUCCESS)) goto fail; rc = (flags & MDBX_SAFE_NOSYNC) ? MDBX_RESULT_TRUE /* carry non-steady */ : MDBX_RESULT_FALSE /* carry steady */; } - mdbx_assert(env, meta_checktxnid(env, pending, true)); + eASSERT(env, coherency_check_meta(env, pending, true)); /* Steady or Weak */ if (rc == MDBX_RESULT_FALSE /* carry steady */) { - atomic_store64(&env->me_lck->mti_sync_timestamp, mdbx_osal_monotime(), - mo_Relaxed); - unaligned_poke_u64(4, pending->mm_datasync_sign, meta_sign(pending)); - atomic_store32(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); + unaligned_poke_u64(4, pending->mm_sign, meta_sign(pending)); + atomic_store64(&env->me_lck->mti_eoos_timestamp, 0, mo_Relaxed); + atomic_store64(&env->me_lck->mti_unsynced_pages, 0, mo_Relaxed); } else { assert(rc == MDBX_RESULT_TRUE /* carry non-steady */); - unaligned_poke_u64(4, pending->mm_datasync_sign, MDBX_DATASIGN_WEAK); + skip_incore_sync: + eASSERT(env, env->me_lck->mti_unsynced_pages.weak > 0); + /* Может быть нулевым если unsynced_pages > 0 в результате спиллинга. + * eASSERT(env, env->me_lck->mti_eoos_timestamp.weak != 0); */ + unaligned_poke_u64(4, pending->mm_sign, MDBX_DATASIGN_WEAK); } + const bool legal4overwrite = + head.txnid == pending->unsafe_txnid && + memcmp(&head.ptr_c->mm_dbs, &pending->mm_dbs, sizeof(pending->mm_dbs)) == + 0 && + memcmp(&head.ptr_c->mm_canary, &pending->mm_canary, + sizeof(pending->mm_canary)) == 0 && + memcmp(&head.ptr_c->mm_geo, &pending->mm_geo, sizeof(pending->mm_geo)) == + 0; MDBX_meta *target = nullptr; - if (constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a)) { - mdbx_assert(env, memcmp(&head->mm_dbs, &pending->mm_dbs, - sizeof(head->mm_dbs)) == 0); - mdbx_assert(env, memcmp(&head->mm_canary, &pending->mm_canary, - sizeof(head->mm_canary)) == 0); - mdbx_assert(env, memcmp(&head->mm_geo, &pending->mm_geo, - sizeof(pending->mm_geo)) == 0); - if (!META_IS_STEADY(head) && META_IS_STEADY(pending)) - target = (MDBX_meta *)head; + if (head.txnid == pending->unsafe_txnid) { + ENSURE(env, legal4overwrite); + if (!head.is_steady && META_IS_STEADY(pending)) + target = (MDBX_meta *)head.ptr_c; else { - mdbx_ensure(env, meta_eq(env, head, pending)); - mdbx_debug("%s", "skip update meta"); + WARNING("%s", "skip update meta"); return MDBX_SUCCESS; } - } else if (head == meta0) - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta1, meta2); - else if (head == meta1) - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta0, meta2); - else { - mdbx_assert(env, head == meta2); - target = (MDBX_meta *)meta_ancient_prefer_weak(env, meta0, meta1); + } else { + const unsigned troika_tail = troika->tail_and_flags & 3; + ENSURE(env, troika_tail < NUM_METAS && troika_tail != troika->recent && + troika_tail != troika->prefer_steady); + target = (MDBX_meta *)meta_tail(env, troika).ptr_c; } /* LY: step#2 - update meta-page. */ - mdbx_debug( - "writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO - ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, - pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, - pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, - pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), - unaligned_peek_u64(4, pending->mm_txnid_a), mdbx_durable_str(pending)); + DEBUG("writing meta%" PRIaPGNO " = root %" PRIaPGNO "/%" PRIaPGNO + ", geo %" PRIaPGNO "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + data_page(target)->mp_pgno, pending->mm_dbs[MAIN_DBI].md_root, + pending->mm_dbs[FREE_DBI].md_root, pending->mm_geo.lower, + pending->mm_geo.next, pending->mm_geo.now, pending->mm_geo.upper, + pv2pages(pending->mm_geo.grow_pv), pv2pages(pending->mm_geo.shrink_pv), + pending->unsafe_txnid, durable_caption(pending)); - mdbx_debug("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta0 == head) ? "head" - : (meta0 == target) ? "tail" - : "stay", - mdbx_durable_str(meta0), meta_txnid(env, meta0), - meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); - mdbx_debug("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta1 == head) ? "head" - : (meta1 == target) ? "tail" - : "stay", - mdbx_durable_str(meta1), meta_txnid(env, meta1), - meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); - mdbx_debug("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO - "/%" PRIaPGNO, - (meta2 == head) ? "head" - : (meta2 == target) ? "tail" - : "stay", - mdbx_durable_str(meta2), meta_txnid(env, meta2), - meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); + DEBUG("meta0: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta0 == head.ptr_c) ? "head" + : (meta0 == target) ? "tail" + : "stay", + durable_caption(meta0), constmeta_txnid(meta0), + meta0->mm_dbs[MAIN_DBI].md_root, meta0->mm_dbs[FREE_DBI].md_root); + DEBUG("meta1: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta1 == head.ptr_c) ? "head" + : (meta1 == target) ? "tail" + : "stay", + durable_caption(meta1), constmeta_txnid(meta1), + meta1->mm_dbs[MAIN_DBI].md_root, meta1->mm_dbs[FREE_DBI].md_root); + DEBUG("meta2: %s, %s, txn_id %" PRIaTXN ", root %" PRIaPGNO "/%" PRIaPGNO, + (meta2 == head.ptr_c) ? "head" + : (meta2 == target) ? "tail" + : "stay", + durable_caption(meta2), constmeta_txnid(meta2), + meta2->mm_dbs[MAIN_DBI].md_root, meta2->mm_dbs[FREE_DBI].md_root); - mdbx_assert(env, !meta_eq(env, pending, meta0)); - mdbx_assert(env, !meta_eq(env, pending, meta1)); - mdbx_assert(env, !meta_eq(env, pending, meta2)); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta0) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta0))); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta1) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta1))); + eASSERT(env, pending->unsafe_txnid != constmeta_txnid(meta2) || + (META_IS_STEADY(pending) && !META_IS_STEADY(meta2))); - mdbx_assert(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); - mdbx_ensure(env, - target == head || constmeta_txnid(env, target) < - unaligned_peek_u64(4, pending->mm_txnid_a)); -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ + eASSERT(env, ((env->me_flags ^ flags) & MDBX_WRITEMAP) == 0); + ENSURE(env, target == head.ptr_c || + constmeta_txnid(target) < pending->unsafe_txnid); if (flags & MDBX_WRITEMAP) { - mdbx_jitter4testing(true); - if (likely(target != head)) { + jitter4testing(true); + if (likely(target != head.ptr_c)) { /* LY: 'invalidate' the meta. */ - meta_update_begin(env, target, - unaligned_peek_u64(4, pending->mm_txnid_a)); - unaligned_poke_u64(4, target->mm_datasync_sign, MDBX_DATASIGN_WEAK); + meta_update_begin(env, target, pending->unsafe_txnid); + unaligned_poke_u64(4, target->mm_sign, MDBX_DATASIGN_WEAK); #ifndef NDEBUG /* debug: provoke failure to catch a violators, but don't touch mm_psize * to allow readers catch actual pagesize. */ - uint8_t *provoke_begin = (uint8_t *)&target->mm_dbs[FREE_DBI].md_root; - uint8_t *provoke_end = (uint8_t *)&target->mm_datasync_sign; - memset(provoke_begin, 0xCC, provoke_end - provoke_begin); - mdbx_jitter4testing(false); + void *provoke_begin = &target->mm_dbs[FREE_DBI].md_root; + void *provoke_end = &target->mm_sign; + memset(provoke_begin, 0xCC, ptr_dist(provoke_end, provoke_begin)); + jitter4testing(false); #endif /* LY: update info */ @@ -11358,76 +12826,115 @@ static int mdbx_sync_locked(MDBX_env *env, unsigned flags, target->mm_dbs[MAIN_DBI] = pending->mm_dbs[MAIN_DBI]; target->mm_canary = pending->mm_canary; memcpy(target->mm_pages_retired, pending->mm_pages_retired, 8); - mdbx_jitter4testing(true); + jitter4testing(true); /* LY: 'commit' the meta */ meta_update_end(env, target, unaligned_peek_u64(4, pending->mm_txnid_b)); - mdbx_jitter4testing(true); - mdbx_assert(env, meta_checktxnid(env, target, true)); + jitter4testing(true); + eASSERT(env, coherency_check_meta(env, target, true)); } else { - /* dangerous case (target == head), only mm_datasync_sign could + /* dangerous case (target == head), only mm_sign could * me updated, check assertions once again */ - mdbx_ensure(env, constmeta_txnid(env, head) == - unaligned_peek_u64(4, pending->mm_txnid_a) && - !META_IS_STEADY(head) && META_IS_STEADY(pending)); - mdbx_ensure(env, memcmp(&head->mm_geo, &pending->mm_geo, - sizeof(head->mm_geo)) == 0); - mdbx_ensure(env, memcmp(&head->mm_dbs, &pending->mm_dbs, - sizeof(head->mm_dbs)) == 0); - mdbx_ensure(env, memcmp(&head->mm_canary, &pending->mm_canary, - sizeof(head->mm_canary)) == 0); + eASSERT(env, + legal4overwrite && !head.is_steady && META_IS_STEADY(pending)); + } + memcpy(target->mm_sign, pending->mm_sign, 8); + osal_flush_incoherent_cpu_writeback(); + jitter4testing(true); + if (!env->me_incore) { + if (!MDBX_AVOID_MSYNC) { + /* sync meta-pages */ +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync( + &env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), + (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE + : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } else { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + const MDBX_page *page = data_page(target); + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + ptr_dist(page, env->me_map)); + if (likely(rc == MDBX_SUCCESS)) { + osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), + env->me_os_psize); + if ((flags & MDBX_NOMETASYNC) == 0 && + env->me_fd4meta == env->me_lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + } + } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; } - memcpy(target->mm_datasync_sign, pending->mm_datasync_sign, 8); - mdbx_flush_incoherent_cpu_writeback(); - mdbx_jitter4testing(true); - /* sync meta-pages */ - rc = - mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, NUM_METAS), - (flags & MDBX_NOMETASYNC) ? MDBX_SYNC_NONE - : MDBX_SYNC_DATA | MDBX_SYNC_IODQ); - if (unlikely(rc != MDBX_SUCCESS)) - goto fail; } else { - const MDBX_meta undo_meta = *target; - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; #if MDBX_ENABLE_PGOP_STAT env->me_lck->mti_pgop_stat.wops.weak += 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - rc = mdbx_pwrite(fd, pending, sizeof(MDBX_meta), - (uint8_t *)target - env->me_map); + const MDBX_meta undo_meta = *target; + rc = osal_pwrite(env->me_fd4meta, pending, sizeof(MDBX_meta), + ptr_dist(target, env->me_map)); if (unlikely(rc != MDBX_SUCCESS)) { undo: - mdbx_debug("%s", "write failed, disk error?"); + DEBUG("%s", "write failed, disk error?"); /* On a failure, the pagecache still contains the new data. * Try write some old data back, to prevent it from being used. */ - mdbx_pwrite(fd, &undo_meta, sizeof(MDBX_meta), - (uint8_t *)target - env->me_map); + osal_pwrite(env->me_fd4meta, &undo_meta, sizeof(MDBX_meta), + ptr_dist(target, env->me_map)); goto fail; } - mdbx_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); + osal_flush_incoherent_mmap(target, sizeof(MDBX_meta), env->me_os_psize); /* sync meta-pages */ - if ((flags & MDBX_NOMETASYNC) == 0 && fd == env->me_lazy_fd) { - rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + if ((flags & MDBX_NOMETASYNC) == 0 && env->me_fd4meta == env->me_lazy_fd && + !env->me_incore) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (rc != MDBX_SUCCESS) goto undo; } - mdbx_assert(env, meta_checktxnid(env, target, true)); } + + uint64_t timestamp = 0; + while ("workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269") { + rc = + coherency_check_written(env, pending->unsafe_txnid, target, ×tamp); + if (likely(rc == MDBX_SUCCESS)) + break; + if (unlikely(rc != MDBX_RESULT_TRUE)) + goto fail; + } + + const uint32_t sync_txnid_dist = + ((flags & MDBX_NOMETASYNC) == 0) ? 0 + : ((flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC) + ? MDBX_NOMETASYNC_LAZY_FD + : MDBX_NOMETASYNC_LAZY_WRITEMAP; env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)unaligned_peek_u64(4, pending->mm_txnid_a) - - ((flags & MDBX_NOMETASYNC) ? UINT32_MAX / 3 : 0); + pending->mm_txnid_a[__BYTE_ORDER__ != __ORDER_LITTLE_ENDIAN__].weak - + sync_txnid_dist; + + *troika = meta_tap(env); + for (MDBX_txn *txn = env->me_txn0; txn; txn = txn->mt_child) + if (troika != &txn->tw.troika) + txn->tw.troika = *troika; /* LY: shrink datafile if needed */ if (unlikely(shrink)) { - mdbx_verbose("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", - pending->mm_geo.now, shrink); - rc = mdbx_mapresize_implicit(env, pending->mm_geo.next, pending->mm_geo.now, - pending->mm_geo.upper); + VERBOSE("shrink to %" PRIaPGNO " pages (-%" PRIaPGNO ")", + pending->mm_geo.now, shrink); + rc = dxb_resize(env, pending->mm_geo.next, pending->mm_geo.now, + pending->mm_geo.upper, impilict_shrink); if (rc != MDBX_SUCCESS && rc != MDBX_EPERM) goto fail; - mdbx_assert(env, meta_checktxnid(env, target, true)); + eASSERT(env, coherency_check_meta(env, target, true)); } MDBX_lockinfo *const lck = env->me_lck_mmap.lck; @@ -11443,7 +12950,7 @@ fail: } static void recalculate_merge_threshold(MDBX_env *env) { - const unsigned bytes = page_space(env); + const size_t bytes = page_space(env); env->me_merge_threshold = (uint16_t)(bytes - (bytes * env->me_options.merge_threshold_16dot16_percent >> @@ -11455,24 +12962,27 @@ static void recalculate_merge_threshold(MDBX_env *env) { : bytes / 4 /* 25 % */)); } -__cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { +__cold static void setup_pagesize(MDBX_env *env, const size_t pagesize) { STATIC_ASSERT(PTRDIFF_MAX > MAX_MAPSIZE); STATIC_ASSERT(MIN_PAGESIZE > sizeof(MDBX_page) + sizeof(MDBX_meta)); - mdbx_ensure(env, is_powerof2(pagesize)); - mdbx_ensure(env, pagesize >= MIN_PAGESIZE); - mdbx_ensure(env, pagesize <= MAX_PAGESIZE); + ENSURE(env, is_powerof2(pagesize)); + ENSURE(env, pagesize >= MIN_PAGESIZE); + ENSURE(env, pagesize <= MAX_PAGESIZE); env->me_psize = (unsigned)pagesize; if (env->me_pbuf) { - mdbx_memalign_free(env->me_pbuf); + osal_memalign_free(env->me_pbuf); env->me_pbuf = nullptr; } STATIC_ASSERT(MAX_GC1OVPAGE(MIN_PAGESIZE) > 4); STATIC_ASSERT(MAX_GC1OVPAGE(MAX_PAGESIZE) < MDBX_PGL_LIMIT); const intptr_t maxgc_ov1page = (pagesize - PAGEHDRSZ) / sizeof(pgno_t) - 1; - mdbx_ensure(env, maxgc_ov1page > 42 && - maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); + ENSURE(env, + maxgc_ov1page > 42 && maxgc_ov1page < (intptr_t)MDBX_PGL_LIMIT / 4); env->me_maxgc_ov1page = (unsigned)maxgc_ov1page; + env->me_maxgc_per_branch = + (unsigned)((pagesize - PAGEHDRSZ) / + (sizeof(indx_t) + sizeof(MDBX_node) + sizeof(txnid_t))); STATIC_ASSERT(LEAF_NODE_MAX(MIN_PAGESIZE) > sizeof(MDBX_db) + NODESIZE + 42); STATIC_ASSERT(LEAF_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); @@ -11481,16 +12991,16 @@ __cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { STATIC_ASSERT(BRANCH_NODE_MAX(MAX_PAGESIZE) < UINT16_MAX); const intptr_t branch_nodemax = BRANCH_NODE_MAX(pagesize); const intptr_t leaf_nodemax = LEAF_NODE_MAX(pagesize); - mdbx_ensure(env, - branch_nodemax > (intptr_t)(NODESIZE + 42) && + ENSURE(env, branch_nodemax > (intptr_t)(NODESIZE + 42) && branch_nodemax % 2 == 0 && leaf_nodemax > (intptr_t)(sizeof(MDBX_db) + NODESIZE + 42) && leaf_nodemax >= branch_nodemax && leaf_nodemax < (int)UINT16_MAX && leaf_nodemax % 2 == 0); env->me_leaf_nodemax = (unsigned)leaf_nodemax; + env->me_branch_nodemax = (unsigned)branch_nodemax; env->me_psize2log = (uint8_t)log2n_powerof2(pagesize); - mdbx_assert(env, pgno2bytes(env, 1) == pagesize); - mdbx_assert(env, bytes2pgno(env, pagesize + pagesize) == 2); + eASSERT(env, pgno2bytes(env, 1) == pagesize); + eASSERT(env, bytes2pgno(env, pagesize + pagesize) == 2); recalculate_merge_threshold(env); const pgno_t max_pgno = bytes2pgno(env, MAX_MAPSIZE); @@ -11499,7 +13009,7 @@ __cold static void mdbx_setup_pagesize(MDBX_env *env, const size_t pagesize) { intptr_t total_ram_pages, avail_ram_pages; int err = mdbx_get_sysraminfo(nullptr, &total_ram_pages, &avail_ram_pages); if (unlikely(err != MDBX_SUCCESS)) - mdbx_error("mdbx_get_sysraminfo(), rc %d", err); + ERROR("mdbx_get_sysraminfo(), rc %d", err); else { size_t reasonable_dpl_limit = (size_t)(total_ram_pages + avail_ram_pages) / 42; @@ -11531,64 +13041,93 @@ lckless_stub(const MDBX_env *env) { } __cold int mdbx_env_create(MDBX_env **penv) { - MDBX_env *env = mdbx_calloc(1, sizeof(MDBX_env)); + if (unlikely(!penv)) + return MDBX_EINVAL; + *penv = nullptr; + +#ifdef MDBX_HAVE_C11ATOMICS + if (unlikely(!atomic_is_lock_free((const volatile uint32_t *)penv))) { + ERROR("lock-free atomic ops for %u-bit types is required", 32); + return MDBX_INCOMPATIBLE; + } +#if MDBX_64BIT_ATOMIC + if (unlikely(!atomic_is_lock_free((const volatile uint64_t *)penv))) { + ERROR("lock-free atomic ops for %u-bit types is required", 64); + return MDBX_INCOMPATIBLE; + } +#endif /* MDBX_64BIT_ATOMIC */ +#endif /* MDBX_HAVE_C11ATOMICS */ + + const size_t os_psize = osal_syspagesize(); + if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { + ERROR("unsuitable system pagesize %" PRIuPTR, os_psize); + return MDBX_INCOMPATIBLE; + } + +#if defined(__linux__) || defined(__gnu_linux__) + if (unlikely(linux_kernel_version < 0x04000000)) { + /* 2022-09-01: Прошло уже больше двух после окончания какой-либо поддержки + * самого "долгоиграющего" ядра 3.16.85 ветки 3.x */ + ERROR("too old linux kernel %u.%u.%u.%u, the >= 4.0.0 is required", + linux_kernel_version >> 24, (linux_kernel_version >> 16) & 255, + (linux_kernel_version >> 8) & 255, linux_kernel_version & 255); + return MDBX_INCOMPATIBLE; + } +#endif /* Linux */ + + MDBX_env *env = osal_calloc(1, sizeof(MDBX_env)); if (unlikely(!env)) return MDBX_ENOMEM; env->me_maxreaders = DEFAULT_READERS; env->me_maxdbs = env->me_numdbs = CORE_DBS; - env->me_lazy_fd = INVALID_HANDLE_VALUE; - env->me_dsync_fd = INVALID_HANDLE_VALUE; - env->me_lfd = INVALID_HANDLE_VALUE; - env->me_pid = mdbx_getpid(); + env->me_lazy_fd = env->me_dsync_fd = env->me_fd4meta = env->me_lfd = + INVALID_HANDLE_VALUE; + env->me_pid = osal_getpid(); env->me_stuck_meta = -1; - env->me_options.dp_reserve_limit = 1024; - env->me_options.rp_augment_limit = 256 * 1024; - env->me_options.dp_limit = 64 * 1024; - if (env->me_options.dp_limit > MAX_PAGENO + 1 - NUM_METAS) - env->me_options.dp_limit = MAX_PAGENO + 1 - NUM_METAS; + env->me_options.rp_augment_limit = MDBX_PNL_INITIAL; + env->me_options.dp_reserve_limit = MDBX_PNL_INITIAL; env->me_options.dp_initial = MDBX_PNL_INITIAL; - if (env->me_options.dp_initial > env->me_options.dp_limit) - env->me_options.dp_initial = env->me_options.dp_limit; env->me_options.spill_max_denominator = 8; env->me_options.spill_min_denominator = 8; env->me_options.spill_parent4child_denominator = 0; env->me_options.dp_loose_limit = 64; env->me_options.merge_threshold_16dot16_percent = 65536 / 4 /* 25% */; - int rc; - const size_t os_psize = mdbx_syspagesize(); - if (unlikely(!is_powerof2(os_psize) || os_psize < MIN_PAGESIZE)) { - mdbx_error("unsuitable system pagesize %" PRIuPTR, os_psize); - rc = MDBX_INCOMPATIBLE; - goto bailout; - } - env->me_os_psize = (unsigned)os_psize; - mdbx_setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize - : MAX_PAGESIZE); +#if !(defined(_WIN32) || defined(_WIN64)) + env->me_options.writethrough_threshold = +#if defined(__linux__) || defined(__gnu_linux__) + mdbx_RunningOnWSL1 ? MAX_PAGENO : +#endif /* Linux */ + MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; +#endif /* Windows */ - rc = mdbx_fastmutex_init(&env->me_dbi_lock); + env->me_os_psize = (unsigned)os_psize; + setup_pagesize(env, (env->me_os_psize < MAX_PAGESIZE) ? env->me_os_psize + : MAX_PAGESIZE); + + int rc = osal_fastmutex_init(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; #if defined(_WIN32) || defined(_WIN64) - mdbx_srwlock_Init(&env->me_remap_guard); + osal_srwlock_Init(&env->me_remap_guard); InitializeCriticalSection(&env->me_windowsbug_lock); #else - rc = mdbx_fastmutex_init(&env->me_remap_guard); + rc = osal_fastmutex_init(&env->me_remap_guard); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_fastmutex_destroy(&env->me_dbi_lock); + osal_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - rc = mdbx_ipclock_stub(&stub->mti_wlock); + rc = osal_ipclock_stub(&stub->mti_wlock); #endif /* MDBX_LOCKING */ if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_fastmutex_destroy(&env->me_remap_guard); - mdbx_fastmutex_destroy(&env->me_dbi_lock); + osal_fastmutex_destroy(&env->me_remap_guard); + osal_fastmutex_destroy(&env->me_dbi_lock); goto bailout; } #endif /* Windows */ @@ -11599,8 +13138,7 @@ __cold int mdbx_env_create(MDBX_env **penv) { return MDBX_SUCCESS; bailout: - mdbx_free(env); - *penv = nullptr; + osal_free(env); return rc; } @@ -11644,7 +13182,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, return rc; const bool inside_txn = - (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()); + (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()); #if MDBX_DEBUG if (growth_step < 0) { @@ -11661,24 +13199,24 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; - const MDBX_geo *geo = nullptr; - if (inside_txn) - geo = &env->me_txn->mt_geo; - else { + if (!inside_txn) { int err = mdbx_txn_lock(env, false); if (unlikely(err != MDBX_SUCCESS)) return err; need_unlock = true; - - const MDBX_meta *head = constmeta_prefer_last(env); - geo = &head->mm_geo; - env->me_txn0->mt_txnid = constmeta_txnid(env, head); - mdbx_find_oldest(env->me_txn0); + env->me_txn0->tw.troika = meta_tap(env); + eASSERT(env, !env->me_txn && !env->me_txn0->mt_child); + env->me_txn0->mt_txnid = + env->me_txn0->tw.troika.txnid[env->me_txn0->tw.troika.recent]; + txn_oldest_reader(env->me_txn0); } - /* get untouched params from current write-txn or DB */ + /* get untouched params from current TXN or DB */ if (pagesize <= 0 || pagesize >= INT_MAX) pagesize = env->me_psize; + const MDBX_geo *const geo = + inside_txn ? &env->me_txn->mt_geo + : &meta_recent(env, &env->me_txn0->tw.troika).ptr_c->mm_geo; if (size_lower < 0) size_lower = pgno2bytes(env, geo->lower); if (size_now < 0) @@ -11694,7 +13232,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, rc = MDBX_EINVAL; goto bailout; } - const size_t usedbytes = pgno2bytes(env, mdbx_find_largest(env, geo->next)); + const size_t usedbytes = + pgno2bytes(env, find_largest_snapshot(env, geo->next)); if ((size_t)size_upper < usedbytes) { rc = MDBX_MAP_FULL; goto bailout; @@ -11714,7 +13253,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pagesize = env->me_os_psize; if ((uintptr_t)pagesize > MAX_PAGESIZE) pagesize = MAX_PAGESIZE; - mdbx_assert(env, (uintptr_t)pagesize >= MIN_PAGESIZE); + eASSERT(env, (uintptr_t)pagesize >= MIN_PAGESIZE); } else if (pagesize == 0 /* minimal */) pagesize = MIN_PAGESIZE; @@ -11824,7 +13363,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if ((size_t)size_upper < (size_t)size_lower) size_lower = size_upper; } - mdbx_assert(env, (size_upper - size_lower) % env->me_os_psize == 0); + eASSERT(env, (size_upper - size_lower) % env->me_os_psize == 0); if (size_now < size_lower) size_now = size_lower; @@ -11853,7 +13392,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (!env->me_map) { /* save user's geo-params for future open/create */ if (pagesize != (intptr_t)env->me_psize) - mdbx_setup_pagesize(env, pagesize); + setup_pagesize(env, pagesize); env->me_dbgeo.lower = size_lower; env->me_dbgeo.now = size_now; env->me_dbgeo.upper = size_upper; @@ -11861,53 +13400,54 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, growth_step)))); env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(pages2pv(bytes2pgno(env, shrink_threshold)))); + adjust_defaults(env); - mdbx_ensure(env, env->me_dbgeo.lower >= MIN_MAPSIZE); - mdbx_ensure(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); - mdbx_ensure(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.lower % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.lower >= MIN_MAPSIZE); + ENSURE(env, env->me_dbgeo.lower / (unsigned)pagesize >= MIN_PAGENO); + ENSURE(env, env->me_dbgeo.lower % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.lower % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.upper <= MAX_MAPSIZE); - mdbx_ensure(env, - env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1); - mdbx_ensure(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.upper % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.upper <= MAX_MAPSIZE); + ENSURE(env, env->me_dbgeo.upper / (unsigned)pagesize <= MAX_PAGENO + 1); + ENSURE(env, env->me_dbgeo.upper % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.upper % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.now >= env->me_dbgeo.lower); - mdbx_ensure(env, env->me_dbgeo.now <= env->me_dbgeo.upper); - mdbx_ensure(env, env->me_dbgeo.now % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.now % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.now >= env->me_dbgeo.lower); + ENSURE(env, env->me_dbgeo.now <= env->me_dbgeo.upper); + ENSURE(env, env->me_dbgeo.now % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.now % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.grow % env->me_os_psize == 0); - mdbx_ensure(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); - mdbx_ensure(env, env->me_dbgeo.shrink % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.grow % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.grow % env->me_os_psize == 0); + ENSURE(env, env->me_dbgeo.shrink % (unsigned)pagesize == 0); + ENSURE(env, env->me_dbgeo.shrink % env->me_os_psize == 0); rc = MDBX_SUCCESS; } else { /* apply new params to opened environment */ - mdbx_ensure(env, pagesize == (intptr_t)env->me_psize); + ENSURE(env, pagesize == (intptr_t)env->me_psize); MDBX_meta meta; memset(&meta, 0, sizeof(meta)); const MDBX_geo *current_geo; if (!inside_txn) { - mdbx_assert(env, need_unlock); - const MDBX_meta *head = constmeta_prefer_last(env); + eASSERT(env, need_unlock); + const meta_ptr_t head = meta_recent(env, &env->me_txn0->tw.troika); uint64_t timestamp = 0; while ("workaround for " "https://libmdbx.dqdkfa.ru/dead-github/issues/269") { - meta = *head; - rc = meta_waittxnid(env, &meta, ×tamp); + meta = *head.ptr_c; + rc = coherency_check_readed(env, head.txnid, meta.mm_dbs, &meta, + ×tamp); if (likely(rc == MDBX_SUCCESS)) break; if (unlikely(rc != MDBX_RESULT_TRUE)) goto bailout; } - const txnid_t txnid = safe64_txnid_next(constmeta_txnid(env, &meta)); + const txnid_t txnid = safe64_txnid_next(head.txnid); if (unlikely(txnid > MAX_TXNID)) { rc = MDBX_TXN_FULL; - mdbx_error("txnid overflow, raise %d", rc); + ERROR("txnid overflow, raise %d", rc); goto bailout; } meta_set_txnid(env, &meta, txnid); @@ -11916,6 +13456,13 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, current_geo = &env->me_txn->mt_geo; } + /* update env-geo to avoid influences */ + env->me_dbgeo.now = pgno2bytes(env, current_geo->now); + env->me_dbgeo.lower = pgno2bytes(env, current_geo->lower); + env->me_dbgeo.upper = pgno2bytes(env, current_geo->upper); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(current_geo->grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(current_geo->shrink_pv)); + MDBX_geo new_geo; new_geo.lower = bytes2pgno(env, size_lower); new_geo.now = bytes2pgno(env, size_now); @@ -11924,22 +13471,19 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, new_geo.shrink_pv = pages2pv(bytes2pgno(env, shrink_threshold)); new_geo.next = current_geo->next; - mdbx_ensure(env, - pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); - mdbx_ensure(env, - pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); - mdbx_ensure(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); - mdbx_ensure(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); - mdbx_ensure(env, - new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); + ENSURE(env, pgno_align2os_bytes(env, new_geo.lower) == (size_t)size_lower); + ENSURE(env, pgno_align2os_bytes(env, new_geo.upper) == (size_t)size_upper); + ENSURE(env, pgno_align2os_bytes(env, new_geo.now) == (size_t)size_now); + ENSURE(env, new_geo.grow_pv == pages2pv(pv2pages(new_geo.grow_pv))); + ENSURE(env, new_geo.shrink_pv == pages2pv(pv2pages(new_geo.shrink_pv))); - mdbx_ensure(env, (size_t)size_lower >= MIN_MAPSIZE); - mdbx_ensure(env, new_geo.lower >= MIN_PAGENO); - mdbx_ensure(env, (size_t)size_upper <= MAX_MAPSIZE); - mdbx_ensure(env, new_geo.upper <= MAX_PAGENO + 1); - mdbx_ensure(env, new_geo.now >= new_geo.next); - mdbx_ensure(env, new_geo.upper >= new_geo.now); - mdbx_ensure(env, new_geo.now >= new_geo.lower); + ENSURE(env, (size_t)size_lower >= MIN_MAPSIZE); + ENSURE(env, new_geo.lower >= MIN_PAGENO); + ENSURE(env, (size_t)size_upper <= MAX_MAPSIZE); + ENSURE(env, new_geo.upper <= MAX_PAGENO + 1); + ENSURE(env, new_geo.now >= new_geo.next); + ENSURE(env, new_geo.upper >= new_geo.now); + ENSURE(env, new_geo.now >= new_geo.lower); if (memcmp(current_geo, &new_geo, sizeof(MDBX_geo)) != 0) { #if defined(_WIN32) || defined(_WIN64) @@ -11951,7 +13495,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, rc = MDBX_EPERM; goto bailout; } - int err = mdbx_rdt_lock(env); + int err = osal_rdt_lock(env); if (unlikely(MDBX_IS_ERROR(err))) { rc = err; goto bailout; @@ -11972,7 +13516,7 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, } } - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -11980,8 +13524,8 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, if (new_geo.now != current_geo->now || new_geo.upper != current_geo->upper) { - rc = mdbx_mapresize(env, current_geo->next, new_geo.now, new_geo.upper, - false); + rc = dxb_resize(env, current_geo->next, new_geo.now, new_geo.upper, + explicit_resize); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } @@ -11990,18 +13534,22 @@ mdbx_env_set_geometry(MDBX_env *env, intptr_t size_lower, intptr_t size_now, env->me_txn->mt_flags |= MDBX_TXN_DIRTY; } else { meta.mm_geo = new_geo; - rc = mdbx_sync_locked(env, env->me_flags, &meta); - } - - if (likely(rc == MDBX_SUCCESS)) { - /* store new geo to env to avoid influences */ - env->me_dbgeo.now = pgno2bytes(env, new_geo.now); - env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); - env->me_dbgeo.upper = pgno2bytes(env, new_geo.upper); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); + rc = sync_locked(env, env->me_flags, &meta, &env->me_txn0->tw.troika); + if (likely(rc == MDBX_SUCCESS)) { + env->me_dbgeo.now = pgno2bytes(env, new_geo.now = meta.mm_geo.now); + env->me_dbgeo.upper = + pgno2bytes(env, new_geo.upper = meta.mm_geo.upper); + } } } + if (likely(rc == MDBX_SUCCESS)) { + /* update env-geo to avoid influences */ + eASSERT(env, env->me_dbgeo.now == pgno2bytes(env, new_geo.now)); + env->me_dbgeo.lower = pgno2bytes(env, new_geo.lower); + eASSERT(env, env->me_dbgeo.upper == pgno2bytes(env, new_geo.upper)); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(new_geo.grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(new_geo.shrink_pv)); + } } bailout: @@ -12033,25 +13581,25 @@ __cold int mdbx_env_get_maxreaders(const MDBX_env *env, unsigned *readers) { #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ __cold static int alloc_page_buf(MDBX_env *env) { - return env->me_pbuf - ? MDBX_SUCCESS - : mdbx_memalign_alloc(env->me_os_psize, env->me_psize * NUM_METAS, - &env->me_pbuf); + return env->me_pbuf ? MDBX_SUCCESS + : osal_memalign_alloc(env->me_os_psize, + env->me_psize * (size_t)NUM_METAS, + &env->me_pbuf); } /* Further setup required for opening an MDBX environment */ -__cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, - const mdbx_mode_t mode_bits) { - MDBX_meta meta; +__cold static int setup_dxb(MDBX_env *env, const int lck_rc, + const mdbx_mode_t mode_bits) { + MDBX_meta header; int rc = MDBX_RESULT_FALSE; - int err = mdbx_read_header(env, &meta, lck_rc, mode_bits); + int err = read_header(env, &header, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE || err != MDBX_ENODATA || (env->me_flags & MDBX_RDONLY) != 0 || /* recovery mode */ env->me_stuck_meta >= 0) return err; - mdbx_debug("%s", "create new database"); + DEBUG("%s", "create new database"); rc = /* new database */ MDBX_RESULT_TRUE; if (!env->me_dbgeo.now) { @@ -12065,49 +13613,51 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, if (unlikely(err != MDBX_SUCCESS)) return err; - meta = *mdbx_init_metas(env, env->me_pbuf); - err = mdbx_pwrite(env->me_lazy_fd, env->me_pbuf, env->me_psize * NUM_METAS, - 0); + header = *init_metas(env, env->me_pbuf); + err = osal_pwrite(env->me_lazy_fd, env->me_pbuf, + env->me_psize * (size_t)NUM_METAS, 0); if (unlikely(err != MDBX_SUCCESS)) return err; - err = mdbx_ftruncate(env->me_lazy_fd, - env->me_dxb_mmap.filesize = env->me_dbgeo.now); + err = osal_ftruncate(env->me_lazy_fd, env->me_dxb_mmap.filesize = + env->me_dxb_mmap.current = + env->me_dbgeo.now); if (unlikely(err != MDBX_SUCCESS)) return err; #ifndef NDEBUG /* just for checking */ - err = mdbx_read_header(env, &meta, lck_rc, mode_bits); + err = read_header(env, &header, lck_rc, mode_bits); if (unlikely(err != MDBX_SUCCESS)) return err; #endif } - mdbx_verbose( - "header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO "/%" PRIaPGNO - "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.grow_pv), pv2pages(meta.mm_geo.shrink_pv), - unaligned_peek_u64(4, meta.mm_txnid_a), mdbx_durable_str(&meta)); + VERBOSE("header: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO " +%u -%u, txn_id %" PRIaTXN + ", %s", + header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, + header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, + header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), + pv2pages(header.mm_geo.shrink_pv), + unaligned_peek_u64(4, header.mm_txnid_a), durable_caption(&header)); - if (env->me_psize != meta.mm_psize) - mdbx_setup_pagesize(env, meta.mm_psize); - const size_t used_bytes = pgno2bytes(env, meta.mm_geo.next); + if (env->me_psize != header.mm_psize) + setup_pagesize(env, header.mm_psize); + const size_t used_bytes = pgno2bytes(env, header.mm_geo.next); const size_t used_aligned2os_bytes = ceil_powerof2(used_bytes, env->me_os_psize); if ((env->me_flags & MDBX_RDONLY) /* readonly */ || lck_rc != MDBX_RESULT_TRUE /* not exclusive */ || /* recovery mode */ env->me_stuck_meta >= 0) { /* use present params from db */ - const size_t pagesize = meta.mm_psize; + const size_t pagesize = header.mm_psize; err = mdbx_env_set_geometry( - env, meta.mm_geo.lower * pagesize, meta.mm_geo.now * pagesize, - meta.mm_geo.upper * pagesize, pv2pages(meta.mm_geo.grow_pv) * pagesize, - pv2pages(meta.mm_geo.shrink_pv) * pagesize, meta.mm_psize); + env, header.mm_geo.lower * pagesize, header.mm_geo.now * pagesize, + header.mm_geo.upper * pagesize, + pv2pages(header.mm_geo.grow_pv) * pagesize, + pv2pages(header.mm_geo.shrink_pv) * pagesize, header.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("%s: err %d", "could not apply preconfigured geometry from db", - err); + ERROR("%s: err %d", "could not apply geometry from db", err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } } else if (env->me_dbgeo.now) { @@ -12122,13 +13672,13 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, * - shrink threshold or growth step * But ignore change just a 'now/current' size. */ if (bytes_align2os_bytes(env, env->me_dbgeo.upper) != - pgno2bytes(env, meta.mm_geo.upper) || + pgno2bytes(env, header.mm_geo.upper) || bytes_align2os_bytes(env, env->me_dbgeo.lower) != - pgno2bytes(env, meta.mm_geo.lower) || + pgno2bytes(env, header.mm_geo.lower) || bytes_align2os_bytes(env, env->me_dbgeo.shrink) != - pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)) || + pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)) || bytes_align2os_bytes(env, env->me_dbgeo.grow) != - pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv))) { + pgno2bytes(env, pv2pages(header.mm_geo.grow_pv))) { if (env->me_dbgeo.shrink && env->me_dbgeo.now > used_bytes) /* pre-shrink if enabled */ @@ -12137,84 +13687,82 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, err = mdbx_env_set_geometry(env, env->me_dbgeo.lower, env->me_dbgeo.now, env->me_dbgeo.upper, env->me_dbgeo.grow, - env->me_dbgeo.shrink, meta.mm_psize); + env->me_dbgeo.shrink, header.mm_psize); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("%s: err %d", "could not apply preconfigured db-geometry", - err); + ERROR("%s: err %d", "could not apply preconfigured db-geometry", err); return (err == MDBX_EINVAL) ? MDBX_INCOMPATIBLE : err; } /* update meta fields */ - meta.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); - meta.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); - meta.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); - meta.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); - meta.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); + header.mm_geo.now = bytes2pgno(env, env->me_dbgeo.now); + header.mm_geo.lower = bytes2pgno(env, env->me_dbgeo.lower); + header.mm_geo.upper = bytes2pgno(env, env->me_dbgeo.upper); + header.mm_geo.grow_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.grow)); + header.mm_geo.shrink_pv = pages2pv(bytes2pgno(env, env->me_dbgeo.shrink)); - mdbx_verbose("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO - "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO - " +%u -%u, txn_id %" PRIaTXN ", %s", - meta.mm_dbs[MAIN_DBI].md_root, meta.mm_dbs[FREE_DBI].md_root, - meta.mm_geo.lower, meta.mm_geo.next, meta.mm_geo.now, - meta.mm_geo.upper, pv2pages(meta.mm_geo.grow_pv), - pv2pages(meta.mm_geo.shrink_pv), - unaligned_peek_u64(4, meta.mm_txnid_a), - mdbx_durable_str(&meta)); + VERBOSE("amended: root %" PRIaPGNO "/%" PRIaPGNO ", geo %" PRIaPGNO + "/%" PRIaPGNO "-%" PRIaPGNO "/%" PRIaPGNO + " +%u -%u, txn_id %" PRIaTXN ", %s", + header.mm_dbs[MAIN_DBI].md_root, header.mm_dbs[FREE_DBI].md_root, + header.mm_geo.lower, header.mm_geo.next, header.mm_geo.now, + header.mm_geo.upper, pv2pages(header.mm_geo.grow_pv), + pv2pages(header.mm_geo.shrink_pv), + unaligned_peek_u64(4, header.mm_txnid_a), + durable_caption(&header)); } else { /* fetch back 'now/current' size, since it was ignored during comparison * and may differ. */ - env->me_dbgeo.now = pgno_align2os_bytes(env, meta.mm_geo.now); + env->me_dbgeo.now = pgno_align2os_bytes(env, header.mm_geo.now); } - mdbx_ensure(env, meta.mm_geo.now >= meta.mm_geo.next); + ENSURE(env, header.mm_geo.now >= header.mm_geo.next); } else { /* geo-params are not pre-configured by user, * get current values from the meta. */ - env->me_dbgeo.now = pgno2bytes(env, meta.mm_geo.now); - env->me_dbgeo.lower = pgno2bytes(env, meta.mm_geo.lower); - env->me_dbgeo.upper = pgno2bytes(env, meta.mm_geo.upper); - env->me_dbgeo.grow = pgno2bytes(env, pv2pages(meta.mm_geo.grow_pv)); - env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(meta.mm_geo.shrink_pv)); + env->me_dbgeo.now = pgno2bytes(env, header.mm_geo.now); + env->me_dbgeo.lower = pgno2bytes(env, header.mm_geo.lower); + env->me_dbgeo.upper = pgno2bytes(env, header.mm_geo.upper); + env->me_dbgeo.grow = pgno2bytes(env, pv2pages(header.mm_geo.grow_pv)); + env->me_dbgeo.shrink = pgno2bytes(env, pv2pages(header.mm_geo.shrink_pv)); } - mdbx_ensure(env, - pgno_align2os_bytes(env, meta.mm_geo.now) == env->me_dbgeo.now); - mdbx_ensure(env, env->me_dbgeo.now >= used_bytes); + ENSURE(env, pgno_align2os_bytes(env, header.mm_geo.now) == env->me_dbgeo.now); + ENSURE(env, env->me_dbgeo.now >= used_bytes); const uint64_t filesize_before = env->me_dxb_mmap.filesize; if (unlikely(filesize_before != env->me_dbgeo.now)) { if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { - mdbx_verbose("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO - "p, have %" PRIu64 "b/%" PRIaPGNO "p), " - "assume other process working", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), - filesize_before, bytes2pgno(env, (size_t)filesize_before)); + VERBOSE("filesize mismatch (expect %" PRIuPTR "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p), " + "assume other process working", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); } else { - mdbx_warning("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO - "p, have %" PRIu64 "b/%" PRIaPGNO "p)", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), - filesize_before, bytes2pgno(env, (size_t)filesize_before)); + WARNING("filesize mismatch (expect %" PRIuSIZE "b/%" PRIaPGNO + "p, have %" PRIu64 "b/%" PRIaPGNO "p)", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now), + filesize_before, bytes2pgno(env, (size_t)filesize_before)); if (filesize_before < used_bytes) { - mdbx_error("last-page beyond end-of-file (last %" PRIaPGNO - ", have %" PRIaPGNO ")", - meta.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); + ERROR("last-page beyond end-of-file (last %" PRIaPGNO + ", have %" PRIaPGNO ")", + header.mm_geo.next, bytes2pgno(env, (size_t)filesize_before)); return MDBX_CORRUPTED; } if (env->me_flags & MDBX_RDONLY) { if (filesize_before & (env->me_os_psize - 1)) { - mdbx_error("%s", "filesize should be rounded-up to system page"); + ERROR("%s", "filesize should be rounded-up to system page"); return MDBX_WANNA_RECOVERY; } - mdbx_warning("%s", "ignore filesize mismatch in readonly-mode"); + WARNING("%s", "ignore filesize mismatch in readonly-mode"); } else { - mdbx_verbose("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO - " pages", - env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); + VERBOSE("will resize datafile to %" PRIuSIZE " bytes, %" PRIaPGNO + " pages", + env->me_dbgeo.now, bytes2pgno(env, env->me_dbgeo.now)); } } } - mdbx_verbose("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", - bootid.x, bootid.y, (bootid.x | bootid.y) ? "" : "not-"); + VERBOSE("current boot-id %" PRIx64 "-%" PRIx64 " (%savailable)", bootid.x, + bootid.y, (bootid.x | bootid.y) ? "" : "not-"); #if MDBX_ENABLE_MADVISE /* calculate readahead hint before mmap with zero redundant pages */ @@ -12223,7 +13771,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, mdbx_is_readahead_reasonable(used_bytes, 0) == MDBX_RESULT_TRUE; #endif /* MDBX_ENABLE_MADVISE */ - err = mdbx_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, + err = osal_mmap(env->me_flags, &env->me_dxb_mmap, env->me_dbgeo.now, env->me_dbgeo.upper, lck_rc ? MMAP_OPTION_TRUNCATE : 0); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -12237,7 +13785,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, return err; #endif /* MADV_DONTDUMP */ #if defined(MADV_DODUMP) - if (mdbx_runtime_flags & MDBX_DBG_DUMP) { + if (runtime_flags & MDBX_DBG_DUMP) { const size_t meta_length_aligned2os = pgno_align2os_bytes(env, NUM_METAS); err = madvise(env->me_map, meta_length_aligned2os, MADV_DODUMP) ? ignore_enosys(errno) @@ -12253,14 +13801,14 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, VALGRIND_CREATE_BLOCK(env->me_map, env->me_dxb_mmap.limit, "mdbx"); #endif /* MDBX_USE_VALGRIND */ - mdbx_assert(env, used_bytes >= pgno2bytes(env, NUM_METAS) && - used_bytes <= env->me_dxb_mmap.limit); + eASSERT(env, used_bytes >= pgno2bytes(env, NUM_METAS) && + used_bytes <= env->me_dxb_mmap.limit); #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) if (env->me_dxb_mmap.filesize > used_bytes && env->me_dxb_mmap.filesize < env->me_dxb_mmap.limit) { - VALGRIND_MAKE_MEM_NOACCESS(env->me_map + used_bytes, + VALGRIND_MAKE_MEM_NOACCESS(ptr_disp(env->me_map, used_bytes), env->me_dxb_mmap.filesize - used_bytes); - MDBX_ASAN_POISON_MEMORY_REGION(env->me_map + used_bytes, + MDBX_ASAN_POISON_MEMORY_REGION(ptr_disp(env->me_map, used_bytes), env->me_dxb_mmap.filesize - used_bytes); } env->me_poison_edge = @@ -12269,198 +13817,211 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, : env->me_dxb_mmap.limit); #endif /* MDBX_USE_VALGRIND || __SANITIZE_ADDRESS__ */ + meta_troika_t troika = meta_tap(env); +#if MDBX_DEBUG + meta_troika_dump(env, &troika); +#endif + eASSERT(env, !env->me_txn && !env->me_txn0); //-------------------------------- validate/rollback head & steady meta-pages if (unlikely(env->me_stuck_meta >= 0)) { /* recovery mode */ MDBX_meta clone; MDBX_meta const *const target = METAPAGE(env, env->me_stuck_meta); - err = mdbx_validate_meta_copy(env, target, &clone); + err = validate_meta_copy(env, target, &clone); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("target meta[%u] is corrupted", - bytes2pgno(env, (uint8_t *)data_page(target) - env->me_map)); + ERROR("target meta[%u] is corrupted", + bytes2pgno(env, ptr_dist(data_page(target), env->me_map))); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } } else /* not recovery mode */ while (1) { - const unsigned meta_clash_mask = meta_eq_mask(env); + const unsigned meta_clash_mask = meta_eq_mask(&troika); if (unlikely(meta_clash_mask)) { - mdbx_error("meta-pages are clashed: mask 0x%d", meta_clash_mask); + ERROR("meta-pages are clashed: mask 0x%d", meta_clash_mask); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } if (lck_rc != /* lck exclusive */ MDBX_RESULT_TRUE) { /* non-exclusive mode, * meta-pages should be validated by a first process opened the DB */ - volatile const MDBX_meta *const head = meta_prefer_last(env); - volatile const MDBX_meta *const steady = meta_prefer_steady(env); - const txnid_t head_txnid = meta_txnid(env, head); - const txnid_t steady_txnid = meta_txnid(env, steady); - if (head_txnid == steady_txnid) + if (troika.recent == troika.prefer_steady) break; if (!env->me_lck_mmap.lck) { /* LY: without-lck (read-only) mode, so it is impossible that other * process made weak checkpoint. */ - mdbx_error("%s", "without-lck, unable recovery/rollback"); + ERROR("%s", "without-lck, unable recovery/rollback"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } /* LY: assume just have a collision with other running process, * or someone make a weak checkpoint */ - mdbx_verbose("%s", "assume collision or online weak checkpoint"); + VERBOSE("%s", "assume collision or online weak checkpoint"); break; } - mdbx_assert(env, lck_rc == MDBX_RESULT_TRUE); + eASSERT(env, lck_rc == MDBX_RESULT_TRUE); /* exclusive mode */ + const meta_ptr_t recent = meta_recent(env, &troika); + const meta_ptr_t prefer_steady = meta_prefer_steady(env, &troika); MDBX_meta clone; - const MDBX_meta *const steady = constmeta_prefer_steady(env); - const MDBX_meta *const head = constmeta_prefer_last(env); - const txnid_t steady_txnid = meta_txnid(env, steady); - if (META_IS_STEADY(steady)) { - err = mdbx_validate_meta_copy(env, steady, &clone); + if (prefer_steady.is_steady) { + err = validate_meta_copy(env, prefer_steady.ptr_c, &clone); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("meta[%u] with %s txnid %" PRIaTXN - " is corrupted, %s needed", - bytes2pgno(env, (uint8_t *)steady - env->me_map), "steady", - steady_txnid, "manual recovery"); + ERROR("meta[%u] with %s txnid %" PRIaTXN " is corrupted, %s needed", + bytes2pgno(env, ptr_dist(prefer_steady.ptr_c, env->me_map)), + "steady", prefer_steady.txnid, "manual recovery"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } - if (steady == head) + if (prefer_steady.ptr_c == recent.ptr_c) break; } - const pgno_t pgno = bytes2pgno(env, (uint8_t *)head - env->me_map); - const txnid_t head_txnid = meta_txnid(env, head); - const bool head_valid = - mdbx_validate_meta_copy(env, head, &clone) == MDBX_SUCCESS; - mdbx_assert(env, !META_IS_STEADY(steady) || head_txnid != steady_txnid); - if (unlikely(!head_valid)) { - if (unlikely(!META_IS_STEADY(steady))) { - mdbx_error("%s for open or automatic rollback, %s", - "there are no suitable meta-pages", - "manual recovery is required"); + const pgno_t pgno = bytes2pgno(env, ptr_dist(recent.ptr_c, env->me_map)); + const bool last_valid = + validate_meta_copy(env, recent.ptr_c, &clone) == MDBX_SUCCESS; + eASSERT(env, + !prefer_steady.is_steady || recent.txnid != prefer_steady.txnid); + if (unlikely(!last_valid)) { + if (unlikely(!prefer_steady.is_steady)) { + ERROR("%s for open or automatic rollback, %s", + "there are no suitable meta-pages", + "manual recovery is required"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } - mdbx_warning("meta[%u] with last txnid %" PRIaTXN - " is corrupted, rollback needed", - pgno, head_txnid); + WARNING("meta[%u] with last txnid %" PRIaTXN + " is corrupted, rollback needed", + pgno, recent.txnid); + meta_troika_dump(env, &troika); goto purge_meta_head; } - if (meta_bootid_match(head)) { + if (meta_bootid_match(recent.ptr_c)) { if (env->me_flags & MDBX_RDONLY) { - mdbx_error("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " - "rollback NOT needed, steady-sync NEEDED%s", - "opening after an unclean shutdown", bootid.x, bootid.y, - ", but unable in read-only mode"); + ERROR("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " + "rollback NOT needed, steady-sync NEEDED%s", + "opening after an unclean shutdown", bootid.x, bootid.y, + ", but unable in read-only mode"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } - mdbx_warning("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " - "rollback NOT needed, steady-sync NEEDED%s", - "opening after an unclean shutdown", bootid.x, bootid.y, - ""); - meta = clone; - atomic_store32(&env->me_lck->mti_unsynced_pages, meta.mm_geo.next, - mo_Relaxed); + WARNING("%s, but boot-id(%016" PRIx64 "-%016" PRIx64 ") is MATCH: " + "rollback NOT needed, steady-sync NEEDED%s", + "opening after an unclean shutdown", bootid.x, bootid.y, ""); + header = clone; + env->me_lck->mti_unsynced_pages.weak = header.mm_geo.next; + if (!env->me_lck->mti_eoos_timestamp.weak) + env->me_lck->mti_eoos_timestamp.weak = osal_monotime(); break; } - if (unlikely(!META_IS_STEADY(steady))) { - mdbx_error("%s, but %s for automatic rollback: %s", - "opening after an unclean shutdown", - "there are no suitable meta-pages", - "manual recovery is required"); + if (unlikely(!prefer_steady.is_steady)) { + ERROR("%s, but %s for automatic rollback: %s", + "opening after an unclean shutdown", + "there are no suitable meta-pages", + "manual recovery is required"); + meta_troika_dump(env, &troika); return MDBX_CORRUPTED; } if (env->me_flags & MDBX_RDONLY) { - mdbx_error("%s and rollback needed: (from head %" PRIaTXN - " to steady %" PRIaTXN ")%s", - "opening after an unclean shutdown", head_txnid, - steady_txnid, ", but unable in read-only mode"); + ERROR("%s and rollback needed: (from head %" PRIaTXN + " to steady %" PRIaTXN ")%s", + "opening after an unclean shutdown", recent.txnid, + prefer_steady.txnid, ", but unable in read-only mode"); + meta_troika_dump(env, &troika); return MDBX_WANNA_RECOVERY; } purge_meta_head: - mdbx_notice("%s and doing automatic rollback: " - "purge%s meta[%u] with%s txnid %" PRIaTXN, - "opening after an unclean shutdown", - head_valid ? "" : " invalid", pgno, head_valid ? " weak" : "", - head_txnid); - mdbx_ensure(env, META_IS_STEADY(steady)); - err = mdbx_override_meta(env, pgno, 0, head_valid ? head : steady); + NOTICE("%s and doing automatic rollback: " + "purge%s meta[%u] with%s txnid %" PRIaTXN, + "opening after an unclean shutdown", last_valid ? "" : " invalid", + pgno, last_valid ? " weak" : "", recent.txnid); + meta_troika_dump(env, &troika); + ENSURE(env, prefer_steady.is_steady); + err = override_meta(env, pgno, 0, + last_valid ? recent.ptr_c : prefer_steady.ptr_c); if (err) { - mdbx_error("rollback: overwrite meta[%u] with txnid %" PRIaTXN - ", error %d", - pgno, head_txnid, err); + ERROR("rollback: overwrite meta[%u] with txnid %" PRIaTXN ", error %d", + pgno, recent.txnid, err); return err; } - mdbx_ensure(env, 0 == meta_txnid(env, head)); - mdbx_ensure(env, 0 == meta_eq_mask(env)); + troika = meta_tap(env); + ENSURE(env, 0 == meta_txnid(recent.ptr_v)); + ENSURE(env, 0 == meta_eq_mask(&troika)); } if (lck_rc == /* lck exclusive */ MDBX_RESULT_TRUE) { //-------------------------------------------------- shrink DB & update geo - const MDBX_meta *head = constmeta_prefer_last(env); /* re-check size after mmap */ if ((env->me_dxb_mmap.current & (env->me_os_psize - 1)) != 0 || env->me_dxb_mmap.current < used_bytes) { - mdbx_error("unacceptable/unexpected datafile size %" PRIuPTR, - env->me_dxb_mmap.current); + ERROR("unacceptable/unexpected datafile size %" PRIuPTR, + env->me_dxb_mmap.current); return MDBX_PROBLEM; } if (env->me_dxb_mmap.current != env->me_dbgeo.now) { - meta.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); - mdbx_notice("need update meta-geo to filesize %" PRIuPTR - " bytes, %" PRIaPGNO " pages", - env->me_dxb_mmap.current, meta.mm_geo.now); + header.mm_geo.now = bytes2pgno(env, env->me_dxb_mmap.current); + NOTICE("need update meta-geo to filesize %" PRIuPTR " bytes, %" PRIaPGNO + " pages", + env->me_dxb_mmap.current, header.mm_geo.now); } - if (memcmp(&meta.mm_geo, &head->mm_geo, sizeof(meta.mm_geo))) { + const meta_ptr_t recent = meta_recent(env, &troika); + if (memcmp(&header.mm_geo, &recent.ptr_c->mm_geo, sizeof(header.mm_geo))) { if ((env->me_flags & MDBX_RDONLY) != 0 || /* recovery mode */ env->me_stuck_meta >= 0) { - mdbx_warning( - "skipped update meta.geo in %s mode: from l%" PRIaPGNO - "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO - "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", - (env->me_stuck_meta < 0) ? "read-only" : "recovery", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), pv2pages(head->mm_geo.grow_pv), - meta.mm_geo.lower, meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), pv2pages(meta.mm_geo.grow_pv)); + WARNING("skipped update meta.geo in %s mode: from l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u, to l%" PRIaPGNO + "-n%" PRIaPGNO "-u%" PRIaPGNO "/s%u-g%u", + (env->me_stuck_meta < 0) ? "read-only" : "recovery", + recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), header.mm_geo.lower, + header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv)); } else { - const txnid_t txnid = constmeta_txnid(env, head); - const txnid_t next_txnid = safe64_txnid_next(txnid); + const txnid_t next_txnid = safe64_txnid_next(recent.txnid); if (unlikely(next_txnid > MAX_TXNID)) { - mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); + ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } - mdbx_notice("updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - head->mm_geo.lower, head->mm_geo.now, head->mm_geo.upper, - pv2pages(head->mm_geo.shrink_pv), - pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, - meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), - pv2pages(meta.mm_geo.grow_pv), next_txnid); + NOTICE("updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, + header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv), next_txnid); - mdbx_ensure(env, meta_eq(env, &meta, head)); - meta_set_txnid(env, &meta, next_txnid); - err = mdbx_sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &meta); + ENSURE(env, header.unsafe_txnid == recent.txnid); + meta_set_txnid(env, &header, next_txnid); + err = sync_locked(env, env->me_flags | MDBX_SHRINK_ALLOWED, &header, + &troika); if (err) { - mdbx_error("error %d, while updating meta.geo: " - "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN "), " - "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO - "/s%u-g%u (txn#%" PRIaTXN ")", - err, head->mm_geo.lower, head->mm_geo.now, - head->mm_geo.upper, pv2pages(head->mm_geo.shrink_pv), - pv2pages(head->mm_geo.grow_pv), txnid, meta.mm_geo.lower, - meta.mm_geo.now, meta.mm_geo.upper, - pv2pages(meta.mm_geo.shrink_pv), - pv2pages(meta.mm_geo.grow_pv), next_txnid); + ERROR("error %d, while updating meta.geo: " + "from l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN "), " + "to l%" PRIaPGNO "-n%" PRIaPGNO "-u%" PRIaPGNO + "/s%u-g%u (txn#%" PRIaTXN ")", + err, recent.ptr_c->mm_geo.lower, recent.ptr_c->mm_geo.now, + recent.ptr_c->mm_geo.upper, + pv2pages(recent.ptr_c->mm_geo.shrink_pv), + pv2pages(recent.ptr_c->mm_geo.grow_pv), recent.txnid, + header.mm_geo.lower, header.mm_geo.now, header.mm_geo.upper, + pv2pages(header.mm_geo.shrink_pv), + pv2pages(header.mm_geo.grow_pv), header.unsafe_txnid); return err; } } @@ -12470,27 +14031,28 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, bytes2pgno(env, used_aligned2os_bytes), mo_Relaxed); if ((env->me_flags & MDBX_RDONLY) == 0 && env->me_stuck_meta < 0 && - (mdbx_runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { + (runtime_flags & MDBX_DBG_DONT_UPGRADE) == 0) { for (int n = 0; n < NUM_METAS; ++n) { - MDBX_meta *const pmeta = METAPAGE(env, n); - if (unlikely(unaligned_peek_u64(4, &pmeta->mm_magic_and_version) != + MDBX_meta *const meta = METAPAGE(env, n); + if (unlikely(unaligned_peek_u64(4, &meta->mm_magic_and_version) != MDBX_DATA_MAGIC)) { - const txnid_t txnid = meta_txnid(env, pmeta); - mdbx_notice("%s %s" - "meta[%u], txnid %" PRIaTXN, - "updating db-format signature for", - META_IS_STEADY(pmeta) ? "stead-" : "weak-", n, txnid); - err = mdbx_override_meta(env, n, txnid, pmeta); + const txnid_t txnid = constmeta_txnid(meta); + NOTICE("%s %s" + "meta[%u], txnid %" PRIaTXN, + "updating db-format signature for", + META_IS_STEADY(meta) ? "stead-" : "weak-", n, txnid); + err = override_meta(env, n, txnid, meta); if (unlikely(err != MDBX_SUCCESS) && /* Just ignore the MDBX_PROBLEM error, since here it is * returned only in case of the attempt to upgrade an obsolete * meta-page that is invalid for current state of a DB, * e.g. after shrinking DB file */ err != MDBX_PROBLEM) { - mdbx_error("%s meta[%u], txnid %" PRIaTXN ", error %d", - "updating db-format signature for", n, txnid, err); + ERROR("%s meta[%u], txnid %" PRIaTXN ", error %d", + "updating db-format signature for", n, txnid, err); return err; } + troika = meta_tap(env); } } } @@ -12502,11 +14064,11 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, #if defined(MADV_REMOVE) if (lck_rc && (env->me_flags & MDBX_WRITEMAP) != 0 && /* not recovery mode */ env->me_stuck_meta < 0) { - mdbx_notice("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", - env->me_lck->mti_discarded_tail.weak, - bytes2pgno(env, env->me_dxb_mmap.current)); + NOTICE("open-MADV_%s %u..%u", "REMOVE (deallocate file space)", + env->me_lck->mti_discarded_tail.weak, + bytes2pgno(env, env->me_dxb_mmap.current)); err = - madvise(env->me_map + used_aligned2os_bytes, + madvise(ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_REMOVE) ? ignore_enosys(errno) : MDBX_SUCCESS; @@ -12515,11 +14077,11 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, } #endif /* MADV_REMOVE */ #if defined(MADV_DONTNEED) - mdbx_notice("open-MADV_%s %u..%u", "DONTNEED", - env->me_lck->mti_discarded_tail.weak, - bytes2pgno(env, env->me_dxb_mmap.current)); + NOTICE("open-MADV_%s %u..%u", "DONTNEED", + env->me_lck->mti_discarded_tail.weak, + bytes2pgno(env, env->me_dxb_mmap.current)); err = - madvise(env->me_map + used_aligned2os_bytes, + madvise(ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, MADV_DONTNEED) ? ignore_enosys(errno) : MDBX_SUCCESS; @@ -12527,7 +14089,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, return err; #elif defined(POSIX_MADV_DONTNEED) err = ignore_enosys(posix_madvise( - env->me_map + used_aligned2os_bytes, + ptr_disp(env->me_map, used_aligned2os_bytes), env->me_dxb_mmap.current - used_aligned2os_bytes, POSIX_MADV_DONTNEED)); if (unlikely(MDBX_IS_ERROR(err))) return err; @@ -12540,7 +14102,7 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, #endif /* MADV_DONTNEED */ } - err = mdbx_set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); + err = set_readahead(env, bytes2pgno(env, used_bytes), readahead, true); if (unlikely(err != MDBX_SUCCESS)) return err; #endif /* MDBX_ENABLE_MADVISE */ @@ -12551,12 +14113,12 @@ __cold static int mdbx_setup_dxb(MDBX_env *env, const int lck_rc, /******************************************************************************/ /* Open and/or initialize the lock region for the environment. */ -__cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, - mdbx_mode_t mode) { - mdbx_assert(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); - mdbx_assert(env, env->me_lfd == INVALID_HANDLE_VALUE); +__cold static int setup_lck(MDBX_env *env, pathchar_t *lck_pathname, + mdbx_mode_t mode) { + eASSERT(env, env->me_lazy_fd != INVALID_HANDLE_VALUE); + eASSERT(env, env->me_lfd == INVALID_HANDLE_VALUE); - int err = mdbx_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); + int err = osal_openfile(MDBX_OPEN_LCK, env, lck_pathname, &env->me_lfd, mode); if (err != MDBX_SUCCESS) { switch (err) { default: @@ -12574,8 +14136,8 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, } if (err != MDBX_ENOFILE) { - /* ensure the file system is read-only */ - err = mdbx_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); + /* ENSURE the file system is read-only */ + err = osal_check_fs_rdonly(env->me_lazy_fd, lck_pathname, err); if (err != MDBX_SUCCESS && /* ignore ERROR_NOT_SUPPORTED for exclusive mode */ !(err == MDBX_ENOSYS && (env->me_flags & MDBX_EXCLUSIVE))) @@ -12585,12 +14147,12 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, /* LY: without-lck mode (e.g. exclusive or on read-only filesystem) */ /* beginning of a locked section ---------------------------------------- */ lcklist_lock(); - mdbx_assert(env, env->me_lcklist_next == nullptr); + eASSERT(env, env->me_lcklist_next == nullptr); env->me_lfd = INVALID_HANDLE_VALUE; - const int rc = mdbx_lck_seize(env); + const int rc = osal_lck_seize(env); if (MDBX_IS_ERROR(rc)) { /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by mdbx_env_close0(). */ + * and this job will be done by env_close(). */ lcklist_unlock(); return rc; } @@ -12602,23 +14164,23 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, env->me_lck = lckless_stub(env); env->me_maxreaders = UINT_MAX; - mdbx_debug("lck-setup:%s%s%s", " lck-less", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); + DEBUG("lck-setup:%s%s%s", " lck-less", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); return rc; } /* beginning of a locked section ------------------------------------------ */ lcklist_lock(); - mdbx_assert(env, env->me_lcklist_next == nullptr); + eASSERT(env, env->me_lcklist_next == nullptr); /* Try to get exclusive lock. If we succeed, then * nobody is using the lock region and we should initialize it. */ - err = mdbx_lck_seize(env); + err = osal_lck_seize(env); if (MDBX_IS_ERROR(err)) { bailout: /* Calling lcklist_detach_locked() is required to restore POSIX-filelock - * and this job will be done by mdbx_env_close0(). */ + * and this job will be done by env_close(). */ lcklist_unlock(); return err; } @@ -12629,7 +14191,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, if (MDBX_IS_ERROR(err)) goto bailout; if (inprocess_neighbor && - ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || + ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 || (inprocess_neighbor->me_flags & MDBX_EXCLUSIVE) != 0)) { err = MDBX_BUSY; goto bailout; @@ -12637,13 +14199,12 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, } const int lck_seize_rc = err; - mdbx_debug("lck-setup:%s%s%s", " with-lck", - (env->me_flags & MDBX_RDONLY) ? " readonly" : "", - (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" - : " cooperative"); + DEBUG("lck-setup:%s%s%s", " with-lck", + (env->me_flags & MDBX_RDONLY) ? " readonly" : "", + (lck_seize_rc == MDBX_RESULT_TRUE) ? " exclusive" : " cooperative"); uint64_t size = 0; - err = mdbx_filesize(env->me_lfd, &size); + err = osal_filesize(env->me_lfd, &size); if (unlikely(err != MDBX_SUCCESS)) goto bailout; @@ -12651,7 +14212,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, size = ceil_powerof2(env->me_maxreaders * sizeof(MDBX_reader) + sizeof(MDBX_lockinfo), env->me_os_psize); - mdbx_jitter4testing(false); + jitter4testing(false); } else { if (env->me_flags & MDBX_EXCLUSIVE) { err = MDBX_BUSY; @@ -12659,7 +14220,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, } if (size > INT_MAX || (size & (env->me_os_psize - 1)) != 0 || size < env->me_os_psize) { - mdbx_error("lck-file has invalid size %" PRIu64 " bytes", size); + ERROR("lck-file has invalid size %" PRIu64 " bytes", size); err = MDBX_PROBLEM; goto bailout; } @@ -12668,7 +14229,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, const size_t maxreaders = ((size_t)size - sizeof(MDBX_lockinfo)) / sizeof(MDBX_reader); if (maxreaders < 4) { - mdbx_error("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); + ERROR("lck-size too small (up to %" PRIuPTR " readers)", maxreaders); err = MDBX_PROBLEM; goto bailout; } @@ -12676,7 +14237,7 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, ? (unsigned)maxreaders : (unsigned)MDBX_READERS_LIMIT; - err = mdbx_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, + err = osal_mmap((env->me_flags & MDBX_EXCLUSIVE) | MDBX_WRITEMAP, &env->me_lck_mmap, (size_t)size, (size_t)size, lck_seize_rc ? MMAP_OPTION_TRUNCATE | MMAP_OPTION_SEMAPHORE : MMAP_OPTION_SEMAPHORE); @@ -12709,55 +14270,50 @@ __cold static int mdbx_setup_lck(MDBX_env *env, char *lck_pathname, if (lck_seize_rc == MDBX_RESULT_TRUE) { /* LY: exclusive mode, check and reset lck content */ memset(lck, 0, (size_t)size); - mdbx_jitter4testing(false); + jitter4testing(false); lck->mti_magic_and_version = MDBX_LOCK_MAGIC; lck->mti_os_and_format = MDBX_LOCK_FORMAT; #if MDBX_ENABLE_PGOP_STAT lck->mti_pgop_stat.wops.weak = 1; #endif /* MDBX_ENABLE_PGOP_STAT */ - err = mdbx_msync(&env->me_lck_mmap, 0, (size_t)size, MDBX_SYNC_NONE); + err = osal_msync(&env->me_lck_mmap, 0, (size_t)size, + MDBX_SYNC_DATA | MDBX_SYNC_SIZE); if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("initial-%s for lck-file failed", "msync"); - goto bailout; - } - err = mdbx_fsync(env->me_lck_mmap.fd, MDBX_SYNC_SIZE); - if (unlikely(err != MDBX_SUCCESS)) { - mdbx_error("initial-%s for lck-file failed", "fsync"); + ERROR("initial-%s for lck-file failed, err %d", "msync/fsync", err); goto bailout; } } else { if (lck->mti_magic_and_version != MDBX_LOCK_MAGIC) { const bool invalid = (lck->mti_magic_and_version >> 8) != MDBX_MAGIC; - mdbx_error( - "lock region has %s", - invalid - ? "invalid magic" - : "incompatible version (only applications with nearly or the " - "same versions of libmdbx can share the same database)"); + ERROR("lock region has %s", + invalid + ? "invalid magic" + : "incompatible version (only applications with nearly or the " + "same versions of libmdbx can share the same database)"); err = invalid ? MDBX_INVALID : MDBX_VERSION_MISMATCH; goto bailout; } if (lck->mti_os_and_format != MDBX_LOCK_FORMAT) { - mdbx_error("lock region has os/format signature 0x%" PRIx32 - ", expected 0x%" PRIx32, - lck->mti_os_and_format, MDBX_LOCK_FORMAT); + ERROR("lock region has os/format signature 0x%" PRIx32 + ", expected 0x%" PRIx32, + lck->mti_os_and_format, MDBX_LOCK_FORMAT); err = MDBX_VERSION_MISMATCH; goto bailout; } } - err = mdbx_lck_init(env, inprocess_neighbor, lck_seize_rc); + err = osal_lck_init(env, inprocess_neighbor, lck_seize_rc); if (MDBX_IS_ERROR(err)) goto bailout; - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); /* insert into inprocess lck-list */ env->me_lcklist_next = inprocess_lcklist_head; inprocess_lcklist_head = env; lcklist_unlock(); /* end of a locked section ------------------------------------------------ */ - mdbx_assert(env, !MDBX_IS_ERROR(lck_seize_rc)); + eASSERT(env, !MDBX_IS_ERROR(lck_seize_rc)); env->me_lck = lck; return lck_seize_rc; } @@ -12805,8 +14361,8 @@ static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { !F_ISSET(r, MDBX_UTTERLY_NOSYNC)) r = (r - MDBX_DEPRECATED_MAPASYNC) | MDBX_SAFE_NOSYNC; - /* force MDBX_NOMETASYNC if MDBX_SAFE_NOSYNC enabled */ - if (r & MDBX_SAFE_NOSYNC) + /* force MDBX_NOMETASYNC if NOSYNC enabled */ + if (r & (MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC)) r |= MDBX_NOMETASYNC; assert(!(F_ISSET(r, MDBX_UTTERLY_NOSYNC) && @@ -12815,25 +14371,27 @@ static uint32_t merge_sync_flags(const uint32_t a, const uint32_t b) { return r; } -__cold static int __must_check_result mdbx_override_meta( - MDBX_env *env, unsigned target, txnid_t txnid, const MDBX_meta *shape) { +__cold static int __must_check_result override_meta(MDBX_env *env, + size_t target, + txnid_t txnid, + const MDBX_meta *shape) { int rc = alloc_page_buf(env); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_page *const page = env->me_pbuf; - mdbx_meta_model(env, page, target); + meta_model(env, page, target); MDBX_meta *const model = page_meta(page); meta_set_txnid(env, model, txnid); if (txnid) - mdbx_assert(env, meta_checktxnid(env, model, true)); + eASSERT(env, coherency_check_meta(env, model, true)); if (shape) { - if (txnid && unlikely(!meta_checktxnid(env, shape, false))) { - mdbx_error("bailout overriding meta-%u since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, - target, "pre", constmeta_txnid(env, shape)); + if (txnid && unlikely(!coherency_check_meta(env, shape, false))) { + ERROR("bailout overriding meta-%zu since model failed " + "freedb/maindb %s-check for txnid #%" PRIaTXN, + target, "pre", constmeta_txnid(shape)); return MDBX_PROBLEM; } - if (mdbx_runtime_flags & MDBX_DBG_DONT_UPGRADE) + if (runtime_flags & MDBX_DBG_DONT_UPGRADE) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); model->mm_extra_flags = shape->mm_extra_flags; @@ -12851,48 +14409,60 @@ __cold static int __must_check_result mdbx_override_meta( model->mm_dbs[MAIN_DBI].md_root != P_INVALID)) memcpy(&model->mm_magic_and_version, &shape->mm_magic_and_version, sizeof(model->mm_magic_and_version)); - if (unlikely(!meta_checktxnid(env, model, false))) { - mdbx_error("bailout overriding meta-%u since model failed " - "freedb/maindb %s-check for txnid #%" PRIaTXN, - target, "post", txnid); + if (unlikely(!coherency_check_meta(env, model, false))) { + ERROR("bailout overriding meta-%zu since model failed " + "freedb/maindb %s-check for txnid #%" PRIaTXN, + target, "post", txnid); return MDBX_PROBLEM; } } } - unaligned_poke_u64(4, model->mm_datasync_sign, meta_sign(model)); - rc = mdbx_validate_meta(env, model, page, target, nullptr); + unaligned_poke_u64(4, model->mm_sign, meta_sign(model)); + rc = validate_meta(env, model, page, (pgno_t)target, nullptr); if (unlikely(MDBX_IS_ERROR(rc))) return MDBX_PROBLEM; if (shape && memcmp(model, shape, sizeof(MDBX_meta)) == 0) return MDBX_SUCCESS; -#if MDBX_ENABLE_PGOP_STAT - env->me_lck->mti_pgop_stat.wops.weak += 1; -#endif /* MDBX_ENABLE_PGOP_STAT */ if (env->me_flags & MDBX_WRITEMAP) { - rc = mdbx_msync(&env->me_dxb_mmap, 0, +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, model->mm_geo.next), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); if (unlikely(rc != MDBX_SUCCESS)) return rc; - /* mdbx_override_meta() called only while current process have exclusive + /* override_meta() called only while current process have exclusive * lock of a DB file. So meta-page could be updated directly without * clearing consistency flag by mdbx_meta_update_begin() */ memcpy(pgno2page(env, target), page, env->me_psize); - mdbx_flush_incoherent_cpu_writeback(); - rc = mdbx_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), + osal_flush_incoherent_cpu_writeback(); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.msync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_msync(&env->me_dxb_mmap, 0, pgno_align2os_bytes(env, target + 1), MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } else { - const mdbx_filehandle_t fd = (env->me_dsync_fd != INVALID_HANDLE_VALUE) - ? env->me_dsync_fd - : env->me_lazy_fd; - rc = mdbx_pwrite(fd, page, env->me_psize, pgno2bytes(env, target)); - if (rc == MDBX_SUCCESS && fd == env->me_lazy_fd) - rc = mdbx_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.wops.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_pwrite(env->me_fd4meta, page, env->me_psize, + pgno2bytes(env, target)); + if (rc == MDBX_SUCCESS && env->me_fd4meta == env->me_lazy_fd) { +#if MDBX_ENABLE_PGOP_STAT + env->me_lck->mti_pgop_stat.fsync.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + rc = osal_fsync(env->me_lazy_fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + } + osal_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), + env->me_os_psize); } - mdbx_flush_incoherent_mmap(env->me_map, pgno2bytes(env, NUM_METAS), - env->me_os_psize); + eASSERT(env, (!env->me_txn && !env->me_txn0) || + (env->me_stuck_meta == (int)target && + (env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) == + MDBX_EXCLUSIVE)); return rc; } @@ -12908,32 +14478,46 @@ __cold int mdbx_env_turn_for_recovery(MDBX_env *env, unsigned target) { return MDBX_EPERM; const MDBX_meta *target_meta = METAPAGE(env, target); - txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(env, target_meta)); - for (unsigned n = 0; n < NUM_METAS; ++n) { - MDBX_page *page = pgno2page(env, n); - MDBX_meta meta = *page_meta(page); + txnid_t new_txnid = safe64_txnid_next(constmeta_txnid(target_meta)); + for (size_t n = 0; n < NUM_METAS; ++n) { if (n == target) continue; - if (mdbx_validate_meta(env, &meta, page, n, nullptr) != MDBX_SUCCESS) { - int err = mdbx_override_meta(env, n, 0, nullptr); + MDBX_meta meta = *METAPAGE(env, target); + if (validate_meta(env, &meta, pgno2page(env, n), (pgno_t)n, nullptr) != + MDBX_SUCCESS) { + int err = override_meta(env, n, 0, nullptr); if (unlikely(err != MDBX_SUCCESS)) return err; } else { - txnid_t txnid = constmeta_txnid(env, &meta); + txnid_t txnid = constmeta_txnid(&meta); if (new_txnid <= txnid) new_txnid = safe64_txnid_next(txnid); } } if (unlikely(new_txnid > MAX_TXNID)) { - mdbx_error("txnid overflow, raise %d", MDBX_TXN_FULL); + ERROR("txnid overflow, raise %d", MDBX_TXN_FULL); return MDBX_TXN_FULL; } - return mdbx_override_meta(env, target, new_txnid, target_meta); + return override_meta(env, target, new_txnid, target_meta); } __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, unsigned target_meta, bool writeable) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_open_for_recoveryW(env, pathnameW, target_meta, writeable); + osal_free(pathnameW); + } + return rc; +} + +__cold int mdbx_env_open_for_recoveryW(MDBX_env *env, const wchar_t *pathname, + unsigned target_meta, bool writeable) { +#endif /* Windows */ + if (unlikely(target_meta >= NUM_METAS)) return MDBX_EINVAL; int rc = check_env(env, false); @@ -12943,31 +14527,44 @@ __cold int mdbx_env_open_for_recovery(MDBX_env *env, const char *pathname, return MDBX_EPERM; env->me_stuck_meta = (int8_t)target_meta; - return mdbx_env_open( - env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, - 0); + return +#if defined(_WIN32) || defined(_WIN64) + mdbx_env_openW +#else + mdbx_env_open +#endif /* Windows */ + (env, pathname, writeable ? MDBX_EXCLUSIVE : MDBX_EXCLUSIVE | MDBX_RDONLY, + 0); } typedef struct { void *buffer_for_free; - char *lck, *dxb; + pathchar_t *lck, *dxb; size_t ent_len; } MDBX_handle_env_pathname; -__cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, - const char *pathname, - MDBX_env_flags_t *flags, - const mdbx_mode_t mode) { - int rc; +__cold static int check_alternative_lck_absent(const pathchar_t *lck_pathname) { + int err = osal_fileexists(lck_pathname); + if (unlikely(err != MDBX_RESULT_FALSE)) { + if (err == MDBX_RESULT_TRUE) + err = MDBX_DUPLICATED_CLK; + ERROR("Alternative/Duplicate LCK-file '%" MDBX_PRIsPATH "' error %d", + lck_pathname, err); + } + return err; +} + +__cold static int handle_env_pathname(MDBX_handle_env_pathname *ctx, + const pathchar_t *pathname, + MDBX_env_flags_t *flags, + const mdbx_mode_t mode) { memset(ctx, 0, sizeof(*ctx)); - if (unlikely(!pathname)) + if (unlikely(!pathname || !*pathname)) return MDBX_EINVAL; + int rc; #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; - MUSTDIE_MB2WIDE(pathname, pathnameW); - - const DWORD dwAttrib = GetFileAttributesW(pathnameW); + const DWORD dwAttrib = GetFileAttributesW(pathname); if (dwAttrib == INVALID_FILE_ATTRIBUTES) { rc = GetLastError(); if (rc != MDBX_ENOFILE) @@ -12977,8 +14574,7 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, return rc; /* auto-create directory if requested */ - if ((*flags & MDBX_NOSUBDIR) == 0 && - !CreateDirectoryW(pathnameW, nullptr)) { + if ((*flags & MDBX_NOSUBDIR) == 0 && !CreateDirectoryW(pathname, nullptr)) { rc = GetLastError(); if (rc != ERROR_ALREADY_EXISTS) return rc; @@ -12991,13 +14587,13 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, } #else struct stat st; - if (stat(pathname, &st)) { + if (stat(pathname, &st) != 0) { rc = errno; if (rc != MDBX_ENOFILE) return rc; if (mode == 0 || (*flags & MDBX_RDONLY) != 0) - /* can't open existing */ - return rc; + /* can't open non-existing */ + return rc /* MDBX_ENOFILE */; /* auto-create directory if requested */ const mdbx_mode_t dir_mode = @@ -13019,41 +14615,106 @@ __cold static int mdbx_handle_env_pathname(MDBX_handle_env_pathname *ctx, } #endif - static const char dxb_name[] = MDBX_DATANAME; - static const size_t dxb_name_len = sizeof(dxb_name) - 1; - static const char lck_name[] = MDBX_LOCKNAME; - static const char lock_suffix[] = MDBX_LOCK_SUFFIX; + static const pathchar_t dxb_name[] = MDBX_DATANAME; + static const pathchar_t lck_name[] = MDBX_LOCKNAME; + static const pathchar_t lock_suffix[] = MDBX_LOCK_SUFFIX; - ctx->ent_len = strlen(pathname); - if ((*flags & MDBX_NOSUBDIR) && ctx->ent_len >= dxb_name_len && - !memcmp(dxb_name, pathname + ctx->ent_len - dxb_name_len, dxb_name_len)) { - *flags -= MDBX_NOSUBDIR; - ctx->ent_len -= dxb_name_len; +#if defined(_WIN32) || defined(_WIN64) + assert(dxb_name[0] == '\\' && lck_name[0] == '\\'); + const size_t pathname_len = wcslen(pathname); +#else + assert(dxb_name[0] == '/' && lck_name[0] == '/'); + const size_t pathname_len = strlen(pathname); +#endif + assert(!osal_isdirsep(lock_suffix[0])); + ctx->ent_len = pathname_len; + static const size_t dxb_name_len = ARRAY_LENGTH(dxb_name) - 1; + if (*flags & MDBX_NOSUBDIR) { + if (ctx->ent_len > dxb_name_len && + osal_pathequal(pathname + ctx->ent_len - dxb_name_len, dxb_name, + dxb_name_len)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len; + } else if (ctx->ent_len == dxb_name_len - 1 && osal_isdirsep(dxb_name[0]) && + osal_isdirsep(lck_name[0]) && + osal_pathequal(pathname + ctx->ent_len - dxb_name_len + 1, + dxb_name + 1, dxb_name_len - 1)) { + *flags -= MDBX_NOSUBDIR; + ctx->ent_len -= dxb_name_len - 1; + } } - const size_t bytes_needed = - ctx->ent_len * 2 + ((*flags & MDBX_NOSUBDIR) - ? sizeof(lock_suffix) + 1 - : sizeof(lck_name) + sizeof(dxb_name)); - ctx->buffer_for_free = mdbx_malloc(bytes_needed); + const size_t suflen_with_NOSUBDIR = sizeof(lock_suffix) + sizeof(pathchar_t); + const size_t suflen_without_NOSUBDIR = sizeof(lck_name) + sizeof(dxb_name); + const size_t enogh4any = (suflen_with_NOSUBDIR > suflen_without_NOSUBDIR) + ? suflen_with_NOSUBDIR + : suflen_without_NOSUBDIR; + const size_t bytes_needed = sizeof(pathchar_t) * ctx->ent_len * 2 + enogh4any; + ctx->buffer_for_free = osal_malloc(bytes_needed); if (!ctx->buffer_for_free) return MDBX_ENOMEM; - ctx->lck = ctx->buffer_for_free; - if (*flags & MDBX_NOSUBDIR) { - ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lock_suffix); - sprintf(ctx->lck, "%s%s", pathname, lock_suffix); - strcpy(ctx->dxb, pathname); - } else { - ctx->dxb = ctx->lck + ctx->ent_len + sizeof(lck_name); - sprintf(ctx->lck, "%.*s%s", (int)ctx->ent_len, pathname, lck_name); - sprintf(ctx->dxb, "%.*s%s", (int)ctx->ent_len, pathname, dxb_name); - } + ctx->dxb = ctx->buffer_for_free; + ctx->lck = ctx->dxb + ctx->ent_len + dxb_name_len + 1; + pathchar_t *const buf = ctx->buffer_for_free; + rc = MDBX_SUCCESS; + if (ctx->ent_len) { + memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, pathname, + sizeof(pathchar_t) * pathname_len); + if (*flags & MDBX_NOSUBDIR) { + const pathchar_t *const lck_ext = + osal_fileext(lck_name, ARRAY_LENGTH(lck_name)); + if (lck_ext) { + pathchar_t *pathname_ext = osal_fileext(buf, pathname_len); + memcpy(pathname_ext ? pathname_ext : buf + pathname_len, lck_ext, + sizeof(pathchar_t) * (ARRAY_END(lck_name) - lck_ext)); + rc = check_alternative_lck_absent(buf); + } + } else { + memcpy(buf + ctx->ent_len, dxb_name, sizeof(dxb_name)); + memcpy(buf + ctx->ent_len + dxb_name_len, lock_suffix, + sizeof(lock_suffix)); + rc = check_alternative_lck_absent(buf); + } - return MDBX_SUCCESS; + memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, pathname, + sizeof(pathchar_t) * (ctx->ent_len + 1)); + memcpy(ctx->lck, pathname, sizeof(pathchar_t) * ctx->ent_len); + if (*flags & MDBX_NOSUBDIR) { + memcpy(ctx->lck + ctx->ent_len, lock_suffix, sizeof(lock_suffix)); + } else { + memcpy(ctx->dxb + ctx->ent_len, dxb_name, sizeof(dxb_name)); + memcpy(ctx->lck + ctx->ent_len, lck_name, sizeof(lck_name)); + } + } else { + assert(!(*flags & MDBX_NOSUBDIR)); + memcpy(buf + /* shutting up goofy MSVC static analyzer */ 0, dxb_name + 1, + sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(buf + dxb_name_len - 1, lock_suffix, sizeof(lock_suffix)); + rc = check_alternative_lck_absent(buf); + + memcpy(ctx->dxb + /* shutting up goofy MSVC static analyzer */ 0, + dxb_name + 1, sizeof(dxb_name) - sizeof(pathchar_t)); + memcpy(ctx->lck, lck_name + 1, sizeof(lck_name) - sizeof(pathchar_t)); + } + return rc; } __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_deleteW(pathnameW, mode); + osal_free(pathnameW); + } + return rc; +} + +__cold int mdbx_env_deleteW(const wchar_t *pathname, + MDBX_env_delete_mode_t mode) { +#endif /* Windows */ + switch (mode) { default: return MDBX_EINVAL; @@ -13071,35 +14732,35 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { memset(dummy_env, 0, sizeof(*dummy_env)); dummy_env->me_flags = (mode == MDBX_ENV_ENSURE_UNUSED) ? MDBX_EXCLUSIVE : MDBX_ENV_DEFAULTS; - dummy_env->me_os_psize = (unsigned)mdbx_syspagesize(); + dummy_env->me_os_psize = (unsigned)osal_syspagesize(); dummy_env->me_psize = (unsigned)mdbx_default_pagesize(); - dummy_env->me_pathname = (char *)pathname; + dummy_env->me_pathname = (pathchar_t *)pathname; MDBX_handle_env_pathname env_pathname; STATIC_ASSERT(sizeof(dummy_env->me_flags) == sizeof(MDBX_env_flags_t)); int rc = MDBX_RESULT_TRUE, - err = mdbx_handle_env_pathname( - &env_pathname, pathname, (MDBX_env_flags_t *)&dummy_env->me_flags, 0); + err = handle_env_pathname(&env_pathname, pathname, + (MDBX_env_flags_t *)&dummy_env->me_flags, 0); if (likely(err == MDBX_SUCCESS)) { mdbx_filehandle_t clk_handle = INVALID_HANDLE_VALUE, dxb_handle = INVALID_HANDLE_VALUE; if (mode > MDBX_ENV_JUST_DELETE) { - err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.dxb, &dxb_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; if (err == MDBX_SUCCESS) { - err = mdbx_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, + err = osal_openfile(MDBX_OPEN_DELETE, dummy_env, env_pathname.lck, &clk_handle, 0); err = (err == MDBX_ENOFILE) ? MDBX_SUCCESS : err; } if (err == MDBX_SUCCESS && clk_handle != INVALID_HANDLE_VALUE) - err = mdbx_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + err = osal_lockfile(clk_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); if (err == MDBX_SUCCESS && dxb_handle != INVALID_HANDLE_VALUE) - err = mdbx_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); + err = osal_lockfile(dxb_handle, mode == MDBX_ENV_WAIT_FOR_UNUSED); } if (err == MDBX_SUCCESS) { - err = mdbx_removefile(env_pathname.dxb); + err = osal_removefile(env_pathname.dxb); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -13107,7 +14768,7 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { } if (err == MDBX_SUCCESS) { - err = mdbx_removefile(env_pathname.lck); + err = osal_removefile(env_pathname.lck); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -13115,7 +14776,7 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { } if (err == MDBX_SUCCESS && !(dummy_env->me_flags & MDBX_NOSUBDIR)) { - err = mdbx_removedirectory(pathname); + err = osal_removedirectory(pathname); if (err == MDBX_SUCCESS) rc = MDBX_SUCCESS; else if (err == MDBX_ENOFILE) @@ -13123,18 +14784,35 @@ __cold int mdbx_env_delete(const char *pathname, MDBX_env_delete_mode_t mode) { } if (dxb_handle != INVALID_HANDLE_VALUE) - mdbx_closefile(dxb_handle); + osal_closefile(dxb_handle); if (clk_handle != INVALID_HANDLE_VALUE) - mdbx_closefile(clk_handle); + osal_closefile(clk_handle); } else if (err == MDBX_ENOFILE) err = MDBX_SUCCESS; - mdbx_free(env_pathname.buffer_for_free); + osal_free(env_pathname.buffer_for_free); return (err == MDBX_SUCCESS) ? rc : err; } __cold int mdbx_env_open(MDBX_env *env, const char *pathname, MDBX_env_flags_t flags, mdbx_mode_t mode) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *pathnameW = nullptr; + int rc = osal_mb2w(pathname, &pathnameW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_openW(env, pathnameW, flags, mode); + osal_free(pathnameW); + if (rc == MDBX_SUCCESS) + /* force to make cache of the multi-byte pathname representation */ + mdbx_env_get_path(env, &pathname); + } + return rc; +} + +__cold int mdbx_env_openW(MDBX_env *env, const wchar_t *pathname, + MDBX_env_flags_t flags, mdbx_mode_t mode) { +#endif /* Windows */ + int rc = check_env(env, false); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -13142,29 +14820,21 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (unlikely(flags & ~ENV_USABLE_FLAGS)) return MDBX_EINVAL; - if (flags & MDBX_RDONLY) - mode = 0; - - if (env->me_lazy_fd != INVALID_HANDLE_VALUE || - (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map) + if (unlikely(env->me_lazy_fd != INVALID_HANDLE_VALUE || + (env->me_flags & MDBX_ENV_ACTIVE) != 0 || env->me_map)) return MDBX_EPERM; - /* pickup previously mdbx_env_set_flags(), + /* Pickup previously mdbx_env_set_flags(), * but avoid MDBX_UTTERLY_NOSYNC by disjunction */ const uint32_t saved_me_flags = env->me_flags; - flags = merge_sync_flags(flags, env->me_flags); - - MDBX_handle_env_pathname env_pathname; - rc = mdbx_handle_env_pathname(&env_pathname, pathname, &flags, mode); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; + flags = merge_sync_flags(flags | MDBX_DEPRECATED_COALESCE, env->me_flags); if (flags & MDBX_RDONLY) { - /* LY: silently ignore irrelevant flags when - * we're only getting read access */ + /* Silently ignore irrelevant flags when we're only getting read access */ flags &= ~(MDBX_WRITEMAP | MDBX_DEPRECATED_MAPASYNC | MDBX_SAFE_NOSYNC | - MDBX_NOMETASYNC | MDBX_COALESCE | MDBX_LIFORECLAIM | + MDBX_NOMETASYNC | MDBX_DEPRECATED_COALESCE | MDBX_LIFORECLAIM | MDBX_NOMEMINIT | MDBX_ACCEDE); + mode = 0; } else { #if MDBX_MMAP_INCOHERENT_FILE_WRITE /* Temporary `workaround` for OpenBSD kernel's flaw. @@ -13173,44 +14843,132 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, if (flags & MDBX_ACCEDE) flags |= MDBX_WRITEMAP; else { - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, - "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " - "of an internal flaw(s) in a file/buffer/page cache.\n"); - rc = 42 /* ENOPROTOOPT */; - goto bailout; + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, + "System (i.e. OpenBSD) requires MDBX_WRITEMAP because " + "of an internal flaw(s) in a file/buffer/page cache.\n"); + return 42 /* ENOPROTOOPT */; } } #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ } + MDBX_handle_env_pathname env_pathname; + rc = handle_env_pathname(&env_pathname, pathname, &flags, mode); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + env->me_flags = (flags & ~MDBX_FATAL_ERROR) | MDBX_ENV_ACTIVE; - env->me_pathname = mdbx_calloc(env_pathname.ent_len + 1, 1); - env->me_dbxs = mdbx_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); - env->me_dbflags = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); - env->me_dbiseqs = mdbx_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); + env->me_pathname = osal_calloc(env_pathname.ent_len + 1, sizeof(pathchar_t)); + env->me_dbxs = osal_calloc(env->me_maxdbs, sizeof(MDBX_dbx)); + env->me_dbflags = osal_calloc(env->me_maxdbs, sizeof(env->me_dbflags[0])); + env->me_dbiseqs = osal_calloc(env->me_maxdbs, sizeof(env->me_dbiseqs[0])); if (!(env->me_dbxs && env->me_pathname && env->me_dbflags && env->me_dbiseqs)) { rc = MDBX_ENOMEM; goto bailout; } - memcpy(env->me_pathname, env_pathname.dxb, env_pathname.ent_len); + memcpy(env->me_pathname, env_pathname.dxb, + env_pathname.ent_len * sizeof(pathchar_t)); env->me_dbxs[FREE_DBI].md_cmp = cmp_int_align4; /* aligned MDBX_INTEGERKEY */ env->me_dbxs[FREE_DBI].md_dcmp = cmp_lenfast; + env->me_dbxs[FREE_DBI].md_klen_max = env->me_dbxs[FREE_DBI].md_klen_min = 8; + env->me_dbxs[FREE_DBI].md_vlen_min = 4; + env->me_dbxs[FREE_DBI].md_vlen_max = + mdbx_env_get_maxvalsize_ex(env, MDBX_INTEGERKEY); - rc = mdbx_openfile(F_ISSET(flags, MDBX_RDONLY) ? MDBX_OPEN_DXB_READ - : MDBX_OPEN_DXB_LAZY, + /* Использование O_DSYNC или FILE_FLAG_WRITE_THROUGH: + * + * 0) Если размер страниц БД меньше системной страницы ОЗУ, то ядру ОС + * придется чаще обновлять страницы в unified page cache. + * + * Однако, O_DSYNC не предполагает отключение unified page cache, + * поэтому подобные затруднения будем считать проблемой ОС и/или + * ожидаемым пенальти из-за использования мелких страниц БД. + * + * 1) В режиме MDBX_SYNC_DURABLE - O_DSYNC для записи как данных, + * так и мета-страниц. Однако, на Linux отказ от O_DSYNC с последующим + * fdatasync() может быть выгоднее при использовании HDD, так как + * позволяет io-scheduler переупорядочить запись с учетом актуального + * расположения файла БД на носителе. + * + * 2) В режиме MDBX_NOMETASYNC - O_DSYNC можно использовать для данных, + * но в этом может не быть смысла, так как fdatasync() всё равно + * требуется для гарантии фиксации мета после предыдущей транзакции. + * + * В итоге на нормальных системах (не Windows) есть два варианта: + * - при возможности O_DIRECT и/или io_ring для данных, скорее всего, + * есть смысл вызвать fdatasync() перед записью данных, а затем + * использовать O_DSYNC; + * - не использовать O_DSYNC и вызывать fdatasync() после записи данных. + * + * На Windows же следует минимизировать использование FlushFileBuffers() + * из-за проблем с производительностью. Поэтому на Windows в режиме + * MDBX_NOMETASYNC: + * - мета обновляется через дескриптор без FILE_FLAG_WRITE_THROUGH; + * - перед началом записи данных вызывается FlushFileBuffers(), если + * mti_meta_sync_txnid не совпадает с последней записанной мета; + * - данные записываются через дескриптор с FILE_FLAG_WRITE_THROUGH. + * + * 3) В режиме MDBX_SAFE_NOSYNC - O_DSYNC нет смысла использовать, пока не + * будет реализована возможность полностью асинхронной "догоняющей" + * записи в выделенном процессе-сервере с io-ring очередями внутри. + * + * ----- + * + * Использование O_DIRECT или FILE_FLAG_NO_BUFFERING: + * + * Назначение этих флагов в отключении файлового дескриптора от + * unified page cache, т.е. от отображенных в память данных в случае + * libmdbx. + * + * Поэтому, использование direct i/o в libmdbx без MDBX_WRITEMAP лишено + * смысла и контр-продуктивно, ибо так мы провоцируем ядро ОС на + * не-когерентность отображения в память с содержимым файла на носителе, + * либо требуем дополнительных проверок и действий направленных на + * фактическое отключение O_DIRECT для отображенных в память данных. + * + * В режиме MDBX_WRITEMAP когерентность отображенных данных обеспечивается + * физически. Поэтому использование direct i/o может иметь смысл, если у + * ядра ОС есть какие-то проблемы с msync(), в том числе с + * производительностью: + * - использование io_ring или gather-write может быть дешевле, чем + * просмотр PTE ядром и запись измененных/грязных; + * - но проблема в том, что записываемые из user mode страницы либо не + * будут помечены чистыми (и соответственно будут записаны ядром + * еще раз), либо ядру необходимо искать и чистить PTE при получении + * запроса на запись. + * + * Поэтому O_DIRECT или FILE_FLAG_NO_BUFFERING используется: + * - только в режиме MDBX_SYNC_DURABLE с MDBX_WRITEMAP; + * - когда me_psize >= me_os_psize; + * - опция сборки MDBX_AVOID_MSYNC != 0, которая по-умолчанию включена + * только на Windows (см ниже). + * + * ----- + * + * Использование FILE_FLAG_OVERLAPPED на Windows: + * + * У Windows очень плохо с I/O (за исключением прямых постраничных + * scatter/gather, которые работают в обход проблемного unified page + * cache и поэтому почти бесполезны в libmdbx). + * + * При этом всё еще хуже при использовании FlushFileBuffers(), что также + * требуется после FlushViewOfFile() в режиме MDBX_WRITEMAP. Поэтому + * на Windows вместо FlushViewOfFile() и FlushFileBuffers() следует + * использовать запись через дескриптор с FILE_FLAG_WRITE_THROUGH. + * + * В свою очередь, запись с FILE_FLAG_WRITE_THROUGH дешевле/быстрее + * при использовании FILE_FLAG_OVERLAPPED. В результате, на Windows + * в durable-режимах запись данных всегда в overlapped-режиме, + * при этом для записи мета требуется отдельный не-overlapped дескриптор. + */ + + rc = osal_openfile((flags & MDBX_RDONLY) ? MDBX_OPEN_DXB_READ + : MDBX_OPEN_DXB_LAZY, env, env_pathname.dxb, &env->me_lazy_fd, mode); if (rc != MDBX_SUCCESS) goto bailout; - mdbx_assert(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); - if ((flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC)) == 0) { - rc = mdbx_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, - &env->me_dsync_fd, 0); - mdbx_ensure(env, (rc != MDBX_SUCCESS) == - (env->me_dsync_fd == INVALID_HANDLE_VALUE)); - } - #if MDBX_LOCKING == MDBX_LOCKING_SYSV env->me_sysv_ipc.key = ftok(env_pathname.dxb, 42); if (env->me_sysv_ipc.key == -1) { @@ -13219,7 +14977,65 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, } #endif /* MDBX_LOCKING */ -#if !(defined(_WIN32) || defined(_WIN64)) + /* Set the position in files outside of the data to avoid corruption + * due to erroneous use of file descriptors in the application code. */ + const uint64_t safe_parking_lot_offset = UINT64_C(0x7fffFFFF80000000); + osal_fseek(env->me_lazy_fd, safe_parking_lot_offset); + + env->me_fd4meta = env->me_lazy_fd; +#if defined(_WIN32) || defined(_WIN64) + eASSERT(env, env->me_overlapped_fd == 0); + bool ior_direct = false; + if (!(flags & + (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_EXCLUSIVE))) { + if (MDBX_AVOID_MSYNC && (flags & MDBX_WRITEMAP)) { + /* Запрошен режим MDBX_SYNC_DURABLE | MDBX_WRITEMAP при активной опции + * MDBX_AVOID_MSYNC. + * + * 1) В этой комбинации наиболее выгодно использовать WriteFileGather(), + * но для этого необходимо открыть файл с флагом FILE_FLAG_NO_BUFFERING и + * после обеспечивать выравнивание адресов и размера данных на границу + * системной страницы, что в свою очередь возможно если размер страницы БД + * не меньше размера системной страницы ОЗУ. Поэтому для открытия файла в + * нужном режиме требуется знать размер страницы БД. + * + * 2) Кроме этого, в Windows запись в заблокированный регион файла + * возможно только через тот-же дескриптор. Поэтому изначальный захват + * блокировок посредством osal_lck_seize(), захват/освобождение блокировок + * во время пишущих транзакций и запись данных должны выполнятся через + * один дескриптор. + * + * Таким образом, требуется прочитать волатильный заголовок БД, чтобы + * узнать размер страницы, чтобы открыть дескриптор файла в режиме нужном + * для записи данных, чтобы использовать именно этот дескриптор для + * изначального захвата блокировок. */ + MDBX_meta header; + uint64_t dxb_filesize; + int err = read_header(env, &header, MDBX_SUCCESS, true); + if ((err == MDBX_SUCCESS && header.mm_psize >= env->me_os_psize) || + (err == MDBX_ENODATA && mode && env->me_psize >= env->me_os_psize && + osal_filesize(env->me_lazy_fd, &dxb_filesize) == MDBX_SUCCESS && + dxb_filesize == 0)) + /* Может быть коллизия, если два процесса пытаются одновременно создать + * БД с разным размером страницы, который у одного меньше системной + * страницы, а у другого НЕ меньше. Эта допустимая, но очень странная + * ситуация. Поэтому считаем её ошибочной и не пытаемся разрешить. */ + ior_direct = true; + } + + rc = osal_openfile(ior_direct ? MDBX_OPEN_DXB_OVERLAPPED_DIRECT + : MDBX_OPEN_DXB_OVERLAPPED, + env, env_pathname.dxb, &env->me_overlapped_fd, 0); + if (rc != MDBX_SUCCESS) + goto bailout; + env->me_data_lock_event = CreateEventW(nullptr, true, false, nullptr); + if (!env->me_data_lock_event) { + rc = (int)GetLastError(); + goto bailout; + } + osal_fseek(env->me_overlapped_fd, safe_parking_lot_offset); + } +#else if (mode == 0) { /* pickup mode for lck-file */ struct stat st; @@ -13235,39 +15051,52 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, ((mode & S_IRGRP) ? /* +write if readable by group */ S_IWGRP : 0) | ((mode & S_IROTH) ? /* +write if readable by others */ S_IWOTH : 0); #endif /* !Windows */ - const int lck_rc = mdbx_setup_lck(env, env_pathname.lck, mode); + const int lck_rc = setup_lck(env, env_pathname.lck, mode); if (MDBX_IS_ERROR(lck_rc)) { rc = lck_rc; goto bailout; } + osal_fseek(env->me_lfd, safe_parking_lot_offset); - /* Set the position in files outside of the data to avoid corruption - * due to erroneous use of file descriptors in the application code. */ - mdbx_fseek(env->me_lfd, UINT64_C(1) << 63); - mdbx_fseek(env->me_lazy_fd, UINT64_C(1) << 63); - if (env->me_dsync_fd != INVALID_HANDLE_VALUE) - mdbx_fseek(env->me_dsync_fd, UINT64_C(1) << 63); + eASSERT(env, env->me_dsync_fd == INVALID_HANDLE_VALUE); + if (!(flags & (MDBX_RDONLY | MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC +#if defined(_WIN32) || defined(_WIN64) + | MDBX_EXCLUSIVE +#endif /* !Windows */ + ))) { + rc = osal_openfile(MDBX_OPEN_DXB_DSYNC, env, env_pathname.dxb, + &env->me_dsync_fd, 0); + if (MDBX_IS_ERROR(rc)) + goto bailout; + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { + if ((flags & MDBX_NOMETASYNC) == 0) + env->me_fd4meta = env->me_dsync_fd; + osal_fseek(env->me_dsync_fd, safe_parking_lot_offset); + } + } - const MDBX_env_flags_t rigorous_flags = - MDBX_SAFE_NOSYNC | MDBX_DEPRECATED_MAPASYNC; - const MDBX_env_flags_t mode_flags = rigorous_flags | MDBX_NOMETASYNC | - MDBX_LIFORECLAIM | MDBX_COALESCE | - MDBX_NORDAHEAD; + const MDBX_env_flags_t lazy_flags = + MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_NOMETASYNC; + const MDBX_env_flags_t mode_flags = lazy_flags | MDBX_LIFORECLAIM | + MDBX_NORDAHEAD | MDBX_RDONLY | + MDBX_WRITEMAP; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (lck && lck_rc != MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { - while (atomic_load32(&lck->mti_envmode, mo_AcquireRelease) == MDBX_RDONLY) { + MDBX_env_flags_t snap_flags; + while ((snap_flags = atomic_load32(&lck->mti_envmode, mo_AcquireRelease)) == + MDBX_RDONLY) { if (atomic_cas32(&lck->mti_envmode, MDBX_RDONLY, - env->me_flags & mode_flags)) { + (snap_flags = (env->me_flags & mode_flags)))) { /* The case: * - let's assume that for some reason the DB file is smaller * than it should be according to the geometry, * but not smaller than the last page used; - * - the first process that opens the database (lc_rc = true) + * - the first process that opens the database (lck_rc == RESULT_TRUE) * does this in readonly mode and therefore cannot bring * the file size back to normal; - * - some next process (lc_rc = false) opens the DB in read-write - * mode and now is here. + * - some next process (lck_rc != RESULT_TRUE) opens the DB in + * read-write mode and now is here. * * FIXME: Should we re-check and set the size of DB-file right here? */ break; @@ -13276,204 +15105,275 @@ __cold int mdbx_env_open(MDBX_env *env, const char *pathname, } if (env->me_flags & MDBX_ACCEDE) { - /* pickup current mode-flags, including MDBX_LIFORECLAIM | - * MDBX_COALESCE | MDBX_NORDAHEAD */ - const unsigned diff = - (lck->mti_envmode.weak ^ env->me_flags) & mode_flags; - mdbx_notice("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, env->me_flags, - env->me_flags ^ diff); + /* Pickup current mode-flags (MDBX_LIFORECLAIM, MDBX_NORDAHEAD, etc). */ + const MDBX_env_flags_t diff = + (snap_flags ^ env->me_flags) & + ((snap_flags & lazy_flags) ? mode_flags + : mode_flags & ~MDBX_WRITEMAP); env->me_flags ^= diff; + NOTICE("accede mode-flags: 0x%X, 0x%X -> 0x%X", diff, + env->me_flags ^ diff, env->me_flags); } - if ((lck->mti_envmode.weak ^ env->me_flags) & rigorous_flags) { - mdbx_error("%s", "current mode/flags incompatible with requested"); + /* Ранее упущенный не очевидный момент: При работе БД в режимах + * не-синхронной/отложенной фиксации на диске, все процессы-писатели должны + * иметь одинаковый режим MDBX_WRITEMAP. + * + * В противном случае, сброс на диск следует выполнять дважды: сначала + * msync(), затем fdatasync(). При этом msync() не обязан отрабатывать + * в процессах без MDBX_WRITEMAP, так как файл в память отображен только + * для чтения. Поэтому, в общем случае, различия по MDBX_WRITEMAP не + * позволяют выполнить фиксацию данных на диск, после их изменения в другом + * процессе. + * + * В режиме MDBX_UTTERLY_NOSYNC позволять совместную работу с MDBX_WRITEMAP + * также не следует, поскольку никакой процесс (в том числе последний) не + * может гарантированно сбросить данные на диск, а следовательно не должен + * помечать какую-либо транзакцию как steady. + * + * В результате, требуется либо запретить совместную работу процессам с + * разным MDBX_WRITEMAP в режиме отложенной записи, либо отслеживать такое + * смешивание и блокировать steady-пометки - что контрпродуктивно. */ + const MDBX_env_flags_t rigorous_flags = + (snap_flags & lazy_flags) + ? MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC | MDBX_WRITEMAP + : MDBX_SAFE_NOSYNC | MDBX_UTTERLY_NOSYNC; + const MDBX_env_flags_t rigorous_diff = + (snap_flags ^ env->me_flags) & rigorous_flags; + if (rigorous_diff) { + ERROR("current mode/flags 0x%X incompatible with requested 0x%X, " + "rigorous diff 0x%X", + env->me_flags, snap_flags, rigorous_diff); rc = MDBX_INCOMPATIBLE; goto bailout; } } - const int dxb_rc = mdbx_setup_dxb(env, lck_rc, mode); + mincore_clean_cache(env); + const int dxb_rc = setup_dxb(env, lck_rc, mode); if (MDBX_IS_ERROR(dxb_rc)) { rc = dxb_rc; goto bailout; } + rc = osal_check_fs_incore(env->me_lazy_fd); + env->me_incore = false; + if (rc == MDBX_RESULT_TRUE) { + env->me_incore = true; + NOTICE("%s", "in-core database"); + } else if (unlikely(rc != MDBX_SUCCESS)) { + ERROR("check_fs_incore(), err %d", rc); + goto bailout; + } + if (unlikely(/* recovery mode */ env->me_stuck_meta >= 0) && (lck_rc != /* exclusive */ MDBX_RESULT_TRUE || (flags & MDBX_EXCLUSIVE) == 0)) { - mdbx_error("%s", "recovery requires exclusive mode"); + ERROR("%s", "recovery requires exclusive mode"); rc = MDBX_BUSY; goto bailout; } - mdbx_debug("opened dbenv %p", (void *)env); + DEBUG("opened dbenv %p", (void *)env); + if (!lck || lck_rc == MDBX_RESULT_TRUE) { + env->me_lck->mti_envmode.weak = env->me_flags & mode_flags; + env->me_lck->mti_meta_sync_txnid.weak = + (uint32_t)recent_committed_txnid(env); + env->me_lck->mti_reader_check_timestamp.weak = osal_monotime(); + } if (lck) { if (lck_rc == MDBX_RESULT_TRUE) { - lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); - lck->mti_meta_sync_txnid.weak = - (uint32_t)mdbx_recent_committed_txnid(env); - lck->mti_reader_check_timestamp.weak = mdbx_osal_monotime(); - rc = mdbx_lck_downgrade(env); - mdbx_debug("lck-downgrade-%s: rc %i", - (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); + rc = osal_lck_downgrade(env); + DEBUG("lck-downgrade-%s: rc %i", + (env->me_flags & MDBX_EXCLUSIVE) ? "partial" : "full", rc); if (rc != MDBX_SUCCESS) goto bailout; } else { - rc = mdbx_cleanup_dead_readers(env, false, NULL); + rc = cleanup_dead_readers(env, false, NULL); if (MDBX_IS_ERROR(rc)) goto bailout; } if ((env->me_flags & MDBX_NOTLS) == 0) { - rc = mdbx_rthc_alloc(&env->me_txkey, &lck->mti_readers[0], - &lck->mti_readers[env->me_maxreaders]); + rc = rthc_alloc(&env->me_txkey, &lck->mti_readers[0], + &lck->mti_readers[env->me_maxreaders]); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; env->me_flags |= MDBX_ENV_TXKEY; } - } else { - env->me_lck->mti_envmode.weak = env->me_flags & (mode_flags | MDBX_RDONLY); - env->me_lck->mti_meta_sync_txnid.weak = - (uint32_t)mdbx_recent_committed_txnid(env); - env->me_lck->mti_reader_check_timestamp.weak = mdbx_osal_monotime(); } if ((flags & MDBX_RDONLY) == 0) { - const size_t tsize = sizeof(MDBX_txn), + const size_t tsize = sizeof(MDBX_txn) + sizeof(MDBX_cursor), size = tsize + env->me_maxdbs * (sizeof(MDBX_db) + sizeof(MDBX_cursor *) + - sizeof(unsigned) + 1); + sizeof(MDBX_atomic_uint32_t) + 1); rc = alloc_page_buf(env); if (rc == MDBX_SUCCESS) { - memset(env->me_pbuf, -1, env->me_psize * 2); - MDBX_txn *txn = mdbx_calloc(1, size); + memset(env->me_pbuf, -1, env->me_psize * (size_t)2); + memset(ptr_disp(env->me_pbuf, env->me_psize * (size_t)2), 0, + env->me_psize); + MDBX_txn *txn = osal_calloc(1, size); if (txn) { - txn->mt_dbs = (MDBX_db *)((char *)txn + tsize); - txn->mt_cursors = (MDBX_cursor **)(txn->mt_dbs + env->me_maxdbs); - txn->mt_dbiseqs = (unsigned *)(txn->mt_cursors + env->me_maxdbs); - txn->mt_dbistate = (uint8_t *)(txn->mt_dbiseqs + env->me_maxdbs); + txn->mt_dbs = ptr_disp(txn, tsize); + txn->mt_cursors = + ptr_disp(txn->mt_dbs, sizeof(MDBX_db) * env->me_maxdbs); + txn->mt_dbiseqs = + ptr_disp(txn->mt_cursors, sizeof(MDBX_cursor *) * env->me_maxdbs); + txn->mt_dbistate = ptr_disp( + txn->mt_dbiseqs, sizeof(MDBX_atomic_uint32_t) * env->me_maxdbs); txn->mt_env = env; txn->mt_dbxs = env->me_dbxs; txn->mt_flags = MDBX_TXN_FINISHED; env->me_txn0 = txn; - txn->tw.retired_pages = mdbx_pnl_alloc(MDBX_PNL_INITIAL); - txn->tw.reclaimed_pglist = mdbx_pnl_alloc(MDBX_PNL_INITIAL); - if (unlikely(!txn->tw.retired_pages || !txn->tw.reclaimed_pglist)) + txn->tw.retired_pages = pnl_alloc(MDBX_PNL_INITIAL); + txn->tw.relist = pnl_alloc(MDBX_PNL_INITIAL); + if (unlikely(!txn->tw.retired_pages || !txn->tw.relist)) rc = MDBX_ENOMEM; } else rc = MDBX_ENOMEM; } + if (rc == MDBX_SUCCESS) + rc = osal_ioring_create(&env->me_ioring +#if defined(_WIN32) || defined(_WIN64) + , + ior_direct, env->me_overlapped_fd +#endif /* Windows */ + ); + if (rc == MDBX_SUCCESS) + adjust_defaults(env); } #if MDBX_DEBUG if (rc == MDBX_SUCCESS) { - const MDBX_meta *meta = constmeta_prefer_last(env); - const MDBX_db *db = &meta->mm_dbs[MAIN_DBI]; + const meta_troika_t troika = meta_tap(env); + const meta_ptr_t head = meta_recent(env, &troika); + const MDBX_db *db = &head.ptr_c->mm_dbs[MAIN_DBI]; - mdbx_debug("opened database version %u, pagesize %u", - (uint8_t)unaligned_peek_u64(4, meta->mm_magic_and_version), - env->me_psize); - mdbx_debug("using meta page %" PRIaPGNO ", txn %" PRIaTXN, - data_page(meta)->mp_pgno, meta_txnid(env, meta)); - mdbx_debug("depth: %u", db->md_depth); - mdbx_debug("entries: %" PRIu64, db->md_entries); - mdbx_debug("branch pages: %" PRIaPGNO, db->md_branch_pages); - mdbx_debug("leaf pages: %" PRIaPGNO, db->md_leaf_pages); - mdbx_debug("overflow pages: %" PRIaPGNO, db->md_overflow_pages); - mdbx_debug("root: %" PRIaPGNO, db->md_root); - mdbx_debug("schema_altered: %" PRIaTXN, db->md_mod_txnid); + DEBUG("opened database version %u, pagesize %u", + (uint8_t)unaligned_peek_u64(4, head.ptr_c->mm_magic_and_version), + env->me_psize); + DEBUG("using meta page %" PRIaPGNO ", txn %" PRIaTXN, + data_page(head.ptr_c)->mp_pgno, head.txnid); + DEBUG("depth: %u", db->md_depth); + DEBUG("entries: %" PRIu64, db->md_entries); + DEBUG("branch pages: %" PRIaPGNO, db->md_branch_pages); + DEBUG("leaf pages: %" PRIaPGNO, db->md_leaf_pages); + DEBUG("large/overflow pages: %" PRIaPGNO, db->md_overflow_pages); + DEBUG("root: %" PRIaPGNO, db->md_root); + DEBUG("schema_altered: %" PRIaTXN, db->md_mod_txnid); } #endif bailout: if (rc != MDBX_SUCCESS) { - rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; + rc = env_close(env) ? MDBX_PANIC : rc; env->me_flags = saved_me_flags | ((rc != MDBX_PANIC) ? 0 : MDBX_FATAL_ERROR); } else { #if defined(MDBX_USE_VALGRIND) || defined(__SANITIZE_ADDRESS__) - mdbx_txn_valgrind(env, nullptr); + txn_valgrind(env, nullptr); #endif } - mdbx_free(env_pathname.buffer_for_free); + osal_free(env_pathname.buffer_for_free); return rc; } /* Destroy resources from mdbx_env_open(), clear our readers & DBIs */ -__cold static int mdbx_env_close0(MDBX_env *env) { +__cold static int env_close(MDBX_env *env) { const unsigned flags = env->me_flags; if (!(flags & MDBX_ENV_ACTIVE)) { - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); return MDBX_SUCCESS; } env->me_flags &= ~ENV_INTERNAL_FLAGS; - env->me_lck = nullptr; if (flags & MDBX_ENV_TXKEY) { - mdbx_rthc_remove(env->me_txkey); - env->me_txkey = (mdbx_thread_key_t)0; + rthc_remove(env->me_txkey); + env->me_txkey = (osal_thread_key_t)0; } + munlock_all(env); + if (!(env->me_flags & MDBX_RDONLY)) + osal_ioring_destroy(&env->me_ioring); + lcklist_lock(); const int rc = lcklist_detach_locked(env); lcklist_unlock(); + env->me_lck = nullptr; + if (env->me_lck_mmap.lck) + osal_munmap(&env->me_lck_mmap); + if (env->me_map) { - mdbx_munmap(&env->me_dxb_mmap); + osal_munmap(&env->me_dxb_mmap); #ifdef MDBX_USE_VALGRIND VALGRIND_DISCARD(env->me_valgrind_handle); env->me_valgrind_handle = -1; #endif } +#if defined(_WIN32) || defined(_WIN64) + eASSERT(env, !env->me_overlapped_fd || + env->me_overlapped_fd == INVALID_HANDLE_VALUE); + if (env->me_data_lock_event != INVALID_HANDLE_VALUE) { + CloseHandle(env->me_data_lock_event); + env->me_data_lock_event = INVALID_HANDLE_VALUE; + } +#endif /* Windows */ + if (env->me_dsync_fd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_dsync_fd); + (void)osal_closefile(env->me_dsync_fd); env->me_dsync_fd = INVALID_HANDLE_VALUE; } if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_lazy_fd); + (void)osal_closefile(env->me_lazy_fd); env->me_lazy_fd = INVALID_HANDLE_VALUE; } - if (env->me_lck_mmap.lck) - mdbx_munmap(&env->me_lck_mmap); - if (env->me_lfd != INVALID_HANDLE_VALUE) { - (void)mdbx_closefile(env->me_lfd); + (void)osal_closefile(env->me_lfd); env->me_lfd = INVALID_HANDLE_VALUE; } if (env->me_dbxs) { - for (unsigned i = CORE_DBS; i < env->me_numdbs; ++i) - mdbx_free(env->me_dbxs[i].md_name.iov_base); - mdbx_free(env->me_dbxs); + for (size_t i = CORE_DBS; i < env->me_numdbs; ++i) + if (env->me_dbxs[i].md_name.iov_len) + osal_free(env->me_dbxs[i].md_name.iov_base); + osal_free(env->me_dbxs); env->me_numdbs = CORE_DBS; env->me_dbxs = nullptr; } if (env->me_pbuf) { - mdbx_memalign_free(env->me_pbuf); + osal_memalign_free(env->me_pbuf); env->me_pbuf = nullptr; } if (env->me_dbiseqs) { - mdbx_free(env->me_dbiseqs); + osal_free(env->me_dbiseqs); env->me_dbiseqs = nullptr; } if (env->me_dbflags) { - mdbx_free(env->me_dbflags); + osal_free(env->me_dbflags); env->me_dbflags = nullptr; } if (env->me_pathname) { - mdbx_free(env->me_pathname); + osal_free(env->me_pathname); env->me_pathname = nullptr; } +#if defined(_WIN32) || defined(_WIN64) + if (env->me_pathname_char) { + osal_free(env->me_pathname_char); + env->me_pathname_char = nullptr; + } +#endif /* Windows */ if (env->me_txn0) { - mdbx_dpl_free(env->me_txn0); - mdbx_txl_free(env->me_txn0->tw.lifo_reclaimed); - mdbx_pnl_free(env->me_txn0->tw.retired_pages); - mdbx_pnl_free(env->me_txn0->tw.spill_pages); - mdbx_pnl_free(env->me_txn0->tw.reclaimed_pglist); - mdbx_free(env->me_txn0); + dpl_free(env->me_txn0); + txl_free(env->me_txn0->tw.lifo_reclaimed); + pnl_free(env->me_txn0->tw.retired_pages); + pnl_free(env->me_txn0->tw.spilled.list); + pnl_free(env->me_txn0->tw.relist); + osal_free(env->me_txn0); env->me_txn0 = nullptr; } env->me_stuck_meta = -1; @@ -13495,13 +15395,13 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { * platforms (i.e. where fork() is available). * This is required to legitimize a call after fork() * from a child process, that should be allowed to free resources. */ - if (unlikely(env->me_pid != mdbx_getpid())) + if (unlikely(env->me_pid != osal_getpid())) env->me_flags |= MDBX_FATAL_ERROR; #endif /* MDBX_ENV_CHECKPID */ if (env->me_map && (env->me_flags & (MDBX_RDONLY | MDBX_FATAL_ERROR)) == 0 && env->me_txn0) { - if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != mdbx_thread_self()) + if (env->me_txn0->mt_owner && env->me_txn0->mt_owner != osal_thread_self()) return MDBX_BUSY; } else dont_sync = true; @@ -13515,48 +15415,48 @@ __cold int mdbx_env_close_ex(MDBX_env *env, bool dont_sync) { * process is running a writing transaction or not. * Because in the "owner died" condition kernel don't release * file lock immediately. */ - rc = mdbx_env_sync_internal(env, true, false); + rc = env_sync(env, true, false); rc = (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; #else struct stat st; if (unlikely(fstat(env->me_lazy_fd, &st))) rc = errno; else if (st.st_nlink > 0 /* don't sync deleted files */) { - rc = mdbx_env_sync_internal(env, true, true); + rc = env_sync(env, true, true); rc = (rc == MDBX_BUSY || rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; } -#endif +#endif /* Windows */ } - mdbx_assert(env, env->me_signature.weak == 0); - rc = mdbx_env_close0(env) ? MDBX_PANIC : rc; - mdbx_ensure(env, mdbx_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); + eASSERT(env, env->me_signature.weak == 0); + rc = env_close(env) ? MDBX_PANIC : rc; + ENSURE(env, osal_fastmutex_destroy(&env->me_dbi_lock) == MDBX_SUCCESS); #if defined(_WIN32) || defined(_WIN64) /* me_remap_guard don't have destructor (Slim Reader/Writer Lock) */ DeleteCriticalSection(&env->me_windowsbug_lock); #else - mdbx_ensure(env, - mdbx_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_destroy(&env->me_remap_guard) == MDBX_SUCCESS); #endif /* Windows */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV MDBX_lockinfo *const stub = lckless_stub(env); - mdbx_ensure(env, mdbx_ipclock_destroy(&stub->mti_wlock) == 0); + ENSURE(env, osal_ipclock_destroy(&stub->mti_wlock) == 0); #endif /* MDBX_LOCKING */ while ((dp = env->me_dp_reserve) != NULL) { MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dp_reserve = dp->mp_next; - mdbx_free(dp); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + osal_free(ptr); } VALGRIND_DESTROY_MEMPOOL(env); - mdbx_ensure(env, env->me_lcklist_next == nullptr); + ENSURE(env, env->me_lcklist_next == nullptr); env->me_pid = 0; - mdbx_free(env); + osal_free(env); return rc; } @@ -13567,153 +15467,56 @@ __cold int mdbx_env_close(MDBX_env *env) { } #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ -/* Compare two items pointing at aligned unsigned int's. */ -static int __hot cmp_int_align4(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(4, a->iov_base), - unaligned_peek_u32(4, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(4, a->iov_base), - unaligned_peek_u64(4, b->iov_base)); - default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, - __LINE__); - return 0; - } -} - -/* Compare two items pointing at 2-byte aligned unsigned int's. */ -static int __hot cmp_int_align2(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(2, a->iov_base), - unaligned_peek_u32(2, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(2, a->iov_base), - unaligned_peek_u64(2, b->iov_base)); - default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, - __LINE__); - return 0; - } -} - -/* Compare two items pointing at unsigned values with unknown alignment. - * - * This is also set as MDBX_INTEGERDUP|MDBX_DUPFIXED's MDBX_dbx.md_dcmp. */ -static int __hot cmp_int_unaligned(const MDBX_val *a, const MDBX_val *b) { - mdbx_assert(NULL, a->iov_len == b->iov_len); - switch (a->iov_len) { - case 4: - return CMP2INT(unaligned_peek_u32(1, a->iov_base), - unaligned_peek_u32(1, b->iov_base)); - case 8: - return CMP2INT(unaligned_peek_u64(1, a->iov_base), - unaligned_peek_u64(1, b->iov_base)); - default: - mdbx_assert_fail(NULL, "invalid size for INTEGERKEY/INTEGERDUP", __func__, - __LINE__); - return 0; - } -} - -/* Compare two items lexically */ -static int __hot cmp_lexical(const MDBX_val *a, const MDBX_val *b) { - if (a->iov_len == b->iov_len) - return a->iov_len ? memcmp(a->iov_base, b->iov_base, a->iov_len) : 0; - - const int diff_len = (a->iov_len < b->iov_len) ? -1 : 1; - const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - int diff_data = shortest ? memcmp(a->iov_base, b->iov_base, shortest) : 0; - return likely(diff_data) ? diff_data : diff_len; -} - -/* Compare two items in reverse byte order */ -static int __hot cmp_reverse(const MDBX_val *a, const MDBX_val *b) { - const size_t shortest = (a->iov_len < b->iov_len) ? a->iov_len : b->iov_len; - if (likely(shortest)) { - const uint8_t *pa = (const uint8_t *)a->iov_base + a->iov_len; - const uint8_t *pb = (const uint8_t *)b->iov_base + b->iov_len; - const uint8_t *const end = pa - shortest; - do { - int diff = *--pa - *--pb; - if (likely(diff)) - return diff; - } while (pa != end); - } - return CMP2INT(a->iov_len, b->iov_len); -} - -/* Fast non-lexically comparator */ -static int __hot cmp_lenfast(const MDBX_val *a, const MDBX_val *b) { - int diff = CMP2INT(a->iov_len, b->iov_len); - return likely(diff || a->iov_len == 0) - ? diff - : memcmp(a->iov_base, b->iov_base, a->iov_len); -} - -static bool unsure_equal(MDBX_cmp_func cmp, const MDBX_val *a, - const MDBX_val *b) { - /* checking for the use of a known good comparator - * or/otherwise for a full byte-to-byte match */ - return cmp == cmp_lenfast || cmp == cmp_lexical || cmp == cmp_reverse || - cmp == cmp_int_unaligned || cmp_lenfast(a, b) == 0; -} - /* Search for key within a page, using binary search. * Returns the smallest entry larger or equal to the key. * Updates the cursor index with the index of the found entry. * If no entry larger or equal to the key is found, returns NULL. */ -static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, - const MDBX_val *key) { +__hot static struct node_result node_search(MDBX_cursor *mc, + const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - const int nkeys = page_numkeys(mp); + const intptr_t nkeys = page_numkeys(mp); DKBUF_DEBUG; - mdbx_debug("searching %u keys in %s %spage %" PRIaPGNO, nkeys, - IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", - mp->mp_pgno); + DEBUG("searching %zu keys in %s %spage %" PRIaPGNO, nkeys, + IS_LEAF(mp) ? "leaf" : "branch", IS_SUBP(mp) ? "sub-" : "", + mp->mp_pgno); struct node_result ret; ret.exact = false; STATIC_ASSERT(P_BRANCH == 1); - int low = mp->mp_flags & P_BRANCH; - int high = nkeys - 1; + intptr_t low = mp->mp_flags & P_BRANCH; + intptr_t high = nkeys - 1; if (unlikely(high < low)) { mc->mc_ki[mc->mc_top] = 0; ret.node = NULL; return ret; } - int cr = 0, i = 0; + intptr_t i; MDBX_cmp_func *cmp = mc->mc_dbx->md_cmp; MDBX_val nodekey; if (unlikely(IS_LEAF2(mp))) { - mdbx_cassert(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); + cASSERT(mc, mp->mp_leaf2_ksize == mc->mc_db->md_xsize); nodekey.iov_len = mp->mp_leaf2_ksize; do { i = (low + high) >> 1; nodekey.iov_base = page_leaf2key(mp, i, nodekey.iov_len); - mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); - cr = cmp(key, &nodekey); - mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), - cr); - if (unlikely(cr == 0)) { + cASSERT(mc, ptr_disp(mp, mc->mc_txn->mt_env->me_psize) >= + ptr_disp(nodekey.iov_base, nodekey.iov_len)); + int cr = cmp(key, &nodekey); + DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); + if (cr > 0) + /* Found entry is less than the key. */ + /* Skip to get the smallest entry larger than key. */ + low = ++i; + else if (cr < 0) + high = i - 1; + else { ret.exact = true; break; } - low = (cr < 0) ? low : i + 1; - high = (cr < 0) ? i - 1 : high; } while (likely(low <= high)); - /* Found entry is less than the key. */ - /* Skip to get the smallest entry larger than key. */ - i += cr > 0; - /* store the key index */ mc->mc_ki[mc->mc_top] = (indx_t)i; ret.node = (i < nkeys) @@ -13730,32 +15533,29 @@ static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, MDBX_node *node; do { i = (low + high) >> 1; - node = page_node(mp, i); nodekey.iov_len = node_ks(node); nodekey.iov_base = node_key(node); - mdbx_cassert(mc, (char *)mp + mc->mc_txn->mt_env->me_psize >= - (char *)nodekey.iov_base + nodekey.iov_len); - - cr = cmp(key, &nodekey); + cASSERT(mc, ptr_disp(mp, mc->mc_txn->mt_env->me_psize) >= + ptr_disp(nodekey.iov_base, nodekey.iov_len)); + int cr = cmp(key, &nodekey); if (IS_LEAF(mp)) - mdbx_debug("found leaf index %u [%s], rc = %i", i, DKEY_DEBUG(&nodekey), - cr); + DEBUG("found leaf index %zu [%s], rc = %i", i, DKEY_DEBUG(&nodekey), cr); else - mdbx_debug("found branch index %u [%s -> %" PRIaPGNO "], rc = %i", i, - DKEY_DEBUG(&nodekey), node_pgno(node), cr); - if (unlikely(cr == 0)) { + DEBUG("found branch index %zu [%s -> %" PRIaPGNO "], rc = %i", i, + DKEY_DEBUG(&nodekey), node_pgno(node), cr); + if (cr > 0) + /* Found entry is less than the key. */ + /* Skip to get the smallest entry larger than key. */ + low = ++i; + else if (cr < 0) + high = i - 1; + else { ret.exact = true; break; } - low = (cr < 0) ? low : i + 1; - high = (cr < 0) ? i - 1 : high; } while (likely(low <= high)); - /* Found entry is less than the key. */ - /* Skip to get the smallest entry larger than key. */ - i += cr > 0; - /* store the key index */ mc->mc_ki[mc->mc_top] = (indx_t)i; ret.node = (i < nkeys) @@ -13765,11 +15565,11 @@ static struct node_result __hot mdbx_node_search(MDBX_cursor *mc, } /* Pop a page off the top of the cursor's stack. */ -static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { - if (mc->mc_snum) { - mdbx_debug("popped page %" PRIaPGNO " off db %d cursor %p", - mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); - if (--mc->mc_snum) { +static __inline void cursor_pop(MDBX_cursor *mc) { + if (likely(mc->mc_snum)) { + DEBUG("popped page %" PRIaPGNO " off db %d cursor %p", + mc->mc_pg[mc->mc_top]->mp_pgno, DDBI(mc), (void *)mc); + if (likely(--mc->mc_snum)) { mc->mc_top--; } else { mc->mc_flags &= ~C_INITIALIZED; @@ -13779,42 +15579,109 @@ static __inline void mdbx_cursor_pop(MDBX_cursor *mc) { /* Push a page onto the top of the cursor's stack. * Set MDBX_TXN_ERROR on failure. */ -static __inline int mdbx_cursor_push(MDBX_cursor *mc, MDBX_page *mp) { - mdbx_debug("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, - DDBI(mc), (void *)mc); +static __inline int cursor_push(MDBX_cursor *mc, MDBX_page *mp) { + DEBUG("pushing page %" PRIaPGNO " on db %d cursor %p", mp->mp_pgno, DDBI(mc), + (void *)mc); if (unlikely(mc->mc_snum >= CURSOR_STACK)) { mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_CURSOR_FULL; } - mdbx_cassert(mc, mc->mc_snum < UINT16_MAX); mc->mc_top = mc->mc_snum++; mc->mc_pg[mc->mc_top] = mp; mc->mc_ki[mc->mc_top] = 0; - return MDBX_SUCCESS; } -__hot static struct page_result -mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, - /* TODO: use parent-page ptr */ txnid_t front) { - struct page_result ret; - MDBX_txn *const txn = mc->mc_txn; - mdbx_tassert(txn, front <= txn->mt_front); - if (unlikely(pgno >= txn->mt_next_pgno)) { - mdbx_error("page #%" PRIaPGNO " beyond next-pgno", pgno); - notfound: - ret.page = nullptr; - ret.err = MDBX_PAGE_NOTFOUND; - bailout: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return ret; +__hot static __always_inline int page_get_checker_lite(const uint16_t ILL, + const MDBX_page *page, + MDBX_txn *const txn, + const txnid_t front) { + if (unlikely(page->mp_flags & ILL)) { + if (ILL == P_ILL_BITS || (page->mp_flags & P_ILL_BITS)) + return bad_page(page, "invalid page's flags (%u)\n", page->mp_flags); + else if (ILL & P_OVERFLOW) { + assert((ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0); + assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); + return bad_page(page, "unexpected %s instead of %s (%u)\n", + "large/overflow", "branch/leaf/leaf2", page->mp_flags); + } else if (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) { + assert((ILL & P_BRANCH) && (ILL & P_LEAF) && (ILL & P_LEAF2)); + assert(page->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2)); + return bad_page(page, "unexpected %s instead of %s (%u)\n", + "branch/leaf/leaf2", "large/overflow", page->mp_flags); + } else { + assert(false); + } } - MDBX_env *const env = txn->mt_env; - mdbx_assert(env, ((txn->mt_flags ^ env->me_flags) & MDBX_WRITEMAP) == 0); - if (unlikely((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0)) { + if (unlikely(page->mp_txnid > front) && + unlikely(page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) + return bad_page( + page, + "invalid page' txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", + page->mp_txnid, + (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" + : "parent-page", + front); + + if (((ILL & P_OVERFLOW) || !IS_OVERFLOW(page)) && + (ILL & (P_BRANCH | P_LEAF | P_LEAF2)) == 0) { + if (unlikely(page->mp_upper < page->mp_lower || + ((page->mp_lower | page->mp_upper) & 1) || + PAGEHDRSZ + page->mp_upper > txn->mt_env->me_psize)) + return bad_page(page, + "invalid page' lower(%u)/upper(%u) with limit %zu\n", + page->mp_lower, page->mp_upper, page_space(txn->mt_env)); + + } else if ((ILL & P_OVERFLOW) == 0) { + const pgno_t npages = page->mp_pages; + if (unlikely(npages < 1) || unlikely(npages >= MAX_PAGENO / 2)) + return bad_page(page, "invalid n-pages (%u) for large-page\n", npages); + if (unlikely(page->mp_pgno + npages > txn->mt_next_pgno)) + return bad_page( + page, + "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + page->mp_pgno + npages, txn->mt_next_pgno); + } else { + assert(false); + } + return MDBX_SUCCESS; +} + +__cold static __noinline pgr_t +page_get_checker_full(const uint16_t ILL, MDBX_page *page, + const MDBX_cursor *const mc, const txnid_t front) { + pgr_t r = {page, page_get_checker_lite(ILL, page, mc->mc_txn, front)}; + if (likely(r.err == MDBX_SUCCESS)) + r.err = page_check(mc, page); + if (unlikely(r.err != MDBX_SUCCESS)) + mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; + return r; +} + +__hot static __always_inline pgr_t page_get_inline(const uint16_t ILL, + const MDBX_cursor *const mc, + const pgno_t pgno, + const txnid_t front) { + MDBX_txn *const txn = mc->mc_txn; + tASSERT(txn, front <= txn->mt_front); + + pgr_t r; + if (unlikely(pgno >= txn->mt_next_pgno)) { + ERROR("page #%" PRIaPGNO " beyond next-pgno", pgno); + r.page = nullptr; + r.err = MDBX_PAGE_NOTFOUND; + bailout: + txn->mt_flags |= MDBX_TXN_ERROR; + return r; + } + + eASSERT(txn->mt_env, + ((txn->mt_flags ^ txn->mt_env->me_flags) & MDBX_WRITEMAP) == 0); + r.page = pgno2page(txn->mt_env, pgno); + if ((txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_WRITEMAP)) == 0) { const MDBX_txn *spiller = txn; do { /* Spilled pages were dirtied in this txn and flushed @@ -13822,89 +15689,59 @@ mdbx_page_get_ex(MDBX_cursor *const mc, const pgno_t pgno, * back in from the map (but don't unspill it here, * leave that unless page_touch happens again). */ if (unlikely(spiller->mt_flags & MDBX_TXN_SPILLS) && - mdbx_search_spilled(spiller, pgno)) { - goto spilled; - } + search_spilled(spiller, pgno)) + break; - const unsigned i = mdbx_dpl_search(spiller, pgno); - assert((int)i > 0); + const size_t i = dpl_search(spiller, pgno); + tASSERT(txn, (intptr_t)i > 0); if (spiller->tw.dirtylist->items[i].pgno == pgno) { - ret.page = spiller->tw.dirtylist->items[i].ptr; - spiller->tw.dirtylist->items[i].lru = txn->tw.dirtylru++; - goto dirty; + r.page = spiller->tw.dirtylist->items[i].ptr; + break; } spiller = spiller->mt_parent; - } while (spiller != NULL); + } while (spiller); } -spilled: - ret.page = pgno2page(env, pgno); - -dirty: - if (unlikely(ret.page->mp_pgno != pgno)) { - bad_page(ret.page, - "mismatch actual pgno (%" PRIaPGNO ") != expected (%" PRIaPGNO - ")\n", - ret.page->mp_pgno, pgno); - goto notfound; - } - -#if !MDBX_DISABLE_PAGECHECKS - if (unlikely(ret.page->mp_flags & P_ILL_BITS)) { - ret.err = - bad_page(ret.page, "invalid page's flags (%u)\n", ret.page->mp_flags); + if (unlikely(r.page->mp_pgno != pgno)) { + r.err = bad_page( + r.page, "pgno mismatch (%" PRIaPGNO ") != expected (%" PRIaPGNO ")\n", + r.page->mp_pgno, pgno); goto bailout; } - if (unlikely(ret.page->mp_txnid > front) && - unlikely(ret.page->mp_txnid > txn->mt_front || front < txn->mt_txnid)) { - ret.err = bad_page( - ret.page, - "invalid page txnid (%" PRIaTXN ") for %s' txnid (%" PRIaTXN ")\n", - ret.page->mp_txnid, - (front == txn->mt_front && front != txn->mt_txnid) ? "front-txn" - : "parent-page", - front); - goto bailout; - } + if (unlikely(mc->mc_checking & CC_PAGECHECK)) + return page_get_checker_full(ILL, r.page, mc, front); - if (unlikely((ret.page->mp_upper < ret.page->mp_lower || - ((ret.page->mp_lower | ret.page->mp_upper) & 1) || - PAGEHDRSZ + ret.page->mp_upper > env->me_psize) && - !IS_OVERFLOW(ret.page))) { - ret.err = - bad_page(ret.page, "invalid page lower(%u)/upper(%u) with limit (%u)\n", - ret.page->mp_lower, ret.page->mp_upper, page_space(env)); +#if MDBX_DISABLE_VALIDATION + r.err = MDBX_SUCCESS; +#else + r.err = page_get_checker_lite(ILL, r.page, txn, front); + if (unlikely(r.err != MDBX_SUCCESS)) goto bailout; - } -#endif /* !MDBX_DISABLE_PAGECHECKS */ - - ret.err = MDBX_SUCCESS; - if (mdbx_audit_enabled()) - ret.err = mdbx_page_check(mc, ret.page, C_UPDATING); - return ret; +#endif /* MDBX_DISABLE_VALIDATION */ + return r; } /* Finish mdbx_page_search() / mdbx_page_search_lowest(). * The cursor is at the root page, set up the rest of it. */ -__hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, - int flags) { +__hot __noinline static int page_search_root(MDBX_cursor *mc, + const MDBX_val *key, int flags) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; int rc; DKBUF_DEBUG; while (IS_BRANCH(mp)) { MDBX_node *node; - int i; + intptr_t i; - mdbx_debug("branch page %" PRIaPGNO " has %u keys", mp->mp_pgno, - page_numkeys(mp)); + DEBUG("branch page %" PRIaPGNO " has %zu keys", mp->mp_pgno, + page_numkeys(mp)); /* Don't assert on branch pages in the GC. We can get here * while in the process of rebalancing a GC branch page; we must * let that proceed. ITS#8336 */ - mdbx_cassert(mc, !mc->mc_dbi || page_numkeys(mp) > 1); - mdbx_debug("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); + cASSERT(mc, !mc->mc_dbi || page_numkeys(mp) > 1); + DEBUG("found index 0 to page %" PRIaPGNO, node_pgno(page_node(mp, 0))); if (flags & (MDBX_PS_FIRST | MDBX_PS_LAST)) { i = 0; @@ -13920,51 +15757,50 @@ __hot static int mdbx_page_search_root(MDBX_cursor *mc, const MDBX_val *key, } } } else { - const struct node_result nsr = mdbx_node_search(mc, key); - if (nsr.node) - i = mc->mc_ki[mc->mc_top] + nsr.exact - 1; + const struct node_result nsr = node_search(mc, key); + if (likely(nsr.node)) + i = mc->mc_ki[mc->mc_top] + (intptr_t)nsr.exact - 1; else i = page_numkeys(mp) - 1; - mdbx_debug("following index %u for key [%s]", i, DKEY_DEBUG(key)); + DEBUG("following index %zu for key [%s]", i, DKEY_DEBUG(key)); } - mdbx_cassert(mc, i >= 0 && i < (int)page_numkeys(mp)); + cASSERT(mc, i >= 0 && i < (int)page_numkeys(mp)); node = page_node(mp, i); - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top] = (indx_t)i; - if (unlikely(rc = mdbx_cursor_push(mc, mp))) + if (unlikely(rc = cursor_push(mc, mp))) return rc; ready: if (flags & MDBX_PS_MODIFY) { - if (unlikely((rc = mdbx_page_touch(mc)) != 0)) + rc = page_touch(mc); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mp = mc->mc_pg[mc->mc_top]; } } -#if !MDBX_DISABLE_PAGECHECKS - if (unlikely(!IS_LEAF(mp))) { - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; - return bad_page(mp, "index points to a page with 0x%02x flags\n", - mp->mp_flags); + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; } -#endif /* !MDBX_DISABLE_PAGECHECKS */ - mdbx_debug("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, - DKEY_DEBUG(key)); + DEBUG("found leaf page %" PRIaPGNO " for key [%s]", mp->mp_pgno, + DKEY_DEBUG(key)); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; return MDBX_SUCCESS; } -static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, - const unsigned pagesize) { +static int setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, + const unsigned pagesize) { if (unlikely(!dbx->md_cmp)) { dbx->md_cmp = get_default_keycmp(db->md_flags); dbx->md_dcmp = get_default_datacmp(db->md_flags); @@ -13979,13 +15815,13 @@ static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, ? 4 /* sizeof(uint32_t) */ : ((db->md_flags & MDBX_DUPFIXED) ? 1 : 0); dbx->md_vlen_max = valsize_max(pagesize, db->md_flags); - assert(dbx->md_vlen_max != (unsigned)-1); + assert(dbx->md_vlen_max != (size_t)-1); if ((db->md_flags & (MDBX_DUPFIXED | MDBX_INTEGERDUP)) != 0 && db->md_xsize) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely(db->md_xsize < dbx->md_vlen_min || + if (!MDBX_DISABLE_VALIDATION && unlikely(db->md_xsize < dbx->md_vlen_min || db->md_xsize > dbx->md_vlen_max)) { - mdbx_error("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", - db->md_xsize, dbx->md_vlen_min, dbx->md_vlen_max); + ERROR("db.md_xsize (%u) <> min/max value-length (%zu/%zu)", db->md_xsize, + dbx->md_vlen_min, dbx->md_vlen_max); return MDBX_CORRUPTED; } dbx->md_vlen_min = dbx->md_vlen_max = db->md_xsize; @@ -13993,52 +15829,49 @@ static int mdbx_setup_dbx(MDBX_dbx *const dbx, const MDBX_db *const db, return MDBX_SUCCESS; } -static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { +static int fetch_sdb(MDBX_txn *txn, size_t dbi) { MDBX_cursor_couple couple; - if (unlikely(TXN_DBI_CHANGED(txn, dbi))) { - mdbx_notice("dbi %u was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); + if (unlikely(dbi_changed(txn, dbi))) { + NOTICE("dbi %zu was changed for txn %" PRIaTXN, dbi, txn->mt_txnid); return MDBX_BAD_DBI; } - int rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + int rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_dbx *const dbx = &txn->mt_dbxs[dbi]; - rc = mdbx_page_search(&couple.outer, &dbx->md_name, 0); + rc = page_search(&couple.outer, &dbx->md_name, 0); if (unlikely(rc != MDBX_SUCCESS)) { notfound: - mdbx_notice("dbi %u refs to inaccessible subDB `%*s` for txn %" PRIaTXN - " (err %d)", - dbi, (int)dbx->md_name.iov_len, - (const char *)dbx->md_name.iov_base, txn->mt_txnid, rc); + NOTICE("dbi %zu refs to inaccessible subDB `%*s` for txn %" PRIaTXN + " (err %d)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, rc); return (rc == MDBX_NOTFOUND) ? MDBX_BAD_DBI : rc; } MDBX_val data; - struct node_result nsr = mdbx_node_search(&couple.outer, &dbx->md_name); + struct node_result nsr = node_search(&couple.outer, &dbx->md_name); if (unlikely(!nsr.exact)) { rc = MDBX_NOTFOUND; goto notfound; } if (unlikely((node_flags(nsr.node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { - mdbx_notice( - "dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", dbi, - (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, "wrong flags"); + NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, "wrong flags"); return MDBX_INCOMPATIBLE; /* not a named DB */ } - const txnid_t pp_txnid = - pp_txnid4chk(couple.outer.mc_pg[couple.outer.mc_top], txn); - rc = mdbx_node_read(&couple.outer, nsr.node, &data, pp_txnid); + rc = node_read(&couple.outer, nsr.node, &data, + couple.outer.mc_pg[couple.outer.mc_top]); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (unlikely(data.iov_len != sizeof(MDBX_db))) { - mdbx_notice( - "dbi %u refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", dbi, - (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, - txn->mt_txnid, "wrong rec-size"); + NOTICE("dbi %zu refs to not a named subDB `%*s` for txn %" PRIaTXN " (%s)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, "wrong rec-size"); return MDBX_INCOMPATIBLE; /* not a named DB */ } @@ -14047,24 +15880,24 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { * have dropped and recreated the DB with other flags. */ MDBX_db *const db = &txn->mt_dbs[dbi]; if (unlikely((db->md_flags & DB_PERSISTENT_FLAGS) != md_flags)) { - mdbx_notice("dbi %u refs to the re-created subDB `%*s` for txn %" PRIaTXN - " with different flags (present 0x%X != wanna 0x%X)", - dbi, (int)dbx->md_name.iov_len, - (const char *)dbx->md_name.iov_base, txn->mt_txnid, - db->md_flags & DB_PERSISTENT_FLAGS, md_flags); + NOTICE("dbi %zu refs to the re-created subDB `%*s` for txn %" PRIaTXN + " with different flags (present 0x%X != wanna 0x%X)", + dbi, (int)dbx->md_name.iov_len, (const char *)dbx->md_name.iov_base, + txn->mt_txnid, db->md_flags & DB_PERSISTENT_FLAGS, md_flags); return MDBX_INCOMPATIBLE; } memcpy(db, data.iov_base, sizeof(MDBX_db)); -#if !MDBX_DISABLE_PAGECHECKS - mdbx_tassert(txn, txn->mt_front >= pp_txnid); +#if !MDBX_DISABLE_VALIDATION + const txnid_t pp_txnid = couple.outer.mc_pg[couple.outer.mc_top]->mp_txnid; + tASSERT(txn, txn->mt_front >= pp_txnid); if (unlikely(db->md_mod_txnid > pp_txnid)) { - mdbx_error("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", - db->md_mod_txnid, pp_txnid); + ERROR("db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + db->md_mod_txnid, pp_txnid); return MDBX_CORRUPTED; } -#endif /* !MDBX_DISABLE_PAGECHECKS */ - rc = mdbx_setup_dbx(dbx, db, txn->mt_env->me_psize); +#endif /* !MDBX_DISABLE_VALIDATION */ + rc = setup_dbx(dbx, db, txn->mt_env->me_psize); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -14077,20 +15910,19 @@ static int mdbx_fetch_sdb(MDBX_txn *txn, MDBX_dbi dbi) { * before calling mdbx_page_search_root(), because the callers * are all in situations where the current page is known to * be underfilled. */ -__hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { +__hot static int page_search_lowest(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_BRANCH(mp)); + cASSERT(mc, IS_BRANCH(mp)); MDBX_node *node = page_node(mp, 0); - int rc; - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) + int rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top] = 0; - if (unlikely(rc = mdbx_cursor_push(mc, mp))) + if (unlikely(rc = cursor_push(mc, mp))) return rc; - return mdbx_page_search_root(mc, NULL, MDBX_PS_FIRST); + return page_search_root(mc, NULL, MDBX_PS_FIRST); } /* Search for the page a given key should be in. @@ -14107,33 +15939,33 @@ __hot static int mdbx_page_search_lowest(MDBX_cursor *mc) { * lookups. * * Returns 0 on success, non-zero on failure. */ -__hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, - int flags) { +__hot static int page_search(MDBX_cursor *mc, const MDBX_val *key, int flags) { int rc; pgno_t root; /* Make sure the txn is still viable, then find the root from * the txn's db table and set it as the root of the cursor's stack. */ if (unlikely(mc->mc_txn->mt_flags & MDBX_TXN_BLOCKED)) { - mdbx_debug("%s", "transaction has failed, must abort"); + DEBUG("%s", "transaction has failed, must abort"); return MDBX_BAD_TXN; } /* Make sure we're using an up-to-date root */ if (unlikely(*mc->mc_dbistate & DBI_STALE)) { - rc = mdbx_fetch_sdb(mc->mc_txn, mc->mc_dbi); + rc = fetch_sdb(mc->mc_txn, mc->mc_dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } root = mc->mc_db->md_root; if (unlikely(root == P_INVALID)) { /* Tree is empty. */ - mdbx_debug("%s", "tree is empty"); + DEBUG("%s", "tree is empty"); return MDBX_NOTFOUND; } - mdbx_cassert(mc, root >= NUM_METAS); - if (!mc->mc_pg[0] || mc->mc_pg[0]->mp_pgno != root) { + cASSERT(mc, root >= NUM_METAS); + if (!mc->mc_snum || !(mc->mc_flags & C_INITIALIZED) || + mc->mc_pg[0]->mp_pgno != root) { txnid_t pp_txnid = mc->mc_db->md_mod_txnid; pp_txnid = /* mc->mc_db->md_mod_txnid maybe zero in a legacy DB */ pp_txnid ? pp_txnid @@ -14144,63 +15976,80 @@ __hot static int mdbx_page_search(MDBX_cursor *mc, const MDBX_val *key, if ((scan->mt_flags & MDBX_TXN_DIRTY) && (mc->mc_dbi == MAIN_DBI || (scan->mt_dbistate[mc->mc_dbi] & DBI_DIRTY))) { + /* После коммита вложенных тразакций может быть mod_txnid > front */ pp_txnid = scan->mt_front; break; } while (unlikely((scan = scan->mt_parent) != nullptr)); } - if (unlikely((rc = mdbx_page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0)) + if (unlikely((rc = page_get(mc, root, &mc->mc_pg[0], pp_txnid)) != 0)) return rc; } mc->mc_snum = 1; mc->mc_top = 0; - mdbx_debug("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, - mc->mc_pg[0]->mp_flags); + DEBUG("db %d root page %" PRIaPGNO " has flags 0x%X", DDBI(mc), root, + mc->mc_pg[0]->mp_flags); if (flags & MDBX_PS_MODIFY) { - if (!(*mc->mc_dbistate & DBI_DIRTY) && unlikely(rc = mdbx_touch_dbi(mc))) - return rc; - if (unlikely(rc = mdbx_page_touch(mc))) + if (unlikely(rc = page_touch(mc))) return rc; } if (flags & MDBX_PS_ROOTONLY) return MDBX_SUCCESS; - return mdbx_page_search_root(mc, key, flags); + return page_search_root(mc, key, flags); } -/* Return the data associated with a given node. - * - * [in] mc The cursor for this operation. - * [in] leaf The node being read. - * [out] data Updated to point to the node's data. - * - * Returns 0 on success, non-zero on failure. */ -static __always_inline int mdbx_node_read(MDBX_cursor *mc, - const MDBX_node *node, MDBX_val *data, - const txnid_t front) { - data->iov_len = node_ds(node); - data->iov_base = node_data(node); - if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { - /* Read overflow data. */ - MDBX_page *omp; /* overflow page */ - int rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, front); - if (unlikely((rc != MDBX_SUCCESS))) { - mdbx_debug("read overflow page %" PRIaPGNO " failed", - node_largedata_pgno(node)); - return rc; +/* Read large/overflow node data. */ +static __noinline int node_read_bigdata(MDBX_cursor *mc, const MDBX_node *node, + MDBX_val *data, const MDBX_page *mp) { + cASSERT(mc, node_flags(node) == F_BIGDATA && data->iov_len == node_ds(node)); + + pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((lp.err != MDBX_SUCCESS))) { + DEBUG("read large/overflow page %" PRIaPGNO " failed", + node_largedata_pgno(node)); + return lp.err; + } + + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + data->iov_base = page_data(lp.page); + if (!MDBX_DISABLE_VALIDATION) { + const MDBX_env *env = mc->mc_txn->mt_env; + const size_t dsize = data->iov_len; + if (unlikely(node_size_len(node_ks(node), dsize) <= env->me_leaf_nodemax) && + mc->mc_dbi != FREE_DBI) + poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + const unsigned npages = number_of_ovpages(env, dsize); + if (unlikely(lp.page->mp_pages != npages)) { + if (lp.page->mp_pages < npages) + return bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); + else if (mc->mc_dbi != FREE_DBI) + poor_page(lp.page, "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); } - data->iov_base = page_data(omp); } return MDBX_SUCCESS; } +/* Return the data associated with a given node. */ +static __always_inline int node_read(MDBX_cursor *mc, const MDBX_node *node, + MDBX_val *data, const MDBX_page *mp) { + data->iov_len = node_ds(node); + data->iov_base = node_data(node); + if (likely(node_flags(node) != F_BIGDATA)) + return MDBX_SUCCESS; + return node_read_bigdata(mc, node, data, mp); +} + int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { DKBUF_DEBUG; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -14213,11 +16062,11 @@ int mdbx_get(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data) { return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; + return cursor_set(&cx.outer, (MDBX_val *)key, data, MDBX_SET).err; } int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, @@ -14236,17 +16085,17 @@ int mdbx_get_equal_or_great(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, return MDBX_BAD_TXN; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - return mdbx_cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); + return cursor_get(&cx.outer, key, data, MDBX_SET_LOWERBOUND); } int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, size_t *values_count) { DKBUF_DEBUG; - mdbx_debug("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); + DEBUG("===> get db %u key [%s]", dbi, DKEY_DEBUG(key)); int rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) @@ -14259,11 +16108,11 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; + rc = cursor_set(&cx.outer, key, data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND && values_count) *values_count = 0; @@ -14275,10 +16124,10 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, if (cx.outer.mc_xcursor != NULL) { MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { // coverity[uninit_use : FALSE] - mdbx_tassert(txn, cx.outer.mc_xcursor == &cx.inner && - (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); + tASSERT(txn, cx.outer.mc_xcursor == &cx.inner && + (cx.inner.mx_cursor.mc_flags & C_INITIALIZED)); // coverity[uninit_use : FALSE] *values_count = (sizeof(*values_count) >= sizeof(cx.inner.mx_db.md_entries) || @@ -14299,7 +16148,7 @@ int mdbx_get_ex(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, * [in] dir SIBLING_LEFT or SIBLING_RIGHT. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { +static int cursor_sibling(MDBX_cursor *mc, int dir) { int rc; MDBX_node *node; MDBX_page *mp; @@ -14308,16 +16157,16 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { if (unlikely(mc->mc_snum < 2)) return MDBX_NOTFOUND; /* root has no siblings */ - mdbx_cursor_pop(mc); - mdbx_debug("parent page is page %" PRIaPGNO ", index %u", - mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); + cursor_pop(mc); + DEBUG("parent page is page %" PRIaPGNO ", index %u", + mc->mc_pg[mc->mc_top]->mp_pgno, mc->mc_ki[mc->mc_top]); - if ((dir == SIBLING_RIGHT) - ? (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mc->mc_pg[mc->mc_top])) - : (mc->mc_ki[mc->mc_top] == 0)) { - mdbx_debug("no more keys aside, moving to next %s sibling", - dir ? "right" : "left"); - if (unlikely((rc = mdbx_cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { + if ((dir == SIBLING_RIGHT) ? (mc->mc_ki[mc->mc_top] + (size_t)1 >= + page_numkeys(mc->mc_pg[mc->mc_top])) + : (mc->mc_ki[mc->mc_top] == 0)) { + DEBUG("no more keys aside, moving to next %s sibling", + dir ? "right" : "left"); + if (unlikely((rc = cursor_sibling(mc, dir)) != MDBX_SUCCESS)) { /* undo cursor_pop before returning */ mc->mc_top++; mc->mc_snum++; @@ -14326,32 +16175,31 @@ static int mdbx_cursor_sibling(MDBX_cursor *mc, int dir) { } else { assert((dir - 1) == -1 || (dir - 1) == 1); mc->mc_ki[mc->mc_top] += (indx_t)(dir - 1); - mdbx_debug("just moving to %s index key %u", - (dir == SIBLING_RIGHT) ? "right" : "left", - mc->mc_ki[mc->mc_top]); + DEBUG("just moving to %s index key %u", + (dir == SIBLING_RIGHT) ? "right" : "left", mc->mc_ki[mc->mc_top]); } - mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_BRANCH(mc->mc_pg[mc->mc_top])); node = page_node(mp = mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (unlikely((rc = mdbx_page_get(mc, node_pgno(node), &mp, - pp_txnid4chk(mp, mc->mc_txn))) != 0)) { + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) { /* mc will be inconsistent if caller does mc_snum++ as above */ mc->mc_flags &= ~(C_INITIALIZED | C_EOF); return rc; } - rc = mdbx_cursor_push(mc, mp); + rc = cursor_push(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_ki[mc->mc_top] = - (indx_t)((dir == SIBLING_LEFT) ? page_numkeys(mp) - 1 : 0); + (dir == SIBLING_LEFT) ? (indx_t)page_numkeys(mp) - 1 : 0; return MDBX_SUCCESS; } /* Move the cursor to the next data item. */ -static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { +static int cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node; int rc; @@ -14360,21 +16208,20 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_NOTFOUND; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) - return mdbx_cursor_first(mc, key, data); + return cursor_first(mc, key, data); mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { - if (mc->mc_ki[mc->mc_top] + 1u >= page_numkeys(mp)) + if (mc->mc_ki[mc->mc_top] + (size_t)1 >= page_numkeys(mp)) return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } if (mc->mc_db->md_flags & MDBX_DUPSORT) { node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (op == MDBX_NEXT || op == MDBX_NEXT_DUP) { - rc = - mdbx_cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); + rc = cursor_next(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_NEXT); if (op != MDBX_NEXT || rc != MDBX_NOTFOUND) { if (likely(rc == MDBX_SUCCESS)) get_key_optional(node, key); @@ -14388,43 +16235,41 @@ static int mdbx_cursor_next(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } } - mdbx_debug("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, - (void *)mc); + DEBUG("cursor_next: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); if (mc->mc_flags & C_DEL) { mc->mc_flags ^= C_DEL; goto skip; } - int ki = mc->mc_ki[mc->mc_top]; + intptr_t ki = mc->mc_ki[mc->mc_top]; mc->mc_ki[mc->mc_top] = (indx_t)++ki; - const int numkeys = page_numkeys(mp); + const intptr_t numkeys = page_numkeys(mp); if (unlikely(ki >= numkeys)) { - mdbx_debug("%s", "=====> move to next sibling page"); + DEBUG("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - if (unlikely((rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT)) != - MDBX_SUCCESS)) { + rc = cursor_sibling(mc, SIBLING_RIGHT); + if (unlikely(rc != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return rc; } mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); } skip: - mdbx_debug("==> cursor points to page %" PRIaPGNO - " with %u keys, key index %u", - mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + DEBUG("==> cursor points to page %" PRIaPGNO " with %zu keys, key index %u", + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } @@ -14432,17 +16277,16 @@ skip: } node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mp); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mp, mc->mc_txn))) != - MDBX_SUCCESS)) + rc = node_read(mc, node, data, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14451,8 +16295,8 @@ skip: } /* Move the cursor to the previous data item. */ -static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { +static int cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node; int rc; @@ -14461,7 +16305,7 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_NOTFOUND; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) { - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); if (unlikely(rc)) return rc; mc->mc_ki[mc->mc_top]++; @@ -14471,10 +16315,9 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, if ((mc->mc_db->md_flags & MDBX_DUPSORT) && mc->mc_ki[mc->mc_top] < page_numkeys(mp)) { node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (op == MDBX_PREV || op == MDBX_PREV_DUP) { - rc = - mdbx_cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); + rc = cursor_prev(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_PREV); if (op != MDBX_PREV || rc != MDBX_NOTFOUND) { if (likely(rc == MDBX_SUCCESS)) { get_key_optional(node, key); @@ -14490,8 +16333,8 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } } - mdbx_debug("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, - (void *)mc); + DEBUG("cursor_prev: top page is %" PRIaPGNO " in cursor %p", mp->mp_pgno, + (void *)mc); mc->mc_flags &= ~(C_EOF | C_DEL); @@ -14499,26 +16342,24 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (indx_t)--ki; if (unlikely(ki < 0)) { mc->mc_ki[mc->mc_top] = 0; - mdbx_debug("%s", "=====> move to prev sibling page"); - if ((rc = mdbx_cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) + DEBUG("%s", "=====> move to prev sibling page"); + if ((rc = cursor_sibling(mc, SIBLING_LEFT)) != MDBX_SUCCESS) return rc; mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("prev page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); } - mdbx_debug("==> cursor points to page %" PRIaPGNO - " with %u keys, key index %u", - mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); + DEBUG("==> cursor points to page %" PRIaPGNO " with %zu keys, key index %u", + mp->mp_pgno, page_numkeys(mp), mc->mc_ki[mc->mc_top]); - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } @@ -14527,17 +16368,16 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mp); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mp, mc->mc_txn))) != - MDBX_SUCCESS)) + rc = node_read(mc, node, data, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14546,9 +16386,8 @@ static int mdbx_cursor_prev(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, } /* Set the cursor on a specific data item. */ -static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, - MDBX_val *data, - MDBX_cursor_op op) { +__hot static struct cursor_set_result +cursor_set(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, MDBX_cursor_op op) { MDBX_page *mp; MDBX_node *node = NULL; DKBUF_DEBUG; @@ -14557,7 +16396,7 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, ret.exact = false; if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || key->iov_len > mc->mc_dbx->md_klen_max)) { - mdbx_cassert(mc, !"Invalid key-size"); + cASSERT(mc, !"Invalid key-size"); ret.err = MDBX_BAD_VALSIZE; return ret; } @@ -14567,7 +16406,7 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { switch (aligned_key.iov_len) { default: - mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); ret.err = MDBX_BAD_VALSIZE; return ret; case 4: @@ -14592,7 +16431,7 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, if (mc->mc_flags & C_INITIALIZED) { MDBX_val nodekey; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mp = mc->mc_pg[mc->mc_top]; if (unlikely(!page_numkeys(mp))) { mc->mc_ki[mc->mc_top] = 0; @@ -14613,13 +16452,12 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, * was the one we wanted. */ mc->mc_ki[mc->mc_top] = 0; ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } if (cmp > 0) { - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); if (nkeys > 1) { if (IS_LEAF2(mp)) { nodekey.iov_base = page_leaf2key(mp, nkeys - 1, nodekey.iov_len); @@ -14630,12 +16468,12 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, cmp = mc->mc_dbx->md_cmp(&aligned_key, &nodekey); if (cmp == 0) { /* last node was the one we wanted */ - mdbx_cassert(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); + cASSERT(mc, nkeys >= 1 && nkeys <= UINT16_MAX + 1); mc->mc_ki[mc->mc_top] = (indx_t)(nkeys - 1); ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } if (cmp < 0) { @@ -14652,9 +16490,9 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, if (cmp == 0) { /* current node was the one we wanted */ ret.exact = true; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < + page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); goto got_node; } } @@ -14664,13 +16502,13 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, } /* If any parents have right-sibs, search. * Otherwise, there's nothing further. */ - unsigned i; + size_t i; for (i = 0; i < mc->mc_top; i++) if (mc->mc_ki[i] < page_numkeys(mc->mc_pg[i]) - 1) break; if (i == mc->mc_top) { /* There are no other pages */ - mdbx_cassert(mc, nkeys <= UINT16_MAX); + cASSERT(mc, nkeys <= UINT16_MAX); mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; mc->mc_flags |= C_EOF; ret.err = MDBX_NOTFOUND; @@ -14683,25 +16521,25 @@ static struct cursor_set_result mdbx_cursor_set(MDBX_cursor *mc, MDBX_val *key, if (op == MDBX_SET_RANGE) goto got_node; - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); ret.err = MDBX_NOTFOUND; return ret; } } else { - mc->mc_pg[0] = 0; + mc->mc_pg[0] = nullptr; } - ret.err = mdbx_page_search(mc, &aligned_key, 0); + ret.err = page_search(mc, &aligned_key, 0); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mp)); + MDBX_ANALYSIS_ASSUME(mp != nullptr); + cASSERT(mc, IS_LEAF(mp)); search_node:; - struct node_result nsr = mdbx_node_search(mc, &aligned_key); + struct node_result nsr = node_search(mc, &aligned_key); node = nsr.node; ret.exact = nsr.exact; if (!ret.exact) { @@ -14715,52 +16553,53 @@ search_node:; } if (node == NULL) { - mdbx_debug("%s", "===> inexact leaf not found, goto sibling"); - ret.err = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + DEBUG("%s", "===> inexact leaf not found, goto sibling"); + ret.err = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(ret.err != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return ret; /* no entries matched */ } mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mp)); + cASSERT(mc, IS_LEAF(mp)); if (!IS_LEAF2(mp)) node = page_node(mp, 0); } } - mdbx_cassert(mc, - mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); got_node: mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; - if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - ret.err = MDBX_CORRUPTED; - } else { - if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { - key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); - } - ret.err = MDBX_SUCCESS; - } + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + ret.err = MDBX_CORRUPTED; return ret; } - if (F_ISSET(node_flags(node), F_DUPDATA)) { - ret.err = mdbx_xcursor_init1(mc, node, mp); + if (IS_LEAF2(mp)) { + if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) { + key->iov_len = mc->mc_db->md_xsize; + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); + } + ret.err = MDBX_SUCCESS; + return ret; + } + + if (node_flags(node) & F_DUPDATA) { + ret.err = cursor_xinit1(mc, node, mp); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; if (op == MDBX_SET || op == MDBX_SET_KEY || op == MDBX_SET_RANGE) { - ret.err = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); + ret.err = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } else { - ret = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE); + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); + ret = cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; if (op == MDBX_GET_BOTH && !ret.exact) { @@ -14772,7 +16611,7 @@ got_node: if (op == MDBX_GET_BOTH || op == MDBX_GET_BOTH_RANGE) { if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || data->iov_len > mc->mc_dbx->md_vlen_max)) { - mdbx_cassert(mc, !"Invalid data-size"); + cASSERT(mc, !"Invalid data-size"); ret.err = MDBX_BAD_VALSIZE; return ret; } @@ -14781,7 +16620,7 @@ got_node: if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { switch (aligned_data.iov_len) { default: - mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERDUP"); + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERDUP"); ret.err = MDBX_BAD_VALSIZE; return ret; case 4: @@ -14799,15 +16638,14 @@ got_node: } } MDBX_val actual_data; - ret.err = mdbx_node_read(mc, node, &actual_data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + ret.err = node_read(mc, node, &actual_data, mc->mc_pg[mc->mc_top]); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; const int cmp = mc->mc_dbx->md_dcmp(&aligned_data, &actual_data); if (cmp) { - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, + mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); if (op != MDBX_GET_BOTH_RANGE || cmp > 0) { ret.err = MDBX_NOTFOUND; return ret; @@ -14815,8 +16653,7 @@ got_node: } *data = actual_data; } else { - ret.err = mdbx_node_read(mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + ret.err = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; } @@ -14826,57 +16663,56 @@ got_node: if (op == MDBX_SET_RANGE || op == MDBX_SET_KEY) get_key_optional(node, key); - mdbx_debug("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), - DVAL_DEBUG(data)); + DEBUG("==> cursor placed on key [%s], data [%s]", DKEY_DEBUG(key), + DVAL_DEBUG(data)); ret.err = MDBX_SUCCESS; return ret; } /* Move the cursor to the first item in the database. */ -static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { +static int cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + rc = page_search(mc, NULL, MDBX_PS_FIRST); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; mc->mc_ki[mc->mc_top] = 0; - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mc->mc_pg[mc->mc_top]->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (IS_LEAF2(mp)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], 0, key->iov_len); + key->iov_base = page_leaf2key(mp, 0, key->iov_len); } return MDBX_SUCCESS; } - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], 0); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + MDBX_node *node = page_node(mp, 0); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) + rc = node_read(mc, node, data, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14885,50 +16721,48 @@ static int mdbx_cursor_first(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { } /* Move the cursor to the last item in the database. */ -static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { +static int cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { int rc; if (mc->mc_xcursor) mc->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - rc = mdbx_page_search(mc, NULL, MDBX_PS_LAST); + rc = page_search(mc, NULL, MDBX_PS_LAST); if (unlikely(rc != MDBX_SUCCESS)) return rc; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mc->mc_pg[mc->mc_top]))) + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; + } - mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]) - 1; + mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mp) - 1; mc->mc_flags |= C_INITIALIZED | C_EOF; - if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mc->mc_pg[mc->mc_top]->mp_pgno); - return MDBX_CORRUPTED; - } else if (likely(key)) { + if (IS_LEAF2(mp)) { + if (likely(key)) { key->iov_len = mc->mc_db->md_xsize; - key->iov_base = page_leaf2key(mc->mc_pg[mc->mc_top], - mc->mc_ki[mc->mc_top], key->iov_len); + key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } return MDBX_SUCCESS; } - MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - rc = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); + MDBX_ANALYSIS_ASSUME(mc->mc_xcursor != nullptr); + rc = cursor_last(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else if (likely(data)) { - if (unlikely((rc = mdbx_node_read( - mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn))) != - MDBX_SUCCESS)) + rc = node_read(mc, node, data, mp); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -14936,65 +16770,56 @@ static int mdbx_cursor_last(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data) { return MDBX_SUCCESS; } -int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - MDBX_cursor_op op) { - if (unlikely(mc == NULL)) - return MDBX_EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; - - int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - +static __hot int cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { int (*mfunc)(MDBX_cursor * mc, MDBX_val * key, MDBX_val * data); + int rc; + switch (op) { case MDBX_GET_CURRENT: { if (unlikely(!(mc->mc_flags & C_INITIALIZED))) return MDBX_ENODATA; - MDBX_page *mp = mc->mc_pg[mc->mc_top]; - const unsigned nkeys = page_numkeys(mp); + const MDBX_page *mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } + const size_t nkeys = page_numkeys(mp); if (unlikely(mc->mc_ki[mc->mc_top] >= nkeys)) { - mdbx_cassert(mc, nkeys <= UINT16_MAX); + cASSERT(mc, nkeys <= UINT16_MAX); if (mc->mc_flags & C_EOF) return MDBX_ENODATA; mc->mc_ki[mc->mc_top] = (uint16_t)nkeys; mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; } - mdbx_cassert(mc, nkeys > 0); + cASSERT(mc, nkeys > 0); rc = MDBX_SUCCESS; if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } key->iov_len = mc->mc_db->md_xsize; key->iov_base = page_leaf2key(mp, mc->mc_ki[mc->mc_top], key->iov_len); } else { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); get_key_optional(node, key); if (data) { - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (unlikely(!(mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED))) { - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); + rc = cursor_first(&mc->mc_xcursor->mx_cursor, data, NULL); if (unlikely(rc)) return rc; } else { - rc = mdbx_cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_GET_CURRENT); + rc = cursor_get(&mc->mc_xcursor->mx_cursor, data, NULL, + MDBX_GET_CURRENT); if (unlikely(rc)) return rc; } } else { - rc = mdbx_node_read(mc, node, data, pp_txnid4chk(mp, mc->mc_txn)); + rc = node_read(mc, node, data, mp); if (unlikely(rc)) return rc; } @@ -15015,12 +16840,11 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_SET_RANGE: if (unlikely(key == NULL)) return MDBX_EINVAL; - rc = mdbx_cursor_set(mc, key, data, op).err; + rc = cursor_set(mc, key, data, op).err; if (mc->mc_flags & C_INITIALIZED) { - mdbx_cassert(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); - mdbx_cassert(mc, mc->mc_ki[mc->mc_top] < - page_numkeys(mc->mc_pg[mc->mc_top]) || - (mc->mc_flags & C_EOF)); + cASSERT(mc, mc->mc_snum > 0 && mc->mc_top < mc->mc_snum); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top]) || + (mc->mc_flags & C_EOF)); } break; case MDBX_GET_MULTIPLE: @@ -15038,7 +16862,7 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_EINVAL; if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) return MDBX_INCOMPATIBLE; - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_DUP); + rc = cursor_next(mc, key, data, MDBX_NEXT_DUP); if (rc == MDBX_SUCCESS) { if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { MDBX_cursor *mx; @@ -15060,11 +16884,11 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return MDBX_INCOMPATIBLE; rc = MDBX_SUCCESS; if (!(mc->mc_flags & C_INITIALIZED)) - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); if (rc == MDBX_SUCCESS) { MDBX_cursor *mx = &mc->mc_xcursor->mx_cursor; if (mx->mc_flags & C_INITIALIZED) { - rc = mdbx_cursor_sibling(mx, SIBLING_LEFT); + rc = cursor_sibling(mx, SIBLING_LEFT); if (rc == MDBX_SUCCESS) goto fetchm; } else { @@ -15075,18 +16899,18 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, case MDBX_NEXT: case MDBX_NEXT_DUP: case MDBX_NEXT_NODUP: - rc = mdbx_cursor_next(mc, key, data, op); + rc = cursor_next(mc, key, data, op); break; case MDBX_PREV: case MDBX_PREV_DUP: case MDBX_PREV_NODUP: - rc = mdbx_cursor_prev(mc, key, data, op); + rc = cursor_prev(mc, key, data, op); break; case MDBX_FIRST: - rc = mdbx_cursor_first(mc, key, data); + rc = cursor_first(mc, key, data); break; case MDBX_FIRST_DUP: - mfunc = mdbx_cursor_first; + mfunc = cursor_first; move: if (unlikely(data == NULL || !(mc->mc_flags & C_INITIALIZED))) return MDBX_EINVAL; @@ -15096,13 +16920,11 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, mc->mc_ki[mc->mc_top] = (indx_t)page_numkeys(mc->mc_pg[mc->mc_top]); mc->mc_flags |= C_EOF; return MDBX_NOTFOUND; - } - { + } else { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (!F_ISSET(node_flags(node), F_DUPDATA)) { + if (!(node_flags(node) & F_DUPDATA)) { get_key_optional(node, key); - rc = mdbx_node_read(mc, node, data, - pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); + rc = node_read(mc, node, data, mc->mc_pg[mc->mc_top]); break; } } @@ -15111,18 +16933,17 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = mfunc(&mc->mc_xcursor->mx_cursor, data, NULL); break; case MDBX_LAST: - rc = mdbx_cursor_last(mc, key, data); + rc = cursor_last(mc, key, data); break; case MDBX_LAST_DUP: - mfunc = mdbx_cursor_last; + mfunc = cursor_last; goto move; case MDBX_SET_UPPERBOUND: /* mostly same as MDBX_SET_LOWERBOUND */ case MDBX_SET_LOWERBOUND: { if (unlikely(key == NULL || data == NULL)) return MDBX_EINVAL; MDBX_val save_data = *data; - struct cursor_set_result csr = - mdbx_cursor_set(mc, key, data, MDBX_SET_RANGE); + struct cursor_set_result csr = cursor_set(mc, key, data, MDBX_SET_RANGE); rc = csr.err; if (rc == MDBX_SUCCESS && csr.exact && mc->mc_xcursor) { mc->mc_flags &= ~C_DEL; @@ -15133,18 +16954,18 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, * returning MDBX_BAD_VALSIZE. */ } else if (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { *data = save_data; - csr = mdbx_cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, - MDBX_SET_RANGE); + csr = + cursor_set(&mc->mc_xcursor->mx_cursor, data, NULL, MDBX_SET_RANGE); rc = csr.err; if (rc == MDBX_NOTFOUND) { - mdbx_cassert(mc, !csr.exact); - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + cASSERT(mc, !csr.exact); + rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); } } else { int cmp = mc->mc_dbx->md_dcmp(&save_data, data); csr.exact = (cmp == 0); if (cmp > 0) - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT_NODUP); + rc = cursor_next(mc, key, data, MDBX_NEXT_NODUP); } } if (rc == MDBX_SUCCESS && !csr.exact) @@ -15156,12 +16977,12 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, rc = MDBX_SUCCESS; else if (rc == MDBX_SUCCESS) /* exactly match, going next */ - rc = mdbx_cursor_next(mc, key, data, MDBX_NEXT); + rc = cursor_next(mc, key, data, MDBX_NEXT); } break; } default: - mdbx_debug("unhandled/unimplemented cursor operation %u", op); + DEBUG("unhandled/unimplemented cursor operation %u", op); return MDBX_EINVAL; } @@ -15169,13 +16990,29 @@ int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, return rc; } +int mdbx_cursor_get(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, + MDBX_cursor_op op) { + if (unlikely(mc == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + return cursor_get(mc, key, data, op); +} + static int cursor_first_batch(MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED) || mc->mc_top) { - int err = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); + int err = page_search(mc, NULL, MDBX_PS_FIRST); if (unlikely(err != MDBX_SUCCESS)) return err; } - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); mc->mc_flags |= C_INITIALIZED; mc->mc_flags &= ~C_EOF; @@ -15189,25 +17026,30 @@ static int cursor_next_batch(MDBX_cursor *mc) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; if (unlikely(mc->mc_flags & C_EOF)) { - if ((unsigned)mc->mc_ki[mc->mc_top] + 1 >= page_numkeys(mp)) + if ((size_t)mc->mc_ki[mc->mc_top] + 1 >= page_numkeys(mp)) return MDBX_NOTFOUND; mc->mc_flags ^= C_EOF; } - int ki = mc->mc_ki[mc->mc_top]; + intptr_t ki = mc->mc_ki[mc->mc_top]; mc->mc_ki[mc->mc_top] = (indx_t)++ki; - const int numkeys = page_numkeys(mp); + const intptr_t numkeys = page_numkeys(mp); if (likely(ki >= numkeys)) { - mdbx_debug("%s", "=====> move to next sibling page"); + DEBUG("%s", "=====> move to next sibling page"); mc->mc_ki[mc->mc_top] = (indx_t)(numkeys - 1); - int err = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + int err = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(err != MDBX_SUCCESS)) { mc->mc_flags |= C_EOF; return err; } mp = mc->mc_pg[mc->mc_top]; - mdbx_debug("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, - mc->mc_ki[mc->mc_top]); + DEBUG("next page is %" PRIaPGNO ", key index %u", mp->mp_pgno, + mc->mc_ki[mc->mc_top]); + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } } return MDBX_SUCCESS; } @@ -15239,7 +17081,7 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, rc = likely(mc->mc_flags & C_INITIALIZED) ? MDBX_SUCCESS : MDBX_ENODATA; break; default: - mdbx_debug("unhandled/unimplemented cursor operation %u", op); + DEBUG("unhandled/unimplemented cursor operation %u", op); rc = MDBX_EINVAL; break; } @@ -15249,15 +17091,20 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return rc; } - const MDBX_page *const page = mc->mc_pg[mc->mc_top]; - const unsigned nkeys = page_numkeys(page); - unsigned i = mc->mc_ki[mc->mc_top], n = 0; + const MDBX_page *const mp = mc->mc_pg[mc->mc_top]; + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); + return MDBX_CORRUPTED; + } + const size_t nkeys = page_numkeys(mp); + size_t i = mc->mc_ki[mc->mc_top], n = 0; if (unlikely(i >= nkeys)) { - mdbx_cassert(mc, op == MDBX_GET_CURRENT); - mdbx_cassert(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); + cASSERT(mc, op == MDBX_GET_CURRENT); + cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); *count = 0; if (mc->mc_flags & C_EOF) { - mdbx_cassert(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); + cASSERT(mc, mdbx_cursor_on_last(mc) == MDBX_RESULT_TRUE); return MDBX_ENODATA; } if (mdbx_cursor_on_last(mc) != MDBX_RESULT_TRUE) @@ -15266,15 +17113,14 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return MDBX_NOTFOUND; } - const txnid_t pp_txnid = pp_txnid4chk(page, mc->mc_txn); do { if (unlikely(n + 2 > limit)) { rc = MDBX_RESULT_TRUE; break; } - const MDBX_node *leaf = page_node(page, i); + const MDBX_node *leaf = page_node(mp, i); get_key(leaf, &pairs[n]); - rc = mdbx_node_read(mc, leaf, &pairs[n + 1], pp_txnid); + rc = node_read(mc, leaf, &pairs[n + 1], mp); if (unlikely(rc != MDBX_SUCCESS)) break; n += 2; @@ -15285,176 +17131,95 @@ int mdbx_cursor_get_batch(MDBX_cursor *mc, size_t *count, MDBX_val *pairs, return rc; } -static int mdbx_touch_dbi(MDBX_cursor *mc) { - mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); +static int touch_dbi(MDBX_cursor *mc) { + cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) == 0); *mc->mc_dbistate |= DBI_DIRTY; mc->mc_txn->mt_flags |= MDBX_TXN_DIRTY; if (mc->mc_dbi >= CORE_DBS) { - mdbx_cassert(mc, (mc->mc_flags & C_RECLAIMING) == 0); /* Touch DB record of named DB */ MDBX_cursor_couple cx; - int rc = mdbx_cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); + int rc = cursor_init(&cx.outer, mc->mc_txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; - rc = mdbx_page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); + rc = page_search(&cx.outer, &mc->mc_dbx->md_name, MDBX_PS_MODIFY); if (unlikely(rc != MDBX_SUCCESS)) return rc; } return MDBX_SUCCESS; } -/* Touch all the pages in the cursor stack. Set mc_top. - * Makes sure all the pages are writable, before attempting a write operation. - * [in] mc The cursor to operate on. */ -static int mdbx_cursor_touch(MDBX_cursor *mc) { - int rc = MDBX_SUCCESS; - if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - rc = mdbx_touch_dbi(mc); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; +static __hot int cursor_touch(MDBX_cursor *const mc, const MDBX_val *key, + const MDBX_val *data) { + cASSERT(mc, (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) == 0); + cASSERT(mc, (mc->mc_flags & C_INITIALIZED) || mc->mc_snum == 0); + cASSERT(mc, cursor_is_tracked(mc)); + + if ((mc->mc_flags & C_SUB) == 0) { + MDBX_txn *const txn = mc->mc_txn; + txn_lru_turn(txn); + + if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { + int err = touch_dbi(mc); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + + /* Estimate how much space this operation will take: */ + /* 1) Max b-tree height, reasonable enough with including dups' sub-tree */ + size_t need = CURSOR_STACK + 3; + /* 2) GC/FreeDB for any payload */ + if (mc->mc_dbi > FREE_DBI) { + need += txn->mt_dbs[FREE_DBI].md_depth + (size_t)3; + /* 3) Named DBs also dirty the main DB */ + if (mc->mc_dbi > MAIN_DBI) + need += txn->mt_dbs[MAIN_DBI].md_depth + (size_t)3; + } +#if xMDBX_DEBUG_SPILLING != 2 + /* production mode */ + /* 4) Double the page chain estimation + * for extensively splitting, rebalance and merging */ + need += need; + /* 5) Factor the key+data which to be put in */ + need += bytes2pgno(txn->mt_env, node_size(key, data)) + (size_t)1; +#else + /* debug mode */ + (void)key; + (void)data; + txn->mt_env->debug_dirtied_est = ++need; + txn->mt_env->debug_dirtied_act = 0; +#endif /* xMDBX_DEBUG_SPILLING == 2 */ + + int err = txn_spill(txn, mc, need); + if (unlikely(err != MDBX_SUCCESS)) + return err; } + + int rc = MDBX_SUCCESS; if (likely(mc->mc_snum)) { mc->mc_top = 0; do { - rc = mdbx_page_touch(mc); - } while (!rc && ++(mc->mc_top) < mc->mc_snum); + rc = page_touch(mc); + if (unlikely(rc != MDBX_SUCCESS)) + break; + mc->mc_top += 1; + } while (mc->mc_top < mc->mc_snum); mc->mc_top = mc->mc_snum - 1; } return rc; } -int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, - unsigned flags) { - MDBX_env *env; - MDBX_page *sub_root = NULL; +static __hot int cursor_put_nochecklen(MDBX_cursor *mc, const MDBX_val *key, + MDBX_val *data, unsigned flags) { + MDBX_page *sub_root = nullptr; MDBX_val xdata, *rdata, dkey, olddata; MDBX_db nested_dupdb; int err; DKBUF_DEBUG; - - if (unlikely(mc == NULL || key == NULL || data == NULL)) - return MDBX_EINVAL; - - if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) - return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL - : MDBX_EBADSIGN; - - int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) - return MDBX_BAD_DBI; - - mdbx_cassert(mc, cursor_is_tracked(mc)); - env = mc->mc_txn->mt_env; - - /* Check this first so counter will always be zero on any early failures. */ - size_t mcount = 0, dcount = 0; - if (unlikely(flags & MDBX_MULTIPLE)) { - if (unlikely(flags & MDBX_RESERVE)) - return MDBX_EINVAL; - if (unlikely(!F_ISSET(mc->mc_db->md_flags, MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - dcount = data[1].iov_len; - if (unlikely(dcount < 2 || data->iov_len == 0)) - return MDBX_BAD_VALSIZE; - if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) - return MDBX_BAD_VALSIZE; - if (unlikely(dcount > MAX_MAPSIZE / 2 / - (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { - /* checking for multiplication overflow */ - if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) - return MDBX_TOO_LARGE; - } - data[1].iov_len = 0 /* reset done item counter */; - } - - if (flags & MDBX_RESERVE) { - if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | - MDBX_INTEGERDUP | MDBX_DUPFIXED))) - return MDBX_INCOMPATIBLE; - data->iov_base = nullptr; - } - - const unsigned nospill = flags & MDBX_NOSPILL; - flags -= nospill; - - if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS - : MDBX_BAD_TXN; - - uint64_t aligned_keybytes, aligned_databytes; - MDBX_val aligned_key, aligned_data; - if (likely((mc->mc_flags & C_SUB) == 0)) { - if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || - key->iov_len > mc->mc_dbx->md_klen_max)) { - mdbx_cassert(mc, !"Invalid key-size"); - return MDBX_BAD_VALSIZE; - } - if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || - data->iov_len > mc->mc_dbx->md_vlen_max)) { - mdbx_cassert(mc, !"Invalid data-size"); - return MDBX_BAD_VALSIZE; - } - - if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { - switch (key->iov_len) { - default: - mdbx_cassert(mc, !"key-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - case 4: - if (unlikely(3 & (uintptr_t)key->iov_base)) { - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); - key = &aligned_key; - } - break; - case 8: - if (unlikely(7 & (uintptr_t)key->iov_base)) { - /* copy instead of return error to avoid break compatibility */ - aligned_key.iov_base = - memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); - key = &aligned_key; - } - break; - } - } - if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { - switch (data->iov_len) { - default: - mdbx_cassert(mc, !"data-size is invalid for MDBX_INTEGERKEY"); - return MDBX_BAD_VALSIZE; - case 4: - if (unlikely(3 & (uintptr_t)data->iov_base)) { - if (unlikely(flags & MDBX_MULTIPLE)) - return MDBX_BAD_VALSIZE; - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, - aligned_data.iov_len = 4); - data = &aligned_data; - } - break; - case 8: - if (unlikely(7 & (uintptr_t)data->iov_base)) { - if (unlikely(flags & MDBX_MULTIPLE)) - return MDBX_BAD_VALSIZE; - /* copy instead of return error to avoid break compatibility */ - aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, - aligned_data.iov_len = 8); - data = &aligned_data; - } - break; - } - } - } - - mdbx_debug( - "==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, - DDBI(mc), DKEY_DEBUG(key), key->iov_len, - DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); + MDBX_env *const env = mc->mc_txn->mt_env; + DEBUG("==> put db %d key [%s], size %" PRIuPTR ", data [%s] size %" PRIuPTR, + DDBI(mc), DKEY_DEBUG(key), key->iov_len, + DVAL_DEBUG((flags & MDBX_RESERVE) ? nullptr : data), data->iov_len); int dupdata_flag = 0; if ((flags & MDBX_CURRENT) != 0 && (mc->mc_flags & C_SUB) == 0) { @@ -15463,40 +17228,39 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, /* Опция MDBX_CURRENT означает, что запрошено обновление текущей записи, * на которой сейчас стоит курсор. Проверяем что переданный ключ совпадает * со значением в текущей позиции курсора. - * Здесь проще вызвать mdbx_cursor_get(), так как для обслуживания таблиц + * Здесь проще вызвать cursor_get(), так как для обслуживания таблиц * с MDBX_DUPSORT также требуется текущий размер данных. */ MDBX_val current_key, current_data; - rc = mdbx_cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = cursor_get(mc, ¤t_key, ¤t_data, MDBX_GET_CURRENT); + if (unlikely(err != MDBX_SUCCESS)) + return err; if (mc->mc_dbx->md_cmp(key, ¤t_key) != 0) return MDBX_EKEYMISMATCH; if (unlikely((flags & MDBX_MULTIPLE))) goto drop_current; - if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_cassert(mc, - mc->mc_xcursor != NULL && - (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); + if (node_flags(node) & F_DUPDATA) { + cASSERT(mc, mc->mc_xcursor != NULL && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); /* Если за ключом более одного значения, либо если размер данных * отличается, то вместо обновления требуется удаление и * последующая вставка. */ if (mc->mc_xcursor->mx_db.md_entries > 1 || current_data.iov_len != data->iov_len) { drop_current: - rc = mdbx_cursor_del(mc, flags & MDBX_ALLDUPS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = cursor_del(mc, flags & MDBX_ALLDUPS); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_CURRENT; goto skip_check_samedata; } } else if (unlikely(node_size(key, data) > env->me_leaf_nodemax)) { - rc = mdbx_cursor_del(mc, 0); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = cursor_del(mc, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_CURRENT; goto skip_check_samedata; } @@ -15507,6 +17271,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, skip_check_samedata:; } + int rc = MDBX_SUCCESS; if (mc->mc_db->md_root == P_INVALID) { /* new database, cursor has nothing to point to */ mc->mc_snum = 0; @@ -15516,101 +17281,94 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else if ((flags & MDBX_CURRENT) == 0) { bool exact = false; if ((flags & MDBX_APPEND) && mc->mc_db->md_entries > 0) { - rc = mdbx_cursor_last(mc, &dkey, &olddata); + rc = cursor_last(mc, &dkey, &olddata); if (likely(rc == MDBX_SUCCESS)) { - rc = mc->mc_dbx->md_cmp(key, &dkey); - if (likely(rc > 0)) { + const int cmp = mc->mc_dbx->md_cmp(key, &dkey); + if (likely(cmp > 0)) { mc->mc_ki[mc->mc_top]++; /* step forward for appending */ rc = MDBX_NOTFOUND; + } else if (unlikely(cmp != 0)) { + /* new-key < last-key */ + return MDBX_EKEYMISMATCH; } else { - if (unlikely(rc != MDBX_SUCCESS)) - /* new-key < last-key - * or new-key == last-key without MDBX_APPENDDUP */ - return MDBX_EKEYMISMATCH; + rc = MDBX_SUCCESS; exact = true; } } } else { struct cursor_set_result csr = /* olddata may not be updated in case LEAF2-page of dupfixed-subDB */ - mdbx_cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); + cursor_set(mc, (MDBX_val *)key, &olddata, MDBX_SET); rc = csr.err; exact = csr.exact; } if (likely(rc == MDBX_SUCCESS)) { if (exact) { if (unlikely(flags & MDBX_NOOVERWRITE)) { - mdbx_debug("duplicate key [%s]", DKEY_DEBUG(key)); + DEBUG("duplicate key [%s]", DKEY_DEBUG(key)); *data = olddata; return MDBX_KEYEXIST; } if (unlikely(mc->mc_flags & C_SUB)) { /* nested subtree of DUPSORT-database with the same key, * nothing to update */ - mdbx_assert(env, data->iov_len == 0 && - (olddata.iov_len == 0 || - /* olddata may not be updated in case LEAF2-page - of dupfixed-subDB */ - (mc->mc_db->md_flags & MDBX_DUPFIXED))); + eASSERT(env, data->iov_len == 0 && + (olddata.iov_len == 0 || + /* olddata may not be updated in case LEAF2-page + of dupfixed-subDB */ + (mc->mc_db->md_flags & MDBX_DUPFIXED))); return MDBX_SUCCESS; } if (unlikely(flags & MDBX_ALLDUPS) && mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)) { - rc = mdbx_cursor_del(mc, MDBX_ALLDUPS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = cursor_del(mc, MDBX_ALLDUPS); + if (unlikely(err != MDBX_SUCCESS)) + return err; flags -= MDBX_ALLDUPS; rc = mc->mc_snum ? MDBX_NOTFOUND : MDBX_NO_ROOT; exact = false; - } else /* checking for early exit without dirtying pages */ - if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE)) && - unlikely(mc->mc_dbx->md_dcmp(data, &olddata) == 0)) { - if (!mc->mc_xcursor) - /* the same data, nothing to update */ - return MDBX_SUCCESS; - if (flags & MDBX_NODUPDATA) - return MDBX_KEYEXIST; - if (flags & MDBX_APPENDDUP) - return MDBX_EKEYMISMATCH; - if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) - /* data is match exactly byte-to-byte, nothing to update */ - return MDBX_SUCCESS; - else { - /* The data has differences, but the user-provided comparator - * considers them equal. So continue update since called without. - * Continue to update since was called without MDBX_NODUPDATA. */ + } else if (!(flags & (MDBX_RESERVE | MDBX_MULTIPLE))) { + /* checking for early exit without dirtying pages */ + if (unlikely(eq_fast(data, &olddata))) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); + if (mc->mc_xcursor) { + if (flags & MDBX_NODUPDATA) + return MDBX_KEYEXIST; + if (flags & MDBX_APPENDDUP) + return MDBX_EKEYMISMATCH; } + /* the same data, nothing to update */ + return MDBX_SUCCESS; } + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) != 0); + } } } else if (unlikely(rc != MDBX_NOTFOUND)) return rc; } mc->mc_flags &= ~C_DEL; + rdata = data; + size_t mcount = 0, dcount = 0; + if (unlikely(flags & MDBX_MULTIPLE)) { + dcount = data[1].iov_len; + data[1].iov_len = 0 /* reset done item counter */; + rdata = &xdata; + xdata.iov_len = data->iov_len * dcount; + } /* Cursor is positioned, check for room in the dirty list */ - if (!nospill) { - rdata = data; - if (unlikely(flags & MDBX_MULTIPLE)) { - rdata = &xdata; - xdata.iov_len = data->iov_len * dcount; - } - if (unlikely(err = mdbx_cursor_spill(mc, key, rdata))) - return err; - } + err = cursor_touch(mc, key, rdata); + if (unlikely(err)) + return err; if (unlikely(rc == MDBX_NO_ROOT)) { /* new database, write a root leaf page */ - mdbx_debug("%s", "allocating new root leaf page"); - if (unlikely((*mc->mc_dbistate & DBI_DIRTY) == 0)) { - err = mdbx_touch_dbi(mc); - if (unlikely(err != MDBX_SUCCESS)) - return err; - } - struct page_result npr = mdbx_page_new(mc, P_LEAF, 1); + DEBUG("%s", "allocating new root leaf page"); + pgr_t npr = page_new(mc, P_LEAF); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; - npr.err = mdbx_cursor_push(mc, npr.page); + npr.err = cursor_push(mc, npr.page); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; mc->mc_db->md_root = npr.page->mp_pgno; @@ -15633,11 +17391,6 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, if ((mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_DUPFIXED)) == MDBX_DUPFIXED) npr.page->mp_flags |= P_LEAF2; mc->mc_flags |= C_INITIALIZED; - } else { - /* make sure all cursor pages are writable */ - err = mdbx_cursor_touch(mc); - if (unlikely(err)) - return err; } bool insert_key, insert_data, do_sub = false; @@ -15647,7 +17400,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, fp->mp_txnid = mc->mc_txn->mt_front; if (insert_key) { /* The key does not exist */ - mdbx_debug("inserting key at index %i", mc->mc_ki[mc->mc_top]); + DEBUG("inserting key at index %i", mc->mc_ki[mc->mc_top]); if ((mc->mc_db->md_flags & MDBX_DUPSORT) && node_size(key, data) > env->me_leaf_nodemax) { /* Too big for a node, insert in sub-DB. Set up an empty @@ -15661,17 +17414,17 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } else { /* there's only a key anyway, so this is a no-op */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - char *ptr; - unsigned ksize = mc->mc_db->md_xsize; + size_t ksize = mc->mc_db->md_xsize; if (unlikely(key->iov_len != ksize)) return MDBX_BAD_VALSIZE; - ptr = page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); + void *ptr = + page_leaf2key(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top], ksize); memcpy(ptr, key->iov_base, ksize); fix_parent: /* if overwriting slot 0 of leaf, need to * update branch key if there is a parent page */ if (mc->mc_top && !mc->mc_ki[mc->mc_top]) { - unsigned dtop = 1; + size_t dtop = 1; mc->mc_top--; /* slot 0 is always an empty key, find real slot */ while (mc->mc_top && !mc->mc_ki[mc->mc_top]) { @@ -15680,15 +17433,15 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } err = MDBX_SUCCESS; if (mc->mc_ki[mc->mc_top]) - err = mdbx_update_key(mc, key); - mdbx_cassert(mc, mc->mc_top + dtop < UINT16_MAX); - mc->mc_top += (uint16_t)dtop; + err = update_key(mc, key); + cASSERT(mc, mc->mc_top + dtop < UINT16_MAX); + mc->mc_top += (uint8_t)dtop; if (unlikely(err != MDBX_SUCCESS)) return err; } - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -15696,133 +17449,123 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, } more:; - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); /* Large/Overflow page overwrites need special handling */ - if (unlikely(F_ISSET(node_flags(node), F_BIGDATA))) { - int dpages = (node_size(key, data) > env->me_leaf_nodemax) - ? number_of_ovpages(env, data->iov_len) - : 0; + if (unlikely(node_flags(node) & F_BIGDATA)) { + const size_t dpages = (node_size(key, data) > env->me_leaf_nodemax) + ? number_of_ovpages(env, data->iov_len) + : 0; const pgno_t pgno = node_largedata_pgno(node); - struct page_result pgr = mdbx_page_get_ex( - mc, pgno, pp_txnid4chk(mc->mc_pg[mc->mc_top], mc->mc_txn)); - if (unlikely(pgr.err != MDBX_SUCCESS)) - return pgr.err; - if (unlikely(!IS_OVERFLOW(pgr.page))) - return MDBX_CORRUPTED; + pgr_t lp = page_get_large(mc, pgno, mc->mc_pg[mc->mc_top]->mp_txnid); + if (unlikely(lp.err != MDBX_SUCCESS)) + return lp.err; + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); /* Is the ov page from this txn (or a parent) and big enough? */ - int ovpages = pgr.page->mp_pages; - if (!IS_FROZEN(mc->mc_txn, pgr.page) && - (unlikely(mc->mc_flags & C_GCFREEZE) - ? (ovpages >= dpages) - : (ovpages == - /* LY: add configurable threshold to keep reserve space */ - dpages))) { + const size_t ovpages = lp.page->mp_pages; + const size_t extra_threshold = + (mc->mc_dbi == FREE_DBI) + ? 1 + : /* LY: add configurable threshold to keep reserve space */ 0; + if (!IS_FROZEN(mc->mc_txn, lp.page) && ovpages >= dpages && + ovpages <= dpages + extra_threshold) { /* yes, overwrite it. */ - if (!IS_MODIFIABLE(mc->mc_txn, pgr.page)) { - if (IS_SPILLED(mc->mc_txn, pgr.page)) { - pgr = /* TODO: avoid search and get txn & spill-index from + if (!IS_MODIFIABLE(mc->mc_txn, lp.page)) { + if (IS_SPILLED(mc->mc_txn, lp.page)) { + lp = /* TODO: avoid search and get txn & spill-index from page_result */ - mdbx_page_unspill(mc->mc_txn, pgr.page); - if (unlikely(pgr.err)) - return pgr.err; + page_unspill(mc->mc_txn, lp.page); + if (unlikely(lp.err)) + return lp.err; } else { if (unlikely(!mc->mc_txn->mt_parent)) { - mdbx_error( - "Unexpected not frozen/modifiable/spilled but shadowed %s " - "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," - " without parent transaction, current txn %" PRIaTXN - " front %" PRIaTXN, - "overflow/large", pgno, pgr.page->mp_txnid, - mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); + ERROR("Unexpected not frozen/modifiable/spilled but shadowed %s " + "page %" PRIaPGNO " mod-txnid %" PRIaTXN "," + " without parent transaction, current txn %" PRIaTXN + " front %" PRIaTXN, + "overflow/large", pgno, lp.page->mp_txnid, + mc->mc_txn->mt_txnid, mc->mc_txn->mt_front); return MDBX_PROBLEM; } /* It is writable only in a parent txn */ - MDBX_page *np = mdbx_page_malloc(mc->mc_txn, ovpages); + MDBX_page *np = page_malloc(mc->mc_txn, ovpages); if (unlikely(!np)) return MDBX_ENOMEM; - memcpy(np, pgr.page, PAGEHDRSZ); /* Copy header of page */ - err = mdbx_page_dirty(mc->mc_txn, pgr.page = np, ovpages); + memcpy(np, lp.page, PAGEHDRSZ); /* Copy header of page */ + err = page_dirty(mc->mc_txn, lp.page = np, ovpages); if (unlikely(err != MDBX_SUCCESS)) return err; #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.clone.weak += ovpages; #endif /* MDBX_ENABLE_PGOP_STAT */ - mdbx_cassert(mc, mdbx_dirtylist_check(mc->mc_txn)); + cASSERT(mc, dirtylist_check(mc->mc_txn)); } } node_set_ds(node, data->iov_len); - if (F_ISSET(flags, MDBX_RESERVE)) - data->iov_base = page_data(pgr.page); + if (flags & MDBX_RESERVE) + data->iov_base = page_data(lp.page); else - memcpy(page_data(pgr.page), data->iov_base, data->iov_len); + memcpy(page_data(lp.page), data->iov_base, data->iov_len); - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } return MDBX_SUCCESS; } - if ((err = mdbx_page_retire(mc, pgr.page)) != MDBX_SUCCESS) + if ((err = page_retire(mc, lp.page)) != MDBX_SUCCESS) return err; } else { olddata.iov_len = node_ds(node); olddata.iov_base = node_data(node); - mdbx_cassert(mc, (char *)olddata.iov_base + olddata.iov_len <= - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, ptr_disp(olddata.iov_base, olddata.iov_len) <= + ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); /* DB has dups? */ - if (F_ISSET(mc->mc_db->md_flags, MDBX_DUPSORT)) { + if (mc->mc_db->md_flags & MDBX_DUPSORT) { /* Prepare (sub-)page/sub-DB to accept the new item, if needed. * fp: old sub-page or a header faking it. * mp: new (sub-)page. offset: growth in page size. * xdata: node data with new page or DB. */ - unsigned i; + size_t i; size_t offset = 0; MDBX_page *mp = fp = xdata.iov_base = env->me_pbuf; mp->mp_pgno = mc->mc_pg[mc->mc_top]->mp_pgno; /* Was a single item before, must convert now */ - if (!F_ISSET(node_flags(node), F_DUPDATA)) { - + if (!(node_flags(node) & F_DUPDATA)) { /* does data match? */ - const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); - if ((flags & MDBX_APPENDDUP) && unlikely(cmp <= 0)) - return MDBX_EKEYMISMATCH; - if (cmp == 0) { + if (flags & MDBX_APPENDDUP) { + const int cmp = mc->mc_dbx->md_dcmp(data, &olddata); + cASSERT(mc, cmp != 0 || eq_fast(data, &olddata)); + if (unlikely(cmp <= 0)) + return MDBX_EKEYMISMATCH; + } else if (eq_fast(data, &olddata)) { + cASSERT(mc, mc->mc_dbx->md_dcmp(data, &olddata) == 0); if (flags & MDBX_NODUPDATA) return MDBX_KEYEXIST; - if (likely(unsure_equal(mc->mc_dbx->md_dcmp, data, &olddata))) { - /* data is match exactly byte-to-byte, nothing to update */ - if (unlikely(flags & MDBX_MULTIPLE)) { - rc = MDBX_SUCCESS; - goto continue_multiple; - } - return MDBX_SUCCESS; - } else { - /* The data has differences, but the user-provided comparator - * considers them equal. So continue update since called without. - * Continue to update since was called without MDBX_NODUPDATA. */ - } - mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax); - goto current; + /* data is match exactly byte-to-byte, nothing to update */ + rc = MDBX_SUCCESS; + if (likely((flags & MDBX_MULTIPLE) == 0)) + return rc; + goto continue_multiple; } /* Just overwrite the current item */ if (flags & MDBX_CURRENT) { - mdbx_cassert(mc, node_size(key, data) <= env->me_leaf_nodemax); + cASSERT(mc, node_size(key, data) <= env->me_leaf_nodemax); goto current; } @@ -15839,11 +17582,11 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, fp->mp_flags |= P_LEAF2; fp->mp_leaf2_ksize = (uint16_t)data->iov_len; xdata.iov_len += 2 * data->iov_len; /* leave space for 2 more */ - mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + cASSERT(mc, xdata.iov_len <= env->me_psize); } else { xdata.iov_len += 2 * (sizeof(indx_t) + NODESIZE) + (dkey.iov_len & 1) + (data->iov_len & 1); - mdbx_cassert(mc, xdata.iov_len <= env->me_psize); + cASSERT(mc, xdata.iov_len <= env->me_psize); } fp->mp_upper = (uint16_t)(xdata.iov_len - PAGEHDRSZ); olddata.iov_len = xdata.iov_len; /* pretend olddata is fp */ @@ -15897,12 +17640,12 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, nested_dupdb.md_entries = page_numkeys(fp); xdata.iov_len = sizeof(nested_dupdb); xdata.iov_base = &nested_dupdb; - const struct page_result par = mdbx_page_alloc(mc, 1, MDBX_ALLOC_ALL); + const pgr_t par = page_alloc(mc); mp = par.page; if (unlikely(par.err != MDBX_SUCCESS)) return par.err; mc->mc_db->md_leaf_pages += 1; - mdbx_cassert(mc, env->me_psize > olddata.iov_len); + cASSERT(mc, env->me_psize > olddata.iov_len); offset = env->me_psize - (unsigned)olddata.iov_len; flags |= F_DUPDATA | F_SUBDATA; nested_dupdb.md_root = mp->mp_pgno; @@ -15915,19 +17658,19 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, mp->mp_txnid = mc->mc_txn->mt_front; mp->mp_leaf2_ksize = fp->mp_leaf2_ksize; mp->mp_lower = fp->mp_lower; - mdbx_cassert(mc, fp->mp_upper + offset <= UINT16_MAX); + cASSERT(mc, fp->mp_upper + offset <= UINT16_MAX); mp->mp_upper = (indx_t)(fp->mp_upper + offset); if (unlikely(fp_flags & P_LEAF2)) { memcpy(page_data(mp), page_data(fp), page_numkeys(fp) * fp->mp_leaf2_ksize); } else { - memcpy((char *)mp + mp->mp_upper + PAGEHDRSZ, - (char *)fp + fp->mp_upper + PAGEHDRSZ, + memcpy(ptr_disp(mp, mp->mp_upper + PAGEHDRSZ), + ptr_disp(fp, fp->mp_upper + PAGEHDRSZ), olddata.iov_len - fp->mp_upper - PAGEHDRSZ); - memcpy((char *)(&mp->mp_ptrs), (char *)(&fp->mp_ptrs), + memcpy(mp->mp_ptrs, fp->mp_ptrs, page_numkeys(fp) * sizeof(mp->mp_ptrs[0])); for (i = 0; i < page_numkeys(fp); i++) { - mdbx_cassert(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); + cASSERT(mc, mp->mp_ptrs[i] + offset <= UINT16_MAX); mp->mp_ptrs[i] += (indx_t)offset; } } @@ -15937,7 +17680,7 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, flags |= F_DUPDATA; do_sub = true; if (!insert_key) - mdbx_node_del(mc, 0); + node_del(mc, 0); goto new_sub; } @@ -15947,62 +17690,61 @@ int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, current: if (data->iov_len == olddata.iov_len) { - mdbx_cassert(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); + cASSERT(mc, EVEN(key->iov_len) == EVEN(node_ks(node))); /* same size, just replace it. Note that we could * also reuse this node if the new data is smaller, * but instead we opt to shrink the node in that case. */ - if (F_ISSET(flags, MDBX_RESERVE)) + if (flags & MDBX_RESERVE) data->iov_base = olddata.iov_base; else if (!(mc->mc_flags & C_SUB)) memcpy(olddata.iov_base, data->iov_base, data->iov_len); else { - mdbx_cassert(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); - mdbx_cassert(mc, PAGETYPE(mc->mc_pg[mc->mc_top]) == P_LEAF); - mdbx_cassert(mc, node_ds(node) == 0); - mdbx_cassert(mc, node_flags(node) == 0); - mdbx_cassert(mc, key->iov_len < UINT16_MAX); + cASSERT(mc, page_numkeys(mc->mc_pg[mc->mc_top]) == 1); + cASSERT(mc, PAGETYPE_COMPAT(mc->mc_pg[mc->mc_top]) == P_LEAF); + cASSERT(mc, node_ds(node) == 0); + cASSERT(mc, node_flags(node) == 0); + cASSERT(mc, key->iov_len < UINT16_MAX); node_set_ks(node, key->iov_len); memcpy(node_key(node), key->iov_base, key->iov_len); - mdbx_cassert(mc, (char *)node_key(node) + node_ds(node) < - (char *)(mc->mc_pg[mc->mc_top]) + env->me_psize); + cASSERT(mc, ptr_disp(node_key(node), node_ds(node)) < + ptr_disp(mc->mc_pg[mc->mc_top], env->me_psize)); goto fix_parent; } - if (mdbx_audit_enabled()) { - err = mdbx_cursor_check(mc, 0); + if (AUDIT_ENABLED()) { + err = cursor_check(mc); if (unlikely(err != MDBX_SUCCESS)) return err; } return MDBX_SUCCESS; } } - mdbx_node_del(mc, 0); + node_del(mc, 0); } rdata = data; new_sub:; - unsigned nflags = flags & NODE_ADD_FLAGS; + const unsigned naf = flags & NODE_ADD_FLAGS; size_t nsize = IS_LEAF2(mc->mc_pg[mc->mc_top]) ? key->iov_len : leaf_size(env, key, rdata); if (page_room(mc->mc_pg[mc->mc_top]) < nsize) { - if (!insert_key) - nflags |= MDBX_SPLIT_REPLACE; - rc = mdbx_page_split(mc, key, rdata, P_INVALID, nflags); - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + rc = page_split(mc, key, rdata, P_INVALID, + insert_key ? naf : naf | MDBX_SPLIT_REPLACE); + if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) + rc = insert_key ? cursor_check(mc) : cursor_check_updating(mc); } else { /* There is room already in this leaf page. */ if (IS_LEAF2(mc->mc_pg[mc->mc_top])) { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0 && - rdata->iov_len == 0); - rc = mdbx_node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); + cASSERT(mc, !(naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) && + rdata->iov_len == 0); + rc = node_add_leaf2(mc, mc->mc_ki[mc->mc_top], key); } else - rc = mdbx_node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, nflags); + rc = node_add_leaf(mc, mc->mc_ki[mc->mc_top], key, rdata, naf); if (likely(rc == 0)) { /* Adjust other cursors pointing to mp */ const MDBX_dbi dbi = mc->mc_dbi; - const unsigned i = mc->mc_top; + const size_t i = mc->mc_top; MDBX_page *const mp = mc->mc_pg[i]; for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -16034,12 +17776,11 @@ new_sub:; STATIC_ASSERT( (MDBX_NODUPDATA >> SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE) == MDBX_NOOVERWRITE); - xflags = MDBX_CURRENT | MDBX_NOSPILL | - ((flags & MDBX_NODUPDATA) >> - SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); + xflags = MDBX_CURRENT | ((flags & MDBX_NODUPDATA) >> + SHIFT_MDBX_NODUPDATA_TO_MDBX_NOOVERWRITE); if ((flags & MDBX_CURRENT) == 0) { xflags -= MDBX_CURRENT; - err = mdbx_xcursor_init1(mc, node, mc->mc_pg[mc->mc_top]); + err = cursor_xinit1(mc, node, mc->mc_pg[mc->mc_top]); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -16047,7 +17788,8 @@ new_sub:; mc->mc_xcursor->mx_cursor.mc_pg[0] = sub_root; /* converted, write the original data first */ if (dupdata_flag) { - rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, xflags); + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, &dkey, &xdata, + xflags); if (unlikely(rc)) goto bad_sub; /* we've done our job */ @@ -16057,9 +17799,9 @@ new_sub:; /* Adjust other cursors pointing to mp */ MDBX_cursor *m2; MDBX_xcursor *mx = mc->mc_xcursor; - unsigned i = mc->mc_top; + size_t i = mc->mc_top; MDBX_page *mp = mc->mc_pg[i]; - const int nkeys = page_numkeys(mp); + const intptr_t nkeys = page_numkeys(mp); for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) @@ -16068,7 +17810,7 @@ new_sub:; continue; if (m2->mc_pg[i] == mp) { if (m2->mc_ki[i] == mc->mc_ki[i]) { - err = mdbx_xcursor_init2(m2, mx, dupdata_flag); + err = cursor_xinit2(m2, mx, dupdata_flag); if (unlikely(err != MDBX_SUCCESS)) return err; } else if (!insert_key && m2->mc_ki[i] < nkeys) { @@ -16077,13 +17819,14 @@ new_sub:; } } } - mdbx_cassert(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); + cASSERT(mc, mc->mc_xcursor->mx_db.md_entries < PTRDIFF_MAX); ecount = (size_t)mc->mc_xcursor->mx_db.md_entries; #define SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND 1 STATIC_ASSERT((MDBX_APPENDDUP >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND) == MDBX_APPEND); xflags |= (flags & MDBX_APPENDDUP) >> SHIFT_MDBX_APPENDDUP_TO_MDBX_APPEND; - rc = mdbx_cursor_put(&mc->mc_xcursor->mx_cursor, data, &xdata, xflags); + rc = cursor_put_nochecklen(&mc->mc_xcursor->mx_cursor, data, &xdata, + xflags); if (flags & F_SUBDATA) { void *db = node_data(node); mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; @@ -16109,19 +17852,19 @@ new_sub:; /* let caller know how many succeeded, if any */ data[1].iov_len = mcount; if (mcount < dcount) { - data[0].iov_base = (char *)data[0].iov_base + data[0].iov_len; + data[0].iov_base = ptr_disp(data[0].iov_base, data[0].iov_len); insert_key = insert_data = false; goto more; } } } - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); + if (rc == MDBX_SUCCESS && AUDIT_ENABLED()) + rc = cursor_check(mc); return rc; bad_sub: if (unlikely(rc == MDBX_KEYEXIST)) { /* should not happen, we deleted that item */ - mdbx_error("Unexpected %i error while put to nested dupsort's hive", rc); + ERROR("Unexpected %i error while put to nested dupsort's hive", rc); rc = MDBX_PROBLEM; } } @@ -16129,6 +17872,126 @@ new_sub:; return rc; } +static __hot int cursor_put_checklen(MDBX_cursor *mc, const MDBX_val *key, + MDBX_val *data, unsigned flags) { + cASSERT(mc, (mc->mc_flags & C_SUB) == 0); + uint64_t aligned_keybytes, aligned_databytes; + MDBX_val aligned_key, aligned_data; + if (unlikely(key->iov_len < mc->mc_dbx->md_klen_min || + key->iov_len > mc->mc_dbx->md_klen_max)) { + cASSERT(mc, !"Invalid key-size"); + return MDBX_BAD_VALSIZE; + } + if (unlikely(data->iov_len < mc->mc_dbx->md_vlen_min || + data->iov_len > mc->mc_dbx->md_vlen_max)) { + cASSERT(mc, !"Invalid data-size"); + return MDBX_BAD_VALSIZE; + } + + if (mc->mc_db->md_flags & MDBX_INTEGERKEY) { + switch (key->iov_len) { + default: + cASSERT(mc, !"key-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + case 4: + if (unlikely(3 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 4); + key = &aligned_key; + } + break; + case 8: + if (unlikely(7 & (uintptr_t)key->iov_base)) { + /* copy instead of return error to avoid break compatibility */ + aligned_key.iov_base = + memcpy(&aligned_keybytes, key->iov_base, aligned_key.iov_len = 8); + key = &aligned_key; + } + break; + } + } + if (mc->mc_db->md_flags & MDBX_INTEGERDUP) { + switch (data->iov_len) { + default: + cASSERT(mc, !"data-size is invalid for MDBX_INTEGERKEY"); + return MDBX_BAD_VALSIZE; + case 4: + if (unlikely(3 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, + aligned_data.iov_len = 4); + data = &aligned_data; + } + break; + case 8: + if (unlikely(7 & (uintptr_t)data->iov_base)) { + if (unlikely(flags & MDBX_MULTIPLE)) + return MDBX_BAD_VALSIZE; + /* copy instead of return error to avoid break compatibility */ + aligned_data.iov_base = memcpy(&aligned_databytes, data->iov_base, + aligned_data.iov_len = 8); + data = &aligned_data; + } + break; + } + } + return cursor_put_nochecklen(mc, key, data, flags); +} + +int mdbx_cursor_put(MDBX_cursor *mc, const MDBX_val *key, MDBX_val *data, + unsigned flags) { + if (unlikely(mc == NULL || key == NULL || data == NULL)) + return MDBX_EINVAL; + + if (unlikely(mc->mc_signature != MDBX_MC_LIVE)) + return (mc->mc_signature == MDBX_MC_READY4CLOSE) ? MDBX_EINVAL + : MDBX_EBADSIGN; + + int rc = check_txn_rw(mc->mc_txn, MDBX_TXN_BLOCKED); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) + return MDBX_BAD_DBI; + + cASSERT(mc, cursor_is_tracked(mc)); + + /* Check this first so counter will always be zero on any early failures. */ + if (unlikely(flags & MDBX_MULTIPLE)) { + if (unlikely(flags & MDBX_RESERVE)) + return MDBX_EINVAL; + if (unlikely(!(mc->mc_db->md_flags & MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + const size_t dcount = data[1].iov_len; + if (unlikely(dcount < 2 || data->iov_len == 0)) + return MDBX_BAD_VALSIZE; + if (unlikely(mc->mc_db->md_xsize != data->iov_len) && mc->mc_db->md_xsize) + return MDBX_BAD_VALSIZE; + if (unlikely(dcount > MAX_MAPSIZE / 2 / + (BRANCH_NODE_MAX(MAX_PAGESIZE) - NODESIZE))) { + /* checking for multiplication overflow */ + if (unlikely(dcount > MAX_MAPSIZE / 2 / data->iov_len)) + return MDBX_TOO_LARGE; + } + } + + if (flags & MDBX_RESERVE) { + if (unlikely(mc->mc_db->md_flags & (MDBX_DUPSORT | MDBX_REVERSEDUP | + MDBX_INTEGERDUP | MDBX_DUPFIXED))) + return MDBX_INCOMPATIBLE; + data->iov_base = nullptr; + } + + if (unlikely(mc->mc_txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) + return (mc->mc_txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS + : MDBX_BAD_TXN; + + return cursor_put_checklen(mc, key, data, flags); +} + int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(!mc)) return MDBX_EINVAL; @@ -16141,7 +18004,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (unlikely(TXN_DBI_CHANGED(mc->mc_txn, mc->mc_dbi))) + if (unlikely(dbi_changed(mc->mc_txn, mc->mc_dbi))) return MDBX_BAD_DBI; if (unlikely(!(mc->mc_flags & C_INITIALIZED))) @@ -16150,53 +18013,52 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (unlikely(mc->mc_ki[mc->mc_top] >= page_numkeys(mc->mc_pg[mc->mc_top]))) return MDBX_NOTFOUND; - if (likely((flags & MDBX_NOSPILL) == 0) && - unlikely(rc = mdbx_cursor_spill(mc, NULL, NULL))) - return rc; + return cursor_del(mc, flags); +} - rc = mdbx_cursor_touch(mc); +static __hot int cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { + cASSERT(mc, mc->mc_flags & C_INITIALIZED); + cASSERT(mc, mc->mc_ki[mc->mc_top] < page_numkeys(mc->mc_pg[mc->mc_top])); + + int rc = cursor_touch(mc, nullptr, nullptr); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_page *mp = mc->mc_pg[mc->mc_top]; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(!IS_LEAF(mp))) + if (!MDBX_DISABLE_VALIDATION && unlikely(!CHECK_LEAF_TYPE(mc, mp))) { + ERROR("unexpected leaf-page #%" PRIaPGNO " type 0x%x seen by cursor", + mp->mp_pgno, mp->mp_flags); return MDBX_CORRUPTED; - if (IS_LEAF2(mp)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely((mc->mc_flags & C_SUB) == 0)) { - mdbx_error("unexpected LEAF2-page %" PRIaPGNO "for non-dupsort cursor", - mp->mp_pgno); - return MDBX_CORRUPTED; - } - goto del_key; } + if (IS_LEAF2(mp)) + goto del_key; MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { if (flags & (MDBX_ALLDUPS | /* for compatibility */ MDBX_NODUPDATA)) { - /* mdbx_cursor_del0() will subtract the final entry */ + /* will subtract the final entry later */ mc->mc_db->md_entries -= mc->mc_xcursor->mx_db.md_entries - 1; mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; } else { - if (!F_ISSET(node_flags(node), F_SUBDATA)) + if (!(node_flags(node) & F_SUBDATA)) mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - rc = mdbx_cursor_del(&mc->mc_xcursor->mx_cursor, MDBX_NOSPILL); + rc = cursor_del(&mc->mc_xcursor->mx_cursor, 0); if (unlikely(rc)) return rc; /* If sub-DB still has entries, we're done */ if (mc->mc_xcursor->mx_db.md_entries) { if (node_flags(node) & F_SUBDATA) { /* update subDB info */ - void *db = node_data(node); mc->mc_xcursor->mx_db.md_mod_txnid = mc->mc_txn->mt_txnid; - memcpy(db, &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); + memcpy(node_data(node), &mc->mc_xcursor->mx_db, sizeof(MDBX_db)); } else { - MDBX_cursor *m2; /* shrink fake page */ - mdbx_node_shrink(mp, mc->mc_ki[mc->mc_top]); + node_shrink(mp, mc->mc_ki[mc->mc_top]); node = page_node(mp, mc->mc_ki[mc->mc_top]); mc->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); /* fix other sub-DB cursors pointed at fake pages on this page */ - for (m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; + m2 = m2->mc_next) { if (m2 == mc || m2->mc_snum < mc->mc_snum) continue; if (!(m2->mc_flags & C_INITIALIZED)) @@ -16215,8 +18077,8 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { } } mc->mc_db->md_entries--; - mdbx_cassert(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && - mc->mc_db->md_root != P_INVALID); + cASSERT(mc, mc->mc_db->md_entries > 0 && mc->mc_db->md_depth > 0 && + mc->mc_db->md_root != P_INVALID); return rc; } else { mc->mc_xcursor->mx_cursor.mc_flags &= ~C_INITIALIZED; @@ -16226,7 +18088,7 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { if (node_flags(node) & F_SUBDATA) { /* add all the child DB's pages to the free list */ - rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false); + rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); if (unlikely(rc)) goto fail; } @@ -16235,17 +18097,117 @@ int mdbx_cursor_del(MDBX_cursor *mc, MDBX_put_flags_t flags) { else if (unlikely((node_flags(node) ^ flags) & F_SUBDATA)) return MDBX_INCOMPATIBLE; - /* add overflow pages to free list */ - if (F_ISSET(node_flags(node), F_BIGDATA)) { - MDBX_page *omp; - if (unlikely((rc = mdbx_page_get(mc, node_largedata_pgno(node), &omp, - pp_txnid4chk(mp, mc->mc_txn))) || - (rc = mdbx_page_retire(mc, omp)))) + /* add large/overflow pages to free list */ + if (node_flags(node) & F_BIGDATA) { + pgr_t lp = page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((rc = lp.err) || (rc = page_retire(mc, lp.page)))) goto fail; } del_key: - return mdbx_cursor_del0(mc); + mc->mc_db->md_entries--; + const MDBX_dbi dbi = mc->mc_dbi; + indx_t ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, IS_LEAF(mp)); + node_del(mc, mc->mc_db->md_xsize); + + /* Adjust other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + if (m3->mc_ki[mc->mc_top] == ki) { + m3->mc_flags |= C_DEL; + if (mc->mc_db->md_flags & MDBX_DUPSORT) { + /* Sub-cursor referred into dataset which is gone */ + m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); + } + continue; + } else if (m3->mc_ki[mc->mc_top] > ki) { + m3->mc_ki[mc->mc_top]--; + } + if (XCURSOR_INITED(m3)) + XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); + } + } + + rc = rebalance(mc); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + + if (unlikely(!mc->mc_snum)) { + /* DB is totally empty now, just bail out. + * Other cursors adjustments were already done + * by rebalance and aren't needed here. */ + cASSERT(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && + mc->mc_db->md_root == P_INVALID); + mc->mc_flags |= C_EOF; + return MDBX_SUCCESS; + } + + ki = mc->mc_ki[mc->mc_top]; + mp = mc->mc_pg[mc->mc_top]; + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); + size_t nkeys = page_numkeys(mp); + cASSERT(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || + ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && + nkeys == 0)); + + /* Adjust this and other cursors pointing to mp */ + for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { + MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; + if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) + continue; + if (m3->mc_snum < mc->mc_snum) + continue; + if (m3->mc_pg[mc->mc_top] == mp) { + /* if m3 points past last node in page, find next sibling */ + if (m3->mc_ki[mc->mc_top] >= nkeys) { + rc = cursor_sibling(m3, SIBLING_RIGHT); + if (rc == MDBX_NOTFOUND) { + m3->mc_flags |= C_EOF; + rc = MDBX_SUCCESS; + continue; + } + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + if (m3->mc_ki[mc->mc_top] >= ki || + /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { + if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { + node = page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); + /* If this node has dupdata, it may need to be reinited + * because its data has moved. + * If the xcursor was not inited it must be reinited. + * Else if node points to a subDB, nothing is needed. */ + if (node_flags(node) & F_DUPDATA) { + if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { + if (!(node_flags(node) & F_SUBDATA)) + m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); + } else { + rc = cursor_xinit1(m3, node, m3->mc_pg[m3->mc_top]); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + rc = cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); + if (unlikely(rc != MDBX_SUCCESS)) + goto fail; + } + } + m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; + } + m3->mc_flags |= C_DEL; + } + } + } + + cASSERT(mc, rc == MDBX_SUCCESS); + if (AUDIT_ENABLED()) + rc = cursor_check(mc); + return rc; fail: mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; @@ -16253,67 +18215,74 @@ fail: } /* Allocate and initialize new pages for a database. - * Set MDBX_TXN_ERROR on failure. - * - * [in] mc a cursor on the database being added to. - * [in] flags flags defining what type of page is being allocated. - * [in] num the number of pages to allocate. This is usually 1, - * unless allocating overflow pages for a large record. - * [out] mp Address of a page, or NULL on failure. - * - * Returns 0 on success, non-zero on failure. */ -static struct page_result mdbx_page_new(MDBX_cursor *mc, const unsigned flags, - const unsigned npages) { - struct page_result ret = mdbx_page_alloc(mc, npages, MDBX_ALLOC_ALL); + * Set MDBX_TXN_ERROR on failure. */ +static pgr_t page_new(MDBX_cursor *mc, const unsigned flags) { + cASSERT(mc, (flags & P_OVERFLOW) == 0); + pgr_t ret = page_alloc(mc); if (unlikely(ret.err != MDBX_SUCCESS)) return ret; - mdbx_debug("db %u allocated new page %" PRIaPGNO ", num %u", mc->mc_dbi, - ret.page->mp_pgno, npages); + DEBUG("db %u allocated new page %" PRIaPGNO, mc->mc_dbi, ret.page->mp_pgno); ret.page->mp_flags = (uint16_t)flags; - ret.page->mp_txnid = mc->mc_txn->mt_front; - mdbx_cassert(mc, *mc->mc_dbistate & DBI_DIRTY); - mdbx_cassert(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); + cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); +#if MDBX_ENABLE_PGOP_STAT + mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += 1; +#endif /* MDBX_ENABLE_PGOP_STAT */ + + STATIC_ASSERT(P_BRANCH == 1); + const unsigned is_branch = flags & P_BRANCH; + + ret.page->mp_lower = 0; + ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); + mc->mc_db->md_branch_pages += is_branch; + mc->mc_db->md_leaf_pages += 1 - is_branch; + if (unlikely(mc->mc_flags & C_SUB)) { + MDBX_db *outer = outer_db(mc); + outer->md_branch_pages += is_branch; + outer->md_leaf_pages += 1 - is_branch; + } + return ret; +} + +static pgr_t page_new_large(MDBX_cursor *mc, const size_t npages) { + pgr_t ret = likely(npages == 1) + ? page_alloc(mc) + : page_alloc_slowpath(mc, npages, MDBX_ALLOC_DEFAULT); + if (unlikely(ret.err != MDBX_SUCCESS)) + return ret; + + DEBUG("db %u allocated new large-page %" PRIaPGNO ", num %zu", mc->mc_dbi, + ret.page->mp_pgno, npages); + ret.page->mp_flags = P_OVERFLOW; + cASSERT(mc, *mc->mc_dbistate & DBI_DIRTY); + cASSERT(mc, mc->mc_txn->mt_flags & MDBX_TXN_DIRTY); #if MDBX_ENABLE_PGOP_STAT mc->mc_txn->mt_env->me_lck->mti_pgop_stat.newly.weak += npages; #endif /* MDBX_ENABLE_PGOP_STAT */ - if (likely((flags & P_OVERFLOW) == 0)) { - STATIC_ASSERT(P_BRANCH == 1); - const bool is_branch = flags & P_BRANCH; - ret.page->mp_lower = 0; - ret.page->mp_upper = (indx_t)(mc->mc_txn->mt_env->me_psize - PAGEHDRSZ); - mc->mc_db->md_branch_pages += is_branch; - mc->mc_db->md_leaf_pages += 1 - is_branch; - if (unlikely(mc->mc_flags & C_SUB)) { - MDBX_db *outer = mdbx_outer_db(mc); - outer->md_branch_pages += is_branch; - outer->md_leaf_pages += 1 - is_branch; - } - } else { - mc->mc_db->md_overflow_pages += npages; - ret.page->mp_pages = npages; - mdbx_cassert(mc, !(mc->mc_flags & C_SUB)); - } - + mc->mc_db->md_overflow_pages += (pgno_t)npages; + ret.page->mp_pages = (pgno_t)npages; + cASSERT(mc, !(mc->mc_flags & C_SUB)); return ret; } -static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key) { +__hot static int __must_check_result node_add_leaf2(MDBX_cursor *mc, + size_t indx, + const MDBX_val *key) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; + MDBX_ANALYSIS_ASSUME(key != nullptr); DKBUF_DEBUG; - mdbx_debug("add to leaf2-%spage %" PRIaPGNO " index %i, " - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, - key ? key->iov_len : 0, DKEY_DEBUG(key)); + DEBUG("add to leaf2-%spage %" PRIaPGNO " index %zi, " + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, key ? key->iov_len : 0, + DKEY_DEBUG(key)); - mdbx_cassert(mc, key); - mdbx_cassert(mc, PAGETYPE(mp) == (P_LEAF | P_LEAF2)); - const unsigned ksize = mc->mc_db->md_xsize; - mdbx_cassert(mc, ksize == key->iov_len); - const unsigned nkeys = page_numkeys(mp); + cASSERT(mc, key); + cASSERT(mc, PAGETYPE_COMPAT(mp) == (P_LEAF | P_LEAF2)); + const size_t ksize = mc->mc_db->md_xsize; + cASSERT(mc, ksize == key->iov_len); + const size_t nkeys = page_numkeys(mp); /* Just using these for counting */ const intptr_t lower = mp->mp_lower + sizeof(indx_t); @@ -16325,35 +18294,34 @@ static int __must_check_result mdbx_node_add_leaf2(MDBX_cursor *mc, mp->mp_lower = (indx_t)lower; mp->mp_upper = (indx_t)upper; - char *const ptr = page_leaf2key(mp, indx, ksize); - mdbx_cassert(mc, nkeys >= indx); - const unsigned diff = nkeys - indx; + void *const ptr = page_leaf2key(mp, indx, ksize); + cASSERT(mc, nkeys >= indx); + const size_t diff = nkeys - indx; if (likely(diff > 0)) /* Move higher keys up one slot. */ - memmove(ptr + ksize, ptr, diff * ksize); + memmove(ptr_disp(ptr, ksize), ptr, diff * ksize); /* insert new key */ memcpy(ptr, key->iov_base, ksize); return MDBX_SUCCESS; } -static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - pgno_t pgno) { +static int __must_check_result node_add_branch(MDBX_cursor *mc, size_t indx, + const MDBX_val *key, + pgno_t pgno) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - mdbx_debug("add to branch-%spage %" PRIaPGNO " index %i, node-pgno %" PRIaPGNO - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, - key ? key->iov_len : 0, DKEY_DEBUG(key)); + DEBUG("add to branch-%spage %" PRIaPGNO " index %zi, node-pgno %" PRIaPGNO + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, pgno, + key ? key->iov_len : 0, DKEY_DEBUG(key)); - mdbx_cassert(mc, PAGETYPE(mp) == P_BRANCH); + cASSERT(mc, PAGETYPE_WHOLE(mp) == P_BRANCH); STATIC_ASSERT(NODESIZE % 2 == 0); /* Move higher pointers up one slot. */ - const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, nkeys >= indx); - for (unsigned i = nkeys; i > indx; --i) + const size_t nkeys = page_numkeys(mp); + cASSERT(mc, nkeys >= indx); + for (size_t i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; /* Adjust free space offsets. */ @@ -16380,61 +18348,65 @@ static int __must_check_result mdbx_node_add_branch(MDBX_cursor *mc, return MDBX_SUCCESS; } -static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, - unsigned indx, - const MDBX_val *key, - MDBX_val *data, - unsigned flags) { +__hot static int __must_check_result node_add_leaf(MDBX_cursor *mc, size_t indx, + const MDBX_val *key, + MDBX_val *data, + unsigned flags) { + MDBX_ANALYSIS_ASSUME(key != nullptr); + MDBX_ANALYSIS_ASSUME(data != nullptr); MDBX_page *mp = mc->mc_pg[mc->mc_top]; DKBUF_DEBUG; - mdbx_debug("add to leaf-%spage %" PRIaPGNO " index %i, data size %" PRIuPTR - " key size %" PRIuPTR " [%s]", - IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, - data ? data->iov_len : 0, key ? key->iov_len : 0, DKEY_DEBUG(key)); - mdbx_cassert(mc, key != NULL && data != NULL); - mdbx_cassert(mc, PAGETYPE(mp) == P_LEAF); - mdbx_cassert(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); + DEBUG("add to leaf-%spage %" PRIaPGNO " index %zi, data size %" PRIuPTR + " key size %" PRIuPTR " [%s]", + IS_SUBP(mp) ? "sub-" : "", mp->mp_pgno, indx, data ? data->iov_len : 0, + key ? key->iov_len : 0, DKEY_DEBUG(key)); + cASSERT(mc, key != NULL && data != NULL); + cASSERT(mc, PAGETYPE_COMPAT(mp) == P_LEAF); MDBX_page *largepage = NULL; size_t node_bytes; if (unlikely(flags & F_BIGDATA)) { - /* Data already on overflow page. */ + /* Data already on large/overflow page. */ STATIC_ASSERT(sizeof(pgno_t) % 2 == 0); node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + cASSERT(mc, page_room(mp) >= node_bytes); } else if (unlikely(node_size(key, data) > mc->mc_txn->mt_env->me_leaf_nodemax)) { - /* Put data on overflow page. */ + /* Put data on large/overflow page. */ if (unlikely(mc->mc_db->md_flags & MDBX_DUPSORT)) { - mdbx_error("Unexpected target %s flags 0x%x for large data-item", - "dupsort-db", mc->mc_db->md_flags); + ERROR("Unexpected target %s flags 0x%x for large data-item", "dupsort-db", + mc->mc_db->md_flags); return MDBX_PROBLEM; } if (unlikely(flags & (F_DUPDATA | F_SUBDATA))) { - mdbx_error("Unexpected target %s flags 0x%x for large data-item", "node", - flags); + ERROR("Unexpected target %s flags 0x%x for large data-item", "node", + flags); return MDBX_PROBLEM; } + cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); const pgno_t ovpages = number_of_ovpages(mc->mc_txn->mt_env, data->iov_len); - const struct page_result npr = mdbx_page_new(mc, P_OVERFLOW, ovpages); + const pgr_t npr = page_new_large(mc, ovpages); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; largepage = npr.page; - mdbx_debug("allocated %u overflow page(s) %" PRIaPGNO "for %" PRIuPTR - " data bytes", - largepage->mp_pages, largepage->mp_pgno, data->iov_len); + DEBUG("allocated %u large/overflow page(s) %" PRIaPGNO "for %" PRIuPTR + " data bytes", + largepage->mp_pages, largepage->mp_pgno, data->iov_len); flags |= F_BIGDATA; node_bytes = node_size_len(key->iov_len, 0) + sizeof(pgno_t) + sizeof(indx_t); + cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); } else { + cASSERT(mc, page_room(mp) >= leaf_size(mc->mc_txn->mt_env, key, data)); node_bytes = node_size(key, data) + sizeof(indx_t); + cASSERT(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); } - mdbx_cassert(mc, node_bytes == leaf_size(mc->mc_txn->mt_env, key, data)); /* Move higher pointers up one slot. */ - const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, nkeys >= indx); - for (unsigned i = nkeys; i > indx; --i) + const size_t nkeys = page_numkeys(mp); + cASSERT(mc, nkeys >= indx); + for (size_t i = nkeys; i > indx; --i) mp->mp_ptrs[i] = mp->mp_ptrs[i - 1]; /* Adjust free space offsets. */ @@ -16457,22 +18429,19 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, void *nodedata = node_data(node); if (likely(largepage == NULL)) { - if (unlikely(flags & F_BIGDATA)) + if (unlikely(flags & F_BIGDATA)) { memcpy(nodedata, data->iov_base, sizeof(pgno_t)); - else if (unlikely(flags & MDBX_RESERVE)) - data->iov_base = nodedata; - else if (likely(nodedata != data->iov_base && - data->iov_len /* to avoid UBSAN traps*/ != 0)) - memcpy(nodedata, data->iov_base, data->iov_len); + return MDBX_SUCCESS; + } } else { poke_pgno(nodedata, largepage->mp_pgno); nodedata = page_data(largepage); - if (unlikely(flags & MDBX_RESERVE)) - data->iov_base = nodedata; - else if (likely(nodedata != data->iov_base && - data->iov_len /* to avoid UBSAN traps*/ != 0)) - memcpy(nodedata, data->iov_base, data->iov_len); } + if (unlikely(flags & MDBX_RESERVE)) + data->iov_base = nodedata; + else if (likely(nodedata != data->iov_base && + data->iov_len /* to avoid UBSAN traps*/ != 0)) + memcpy(nodedata, data->iov_base, data->iov_len); return MDBX_SUCCESS; } @@ -16480,80 +18449,69 @@ static int __must_check_result mdbx_node_add_leaf(MDBX_cursor *mc, * [in] mc Cursor pointing to the node to delete. * [in] ksize The size of a node. Only used if the page is * part of a MDBX_DUPFIXED database. */ -static void mdbx_node_del(MDBX_cursor *mc, size_t ksize) { +__hot static void node_del(MDBX_cursor *mc, size_t ksize) { MDBX_page *mp = mc->mc_pg[mc->mc_top]; - int indx = mc->mc_ki[mc->mc_top]; - int i, j, nkeys, ptr; - MDBX_node *node; - char *base; + const size_t hole = mc->mc_ki[mc->mc_top]; + const size_t nkeys = page_numkeys(mp); - mdbx_debug("delete node %u on %s page %" PRIaPGNO, indx, - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); - nkeys = page_numkeys(mp); - mdbx_cassert(mc, indx < nkeys); + DEBUG("delete node %zu on %s page %" PRIaPGNO, hole, + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno); + cASSERT(mc, hole < nkeys); if (IS_LEAF2(mp)) { - mdbx_cassert(mc, ksize >= sizeof(indx_t)); - unsigned diff = nkeys - 1 - indx; - base = page_leaf2key(mp, indx, ksize); + cASSERT(mc, ksize >= sizeof(indx_t)); + size_t diff = nkeys - 1 - hole; + void *const base = page_leaf2key(mp, hole, ksize); if (diff) - memmove(base, base + ksize, diff * ksize); - mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); + memmove(base, ptr_disp(base, ksize), diff * ksize); + cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mdbx_cassert(mc, - (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper += (indx_t)(ksize - sizeof(indx_t)); return; } - node = page_node(mp, indx); - mdbx_cassert(mc, !IS_BRANCH(mp) || indx || node_ks(node) == 0); - size_t sz = NODESIZE + node_ks(node); - if (IS_LEAF(mp)) { - if (F_ISSET(node_flags(node), F_BIGDATA)) - sz += sizeof(pgno_t); - else - sz += node_ds(node); - } - sz = EVEN(sz); + MDBX_node *node = page_node(mp, hole); + cASSERT(mc, !IS_BRANCH(mp) || hole || node_ks(node) == 0); + size_t hole_size = NODESIZE + node_ks(node); + if (IS_LEAF(mp)) + hole_size += + (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) : node_ds(node); + hole_size = EVEN(hole_size); - ptr = mp->mp_ptrs[indx]; - for (i = j = 0; i < nkeys; i++) { - if (i != indx) { - mp->mp_ptrs[j] = mp->mp_ptrs[i]; - if (mp->mp_ptrs[i] < ptr) { - mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_ptrs[j] >= sz); - mp->mp_ptrs[j] += (indx_t)sz; - } - j++; - } - } + const indx_t hole_offset = mp->mp_ptrs[hole]; + size_t r, w; + for (r = w = 0; r < nkeys; r++) + if (r != hole) + mp->mp_ptrs[w++] = (mp->mp_ptrs[r] < hole_offset) + ? mp->mp_ptrs[r] + (indx_t)hole_size + : mp->mp_ptrs[r]; - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; - memmove(base + sz, base, ptr - mp->mp_upper); + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); + memmove(ptr_disp(base, hole_size), base, hole_offset - mp->mp_upper); - mdbx_cassert(mc, mp->mp_lower >= sizeof(indx_t)); + cASSERT(mc, mp->mp_lower >= sizeof(indx_t)); mp->mp_lower -= sizeof(indx_t); - mdbx_cassert(mc, (size_t)UINT16_MAX - mp->mp_upper >= sz); - mp->mp_upper += (indx_t)sz; + cASSERT(mc, (size_t)UINT16_MAX - mp->mp_upper >= hole_size); + mp->mp_upper += (indx_t)hole_size; -#if MDBX_DEBUG > 0 - if (mdbx_audit_enabled()) { - int page_check_err = mdbx_page_check(mc, mp, C_UPDATING); - mdbx_cassert(mc, page_check_err == MDBX_SUCCESS); + if (AUDIT_ENABLED()) { + const uint8_t checking = mc->mc_checking; + mc->mc_checking |= CC_UPDATING; + const int page_check_err = page_check(mc, mp); + mc->mc_checking = checking; + cASSERT(mc, page_check_err == MDBX_SUCCESS); } -#endif } /* Compact the main page after deleting a node on a subpage. * [in] mp The main page to operate on. * [in] indx The index of the subpage on the main page. */ -static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { +static void node_shrink(MDBX_page *mp, size_t indx) { MDBX_node *node; MDBX_page *sp, *xp; - char *base; size_t nsize, delta, len, ptr; - int i; + intptr_t i; node = page_node(mp, indx); sp = (MDBX_page *)node_data(node); @@ -16569,7 +18527,7 @@ static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { assert(nsize % 1 == 0); len = nsize; } else { - xp = (MDBX_page *)((char *)sp + delta); /* destination subpage */ + xp = ptr_disp(sp, delta); /* destination subpage */ for (i = page_numkeys(sp); --i >= 0;) { assert(sp->mp_ptrs[i] >= delta); xp->mp_ptrs[i] = (indx_t)(sp->mp_ptrs[i] - delta); @@ -16582,8 +18540,8 @@ static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { node_set_ds(node, nsize); /* Shift upward */ - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; - memmove(base + delta, base, (char *)sp + len - base); + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); + memmove(ptr_disp(base, delta), base, ptr_dist(sp, base) + len); ptr = mp->mp_ptrs[indx]; for (i = page_numkeys(mp); --i >= 0;) { @@ -16606,11 +18564,11 @@ static void mdbx_node_shrink(MDBX_page *mp, unsigned indx) { * depend only on the parent DB. * * [in] mc The main cursor whose sorted-dups cursor is to be initialized. */ -static int mdbx_xcursor_init0(MDBX_cursor *mc) { +static int cursor_xinit0(MDBX_cursor *mc) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } @@ -16623,7 +18581,11 @@ static int mdbx_xcursor_init0(MDBX_cursor *mc) { mx->mx_cursor.mc_dbistate = mc->mc_dbistate; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB; + STATIC_ASSERT(MDBX_DUPFIXED * 2 == P_LEAF2); + cASSERT(mc, (mc->mc_checking & (P_BRANCH | P_LEAF | P_LEAF2)) == P_LEAF); + mx->mx_cursor.mc_checking = + mc->mc_checking + ((mc->mc_db->md_flags & MDBX_DUPFIXED) << 1); mx->mx_dbx.md_name.iov_len = 0; mx->mx_dbx.md_name.iov_base = NULL; mx->mx_dbx.md_cmp = mc->mc_dbx->md_dcmp; @@ -16638,43 +18600,42 @@ static int mdbx_xcursor_init0(MDBX_cursor *mc) { * [in] mc The main cursor whose sorted-dups cursor is to be initialized. * [in] node The data containing the MDBX_db record for the sorted-dup database. */ -static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, - const MDBX_page *mp) { +static int cursor_xinit1(MDBX_cursor *mc, MDBX_node *node, + const MDBX_page *mp) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } const uint8_t flags = node_flags(node); switch (flags) { default: - mdbx_error("invalid node flags %u", flags); + ERROR("invalid node flags %u", flags); return MDBX_CORRUPTED; case F_DUPDATA | F_SUBDATA: - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { - mdbx_error("invalid nested-db record size %zu", node_ds(node)); + ERROR("invalid nested-db record size %zu", node_ds(node)); return MDBX_CORRUPTED; } memcpy(&mx->mx_db, node_data(node), sizeof(MDBX_db)); const txnid_t pp_txnid = mp->mp_txnid; - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(mx->mx_db.md_mod_txnid > pp_txnid)) { - mdbx_error("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN - ")", - mx->mx_db.md_mod_txnid, pp_txnid); + ERROR("nested-db.md_mod_txnid (%" PRIaTXN ") > page-txnid (%" PRIaTXN ")", + mx->mx_db.md_mod_txnid, pp_txnid); return MDBX_CORRUPTED; } mx->mx_cursor.mc_pg[0] = 0; mx->mx_cursor.mc_snum = 0; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB; break; case F_DUPDATA: - if (!MDBX_DISABLE_PAGECHECKS && unlikely(node_ds(node) <= PAGEHDRSZ)) { - mdbx_error("invalid nested-page size %zu", node_ds(node)); + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) <= PAGEHDRSZ)) { + ERROR("invalid nested-page size %zu", node_ds(node)); return MDBX_CORRUPTED; } MDBX_page *fp = node_data(node); @@ -16687,8 +18648,7 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, mx->mx_db.md_mod_txnid = mp->mp_txnid; mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags = - C_INITIALIZED | C_SUB | (mc->mc_flags & (C_COPYING | C_SKIPORD)); + mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; mx->mx_cursor.mc_pg[0] = fp; mx->mx_cursor.mc_ki[0] = 0; mx->mx_db.md_flags = flags_db2sub(mc->mc_db->md_flags); @@ -16698,23 +18658,22 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, } if (unlikely(mx->mx_db.md_xsize != mc->mc_db->md_xsize)) { - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mc->mc_db->md_xsize != 0)) { - mdbx_error("cursor mismatched nested-db md_xsize %u", - mc->mc_db->md_xsize); + if (!MDBX_DISABLE_VALIDATION && unlikely(mc->mc_db->md_xsize != 0)) { + ERROR("cursor mismatched nested-db md_xsize %u", mc->mc_db->md_xsize); return MDBX_CORRUPTED; } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely((mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) { - mdbx_error("mismatched nested-db md_flags %u", mc->mc_db->md_flags); + ERROR("mismatched nested-db md_flags %u", mc->mc_db->md_flags); return MDBX_CORRUPTED; } - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(mx->mx_db.md_xsize < mc->mc_dbx->md_vlen_min || mx->mx_db.md_xsize > mc->mc_dbx->md_vlen_max)) { - mdbx_error("mismatched nested-db.md_xsize (%u) <> min/max value-length " - "(%zu/%zu)", - mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); + ERROR("mismatched nested-db.md_xsize (%u) <> min/max value-length " + "(%zu/%zu)", + mx->mx_db.md_xsize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); return MDBX_CORRUPTED; } mc->mc_db->md_xsize = mx->mx_db.md_xsize; @@ -16723,8 +18682,8 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, mx->mx_dbx.md_klen_min = mc->mc_dbx->md_vlen_min; mx->mx_dbx.md_klen_max = mc->mc_dbx->md_vlen_max; - mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); + DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); return MDBX_SUCCESS; } @@ -16735,19 +18694,19 @@ static int mdbx_xcursor_init1(MDBX_cursor *mc, MDBX_node *node, * [in] mc The main cursor whose sorted-dups cursor is to be fixed up. * [in] src_mx The xcursor of an up-to-date cursor. * [in] new_dupdata True if converting from a non-F_DUPDATA item. */ -static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, - bool new_dupdata) { +static int cursor_xinit2(MDBX_cursor *mc, MDBX_xcursor *src_mx, + bool new_dupdata) { MDBX_xcursor *mx = mc->mc_xcursor; - if (!MDBX_DISABLE_PAGECHECKS && unlikely(mx == nullptr)) { - mdbx_error("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", - mc->mc_dbi); + if (!MDBX_DISABLE_VALIDATION && unlikely(mx == nullptr)) { + ERROR("unexpected dupsort-page for non-dupsort db/cursor (dbi %u)", + mc->mc_dbi); return MDBX_CORRUPTED; } if (new_dupdata) { mx->mx_cursor.mc_snum = 1; mx->mx_cursor.mc_top = 0; - mx->mx_cursor.mc_flags |= C_INITIALIZED; + mx->mx_cursor.mc_flags = C_SUB | C_INITIALIZED; mx->mx_cursor.mc_ki[0] = 0; } @@ -16757,20 +18716,19 @@ static int mdbx_xcursor_init2(MDBX_cursor *mc, MDBX_xcursor *src_mx, mx->mx_db = src_mx->mx_db; mx->mx_cursor.mc_pg[0] = src_mx->mx_cursor.mc_pg[0]; if (mx->mx_cursor.mc_flags & C_INITIALIZED) { - mdbx_debug("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, - mx->mx_db.md_root); + DEBUG("Sub-db -%u root page %" PRIaPGNO, mx->mx_cursor.mc_dbi, + mx->mx_db.md_root); } return MDBX_SUCCESS; } -static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, - const MDBX_dbi dbi, MDBX_txn *const txn, - MDBX_db *const db, MDBX_dbx *const dbx, - uint8_t *const dbstate) { +static __inline int couple_init(MDBX_cursor_couple *couple, const size_t dbi, + MDBX_txn *const txn, MDBX_db *const db, + MDBX_dbx *const dbx, uint8_t *const dbstate) { couple->outer.mc_signature = MDBX_MC_LIVE; couple->outer.mc_next = NULL; couple->outer.mc_backup = NULL; - couple->outer.mc_dbi = dbi; + couple->outer.mc_dbi = (MDBX_dbi)dbi; couple->outer.mc_txn = txn; couple->outer.mc_db = db; couple->outer.mc_dbx = dbx; @@ -16779,22 +18737,27 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, couple->outer.mc_top = 0; couple->outer.mc_pg[0] = 0; couple->outer.mc_flags = 0; + STATIC_ASSERT(CC_BRANCH == P_BRANCH && CC_LEAF == P_LEAF && + CC_OVERFLOW == P_OVERFLOW && CC_LEAF2 == P_LEAF2); + couple->outer.mc_checking = + (AUDIT_ENABLED() || (txn->mt_env->me_flags & MDBX_VALIDATION)) + ? CC_PAGECHECK | CC_LEAF + : CC_LEAF; couple->outer.mc_ki[0] = 0; couple->outer.mc_xcursor = NULL; int rc = MDBX_SUCCESS; if (unlikely(*couple->outer.mc_dbistate & DBI_STALE)) { - rc = mdbx_page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); + rc = page_search(&couple->outer, NULL, MDBX_PS_ROOTONLY); rc = (rc != MDBX_NOTFOUND) ? rc : MDBX_SUCCESS; - } else if (unlikely(couple->outer.mc_dbx->md_klen_max == 0)) { - rc = mdbx_setup_dbx(couple->outer.mc_dbx, couple->outer.mc_db, - txn->mt_env->me_psize); + } else if (unlikely(dbx->md_klen_max == 0)) { + rc = setup_dbx(dbx, db, txn->mt_env->me_psize); } if (couple->outer.mc_db->md_flags & MDBX_DUPSORT) { couple->inner.mx_cursor.mc_signature = MDBX_MC_LIVE; couple->outer.mc_xcursor = &couple->inner; - rc = mdbx_xcursor_init0(&couple->outer); + rc = cursor_xinit0(&couple->outer); if (unlikely(rc != MDBX_SUCCESS)) return rc; couple->inner.mx_dbx.md_klen_min = couple->outer.mc_dbx->md_vlen_min; @@ -16804,15 +18767,15 @@ static __inline int mdbx_couple_init(MDBX_cursor_couple *couple, } /* Initialize a cursor for a given transaction and database. */ -static int mdbx_cursor_init(MDBX_cursor *mc, MDBX_txn *txn, MDBX_dbi dbi) { +static int cursor_init(MDBX_cursor *mc, MDBX_txn *txn, size_t dbi) { STATIC_ASSERT(offsetof(MDBX_cursor_couple, outer) == 0); - return mdbx_couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, - &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], - &txn->mt_dbistate[dbi]); + return couple_init(container_of(mc, MDBX_cursor_couple, outer), dbi, txn, + &txn->mt_dbs[dbi], &txn->mt_dbxs[dbi], + &txn->mt_dbistate[dbi]); } MDBX_cursor *mdbx_cursor_create(void *context) { - MDBX_cursor_couple *couple = mdbx_calloc(1, sizeof(MDBX_cursor_couple)); + MDBX_cursor_couple *couple = osal_calloc(1, sizeof(MDBX_cursor_couple)); if (unlikely(!couple)) return nullptr; @@ -16862,11 +18825,11 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (unlikely(!check_dbi(txn, dbi, DBI_VALID))) return MDBX_BAD_DBI; - if (unlikely(dbi == FREE_DBI && !F_ISSET(txn->mt_flags, MDBX_TXN_RDONLY))) + if (unlikely(dbi == FREE_DBI && !(txn->mt_flags & MDBX_TXN_RDONLY))) return MDBX_EACCESS; if (unlikely(mc->mc_backup)) /* Cursor from parent transaction */ { - mdbx_cassert(mc, mc->mc_signature == MDBX_MC_LIVE); + cASSERT(mc, mc->mc_signature == MDBX_MC_LIVE); if (unlikely(mc->mc_dbi != dbi || /* paranoia */ mc->mc_signature != MDBX_MC_LIVE || mc->mc_txn != txn)) @@ -16886,16 +18849,16 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { if (mc->mc_signature == MDBX_MC_LIVE) { if (unlikely(!mc->mc_txn || mc->mc_txn->mt_signature != MDBX_MT_SIGNATURE)) { - mdbx_error("Wrong cursor's transaction %p 0x%x", - __Wpedantic_format_voidptr(mc->mc_txn), - mc->mc_txn ? mc->mc_txn->mt_signature : 0); + ERROR("Wrong cursor's transaction %p 0x%x", + __Wpedantic_format_voidptr(mc->mc_txn), + mc->mc_txn ? mc->mc_txn->mt_signature : 0); return MDBX_PROBLEM; } if (mc->mc_flags & C_UNTRACK) { MDBX_cursor **prev = &mc->mc_txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - mdbx_cassert(mc, *prev == mc); + cASSERT(mc, *prev == mc); *prev = mc->mc_next; } mc->mc_signature = MDBX_MC_READY4CLOSE; @@ -16906,9 +18869,9 @@ int mdbx_cursor_bind(MDBX_txn *txn, MDBX_cursor *mc, MDBX_dbi dbi) { mc->mc_dbx = NULL; mc->mc_dbistate = NULL; } - mdbx_cassert(mc, !(mc->mc_flags & C_UNTRACK)); + cASSERT(mc, !(mc->mc_flags & C_UNTRACK)); - rc = mdbx_cursor_init(mc, txn, dbi); + rc = cursor_init(mc, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -16962,7 +18925,7 @@ again: dest->mc_flags ^= (dest->mc_flags ^ src->mc_flags) & ~C_UNTRACK; dest->mc_top = src->mc_top; dest->mc_snum = src->mc_snum; - for (unsigned i = 0; i < src->mc_snum; ++i) { + for (size_t i = 0; i < src->mc_snum; ++i) { dest->mc_ki[i] = src->mc_ki[i]; dest->mc_pg[i] = src->mc_pg[i]; } @@ -16980,27 +18943,27 @@ again: void mdbx_cursor_close(MDBX_cursor *mc) { if (likely(mc)) { - mdbx_ensure(NULL, mc->mc_signature == MDBX_MC_LIVE || - mc->mc_signature == MDBX_MC_READY4CLOSE); + ENSURE(NULL, mc->mc_signature == MDBX_MC_LIVE || + mc->mc_signature == MDBX_MC_READY4CLOSE); MDBX_txn *const txn = mc->mc_txn; if (!mc->mc_backup) { mc->mc_txn = NULL; /* Unlink from txn, if tracked. */ if (mc->mc_flags & C_UNTRACK) { - mdbx_ensure(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS); + ENSURE(txn->mt_env, check_txn(txn, 0) == MDBX_SUCCESS); MDBX_cursor **prev = &txn->mt_cursors[mc->mc_dbi]; while (*prev && *prev != mc) prev = &(*prev)->mc_next; - mdbx_tassert(txn, *prev == mc); + tASSERT(txn, *prev == mc); *prev = mc->mc_next; } mc->mc_signature = 0; mc->mc_next = mc; - mdbx_free(mc); + osal_free(mc); } else { /* Cursor closed before nested txn ends */ - mdbx_tassert(txn, mc->mc_signature == MDBX_MC_LIVE); - mdbx_ensure(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); + tASSERT(txn, mc->mc_signature == MDBX_MC_LIVE); + ENSURE(txn->mt_env, check_txn_rw(txn, 0) == MDBX_SUCCESS); mc->mc_signature = MDBX_MC_WAIT4EOT; } } @@ -17053,9 +19016,9 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { *countp = 1; if (mc->mc_xcursor != NULL) { MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_cassert(mc, mc->mc_xcursor && (mc->mc_xcursor->mx_cursor.mc_flags & - C_INITIALIZED)); + if (node_flags(node) & F_DUPDATA) { + cASSERT(mc, mc->mc_xcursor && + (mc->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED)); *countp = unlikely(mc->mc_xcursor->mx_db.md_entries > PTRDIFF_MAX) ? PTRDIFF_MAX : (size_t)mc->mc_xcursor->mx_db.md_entries; @@ -17069,16 +19032,15 @@ int mdbx_cursor_count(const MDBX_cursor *mc, size_t *countp) { * [in] mc Cursor pointing to the node to operate on. * [in] key The new key to use. * Returns 0 on success, non-zero on failure. */ -static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { +static int update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_page *mp; MDBX_node *node; - char *base; size_t len; ptrdiff_t delta, ksize, oksize; - int ptr, i, nkeys, indx; + intptr_t ptr, i, nkeys, indx; DKBUF_DEBUG; - mdbx_cassert(mc, cursor_is_tracked(mc)); + cASSERT(mc, cursor_is_tracked(mc)); indx = mc->mc_ki[mc->mc_top]; mp = mc->mc_pg[mc->mc_top]; node = page_node(mp, indx); @@ -17087,8 +19049,8 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { MDBX_val k2; k2.iov_base = node_key(node); k2.iov_len = node_ks(node); - mdbx_debug("update key %u (offset %u) [%s] to [%s] on page %" PRIaPGNO, indx, - ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); + DEBUG("update key %zi (offset %zu) [%s] to [%s] on page %" PRIaPGNO, indx, + ptr, DVAL_DEBUG(&k2), DKEY_DEBUG(key), mp->mp_pgno); #endif /* MDBX_DEBUG */ /* Sizes must be 2-byte aligned. */ @@ -17100,27 +19062,27 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { if (delta) { if (delta > (int)page_room(mp)) { /* not enough space left, do a delete and split */ - mdbx_debug("Not enough room, delta = %zd, splitting...", delta); + DEBUG("Not enough room, delta = %zd, splitting...", delta); pgno_t pgno = node_pgno(node); - mdbx_node_del(mc, 0); - int rc = mdbx_page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); - if (rc == MDBX_SUCCESS && mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, C_UPDATING); - return rc; + node_del(mc, 0); + int err = page_split(mc, key, NULL, pgno, MDBX_SPLIT_REPLACE); + if (err == MDBX_SUCCESS && AUDIT_ENABLED()) + err = cursor_check_updating(mc); + return err; } nkeys = page_numkeys(mp); for (i = 0; i < nkeys; i++) { if (mp->mp_ptrs[i] <= ptr) { - mdbx_cassert(mc, mp->mp_ptrs[i] >= delta); + cASSERT(mc, mp->mp_ptrs[i] >= delta); mp->mp_ptrs[i] -= (indx_t)delta; } } - base = (char *)mp + mp->mp_upper + PAGEHDRSZ; + void *const base = ptr_disp(mp, mp->mp_upper + PAGEHDRSZ); len = ptr - mp->mp_upper + NODESIZE; - memmove(base - delta, base, len); - mdbx_cassert(mc, mp->mp_upper >= delta); + memmove(ptr_disp(base, -delta), base, len); + cASSERT(mc, mp->mp_upper >= delta); mp->mp_upper -= (indx_t)delta; node = page_node(mp, indx); @@ -17135,41 +19097,41 @@ static int mdbx_update_key(MDBX_cursor *mc, const MDBX_val *key) { } /* Move a node from csrc to cdst. */ -static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { +static int node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { int rc; DKBUF_DEBUG; MDBX_page *psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); - mdbx_cassert(csrc, csrc->mc_dbi == cdst->mc_dbi); - mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); - if (unlikely(PAGETYPE(psrc) != PAGETYPE(pdst))) { + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); + cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi); + cASSERT(csrc, csrc->mc_top == cdst->mc_top); + if (unlikely(PAGETYPE_WHOLE(psrc) != PAGETYPE_WHOLE(pdst))) { bailout: - mdbx_error("Wrong or mismatch pages's types (src %d, dst %d) to move node", - PAGETYPE(psrc), PAGETYPE(pdst)); + ERROR("Wrong or mismatch pages's types (src %d, dst %d) to move node", + PAGETYPE_WHOLE(psrc), PAGETYPE_WHOLE(pdst)); csrc->mc_txn->mt_flags |= MDBX_TXN_ERROR; return MDBX_PROBLEM; } MDBX_val key4move; - switch (PAGETYPE(psrc)) { + switch (PAGETYPE_WHOLE(psrc)) { case P_BRANCH: { const MDBX_node *srcnode = page_node(psrc, csrc->mc_ki[csrc->mc_top]); - mdbx_cassert(csrc, node_flags(srcnode) == 0); + cASSERT(csrc, node_flags(srcnode) == 0); const pgno_t srcpg = node_pgno(srcnode); key4move.iov_len = node_ks(srcnode); key4move.iov_base = node_key(srcnode); if (csrc->mc_ki[csrc->mc_top] == 0) { - const unsigned snum = csrc->mc_snum; - mdbx_cassert(csrc, snum > 0); + const size_t snum = csrc->mc_snum; + cASSERT(csrc, snum > 0); /* must find the lowest key below src */ - rc = mdbx_page_search_lowest(csrc); + rc = page_search_lowest(csrc); MDBX_page *lowest_page = csrc->mc_pg[csrc->mc_top]; if (unlikely(rc)) return rc; - mdbx_cassert(csrc, IS_LEAF(lowest_page)); + cASSERT(csrc, IS_LEAF(lowest_page)); if (unlikely(!IS_LEAF(lowest_page))) goto bailout; if (IS_LEAF2(lowest_page)) { @@ -17182,28 +19144,28 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { } /* restore cursor after mdbx_page_search_lowest() */ - csrc->mc_snum = snum; - csrc->mc_top = snum - 1; + csrc->mc_snum = (uint8_t)snum; + csrc->mc_top = (uint8_t)snum - 1; csrc->mc_ki[csrc->mc_top] = 0; /* paranoia */ - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(csrc, IS_BRANCH(psrc)); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(csrc, IS_BRANCH(psrc)); if (unlikely(!IS_BRANCH(psrc))) goto bailout; } if (cdst->mc_ki[cdst->mc_top] == 0) { - const unsigned snum = cdst->mc_snum; - mdbx_cassert(csrc, snum > 0); + const size_t snum = cdst->mc_snum; + cASSERT(csrc, snum > 0); MDBX_cursor mn; cursor_copy(cdst, &mn); /* must find the lowest key below dst */ - rc = mdbx_page_search_lowest(&mn); + rc = page_search_lowest(&mn); if (unlikely(rc)) return rc; MDBX_page *const lowest_page = mn.mc_pg[mn.mc_top]; - mdbx_cassert(cdst, IS_LEAF(lowest_page)); + cASSERT(cdst, IS_LEAF(lowest_page)); if (unlikely(!IS_LEAF(lowest_page))) goto bailout; MDBX_val key; @@ -17217,8 +19179,8 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { } /* restore cursor after mdbx_page_search_lowest() */ - mn.mc_snum = snum; - mn.mc_top = snum - 1; + mn.mc_snum = (uint8_t)snum; + mn.mc_top = (uint8_t)snum - 1; mn.mc_ki[mn.mc_top] = 0; const intptr_t delta = @@ -17229,13 +19191,12 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (unlikely(needed > have)) return MDBX_RESULT_TRUE; - if (unlikely((rc = mdbx_page_touch(csrc)) || - (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc)) return rc; } else { @@ -17244,25 +19205,23 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (unlikely(needed > have)) return MDBX_RESULT_TRUE; - if (unlikely((rc = mdbx_page_touch(csrc)) || - (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; } - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "branch", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = - mdbx_node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); + rc = node_add_branch(cdst, cdst->mc_ki[cdst->mc_top], &key4move, srcpg); } break; case P_LEAF: { /* Mark src and dst as dirty. */ - if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; @@ -17272,30 +19231,30 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { data.iov_base = node_data(srcnode); key4move.iov_len = node_ks(srcnode); key4move.iov_base = node_key(srcnode); - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = mdbx_node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, - node_flags(srcnode)); + rc = node_add_leaf(cdst, cdst->mc_ki[cdst->mc_top], &key4move, &data, + node_flags(srcnode)); } break; case P_LEAF | P_LEAF2: { /* Mark src and dst as dirty. */ - if (unlikely((rc = mdbx_page_touch(csrc)) || (rc = mdbx_page_touch(cdst)))) + if (unlikely((rc = page_touch(csrc)) || (rc = page_touch(cdst)))) return rc; psrc = csrc->mc_pg[csrc->mc_top]; pdst = cdst->mc_pg[cdst->mc_top]; key4move.iov_len = csrc->mc_db->md_xsize; key4move.iov_base = page_leaf2key(psrc, csrc->mc_ki[csrc->mc_top], key4move.iov_len); - mdbx_debug("moving %s-node %u [%s] on page %" PRIaPGNO - " to node %u on page %" PRIaPGNO, - "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), - psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); + DEBUG("moving %s-node %u [%s] on page %" PRIaPGNO + " to node %u on page %" PRIaPGNO, + "leaf2", csrc->mc_ki[csrc->mc_top], DKEY_DEBUG(&key4move), + psrc->mp_pgno, cdst->mc_ki[cdst->mc_top], pdst->mp_pgno); /* Add the node to the destination page. */ - rc = mdbx_node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); + rc = node_add_leaf2(cdst, cdst->mc_ki[cdst->mc_top], &key4move); } break; default: @@ -17307,17 +19266,17 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { return rc; /* Delete the node from the source page. */ - mdbx_node_del(csrc, key4move.iov_len); + node_del(csrc, key4move.iov_len); - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; const MDBX_dbi dbi = csrc->mc_dbi; - mdbx_cassert(csrc, csrc->mc_top == cdst->mc_top); + cASSERT(csrc, csrc->mc_top == cdst->mc_top); if (fromleft) { /* If we're adding on the left, bump others up */ for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { @@ -17332,7 +19291,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { m3->mc_ki[csrc->mc_top] == csrc->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = pdst; m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); m3->mc_ki[csrc->mc_top - 1]++; } if (XCURSOR_INITED(m3) && IS_LEAF(psrc)) @@ -17350,7 +19309,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { if (!m3->mc_ki[csrc->mc_top]) { m3->mc_pg[csrc->mc_top] = pdst; m3->mc_ki[csrc->mc_top] = cdst->mc_ki[cdst->mc_top]; - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); m3->mc_ki[csrc->mc_top - 1]--; } else { m3->mc_ki[csrc->mc_top]--; @@ -17365,7 +19324,7 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { /* Update the parent separators. */ if (csrc->mc_ki[csrc->mc_top] == 0) { - mdbx_cassert(csrc, csrc->mc_top > 0); + cASSERT(csrc, csrc->mc_top > 0); if (csrc->mc_ki[csrc->mc_top - 1] != 0) { MDBX_val key; if (IS_LEAF2(psrc)) { @@ -17376,15 +19335,15 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } - mdbx_debug("update separator for source page %" PRIaPGNO " to [%s]", - psrc->mp_pgno, DKEY_DEBUG(&key)); + DEBUG("update separator for source page %" PRIaPGNO " to [%s]", + psrc->mp_pgno, DKEY_DEBUG(&key)); MDBX_cursor mn; cursor_copy(csrc, &mn); - mdbx_cassert(csrc, mn.mc_snum > 0); + cASSERT(csrc, mn.mc_snum > 0); mn.mc_snum--; mn.mc_top--; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -17392,14 +19351,14 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { const MDBX_val nullkey = {0, 0}; const indx_t ix = csrc->mc_ki[csrc->mc_top]; csrc->mc_ki[csrc->mc_top] = 0; - rc = mdbx_update_key(csrc, &nullkey); + rc = update_key(csrc, &nullkey); csrc->mc_ki[csrc->mc_top] = ix; - mdbx_cassert(csrc, rc == MDBX_SUCCESS); + cASSERT(csrc, rc == MDBX_SUCCESS); } } if (cdst->mc_ki[cdst->mc_top] == 0) { - mdbx_cassert(cdst, cdst->mc_top > 0); + cASSERT(cdst, cdst->mc_top > 0); if (cdst->mc_ki[cdst->mc_top - 1] != 0) { MDBX_val key; if (IS_LEAF2(pdst)) { @@ -17410,15 +19369,15 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { key.iov_len = node_ks(srcnode); key.iov_base = node_key(srcnode); } - mdbx_debug("update separator for destination page %" PRIaPGNO " to [%s]", - pdst->mp_pgno, DKEY_DEBUG(&key)); + DEBUG("update separator for destination page %" PRIaPGNO " to [%s]", + pdst->mp_pgno, DKEY_DEBUG(&key)); MDBX_cursor mn; cursor_copy(cdst, &mn); - mdbx_cassert(cdst, mn.mc_snum > 0); + cASSERT(cdst, mn.mc_snum > 0); mn.mc_snum--; mn.mc_top--; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_update_key(&mn, &key)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = update_key(&mn, &key)); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -17426,9 +19385,9 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { const MDBX_val nullkey = {0, 0}; const indx_t ix = cdst->mc_ki[cdst->mc_top]; cdst->mc_ki[cdst->mc_top] = 0; - rc = mdbx_update_key(cdst, &nullkey); + rc = update_key(cdst, &nullkey); cdst->mc_ki[cdst->mc_top] = ix; - mdbx_cassert(cdst, rc == MDBX_SUCCESS); + cASSERT(cdst, rc == MDBX_SUCCESS); } } @@ -17444,49 +19403,48 @@ static int mdbx_node_move(MDBX_cursor *csrc, MDBX_cursor *cdst, bool fromleft) { * [in] cdst Cursor pointing to the destination page. * * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { +static int page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_val key; int rc; - mdbx_cassert(csrc, csrc != cdst); - mdbx_cassert(csrc, cursor_is_tracked(csrc)); - mdbx_cassert(cdst, cursor_is_tracked(cdst)); + cASSERT(csrc, csrc != cdst); + cASSERT(csrc, cursor_is_tracked(csrc)); + cASSERT(cdst, cursor_is_tracked(cdst)); const MDBX_page *const psrc = csrc->mc_pg[csrc->mc_top]; MDBX_page *pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_debug("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, - pdst->mp_pgno); + DEBUG("merging page %" PRIaPGNO " into %" PRIaPGNO, psrc->mp_pgno, + pdst->mp_pgno); - mdbx_cassert(csrc, PAGETYPE(psrc) == PAGETYPE(pdst)); - mdbx_cassert(csrc, - csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); - mdbx_cassert(csrc, csrc->mc_snum > 1); /* can't merge root page */ - mdbx_cassert(cdst, cdst->mc_snum > 1); - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(csrc, csrc->mc_snum < csrc->mc_db->md_depth || - IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); - mdbx_cassert(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); - const int pagetype = PAGETYPE(psrc); + cASSERT(csrc, PAGETYPE_WHOLE(psrc) == PAGETYPE_WHOLE(pdst)); + cASSERT(csrc, csrc->mc_dbi == cdst->mc_dbi && csrc->mc_db == cdst->mc_db); + cASSERT(csrc, csrc->mc_snum > 1); /* can't merge root page */ + cASSERT(cdst, cdst->mc_snum > 1); + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(csrc, csrc->mc_snum < csrc->mc_db->md_depth || + IS_LEAF(csrc->mc_pg[csrc->mc_db->md_depth - 1])); + cASSERT(cdst, page_room(pdst) >= page_used(cdst->mc_txn->mt_env, psrc)); + const int pagetype = PAGETYPE_WHOLE(psrc); /* Move all nodes from src to dst */ - const unsigned dst_nkeys = page_numkeys(pdst); - const unsigned src_nkeys = page_numkeys(psrc); - mdbx_cassert(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); + const size_t dst_nkeys = page_numkeys(pdst); + const size_t src_nkeys = page_numkeys(psrc); + cASSERT(cdst, dst_nkeys + src_nkeys >= (IS_LEAF(psrc) ? 1u : 2u)); if (likely(src_nkeys)) { - unsigned j = dst_nkeys; + size_t j = dst_nkeys; if (unlikely(pagetype & P_LEAF2)) { /* Mark dst as dirty. */ - if (unlikely(rc = mdbx_page_touch(cdst))) + if (unlikely(rc = page_touch(cdst))) return rc; key.iov_len = csrc->mc_db->md_xsize; key.iov_base = page_data(psrc); - unsigned i = 0; + size_t i = 0; do { - rc = mdbx_node_add_leaf2(cdst, j++, &key); + rc = node_add_leaf2(cdst, j++, &key); if (unlikely(rc != MDBX_SUCCESS)) return rc; - key.iov_base = (char *)key.iov_base + key.iov_len; + key.iov_base = ptr_disp(key.iov_base, key.iov_len); } while (++i != src_nkeys); } else { MDBX_node *srcnode = page_node(psrc, 0); @@ -17496,23 +19454,23 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { MDBX_cursor mn; cursor_copy(csrc, &mn); /* must find the lowest key below src */ - rc = mdbx_page_search_lowest(&mn); + rc = page_search_lowest(&mn); if (unlikely(rc)) return rc; const MDBX_page *mp = mn.mc_pg[mn.mc_top]; if (likely(!IS_LEAF2(mp))) { - mdbx_cassert(&mn, IS_LEAF(mp)); + cASSERT(&mn, IS_LEAF(mp)); const MDBX_node *lowest = page_node(mp, 0); key.iov_len = node_ks(lowest); key.iov_base = node_key(lowest); } else { - mdbx_cassert(&mn, mn.mc_top > csrc->mc_top); + cASSERT(&mn, mn.mc_top > csrc->mc_top); key.iov_len = mp->mp_leaf2_ksize; key.iov_base = page_leaf2key(mp, mn.mc_ki[mn.mc_top], key.iov_len); } - mdbx_cassert(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); - mdbx_cassert(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); + cASSERT(&mn, key.iov_len >= csrc->mc_dbx->md_klen_min); + cASSERT(&mn, key.iov_len <= csrc->mc_dbx->md_klen_max); const size_t dst_room = page_room(pdst); const size_t src_used = page_used(cdst->mc_txn->mt_env, psrc); @@ -17522,19 +19480,19 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } /* Mark dst as dirty. */ - if (unlikely(rc = mdbx_page_touch(cdst))) + if (unlikely(rc = page_touch(cdst))) return rc; - unsigned i = 0; + size_t i = 0; while (true) { if (pagetype & P_LEAF) { MDBX_val data; data.iov_len = node_ds(srcnode); data.iov_base = node_data(srcnode); - rc = mdbx_node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); + rc = node_add_leaf(cdst, j++, &key, &data, node_flags(srcnode)); } else { - mdbx_cassert(csrc, node_flags(srcnode) == 0); - rc = mdbx_node_add_branch(cdst, j++, &key, node_pgno(srcnode)); + cASSERT(csrc, node_flags(srcnode) == 0); + rc = node_add_branch(cdst, j++, &key, node_pgno(srcnode)); } if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -17548,20 +19506,20 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } pdst = cdst->mc_pg[cdst->mc_top]; - mdbx_debug("dst page %" PRIaPGNO " now has %u keys (%.1f%% filled)", - pdst->mp_pgno, page_numkeys(pdst), - page_fill(cdst->mc_txn->mt_env, pdst)); + DEBUG("dst page %" PRIaPGNO " now has %zu keys (%.1f%% filled)", + pdst->mp_pgno, page_numkeys(pdst), + page_fill(cdst->mc_txn->mt_env, pdst)); - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); } /* Unlink the src page from parent and add to free list. */ csrc->mc_top--; - mdbx_node_del(csrc, 0); + node_del(csrc, 0); if (csrc->mc_ki[csrc->mc_top] == 0) { const MDBX_val nullkey = {0, 0}; - rc = mdbx_update_key(csrc, &nullkey); + rc = update_key(csrc, &nullkey); if (unlikely(rc)) { csrc->mc_top++; return rc; @@ -17569,14 +19527,14 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } csrc->mc_top++; - mdbx_cassert(csrc, psrc == csrc->mc_pg[csrc->mc_top]); - mdbx_cassert(cdst, pdst == cdst->mc_pg[cdst->mc_top]); + cASSERT(csrc, psrc == csrc->mc_pg[csrc->mc_top]); + cASSERT(cdst, pdst == cdst->mc_pg[cdst->mc_top]); { /* Adjust other cursors pointing to mp */ MDBX_cursor *m2, *m3; const MDBX_dbi dbi = csrc->mc_dbi; - const unsigned top = csrc->mc_top; + const size_t top = csrc->mc_top; for (m2 = csrc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { m3 = (csrc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; @@ -17584,7 +19542,7 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { continue; if (m3->mc_pg[top] == psrc) { m3->mc_pg[top] = pdst; - mdbx_cassert(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); + cASSERT(m3, dst_nkeys + m3->mc_ki[top] <= UINT16_MAX); m3->mc_ki[top] += (indx_t)dst_nkeys; m3->mc_ki[top - 1] = cdst->mc_ki[top - 1]; } else if (m3->mc_pg[top - 1] == csrc->mc_pg[top - 1] && @@ -17596,28 +19554,26 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } } - /* If not operating on GC, allow this page to be reused - * in this txn. Otherwise just add to free list. */ - rc = mdbx_page_retire(csrc, (MDBX_page *)psrc); + rc = page_retire(csrc, (MDBX_page *)psrc); if (unlikely(rc)) return rc; - mdbx_cassert(cdst, cdst->mc_db->md_entries > 0); - mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); - mdbx_cassert(cdst, cdst->mc_top > 0); - mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1); + cASSERT(cdst, cdst->mc_db->md_entries > 0); + cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); + cASSERT(cdst, cdst->mc_top > 0); + cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); MDBX_page *const top_page = cdst->mc_pg[cdst->mc_top]; const indx_t top_indx = cdst->mc_ki[cdst->mc_top]; const unsigned save_snum = cdst->mc_snum; const uint16_t save_depth = cdst->mc_db->md_depth; - mdbx_cursor_pop(cdst); - rc = mdbx_rebalance(cdst); + cursor_pop(cdst); + rc = rebalance(cdst); if (unlikely(rc)) return rc; - mdbx_cassert(cdst, cdst->mc_db->md_entries > 0); - mdbx_cassert(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); - mdbx_cassert(cdst, cdst->mc_snum == cdst->mc_top + 1); + cASSERT(cdst, cdst->mc_db->md_entries > 0); + cASSERT(cdst, cdst->mc_snum <= cdst->mc_db->md_depth); + cASSERT(cdst, cdst->mc_snum == cdst->mc_top + 1); #if MDBX_ENABLE_PGOP_STAT cdst->mc_txn->mt_env->me_lck->mti_pgop_stat.merge.weak += 1; @@ -17625,23 +19581,23 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { if (IS_LEAF(cdst->mc_pg[cdst->mc_top])) { /* LY: don't touch cursor if top-page is a LEAF */ - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } - mdbx_cassert(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); + cASSERT(cdst, page_numkeys(top_page) == dst_nkeys + src_nkeys); - if (unlikely(pagetype != PAGETYPE(top_page))) { + if (unlikely(pagetype != PAGETYPE_WHOLE(top_page))) { /* LY: LEAF-page becomes BRANCH, unable restore cursor's stack */ goto bailout; } if (top_page == cdst->mc_pg[cdst->mc_top]) { /* LY: don't touch cursor if prev top-page already on the top */ - mdbx_cassert(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cASSERT(cdst, cdst->mc_ki[cdst->mc_top] == top_indx); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17652,14 +19608,14 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { } if (top_page == cdst->mc_pg[new_snum - 1]) { - mdbx_cassert(cdst, cdst->mc_ki[new_snum - 1] == top_indx); + cASSERT(cdst, cdst->mc_ki[new_snum - 1] == top_indx); /* LY: restore cursor stack */ - cdst->mc_snum = (uint16_t)new_snum; - cdst->mc_top = (uint16_t)new_snum - 1; - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cdst->mc_snum = (uint8_t)new_snum; + cdst->mc_top = (uint8_t)new_snum - 1; + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17675,12 +19631,12 @@ static int mdbx_page_merge(MDBX_cursor *csrc, MDBX_cursor *cdst) { cdst->mc_ki[new_snum - 1] = top_indx; cdst->mc_pg[new_snum] = (MDBX_page *)(~(uintptr_t)cdst->mc_pg[new_snum]); cdst->mc_ki[new_snum] = ~cdst->mc_ki[new_snum]; - cdst->mc_snum = (uint16_t)new_snum; - cdst->mc_top = (uint16_t)new_snum - 1; - mdbx_cassert(cdst, cdst->mc_snum < cdst->mc_db->md_depth || - IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); - mdbx_cassert(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || - PAGETYPE(cdst->mc_pg[cdst->mc_top]) == pagetype); + cdst->mc_snum = (uint8_t)new_snum; + cdst->mc_top = (uint8_t)new_snum - 1; + cASSERT(cdst, cdst->mc_snum < cdst->mc_db->md_depth || + IS_LEAF(cdst->mc_pg[cdst->mc_db->md_depth - 1])); + cASSERT(cdst, IS_LEAF(cdst->mc_pg[cdst->mc_top]) || + PAGETYPE_WHOLE(cdst->mc_pg[cdst->mc_top]) == pagetype); return MDBX_SUCCESS; } @@ -17691,16 +19647,17 @@ bailout: } static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - mdbx_cassert(cdst, cdst->mc_dbi == csrc->mc_dbi); - mdbx_cassert(cdst, cdst->mc_txn == csrc->mc_txn); - mdbx_cassert(cdst, cdst->mc_db == csrc->mc_db); - mdbx_cassert(cdst, cdst->mc_dbx == csrc->mc_dbx); - mdbx_cassert(cdst, cdst->mc_dbistate == csrc->mc_dbistate); + cASSERT(cdst, cdst->mc_dbi == csrc->mc_dbi); + cASSERT(cdst, cdst->mc_txn == csrc->mc_txn); + cASSERT(cdst, cdst->mc_db == csrc->mc_db); + cASSERT(cdst, cdst->mc_dbx == csrc->mc_dbx); + cASSERT(cdst, cdst->mc_dbistate == csrc->mc_dbistate); cdst->mc_snum = csrc->mc_snum; cdst->mc_top = csrc->mc_top; cdst->mc_flags = csrc->mc_flags; + cdst->mc_checking = csrc->mc_checking; - for (unsigned i = 0; i < csrc->mc_snum; i++) { + for (size_t i = 0; i < csrc->mc_snum; i++) { cdst->mc_pg[i] = csrc->mc_pg[i]; cdst->mc_ki[i] = csrc->mc_ki[i]; } @@ -17710,8 +19667,8 @@ static void cursor_restore(const MDBX_cursor *csrc, MDBX_cursor *cdst) { * [in] csrc The cursor to copy from. * [out] cdst The cursor to copy to. */ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { - mdbx_cassert(csrc, csrc->mc_txn->mt_txnid >= - csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); + cASSERT(csrc, csrc->mc_txn->mt_txnid >= + csrc->mc_txn->mt_env->me_lck->mti_oldest_reader.weak); cdst->mc_dbi = csrc->mc_dbi; cdst->mc_next = NULL; cdst->mc_backup = NULL; @@ -17726,63 +19683,63 @@ static void cursor_copy(const MDBX_cursor *csrc, MDBX_cursor *cdst) { /* Rebalance the tree after a delete operation. * [in] mc Cursor pointing to the page where rebalancing should begin. * Returns 0 on success, non-zero on failure. */ -static int mdbx_rebalance(MDBX_cursor *mc) { - mdbx_cassert(mc, cursor_is_tracked(mc)); - mdbx_cassert(mc, mc->mc_snum > 0); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - const int pagetype = PAGETYPE(mc->mc_pg[mc->mc_top]); +static int rebalance(MDBX_cursor *mc) { + cASSERT(mc, cursor_is_tracked(mc)); + cASSERT(mc, mc->mc_snum > 0); + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || + IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + const int pagetype = PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]); STATIC_ASSERT(P_BRANCH == 1); - const unsigned minkeys = (pagetype & P_BRANCH) + 1; + const size_t minkeys = (pagetype & P_BRANCH) + (size_t)1; /* Pages emptier than this are candidates for merging. */ - unsigned room_threshold = likely(mc->mc_dbi != FREE_DBI) - ? mc->mc_txn->mt_env->me_merge_threshold - : mc->mc_txn->mt_env->me_merge_threshold_gc; + size_t room_threshold = likely(mc->mc_dbi != FREE_DBI) + ? mc->mc_txn->mt_env->me_merge_threshold + : mc->mc_txn->mt_env->me_merge_threshold_gc; const MDBX_page *const tp = mc->mc_pg[mc->mc_top]; - const unsigned numkeys = page_numkeys(tp); - const unsigned room = page_room(tp); - mdbx_debug("rebalancing %s page %" PRIaPGNO - " (has %u keys, full %.1f%%, used %u, room %u bytes )", - (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, - page_fill(mc->mc_txn->mt_env, tp), - page_used(mc->mc_txn->mt_env, tp), room); + const size_t numkeys = page_numkeys(tp); + const size_t room = page_room(tp); + DEBUG("rebalancing %s page %" PRIaPGNO + " (has %zu keys, full %.1f%%, used %zu, room %zu bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), + room); + cASSERT(mc, IS_MODIFIABLE(mc->mc_txn, tp)); if (unlikely(numkeys < minkeys)) { - mdbx_debug("page %" PRIaPGNO " must be merged due keys < %u threshold", - tp->mp_pgno, minkeys); + DEBUG("page %" PRIaPGNO " must be merged due keys < %zu threshold", + tp->mp_pgno, minkeys); } else if (unlikely(room > room_threshold)) { - mdbx_debug("page %" PRIaPGNO " should be merged due room %u > %u threshold", - tp->mp_pgno, room, room_threshold); + DEBUG("page %" PRIaPGNO " should be merged due room %zu > %zu threshold", + tp->mp_pgno, room, room_threshold); } else { - mdbx_debug("no need to rebalance page %" PRIaPGNO - ", room %u < %u threshold", - tp->mp_pgno, room, room_threshold); - mdbx_cassert(mc, mc->mc_db->md_entries > 0); + DEBUG("no need to rebalance page %" PRIaPGNO ", room %zu < %zu threshold", + tp->mp_pgno, room, room_threshold); + cASSERT(mc, mc->mc_db->md_entries > 0); return MDBX_SUCCESS; } int rc; if (mc->mc_snum < 2) { MDBX_page *const mp = mc->mc_pg[0]; - const unsigned nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); + const size_t nkeys = page_numkeys(mp); + cASSERT(mc, (mc->mc_db->md_entries == 0) == (nkeys == 0)); if (IS_SUBP(mp)) { - mdbx_debug("%s", "Can't rebalance a subpage, ignoring"); - mdbx_cassert(mc, pagetype & P_LEAF); + DEBUG("%s", "Can't rebalance a subpage, ignoring"); + cASSERT(mc, pagetype & P_LEAF); return MDBX_SUCCESS; } if (nkeys == 0) { - mdbx_cassert(mc, IS_LEAF(mp)); - mdbx_debug("%s", "tree is completely empty"); - mdbx_cassert(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); + cASSERT(mc, IS_LEAF(mp)); + DEBUG("%s", "tree is completely empty"); + cASSERT(mc, (*mc->mc_dbistate & DBI_DIRTY) != 0); mc->mc_db->md_root = P_INVALID; mc->mc_db->md_depth = 0; - mdbx_cassert(mc, mc->mc_db->md_branch_pages == 0 && - mc->mc_db->md_overflow_pages == 0 && - mc->mc_db->md_leaf_pages == 1); + cASSERT(mc, mc->mc_db->md_branch_pages == 0 && + mc->mc_db->md_overflow_pages == 0 && + mc->mc_db->md_leaf_pages == 1); /* Adjust cursors pointing to mp */ for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[mc->mc_dbi]; m2; m2 = m2->mc_next) { @@ -17800,14 +19757,13 @@ static int mdbx_rebalance(MDBX_cursor *mc) { mc->mc_top = 0; mc->mc_flags &= ~C_INITIALIZED; - rc = mdbx_page_retire(mc, mp); + rc = page_retire(mc, mp); if (unlikely(rc != MDBX_SUCCESS)) return rc; } else if (IS_BRANCH(mp) && nkeys == 1) { - mdbx_debug("%s", "collapsing root page!"); + DEBUG("%s", "collapsing root page!"); mc->mc_db->md_root = node_pgno(page_node(mp, 0)); - rc = mdbx_page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], - pp_txnid4chk(mp, mc->mc_txn)); + rc = page_get(mc, mc->mc_db->md_root, &mc->mc_pg[0], mp->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; mc->mc_db->md_depth--; @@ -17833,28 +19789,28 @@ static int mdbx_rebalance(MDBX_cursor *mc) { m3->mc_top--; } } - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || - PAGETYPE(mc->mc_pg[mc->mc_top]) == pagetype); - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth || - IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); + cASSERT(mc, IS_LEAF(mc->mc_pg[mc->mc_top]) || + PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top]) == pagetype); + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth || + IS_LEAF(mc->mc_pg[mc->mc_db->md_depth - 1])); - rc = mdbx_page_retire(mc, mp); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + rc = page_retire(mc, mp); + if (likely(rc == MDBX_SUCCESS)) + rc = page_touch(mc); + return rc; } else { - mdbx_debug("root page %" PRIaPGNO - " doesn't need rebalancing (flags 0x%x)", - mp->mp_pgno, mp->mp_flags); + DEBUG("root page %" PRIaPGNO " doesn't need rebalancing (flags 0x%x)", + mp->mp_pgno, mp->mp_flags); } return MDBX_SUCCESS; } /* The parent (branch page) must have at least 2 pointers, * otherwise the tree is invalid. */ - const unsigned pre_top = mc->mc_top - 1; - mdbx_cassert(mc, IS_BRANCH(mc->mc_pg[pre_top])); - mdbx_cassert(mc, !IS_SUBP(mc->mc_pg[0])); - mdbx_cassert(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); + const size_t pre_top = mc->mc_top - 1; + cASSERT(mc, IS_BRANCH(mc->mc_pg[pre_top])); + cASSERT(mc, !IS_SUBP(mc->mc_pg[0])); + cASSERT(mc, page_numkeys(mc->mc_pg[pre_top]) > 1); /* Leaf page fill factor is below the threshold. * Try to move keys from left or right neighbor, or @@ -17866,221 +19822,309 @@ static int mdbx_rebalance(MDBX_cursor *mc) { MDBX_page *left = nullptr, *right = nullptr; if (mn.mc_ki[pre_top] > 0) { - rc = mdbx_page_get( + rc = page_get( &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] - 1)), - &left, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn)); + &left, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE(left) == PAGETYPE(mc->mc_pg[mc->mc_top])); + cASSERT(mc, PAGETYPE_WHOLE(left) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } - if (mn.mc_ki[pre_top] + 1u < page_numkeys(mn.mc_pg[pre_top])) { - rc = mdbx_page_get( - &mn, node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + 1)), - &right, pp_txnid4chk(mn.mc_pg[pre_top], mc->mc_txn)); + if (mn.mc_ki[pre_top] + (size_t)1 < page_numkeys(mn.mc_pg[pre_top])) { + rc = page_get( + &mn, + node_pgno(page_node(mn.mc_pg[pre_top], mn.mc_ki[pre_top] + (size_t)1)), + &right, mc->mc_pg[mc->mc_top]->mp_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_cassert(mc, PAGETYPE(right) == PAGETYPE(mc->mc_pg[mc->mc_top])); + cASSERT(mc, PAGETYPE_WHOLE(right) == PAGETYPE_WHOLE(mc->mc_pg[mc->mc_top])); } - mdbx_cassert(mc, left || right); + cASSERT(mc, left || right); - const unsigned ki_top = mc->mc_ki[mc->mc_top]; - const unsigned ki_pre_top = mn.mc_ki[pre_top]; - const unsigned nkeys = page_numkeys(mn.mc_pg[mn.mc_top]); + const size_t ki_top = mc->mc_ki[mc->mc_top]; + const size_t ki_pre_top = mn.mc_ki[pre_top]; + const size_t nkeys = page_numkeys(mn.mc_pg[mn.mc_top]); - const unsigned left_room = left ? page_room(left) : 0; - const unsigned right_room = right ? page_room(right) : 0; - const unsigned left_nkeys = left ? page_numkeys(left) : 0; - const unsigned right_nkeys = right ? page_numkeys(right) : 0; + const size_t left_room = left ? page_room(left) : 0; + const size_t right_room = right ? page_room(right) : 0; + const size_t left_nkeys = left ? page_numkeys(left) : 0; + const size_t right_nkeys = right ? page_numkeys(right) : 0; + bool involve = false; retry: - if (left_room > room_threshold && left_room >= right_room) { + if (left_room > room_threshold && left_room >= right_room && + (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { /* try merge with left */ - mdbx_cassert(mc, left_nkeys >= minkeys); + cASSERT(mc, left_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = left; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); mc->mc_ki[mc->mc_top] = 0; - const unsigned new_ki = ki_top + left_nkeys; + const size_t new_ki = ki_top + left_nkeys; mn.mc_ki[mn.mc_top] += mc->mc_ki[mn.mc_top] + 1; - /* We want mdbx_rebalance to find mn when doing fixups */ - WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(mc, &mn)); + /* We want rebalance to find mn when doing fixups */ + WITH_CURSOR_TRACKING(mn, rc = page_merge(mc, &mn)); if (likely(rc != MDBX_RESULT_TRUE)) { cursor_restore(&mn, mc); mc->mc_ki[mc->mc_top] = (indx_t)new_ki; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } - if (right_room > room_threshold) { + if (right_room > room_threshold && + (IS_MODIFIABLE(mc->mc_txn, right) || involve)) { /* try merge with right */ - mdbx_cassert(mc, right_nkeys >= minkeys); + cASSERT(mc, right_nkeys >= minkeys); mn.mc_pg[mn.mc_top] = right; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - WITH_CURSOR_TRACKING(mn, rc = mdbx_page_merge(&mn, mc)); + WITH_CURSOR_TRACKING(mn, rc = page_merge(&mn, mc)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } if (left_nkeys > minkeys && - (right_nkeys <= left_nkeys || right_room >= left_room)) { + (right_nkeys <= left_nkeys || right_room >= left_room) && + (IS_MODIFIABLE(mc->mc_txn, left) || involve)) { /* try move from left */ mn.mc_pg[mn.mc_top] = left; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top - 1); mn.mc_ki[mn.mc_top] = (indx_t)(left_nkeys - 1); mc->mc_ki[mc->mc_top] = 0; - WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, true)); + WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, true)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)(ki_top + 1); - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } - if (right_nkeys > minkeys) { + if (right_nkeys > minkeys && (IS_MODIFIABLE(mc->mc_txn, right) || involve)) { /* try move from right */ mn.mc_pg[mn.mc_top] = right; mn.mc_ki[mn.mc_top - 1] = (indx_t)(ki_pre_top + 1); mn.mc_ki[mn.mc_top] = 0; mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - WITH_CURSOR_TRACKING(mn, rc = mdbx_node_move(&mn, mc, false)); + WITH_CURSOR_TRACKING(mn, rc = node_move(&mn, mc, false)); if (likely(rc != MDBX_RESULT_TRUE)) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - mdbx_cassert(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); + cASSERT(mc, rc || page_numkeys(mc->mc_pg[mc->mc_top]) >= minkeys); return rc; } } if (nkeys >= minkeys) { mc->mc_ki[mc->mc_top] = (indx_t)ki_top; - if (!mdbx_audit_enabled()) - return MDBX_SUCCESS; - return mdbx_cursor_check(mc, C_UPDATING); + if (AUDIT_ENABLED()) + return cursor_check_updating(mc); + return MDBX_SUCCESS; } + if (likely(!involve)) { + involve = true; + goto retry; + } if (likely(room_threshold > 0)) { room_threshold = 0; goto retry; } - mdbx_error("Unable to merge/rebalance %s page %" PRIaPGNO - " (has %u keys, full %.1f%%, used %u, room %u bytes )", - (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, - page_fill(mc->mc_txn->mt_env, tp), - page_used(mc->mc_txn->mt_env, tp), room); + ERROR("Unable to merge/rebalance %s page %" PRIaPGNO + " (has %zu keys, full %.1f%%, used %zu, room %zu bytes )", + (pagetype & P_LEAF) ? "leaf" : "branch", tp->mp_pgno, numkeys, + page_fill(mc->mc_txn->mt_env, tp), page_used(mc->mc_txn->mt_env, tp), + room); return MDBX_PROBLEM; } -__cold static int mdbx_page_check(MDBX_cursor *const mc, - const MDBX_page *const mp, unsigned options) { +__cold static int page_check(const MDBX_cursor *const mc, + const MDBX_page *const mp) { DKBUF; - options |= mc->mc_flags; - MDBX_env *const env = mc->mc_txn->mt_env; - const unsigned nkeys = page_numkeys(mp); - char *const end_of_page = (char *)mp + env->me_psize; + int rc = MDBX_SUCCESS; if (unlikely(mp->mp_pgno < MIN_PAGENO || mp->mp_pgno > MAX_PAGENO)) - return bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); - if (IS_OVERFLOW(mp)) { - if (unlikely(mp->mp_pages < 1 && mp->mp_pages >= MAX_PAGENO / 2)) - return bad_page(mp, "invalid overflow n-pages (%u)\n", mp->mp_pages); - if (unlikely(mp->mp_pgno + mp->mp_pages > mc->mc_txn->mt_next_pgno)) - return bad_page(mp, "overflow page beyond (%u) next-pgno\n", - mp->mp_pgno + mp->mp_pages); - if (unlikely((options & (C_SUB | C_COPYING)) == C_SUB)) - return bad_page(mp, - "unexpected overflow-page for dupsort db (flags 0x%x)\n", - mc->mc_db->md_flags); - return MDBX_SUCCESS; + rc = bad_page(mp, "invalid pgno (%u)\n", mp->mp_pgno); + + MDBX_env *const env = mc->mc_txn->mt_env; + const ptrdiff_t offset = ptr_dist(mp, env->me_map); + unsigned flags_mask = P_ILL_BITS; + unsigned flags_expected = 0; + if (offset < 0 || + offset > (ptrdiff_t)(pgno2bytes(env, mc->mc_txn->mt_next_pgno) - + ((mp->mp_flags & P_SUBP) ? PAGEHDRSZ + 1 + : env->me_psize))) { + /* should be dirty page without MDBX_WRITEMAP, or a subpage of. */ + flags_mask -= P_SUBP; + if ((env->me_flags & MDBX_WRITEMAP) != 0 || + (!IS_SHADOWED(mc->mc_txn, mp) && !(mp->mp_flags & P_SUBP))) + rc = bad_page(mp, "invalid page-address %p, offset %zi\n", + __Wpedantic_format_voidptr(mp), offset); + } else if (offset & (env->me_psize - 1)) + flags_expected = P_SUBP; + + if (unlikely((mp->mp_flags & flags_mask) != flags_expected)) + rc = bad_page(mp, "unknown/extra page-flags (have 0x%x, expect 0x%x)\n", + mp->mp_flags & flags_mask, flags_expected); + + cASSERT(mc, (mc->mc_checking & CC_LEAF2) == 0 || (mc->mc_flags & C_SUB) != 0); + const uint8_t type = PAGETYPE_WHOLE(mp); + switch (type) { + default: + return bad_page(mp, "invalid type (%u)\n", type); + case P_OVERFLOW: + if (unlikely(mc->mc_flags & C_SUB)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", "large", + "nested dupsort tree", mc->mc_db->md_flags); + const pgno_t npages = mp->mp_pages; + if (unlikely(npages < 1 || npages >= MAX_PAGENO / 2)) + rc = bad_page(mp, "invalid n-pages (%u) for large-page\n", npages); + if (unlikely(mp->mp_pgno + npages > mc->mc_txn->mt_next_pgno)) + rc = bad_page( + mp, "end of large-page beyond (%u) allocated space (%u next-pgno)\n", + mp->mp_pgno + npages, mc->mc_txn->mt_next_pgno); + return rc; //-------------------------- end of large/overflow page handling + case P_LEAF | P_SUBP: + if (unlikely(mc->mc_db->md_depth != 1)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", + "leaf-sub", "nested dupsort db", mc->mc_db->md_flags); + /* fall through */ + __fallthrough; + case P_LEAF: + if (unlikely((mc->mc_checking & CC_LEAF2) != 0)) + rc = bad_page( + mp, "unexpected leaf-page for dupfixed subtree (db-lags 0x%x)\n", + mc->mc_db->md_flags); + break; + case P_LEAF | P_LEAF2 | P_SUBP: + if (unlikely(mc->mc_db->md_depth != 1)) + rc = bad_page(mp, "unexpected %s-page for %s (db-flags 0x%x)\n", + "leaf2-sub", "nested dupsort db", mc->mc_db->md_flags); + /* fall through */ + __fallthrough; + case P_LEAF | P_LEAF2: + if (unlikely((mc->mc_checking & CC_LEAF2) == 0)) + rc = bad_page( + mp, + "unexpected leaf2-page for non-dupfixed (sub)tree (db-flags 0x%x)\n", + mc->mc_db->md_flags); + break; + case P_BRANCH: + break; } - int rc = MDBX_SUCCESS; - if ((options & C_UPDATING) == 0 || !IS_MODIFIABLE(mc->mc_txn, mp)) { - if (unlikely(nkeys < 2 && IS_BRANCH(mp))) - rc = bad_page(mp, "branch-page nkey (%u) < 2\n", nkeys); + if (unlikely(mp->mp_upper < mp->mp_lower || + ((mp->mp_lower | mp->mp_upper) & 1) || + PAGEHDRSZ + mp->mp_upper > env->me_psize)) + rc = bad_page(mp, "invalid page lower(%u)/upper(%u) with limit %zu\n", + mp->mp_lower, mp->mp_upper, page_space(env)); + + const char *const end_of_page = ptr_disp(mp, env->me_psize); + const size_t nkeys = page_numkeys(mp); + STATIC_ASSERT(P_BRANCH == 1); + if (unlikely(nkeys <= (uint8_t)(mp->mp_flags & P_BRANCH))) { + if ((!(mc->mc_flags & C_SUB) || mc->mc_db->md_entries) && + (!(mc->mc_checking & CC_UPDATING) || + !(IS_MODIFIABLE(mc->mc_txn, mp) || (mp->mp_flags & P_SUBP)))) + rc = + bad_page(mp, "%s-page nkeys (%zu) < %u\n", + IS_BRANCH(mp) ? "branch" : "leaf", nkeys, 1 + IS_BRANCH(mp)); + } + if (!IS_LEAF2(mp) && unlikely(PAGEHDRSZ + mp->mp_upper + + nkeys * sizeof(MDBX_node) + nkeys - 1 > + env->me_psize)) + rc = bad_page(mp, "invalid page upper (%u) for nkeys %zu with limit %zu\n", + mp->mp_upper, nkeys, page_space(env)); + + const size_t ksize_max = keysize_max(env->me_psize, 0); + const size_t leaf2_ksize = mp->mp_leaf2_ksize; + if (IS_LEAF2(mp)) { + if (unlikely((mc->mc_flags & C_SUB) == 0 || + (mc->mc_db->md_flags & MDBX_DUPFIXED) == 0)) + rc = bad_page(mp, "unexpected leaf2-page (db-flags 0x%x)\n", + mc->mc_db->md_flags); + if (unlikely(leaf2_ksize < 1 || leaf2_ksize > ksize_max)) + rc = bad_page(mp, "invalid leaf2-key length (%zu)\n", leaf2_ksize); } - if (IS_LEAF2(mp) && unlikely((options & (C_SUB | C_COPYING)) == 0)) - rc = bad_page(mp, "unexpected leaf2-page (db flags 0x%x)\n", - mc->mc_db->md_flags); MDBX_val here, prev = {0, 0}; - for (unsigned i = 0; i < nkeys; ++i) { + for (size_t i = 0; i < nkeys; ++i) { if (IS_LEAF2(mp)) { - const size_t ksize = mp->mp_leaf2_ksize; - char *const key = page_leaf2key(mp, i, ksize); - if (unlikely(end_of_page < key + ksize)) { + const char *const key = page_leaf2key(mp, i, leaf2_ksize); + if (unlikely(end_of_page < key + leaf2_ksize)) { rc = bad_page(mp, "leaf2-key beyond (%zu) page-end\n", - key + ksize - end_of_page); + key + leaf2_ksize - end_of_page); continue; } - if ((options & C_COPYING) == 0) { - if (unlikely(ksize != mc->mc_dbx->md_klen_min)) { - if (unlikely(ksize < mc->mc_dbx->md_klen_min || - ksize > mc->mc_dbx->md_klen_max)) - rc = bad_page( - mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", - ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); - else - mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = ksize; - } - if ((options & C_SKIPORD) == 0) { - here.iov_len = ksize; - here.iov_base = key; - if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "leaf2-key #%u wrong order (%s >= %s)\n", i, - DKEY(&prev), DVAL(&here)); - prev = here; - } + if (unlikely(leaf2_ksize != mc->mc_dbx->md_klen_min)) { + if (unlikely(leaf2_ksize < mc->mc_dbx->md_klen_min || + leaf2_ksize > mc->mc_dbx->md_klen_max)) + rc = bad_page( + mp, "leaf2-key size (%zu) <> min/max key-length (%zu/%zu)\n", + leaf2_ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); + else + mc->mc_dbx->md_klen_min = mc->mc_dbx->md_klen_max = leaf2_ksize; + } + if ((mc->mc_checking & CC_SKIPORD) == 0) { + here.iov_base = (void *)key; + here.iov_len = leaf2_ksize; + if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) + rc = bad_page(mp, "leaf2-key #%zu wrong order (%s >= %s)\n", i, + DKEY(&prev), DVAL(&here)); + prev = here; } } else { const MDBX_node *const node = page_node(mp, i); - const char *node_end = (char *)node + NODESIZE; + const char *const node_end = ptr_disp(node, NODESIZE); if (unlikely(node_end > end_of_page)) { - rc = bad_page(mp, "node[%u] (%zu) beyond page-end\n", i, + rc = bad_page(mp, "node[%zu] (%zu) beyond page-end\n", i, node_end - end_of_page); continue; } - size_t ksize = node_ks(node); - char *key = node_key(node); + const size_t ksize = node_ks(node); + if (unlikely(ksize > ksize_max)) + rc = bad_page(mp, "node[%zu] too long key (%zu)\n", i, ksize); + const char *const key = node_key(node); if (unlikely(end_of_page < key + ksize)) { - rc = bad_page(mp, "node[%u] key (%zu) beyond page-end\n", i, + rc = bad_page(mp, "node[%zu] key (%zu) beyond page-end\n", i, key + ksize - end_of_page); continue; } - if ((IS_LEAF(mp) || i > 0) && (options & C_COPYING) == 0) { + if ((IS_LEAF(mp) || i > 0)) { if (unlikely(ksize < mc->mc_dbx->md_klen_min || ksize > mc->mc_dbx->md_klen_max)) rc = bad_page( - mp, "node[%u] key size (%zu) <> min/max key-length (%zu/%zu)\n", + mp, "node[%zu] key size (%zu) <> min/max key-length (%zu/%zu)\n", i, ksize, mc->mc_dbx->md_klen_min, mc->mc_dbx->md_klen_max); - if ((options & C_SKIPORD) == 0) { - here.iov_base = key; + if ((mc->mc_checking & CC_SKIPORD) == 0) { + here.iov_base = (void *)key; here.iov_len = ksize; if (prev.iov_base && unlikely(mc->mc_dbx->md_cmp(&prev, &here) >= 0)) - rc = bad_page(mp, "node[%u] key wrong order (%s >= %s)\n", i, + rc = bad_page(mp, "node[%zu] key wrong order (%s >= %s)\n", i, DKEY(&prev), DVAL(&here)); prev = here; } } if (IS_BRANCH(mp)) { - if ((options & C_UPDATING) == 0 && i == 0 && unlikely(ksize != 0)) - rc = bad_page(mp, "branch-node[%u] wrong 0-node key-length (%zu)\n", + if ((mc->mc_checking & CC_UPDATING) == 0 && i == 0 && + unlikely(ksize != 0)) + rc = bad_page(mp, "branch-node[%zu] wrong 0-node key-length (%zu)\n", i, ksize); - if ((options & C_RETIRING) == 0) { - const pgno_t ref = node_pgno(node); - if (unlikely(ref < MIN_PAGENO || ref >= mc->mc_txn->mt_next_pgno)) - rc = bad_page(mp, "branch-node[%u] wrong pgno (%u)\n", i, ref); - } + const pgno_t ref = node_pgno(node); + if (unlikely(ref < MIN_PAGENO) || + (unlikely(ref >= mc->mc_txn->mt_next_pgno) && + (unlikely(ref >= mc->mc_txn->mt_geo.now) || + !(mc->mc_checking & CC_RETIRING)))) + rc = bad_page(mp, "branch-node[%zu] wrong pgno (%u)\n", i, ref); if (unlikely(node_flags(node))) - rc = bad_page(mp, "branch-node[%u] wrong flags (%u)\n", i, + rc = bad_page(mp, "branch-node[%zu] wrong flags (%u)\n", i, node_flags(node)); continue; } switch (node_flags(node)) { default: - rc = bad_page(mp, "invalid node[%u] flags (%u)\n", i, node_flags(node)); + rc = + bad_page(mp, "invalid node[%zu] flags (%u)\n", i, node_flags(node)); break; case F_BIGDATA /* data on large-page */: case 0 /* usual */: @@ -18095,41 +20139,46 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, if (node_flags(node) & F_BIGDATA) { if (unlikely(end_of_page < data + sizeof(pgno_t))) { rc = bad_page( - mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n", + mp, "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", "bigdata-pgno", i, nkeys, dsize, data + dsize - end_of_page); continue; } - if ((options & C_COPYING) == 0) { - if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || - dsize > mc->mc_dbx->md_vlen_max)) - rc = bad_page( - mp, - "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", - dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - } - if ((options & C_RETIRING) == 0) { - MDBX_page *lp; - int err = mdbx_page_get(mc, node_largedata_pgno(node), &lp, - pp_txnid4chk(mp, mc->mc_txn)); - if (unlikely(err != MDBX_SUCCESS)) - return err; - if (unlikely(!IS_OVERFLOW(lp))) { - rc = bad_page(mp, "big-node refs to non-overflow page (%u)\n", - lp->mp_pgno); - continue; + if (unlikely(dsize <= mc->mc_dbx->md_vlen_min || + dsize > mc->mc_dbx->md_vlen_max)) + rc = bad_page( + mp, + "big-node data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); + if (unlikely(node_size_len(node_ks(node), dsize) <= + mc->mc_txn->mt_env->me_leaf_nodemax) && + mc->mc_dbi != FREE_DBI) + poor_page(mp, "too small data (%zu bytes) for bigdata-node", dsize); + + if ((mc->mc_checking & CC_RETIRING) == 0) { + const pgr_t lp = + page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely(lp.err != MDBX_SUCCESS)) + return lp.err; + cASSERT(mc, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + const unsigned npages = number_of_ovpages(env, dsize); + if (unlikely(lp.page->mp_pages != npages)) { + if (lp.page->mp_pages < npages) + rc = bad_page(lp.page, + "too less n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); + else if (mc->mc_dbi != FREE_DBI) + poor_page(lp.page, + "extra n-pages %u for bigdata-node (%zu bytes)", + lp.page->mp_pages, dsize); } - if (unlikely(number_of_ovpages(env, dsize) > lp->mp_pages)) - rc = - bad_page(mp, "big-node size (%zu) mismatch n-pages size (%u)\n", - dsize, lp->mp_pages); } continue; } if (unlikely(end_of_page < data + dsize)) { - rc = - bad_page(mp, "node-%s(%u of %u, %zu bytes) beyond (%zu) page-end\n", - "data", i, nkeys, dsize, data + dsize - end_of_page); + rc = bad_page(mp, + "node-%s(%zu of %zu, %zu bytes) beyond (%zu) page-end\n", + "data", i, nkeys, dsize, data + dsize - end_of_page); continue; } @@ -18138,14 +20187,12 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, /* wrong, but already handled */ continue; case 0 /* usual */: - if ((options & C_COPYING) == 0) { - if (unlikely(dsize < mc->mc_dbx->md_vlen_min || - dsize > mc->mc_dbx->md_vlen_max)) { - rc = bad_page( - mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", - dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); - continue; - } + if (unlikely(dsize < mc->mc_dbx->md_vlen_min || + dsize > mc->mc_dbx->md_vlen_max)) { + rc = bad_page( + mp, "node-data size (%zu) <> min/max value-length (%zu/%zu)\n", + dsize, mc->mc_dbx->md_vlen_min, mc->mc_dbx->md_vlen_max); + continue; } break; case F_SUBDATA /* sub-db */: @@ -18167,9 +20214,8 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } else { const MDBX_page *const sp = (MDBX_page *)data; - const char *const end_of_subpage = data + dsize; - const int nsubkeys = page_numkeys(sp); - switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) { + switch (sp->mp_flags & + /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { case P_LEAF | P_SUBP: case P_LEAF | P_LEAF2 | P_SUBP: break; @@ -18179,45 +20225,49 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, continue; } + const char *const end_of_subpage = data + dsize; + const intptr_t nsubkeys = page_numkeys(sp); + if (unlikely(nsubkeys == 0) && !(mc->mc_checking & CC_UPDATING) && + mc->mc_db->md_entries) + rc = bad_page(mp, "no keys on a %s-page\n", + IS_LEAF2(sp) ? "leaf2-sub" : "leaf-sub"); + MDBX_val sub_here, sub_prev = {0, 0}; for (int j = 0; j < nsubkeys; j++) { if (IS_LEAF2(sp)) { /* LEAF2 pages have no mp_ptrs[] or node headers */ - size_t sub_ksize = sp->mp_leaf2_ksize; - char *sub_key = page_leaf2key(sp, j, sub_ksize); + const size_t sub_ksize = sp->mp_leaf2_ksize; + const char *const sub_key = page_leaf2key(sp, j, sub_ksize); if (unlikely(end_of_subpage < sub_key + sub_ksize)) { rc = bad_page(mp, "nested-leaf2-key beyond (%zu) nested-page\n", sub_key + sub_ksize - end_of_subpage); continue; } - if ((options & C_COPYING) == 0) { - if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { - if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || - sub_ksize > mc->mc_dbx->md_vlen_max)) { - rc = bad_page(mp, - "nested-leaf2-key size (%zu) <> min/max " - "value-length (%zu/%zu)\n", - sub_ksize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - continue; - } + if (unlikely(sub_ksize != mc->mc_dbx->md_vlen_min)) { + if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || + sub_ksize > mc->mc_dbx->md_vlen_max)) + rc = bad_page(mp, + "nested-leaf2-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); + else mc->mc_dbx->md_vlen_min = mc->mc_dbx->md_vlen_max = sub_ksize; - } - if ((options & C_SKIPORD) == 0) { - sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; - if (sub_prev.iov_base && - unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) - rc = bad_page( - mp, "nested-leaf2-key #%u wrong order (%s >= %s)\n", j, - DKEY(&sub_prev), DVAL(&sub_here)); - sub_prev = sub_here; - } + } + if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_base = (void *)sub_key; + sub_here.iov_len = sub_ksize; + if (sub_prev.iov_base && + unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) + rc = bad_page(mp, + "nested-leaf2-key #%u wrong order (%s >= %s)\n", + j, DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; } } else { const MDBX_node *const sub_node = page_node(sp, j); - const char *sub_node_end = (char *)sub_node + NODESIZE; + const char *const sub_node_end = ptr_disp(sub_node, NODESIZE); if (unlikely(sub_node_end > end_of_subpage)) { rc = bad_page(mp, "nested-node beyond (%zu) nested-page\n", end_of_subpage - sub_node_end); @@ -18227,30 +20277,27 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, rc = bad_page(mp, "nested-node invalid flags (%u)\n", node_flags(sub_node)); - size_t sub_ksize = node_ks(sub_node); - char *sub_key = node_key(sub_node); - size_t sub_dsize = node_ds(sub_node); + const size_t sub_ksize = node_ks(sub_node); + const char *const sub_key = node_key(sub_node); + const size_t sub_dsize = node_ds(sub_node); /* char *sub_data = node_data(sub_node); */ - if ((options & C_COPYING) == 0) { - if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || - sub_ksize > mc->mc_dbx->md_vlen_max)) + if (unlikely(sub_ksize < mc->mc_dbx->md_vlen_min || + sub_ksize > mc->mc_dbx->md_vlen_max)) + rc = bad_page(mp, + "nested-node-key size (%zu) <> min/max " + "value-length (%zu/%zu)\n", + sub_ksize, mc->mc_dbx->md_vlen_min, + mc->mc_dbx->md_vlen_max); + if ((mc->mc_checking & CC_SKIPORD) == 0) { + sub_here.iov_base = (void *)sub_key; + sub_here.iov_len = sub_ksize; + if (sub_prev.iov_base && + unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) rc = bad_page(mp, - "nested-node-key size (%zu) <> min/max " - "value-length (%zu/%zu)\n", - sub_ksize, mc->mc_dbx->md_vlen_min, - mc->mc_dbx->md_vlen_max); - - if ((options & C_SKIPORD) == 0) { - sub_here.iov_len = sub_ksize; - sub_here.iov_base = sub_key; - if (sub_prev.iov_base && - unlikely(mc->mc_dbx->md_dcmp(&sub_prev, &sub_here) >= 0)) - rc = bad_page( - mp, "nested-node-key #%u wrong order (%s >= %s)\n", j, - DKEY(&sub_prev), DVAL(&sub_here)); - sub_prev = sub_here; - } + "nested-node-key #%u wrong order (%s >= %s)\n", + j, DKEY(&sub_prev), DVAL(&sub_here)); + sub_prev = sub_here; } if (unlikely(sub_dsize != 0)) rc = bad_page(mp, "nested-node non-empty data size (%zu)\n", @@ -18268,66 +20315,74 @@ __cold static int mdbx_page_check(MDBX_cursor *const mc, return rc; } -__cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { - mdbx_cassert(mc, - mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == - (mc->mc_txn->mt_parent - ? mc->mc_txn->mt_parent->tw.dirtyroom - : mc->mc_txn->mt_env->me_options.dp_limit)); - mdbx_cassert(mc, mc->mc_top == mc->mc_snum - 1 || (options & C_UPDATING)); - if (unlikely(mc->mc_top != mc->mc_snum - 1) && (options & C_UPDATING) == 0) +__cold static int cursor_check(const MDBX_cursor *mc) { + if (!mc->mc_txn->tw.dirtylist) { + cASSERT(mc, + (mc->mc_txn->mt_flags & MDBX_WRITEMAP) != 0 && !MDBX_AVOID_MSYNC); + } else { + cASSERT(mc, + (mc->mc_txn->mt_flags & MDBX_WRITEMAP) == 0 || MDBX_AVOID_MSYNC); + cASSERT(mc, mc->mc_txn->tw.dirtyroom + mc->mc_txn->tw.dirtylist->length == + (mc->mc_txn->mt_parent + ? mc->mc_txn->mt_parent->tw.dirtyroom + : mc->mc_txn->mt_env->me_options.dp_limit)); + } + cASSERT(mc, mc->mc_top == mc->mc_snum - 1 || (mc->mc_checking & CC_UPDATING)); + if (unlikely(mc->mc_top != mc->mc_snum - 1) && + (mc->mc_checking & CC_UPDATING) == 0) return MDBX_CURSOR_FULL; - mdbx_cassert(mc, (options & C_UPDATING) ? mc->mc_snum <= mc->mc_db->md_depth - : mc->mc_snum == mc->mc_db->md_depth); - if (unlikely((options & C_UPDATING) ? mc->mc_snum > mc->mc_db->md_depth - : mc->mc_snum != mc->mc_db->md_depth)) + cASSERT(mc, (mc->mc_checking & CC_UPDATING) + ? mc->mc_snum <= mc->mc_db->md_depth + : mc->mc_snum == mc->mc_db->md_depth); + if (unlikely((mc->mc_checking & CC_UPDATING) + ? mc->mc_snum > mc->mc_db->md_depth + : mc->mc_snum != mc->mc_db->md_depth)) return MDBX_CURSOR_FULL; for (int n = 0; n < (int)mc->mc_snum; ++n) { MDBX_page *mp = mc->mc_pg[n]; - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); const bool expect_branch = (n < mc->mc_db->md_depth - 1) ? true : false; const bool expect_nested_leaf = (n + 1 == mc->mc_db->md_depth - 1) ? true : false; const bool branch = IS_BRANCH(mp) ? true : false; - mdbx_cassert(mc, branch == expect_branch); + cASSERT(mc, branch == expect_branch); if (unlikely(branch != expect_branch)) return MDBX_CURSOR_FULL; - if ((options & C_UPDATING) == 0) { - mdbx_cassert(mc, - nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && - (mc->mc_flags & C_EOF) != 0)); + if ((mc->mc_checking & CC_UPDATING) == 0) { + cASSERT(mc, nkeys > mc->mc_ki[n] || (!branch && nkeys == mc->mc_ki[n] && + (mc->mc_flags & C_EOF) != 0)); if (unlikely(nkeys <= mc->mc_ki[n] && !(!branch && nkeys == mc->mc_ki[n] && (mc->mc_flags & C_EOF) != 0))) return MDBX_CURSOR_FULL; } else { - mdbx_cassert(mc, nkeys + 1 >= mc->mc_ki[n]); + cASSERT(mc, nkeys + 1 >= mc->mc_ki[n]); if (unlikely(nkeys + 1 < mc->mc_ki[n])) return MDBX_CURSOR_FULL; } - int err = mdbx_page_check(mc, mp, options); + int err = page_check(mc, mp); if (unlikely(err != MDBX_SUCCESS)) return err; - for (unsigned i = 0; i < nkeys; ++i) { + for (size_t i = 0; i < nkeys; ++i) { if (branch) { MDBX_node *node = page_node(mp, i); - mdbx_cassert(mc, node_flags(node) == 0); + cASSERT(mc, node_flags(node) == 0); if (unlikely(node_flags(node) != 0)) return MDBX_CURSOR_FULL; pgno_t pgno = node_pgno(node); MDBX_page *np; - int rc = mdbx_page_get(mc, pgno, &np, pp_txnid4chk(mp, mc->mc_txn)); - mdbx_cassert(mc, rc == MDBX_SUCCESS); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + err = page_get(mc, pgno, &np, mp->mp_txnid); + cASSERT(mc, err == MDBX_SUCCESS); + if (unlikely(err != MDBX_SUCCESS)) + return err; const bool nested_leaf = IS_LEAF(np) ? true : false; - mdbx_cassert(mc, nested_leaf == expect_nested_leaf); + cASSERT(mc, nested_leaf == expect_nested_leaf); if (unlikely(nested_leaf != expect_nested_leaf)) return MDBX_CURSOR_FULL; - err = mdbx_page_check(mc, np, options); + err = page_check(mc, np); if (unlikely(err != MDBX_SUCCESS)) return err; } @@ -18336,121 +20391,11 @@ __cold static int mdbx_cursor_check(MDBX_cursor *mc, unsigned options) { return MDBX_SUCCESS; } -/* Complete a delete operation started by mdbx_cursor_del(). */ -static int mdbx_cursor_del0(MDBX_cursor *mc) { - int rc; - MDBX_page *mp; - indx_t ki; - unsigned nkeys; - MDBX_dbi dbi = mc->mc_dbi; - - mdbx_cassert(mc, cursor_is_tracked(mc)); - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - mdbx_node_del(mc, mc->mc_db->md_xsize); - mc->mc_db->md_entries--; - - /* Adjust other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (m3 == mc || !(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - if (m3->mc_ki[mc->mc_top] == ki) { - m3->mc_flags |= C_DEL; - if (mc->mc_db->md_flags & MDBX_DUPSORT) { - /* Sub-cursor referred into dataset which is gone */ - m3->mc_xcursor->mx_cursor.mc_flags &= ~(C_INITIALIZED | C_EOF); - } - continue; - } else if (m3->mc_ki[mc->mc_top] > ki) { - m3->mc_ki[mc->mc_top]--; - } - if (XCURSOR_INITED(m3)) - XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); - } - } - - rc = mdbx_rebalance(mc); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - - if (unlikely(!mc->mc_snum)) { - /* DB is totally empty now, just bail out. - * Other cursors adjustments were already done - * by mdbx_rebalance and aren't needed here. */ - mdbx_cassert(mc, mc->mc_db->md_entries == 0 && mc->mc_db->md_depth == 0 && - mc->mc_db->md_root == P_INVALID); - mc->mc_flags |= C_EOF; - return MDBX_SUCCESS; - } - - ki = mc->mc_ki[mc->mc_top]; - mp = mc->mc_pg[mc->mc_top]; - mdbx_cassert(mc, IS_LEAF(mc->mc_pg[mc->mc_top])); - nkeys = page_numkeys(mp); - mdbx_cassert(mc, (mc->mc_db->md_entries > 0 && nkeys > 0) || - ((mc->mc_flags & C_SUB) && mc->mc_db->md_entries == 0 && - nkeys == 0)); - - /* Adjust this and other cursors pointing to mp */ - for (MDBX_cursor *m2 = mc->mc_txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) { - MDBX_cursor *m3 = (mc->mc_flags & C_SUB) ? &m2->mc_xcursor->mx_cursor : m2; - if (!(m2->mc_flags & m3->mc_flags & C_INITIALIZED)) - continue; - if (m3->mc_snum < mc->mc_snum) - continue; - if (m3->mc_pg[mc->mc_top] == mp) { - /* if m3 points past last node in page, find next sibling */ - if (m3->mc_ki[mc->mc_top] >= nkeys) { - rc = mdbx_cursor_sibling(m3, SIBLING_RIGHT); - if (rc == MDBX_NOTFOUND) { - m3->mc_flags |= C_EOF; - rc = MDBX_SUCCESS; - continue; - } - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - if (m3->mc_ki[mc->mc_top] >= ki || - /* moved to right sibling */ m3->mc_pg[mc->mc_top] != mp) { - if (m3->mc_xcursor && !(m3->mc_flags & C_EOF)) { - MDBX_node *node = - page_node(m3->mc_pg[m3->mc_top], m3->mc_ki[m3->mc_top]); - /* If this node has dupdata, it may need to be reinited - * because its data has moved. - * If the xcursor was not inited it must be reinited. - * Else if node points to a subDB, nothing is needed. */ - if (node_flags(node) & F_DUPDATA) { - if (m3->mc_xcursor->mx_cursor.mc_flags & C_INITIALIZED) { - if (!(node_flags(node) & F_SUBDATA)) - m3->mc_xcursor->mx_cursor.mc_pg[0] = node_data(node); - } else { - rc = mdbx_xcursor_init1(m3, node, m3->mc_pg[m3->mc_top]); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - rc = mdbx_cursor_first(&m3->mc_xcursor->mx_cursor, NULL, NULL); - if (unlikely(rc != MDBX_SUCCESS)) - goto bailout; - } - } - m3->mc_xcursor->mx_cursor.mc_flags |= C_DEL; - } - m3->mc_flags |= C_DEL; - } - } - } - - mdbx_cassert(mc, rc == MDBX_SUCCESS); - if (mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, 0); - return rc; - -bailout: - mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; +__cold static int cursor_check_updating(MDBX_cursor *mc) { + const uint8_t checking = mc->mc_checking; + mc->mc_checking |= CC_UPDATING; + const int rc = cursor_check(mc); + mc->mc_checking = checking; return rc; } @@ -18469,21 +20414,21 @@ int mdbx_del(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - return mdbx_del0(txn, dbi, key, data, 0); + return delete (txn, dbi, key, data, 0); } -static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, - const MDBX_val *data, unsigned flags) { +static int delete(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, + const MDBX_val *data, unsigned flags) { MDBX_cursor_couple cx; MDBX_cursor_op op; MDBX_val rdata; int rc; DKBUF_DEBUG; - mdbx_debug("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), - DVAL_DEBUG(data)); + DEBUG("====> delete db %u key [%s], data [%s]", dbi, DKEY_DEBUG(key), + DVAL_DEBUG(data)); - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -18495,7 +20440,7 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, op = MDBX_SET; flags |= MDBX_ALLDUPS; } - rc = mdbx_cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; + rc = cursor_set(&cx.outer, (MDBX_val *)key, (MDBX_val *)data, op).err; if (likely(rc == MDBX_SUCCESS)) { /* let mdbx_page_split know about this cursor if needed: * delete will trigger a rebalance; if it needs to move @@ -18506,7 +20451,7 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * cursor to be consistent until the end of the rebalance. */ cx.outer.mc_next = txn->mt_cursors[dbi]; txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_del(&cx.outer, flags); + rc = cursor_del(&cx.outer, flags); txn->mt_cursors[dbi] = cx.outer.mc_next; } return rc; @@ -18520,57 +20465,57 @@ static int mdbx_del0(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, * [in] newkey The key for the newly inserted node. * [in] newdata The data for the newly inserted node. * [in] newpgno The page number, if the new node is a branch node. - * [in] nflags The NODE_ADD_FLAGS for the new node. + * [in] naf The NODE_ADD_FLAGS for the new node. * Returns 0 on success, non-zero on failure. */ -static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, - MDBX_val *const newdata, pgno_t newpgno, - unsigned nflags) { +static int page_split(MDBX_cursor *mc, const MDBX_val *const newkey, + MDBX_val *const newdata, pgno_t newpgno, + const unsigned naf) { unsigned flags; int rc = MDBX_SUCCESS, foliage = 0; - unsigned i, ptop; + size_t i, ptop; MDBX_env *const env = mc->mc_txn->mt_env; - MDBX_val sepkey, rkey, xdata; + MDBX_val rkey, xdata; MDBX_page *tmp_ki_copy = NULL; DKBUF; MDBX_page *const mp = mc->mc_pg[mc->mc_top]; - const unsigned newindx = mc->mc_ki[mc->mc_top]; - unsigned nkeys = page_numkeys(mp); - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + const size_t newindx = mc->mc_ki[mc->mc_top]; + size_t nkeys = page_numkeys(mp); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) return rc; } STATIC_ASSERT(P_BRANCH == 1); - const unsigned minkeys = (mp->mp_flags & P_BRANCH) + 1; + const size_t minkeys = (mp->mp_flags & P_BRANCH) + (size_t)1; - mdbx_debug(">> splitting %s-page %" PRIaPGNO - " and adding %zu+%zu [%s] at %i, nkeys %i", - IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, - newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), - mc->mc_ki[mc->mc_top], nkeys); - mdbx_cassert(mc, nkeys + 1 >= minkeys * 2); + DEBUG(">> splitting %s-page %" PRIaPGNO + " and adding %zu+%zu [%s] at %i, nkeys %zi", + IS_LEAF(mp) ? "leaf" : "branch", mp->mp_pgno, newkey->iov_len, + newdata ? newdata->iov_len : 0, DKEY_DEBUG(newkey), + mc->mc_ki[mc->mc_top], nkeys); + cASSERT(mc, nkeys + 1 >= minkeys * 2); /* Create a new sibling page. */ - struct page_result npr = mdbx_page_new(mc, mp->mp_flags, 1); + pgr_t npr = page_new(mc, mp->mp_flags); if (unlikely(npr.err != MDBX_SUCCESS)) return npr.err; MDBX_page *const sister = npr.page; sister->mp_leaf2_ksize = mp->mp_leaf2_ksize; - mdbx_debug("new sibling: page %" PRIaPGNO, sister->mp_pgno); + DEBUG("new sibling: page %" PRIaPGNO, sister->mp_pgno); /* Usually when splitting the root page, the cursor - * height is 1. But when called from mdbx_update_key, + * height is 1. But when called from update_key, * the cursor height may be greater because it walks * up the stack while finding the branch slot to update. */ if (mc->mc_top < 1) { - npr = mdbx_page_new(mc, P_BRANCH, 1); + npr = page_new(mc, P_BRANCH); rc = npr.err; if (unlikely(rc != MDBX_SUCCESS)) goto done; MDBX_page *const pp = npr.page; /* shift current top to make room for new parent */ - mdbx_cassert(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); + cASSERT(mc, mc->mc_snum < 2 && mc->mc_db->md_depth > 0); #if MDBX_DEBUG memset(mc->mc_pg + 3, 0, sizeof(mc->mc_pg) - sizeof(mc->mc_pg[0]) * 3); memset(mc->mc_ki + 3, -1, sizeof(mc->mc_ki) - sizeof(mc->mc_ki[0]) * 3); @@ -18582,11 +20527,11 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_pg[0] = pp; mc->mc_ki[0] = 0; mc->mc_db->md_root = pp->mp_pgno; - mdbx_debug("root split! new root = %" PRIaPGNO, pp->mp_pgno); + DEBUG("root split! new root = %" PRIaPGNO, pp->mp_pgno); foliage = mc->mc_db->md_depth++; /* Add left (implicit) pointer. */ - rc = mdbx_node_add_branch(mc, 0, NULL, mp->mp_pgno); + rc = node_add_branch(mc, 0, NULL, mp->mp_pgno); if (unlikely(rc != MDBX_SUCCESS)) { /* undo the pre-push */ mc->mc_pg[0] = mc->mc_pg[1]; @@ -18598,14 +20543,14 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mc->mc_snum++; mc->mc_top++; ptop = 0; - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } } else { ptop = mc->mc_top - 1; - mdbx_debug("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); + DEBUG("parent branch page is %" PRIaPGNO, mc->mc_pg[ptop]->mp_pgno); } MDBX_cursor mn; @@ -18614,17 +20559,18 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, mn.mc_ki[mn.mc_top] = 0; mn.mc_ki[ptop] = mc->mc_ki[ptop] + 1; - unsigned split_indx = + size_t split_indx = (newindx < nkeys) - ? /* split at the middle */ (nkeys + 1) / 2 + ? /* split at the middle */ (nkeys + 1) >> 1 : /* split at the end (i.e. like append-mode ) */ nkeys - minkeys + 1; - mdbx_assert(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); - mdbx_cassert(mc, !IS_BRANCH(mp) || newindx > 0); + cASSERT(mc, !IS_BRANCH(mp) || newindx > 0); + MDBX_val sepkey = {nullptr, 0}; /* It is reasonable and possible to split the page at the begin */ if (unlikely(newindx < minkeys)) { split_indx = minkeys; - if (newindx == 0 && foliage == 0 && !(nflags & MDBX_SPLIT_REPLACE)) { + if (newindx == 0 && foliage == 0 && !(naf & MDBX_SPLIT_REPLACE)) { split_indx = 0; /* Checking for ability of splitting by the left-side insertion * of a pure page with the new key */ @@ -18643,7 +20589,7 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, sepkey.iov_base = page_leaf2key(mp, 0, sepkey.iov_len); } else get_key(page_node(mp, 0), &sepkey); - mdbx_cassert(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); + cASSERT(mc, mc->mc_dbx->md_cmp(newkey, &sepkey) < 0); /* Avoiding rare complex cases of split the parent page */ if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) split_indx = minkeys; @@ -18655,84 +20601,84 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, const bool pure_left = split_indx == 0; if (unlikely(pure_right)) { /* newindx == split_indx == nkeys */ - mdbx_trace("no-split, but add new pure page at the %s", "right/after"); - mdbx_cassert(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); + TRACE("no-split, but add new pure page at the %s", "right/after"); + cASSERT(mc, newindx == nkeys && split_indx == nkeys && minkeys == 1); sepkey = *newkey; } else if (unlikely(pure_left)) { /* newindx == split_indx == 0 */ - mdbx_trace("no-split, but add new pure page at the %s", "left/before"); - mdbx_cassert(mc, newindx == 0 && split_indx == 0 && minkeys == 1); - mdbx_trace("old-first-key is %s", DKEY_DEBUG(&sepkey)); + TRACE("no-split, but add new pure page at the %s", "left/before"); + cASSERT(mc, newindx == 0 && split_indx == 0 && minkeys == 1); + TRACE("old-first-key is %s", DKEY_DEBUG(&sepkey)); } else { if (IS_LEAF2(sister)) { - char *split, *ins; - unsigned lsize, rsize, ksize; /* Move half of the keys to the right sibling */ - const int x = mc->mc_ki[mc->mc_top] - split_indx; - ksize = mc->mc_db->md_xsize; - split = page_leaf2key(mp, split_indx, ksize); - rsize = (nkeys - split_indx) * ksize; - lsize = (nkeys - split_indx) * sizeof(indx_t); - mdbx_cassert(mc, mp->mp_lower >= lsize); + const intptr_t distance = mc->mc_ki[mc->mc_top] - split_indx; + size_t ksize = mc->mc_db->md_xsize; + void *const split = page_leaf2key(mp, split_indx, ksize); + size_t rsize = (nkeys - split_indx) * ksize; + size_t lsize = (nkeys - split_indx) * sizeof(indx_t); + cASSERT(mc, mp->mp_lower >= lsize); mp->mp_lower -= (indx_t)lsize; - mdbx_cassert(mc, sister->mp_lower + lsize <= UINT16_MAX); + cASSERT(mc, sister->mp_lower + lsize <= UINT16_MAX); sister->mp_lower += (indx_t)lsize; - mdbx_cassert(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); + cASSERT(mc, mp->mp_upper + rsize - lsize <= UINT16_MAX); mp->mp_upper += (indx_t)(rsize - lsize); - mdbx_cassert(mc, sister->mp_upper >= rsize - lsize); + cASSERT(mc, sister->mp_upper >= rsize - lsize); sister->mp_upper -= (indx_t)(rsize - lsize); sepkey.iov_len = ksize; sepkey.iov_base = (newindx != split_indx) ? split : newkey->iov_base; - if (x < 0) { - mdbx_cassert(mc, ksize >= sizeof(indx_t)); - ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); + if (distance < 0) { + cASSERT(mc, ksize >= sizeof(indx_t)); + void *const ins = page_leaf2key(mp, mc->mc_ki[mc->mc_top], ksize); memcpy(sister->mp_ptrs, split, rsize); sepkey.iov_base = sister->mp_ptrs; - memmove(ins + ksize, ins, (split_indx - mc->mc_ki[mc->mc_top]) * ksize); + memmove(ptr_disp(ins, ksize), ins, + (split_indx - mc->mc_ki[mc->mc_top]) * ksize); memcpy(ins, newkey->iov_base, ksize); - mdbx_cassert(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); + cASSERT(mc, UINT16_MAX - mp->mp_lower >= (int)sizeof(indx_t)); mp->mp_lower += sizeof(indx_t); - mdbx_cassert(mc, mp->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, mp->mp_upper >= ksize - sizeof(indx_t)); mp->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); } else { - memcpy(sister->mp_ptrs, split, x * ksize); - ins = page_leaf2key(sister, x, ksize); + memcpy(sister->mp_ptrs, split, distance * ksize); + void *const ins = page_leaf2key(sister, distance, ksize); memcpy(ins, newkey->iov_base, ksize); - memcpy(ins + ksize, split + x * ksize, rsize - x * ksize); - mdbx_cassert(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); + memcpy(ptr_disp(ins, ksize), ptr_disp(split, distance * ksize), + rsize - distance * ksize); + cASSERT(mc, UINT16_MAX - sister->mp_lower >= (int)sizeof(indx_t)); sister->mp_lower += sizeof(indx_t); - mdbx_cassert(mc, sister->mp_upper >= ksize - sizeof(indx_t)); + cASSERT(mc, sister->mp_upper >= ksize - sizeof(indx_t)); sister->mp_upper -= (indx_t)(ksize - sizeof(indx_t)); - mdbx_cassert(mc, x <= (int)UINT16_MAX); - mc->mc_ki[mc->mc_top] = (indx_t)x; + cASSERT(mc, distance <= (int)UINT16_MAX); + mc->mc_ki[mc->mc_top] = (indx_t)distance; } - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; - rc = mdbx_cursor_check(&mn, C_UPDATING); + rc = cursor_check_updating(&mn); if (unlikely(rc != MDBX_SUCCESS)) goto done; } } else { /* grab a page to hold a temporary copy */ - tmp_ki_copy = mdbx_page_malloc(mc->mc_txn, 1); + tmp_ki_copy = page_malloc(mc->mc_txn, 1); if (unlikely(tmp_ki_copy == NULL)) { rc = MDBX_ENOMEM; goto done; } - const unsigned max_space = page_space(env); + const size_t max_space = page_space(env); const size_t new_size = IS_LEAF(mp) ? leaf_size(env, newkey, newdata) : branch_size(env, newkey); /* prepare to insert */ - for (unsigned j = i = 0; i < nkeys; ++i, ++j) { - tmp_ki_copy->mp_ptrs[j] = 0; - j += (i == newindx); - tmp_ki_copy->mp_ptrs[j] = mp->mp_ptrs[i]; - } + for (i = 0; i < newindx; ++i) + tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i]; + tmp_ki_copy->mp_ptrs[i] = (indx_t)-1; + while (++i <= nkeys) + tmp_ki_copy->mp_ptrs[i] = mp->mp_ptrs[i - 1]; tmp_ki_copy->mp_pgno = mp->mp_pgno; tmp_ki_copy->mp_flags = mp->mp_flags; tmp_ki_copy->mp_txnid = INVALID_TXNID; @@ -18754,30 +20700,30 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, * будет в каждом ключе, в худшем случае кроме одного, который может быть * нулевого размера. */ - if (newindx == split_indx && split_indx + minkeys <= nkeys) - split_indx += 1; - mdbx_assert(env, - split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); - const unsigned dim_nodes = + if (newindx == split_indx && nkeys >= 5) { + STATIC_ASSERT(P_BRANCH == 1); + split_indx += mp->mp_flags & P_BRANCH; + } + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); + const size_t dim_nodes = (newindx >= split_indx) ? split_indx : nkeys - split_indx; - const unsigned dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; + const size_t dim_used = (sizeof(indx_t) + NODESIZE + 1) * dim_nodes; if (new_size >= dim_used) { - /* Find split point */ + /* Search for best acceptable split point */ i = (newindx < split_indx) ? 0 : nkeys; - int dir = (newindx < split_indx) ? 1 : -1; + intptr_t dir = (newindx < split_indx) ? 1 : -1; size_t before = 0, after = new_size + page_used(env, mp); - unsigned best_split = split_indx; - unsigned best_offset = INT_MAX; + size_t best_split = split_indx; + size_t best_shift = INT_MAX; - mdbx_trace("seek separator from %u, step %i, default %u, new-idx %u, " - "new-size %zu", - i, dir, split_indx, newindx, new_size); + TRACE("seek separator from %zu, step %zi, default %zu, new-idx %zu, " + "new-size %zu", + i, dir, split_indx, newindx, new_size); do { - mdbx_cassert(mc, i <= nkeys); + cASSERT(mc, i <= nkeys); size_t size = new_size; if (i != newindx) { - MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); + MDBX_node *node = ptr_disp(mp, tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); size = NODESIZE + node_ks(node) + sizeof(indx_t); if (IS_LEAF(mp)) size += (node_flags(node) & F_BIGDATA) ? sizeof(pgno_t) @@ -18787,47 +20733,46 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, before += size; after -= size; - mdbx_trace("step %u, size %zu, before %zu, after %zu, max %u", i, - size, before, after, max_space); + TRACE("step %zu, size %zu, before %zu, after %zu, max %zu", i, size, + before, after, max_space); if (before <= max_space && after <= max_space) { - const unsigned split = i + (dir > 0); + const size_t split = i + (dir > 0); if (split >= minkeys && split <= nkeys + 1 - minkeys) { - const unsigned offset = branchless_abs(split_indx - split); - if (offset >= best_offset) + const size_t shift = branchless_abs(split_indx - split); + if (shift >= best_shift) break; - best_offset = offset; + best_shift = shift; best_split = split; + if (!best_shift) + break; } } i += dir; } while (i < nkeys); split_indx = best_split; - mdbx_trace("chosen %u", split_indx); + TRACE("chosen %zu", split_indx); } - mdbx_assert(env, - split_indx >= minkeys && split_indx <= nkeys - minkeys + 1); + eASSERT(env, split_indx >= minkeys && split_indx <= nkeys + 1 - minkeys); - sepkey.iov_len = newkey->iov_len; - sepkey.iov_base = newkey->iov_base; + sepkey = *newkey; if (split_indx != newindx) { MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[split_indx] + - PAGEHDRSZ); + ptr_disp(mp, tmp_ki_copy->mp_ptrs[split_indx] + PAGEHDRSZ); sepkey.iov_len = node_ks(node); sepkey.iov_base = node_key(node); } } } - mdbx_debug("separator is %d [%s]", split_indx, DKEY_DEBUG(&sepkey)); + DEBUG("separator is %zd [%s]", split_indx, DKEY_DEBUG(&sepkey)); bool did_split_parent = false; /* Copy separator key to the parent. */ if (page_room(mn.mc_pg[ptop]) < branch_size(env, &sepkey)) { - mdbx_trace("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); - mdbx_cassert(mc, page_numkeys(mn.mc_pg[ptop]) > 2); - mdbx_cassert(mc, !pure_left); + TRACE("need split parent branch-page for key %s", DKEY_DEBUG(&sepkey)); + cASSERT(mc, page_numkeys(mn.mc_pg[ptop]) > 2); + cASSERT(mc, !pure_left); const int snum = mc->mc_snum; const int depth = mc->mc_db->md_depth; mn.mc_snum--; @@ -18835,18 +20780,18 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, did_split_parent = true; /* We want other splits to find mn when doing fixups */ WITH_CURSOR_TRACKING( - mn, rc = mdbx_page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); + mn, rc = page_split(&mn, &sepkey, NULL, sister->mp_pgno, 0)); if (unlikely(rc != MDBX_SUCCESS)) goto done; - mdbx_cassert(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); - if (mdbx_audit_enabled()) { - rc = mdbx_cursor_check(mc, C_UPDATING); + cASSERT(mc, (int)mc->mc_snum - snum == mc->mc_db->md_depth - depth); + if (AUDIT_ENABLED()) { + rc = cursor_check_updating(mc); if (unlikely(rc != MDBX_SUCCESS)) goto done; } /* root split? */ - ptop += mc->mc_snum - snum; + ptop += mc->mc_snum - (size_t)snum; /* Right page might now have changed parent. * Check if left page also changed parent. */ @@ -18862,10 +20807,10 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } else { /* find right page's left sibling */ mc->mc_ki[ptop] = mn.mc_ki[ptop]; - rc = mdbx_cursor_sibling(mc, SIBLING_LEFT); + rc = cursor_sibling(mc, SIBLING_LEFT); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_NOTFOUND) /* improper mdbx_cursor_sibling() result */ { - mdbx_error("unexpected %i error going left sibling", rc); + ERROR("unexpected %i error going left sibling", rc); rc = MDBX_PROBLEM; } goto done; @@ -18874,25 +20819,24 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, } } else if (unlikely(pure_left)) { MDBX_page *ptop_page = mc->mc_pg[ptop]; - mdbx_debug("adding to parent page %u node[%u] left-leaf page #%u key %s", - ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, - DKEY(mc->mc_ki[ptop] ? newkey : NULL)); + DEBUG("adding to parent page %u node[%u] left-leaf page #%u key %s", + ptop_page->mp_pgno, mc->mc_ki[ptop], sister->mp_pgno, + DKEY(mc->mc_ki[ptop] ? newkey : NULL)); mc->mc_top--; - rc = mdbx_node_add_branch(mc, mc->mc_ki[ptop], - mc->mc_ki[ptop] ? newkey : NULL, sister->mp_pgno); - mdbx_cassert(mc, mp == mc->mc_pg[ptop + 1] && - newindx == mc->mc_ki[ptop + 1] && ptop == mc->mc_top); + rc = node_add_branch(mc, mc->mc_ki[ptop], mc->mc_ki[ptop] ? newkey : NULL, + sister->mp_pgno); + cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1] && + ptop == mc->mc_top); if (likely(rc == MDBX_SUCCESS) && mc->mc_ki[ptop] == 0) { - mdbx_debug("update prev-first key on parent %s", DKEY(&sepkey)); + DEBUG("update prev-first key on parent %s", DKEY(&sepkey)); MDBX_node *node = page_node(mc->mc_pg[ptop], 1); - mdbx_cassert(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); - mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); + cASSERT(mc, node_ks(node) == 0 && node_pgno(node) == mp->mp_pgno); + cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 0); mc->mc_ki[ptop] = 1; - rc = mdbx_update_key(mc, &sepkey); - mdbx_cassert(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); - mdbx_cassert(mc, - mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); + rc = update_key(mc, &sepkey); + cASSERT(mc, mc->mc_top == ptop && mc->mc_ki[ptop] == 1); + cASSERT(mc, mp == mc->mc_pg[ptop + 1] && newindx == mc->mc_ki[ptop + 1]); mc->mc_ki[ptop] = 0; } @@ -18900,14 +20844,13 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (unlikely(rc != MDBX_SUCCESS)) goto done; - MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + 1); - mdbx_cassert(mc, node_pgno(node) == mp->mp_pgno && - mc->mc_pg[ptop] == ptop_page); + MDBX_node *node = page_node(mc->mc_pg[ptop], mc->mc_ki[ptop] + (size_t)1); + cASSERT(mc, node_pgno(node) == mp->mp_pgno && mc->mc_pg[ptop] == ptop_page); } else { mn.mc_top--; - mdbx_trace("add-to-parent the right-entry[%u] for new sibling-page", - mn.mc_ki[ptop]); - rc = mdbx_node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); + TRACE("add-to-parent the right-entry[%u] for new sibling-page", + mn.mc_ki[ptop]); + rc = node_add_branch(&mn, mn.mc_ki[ptop], &sepkey, sister->mp_pgno); mn.mc_top++; if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -18916,18 +20859,18 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (unlikely(pure_left | pure_right)) { mc->mc_pg[mc->mc_top] = sister; mc->mc_ki[mc->mc_top] = 0; - switch (PAGETYPE(sister)) { + switch (PAGETYPE_WHOLE(sister)) { case P_LEAF: { - mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); - rc = mdbx_node_add_leaf(mc, 0, newkey, newdata, nflags); + cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); + rc = node_add_leaf(mc, 0, newkey, newdata, naf); } break; case P_LEAF | P_LEAF2: { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - mdbx_cassert(mc, newpgno == 0 || newpgno == P_INVALID); - rc = mdbx_node_add_leaf2(mc, 0, newkey); + cASSERT(mc, (naf & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + cASSERT(mc, newpgno == 0 || newpgno == P_INVALID); + rc = node_add_leaf2(mc, 0, newkey); } break; default: - rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister)); + rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); } if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -18942,41 +20885,38 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, page_node(mc->mc_pg[mc->mc_top - i], mc->mc_ki[mc->mc_top - i]), &sepkey); if (mc->mc_dbx->md_cmp(newkey, &sepkey) < 0) { - mc->mc_top -= i; - mdbx_debug("update new-first on parent [%i] page %u key %s", - mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, - DKEY(newkey)); - rc = mdbx_update_key(mc, newkey); - mc->mc_top += i; + mc->mc_top -= (uint8_t)i; + DEBUG("update new-first on parent [%i] page %u key %s", + mc->mc_ki[mc->mc_top], mc->mc_pg[mc->mc_top]->mp_pgno, + DKEY(newkey)); + rc = update_key(mc, newkey); + mc->mc_top += (uint8_t)i; if (unlikely(rc != MDBX_SUCCESS)) goto done; } break; } } - } else if (!IS_LEAF2(mp)) { + } else if (tmp_ki_copy /* !IS_LEAF2(mp) */) { /* Move nodes */ mc->mc_pg[mc->mc_top] = sister; i = split_indx; - unsigned n = 0; - pgno_t pgno = 0; + size_t n = 0; do { - mdbx_trace("i %u, nkeys %u => n %u, rp #%u", i, nkeys, n, - sister->mp_pgno); + TRACE("i %zu, nkeys %zu => n %zu, rp #%u", i, nkeys, n, sister->mp_pgno); + pgno_t pgno = 0; MDBX_val *rdata = NULL; if (i == newindx) { - rkey.iov_base = newkey->iov_base; - rkey.iov_len = newkey->iov_len; + rkey = *newkey; if (IS_LEAF(mp)) rdata = newdata; else pgno = newpgno; - flags = nflags; + flags = naf; /* Update index for the new key. */ mc->mc_ki[mc->mc_top] = (indx_t)n; } else { - MDBX_node *node = - (MDBX_node *)((char *)mp + tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); + MDBX_node *node = ptr_disp(mp, tmp_ki_copy->mp_ptrs[i] + PAGEHDRSZ); rkey.iov_base = node_key(node); rkey.iov_len = node_ks(node); if (IS_LEAF(mp)) { @@ -18988,24 +20928,24 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, flags = node_flags(node); } - switch (PAGETYPE(sister)) { + switch (PAGETYPE_WHOLE(sister)) { case P_BRANCH: { - mdbx_cassert(mc, 0 == (uint16_t)flags); + cASSERT(mc, 0 == (uint16_t)flags); /* First branch index doesn't need key data. */ - rc = mdbx_node_add_branch(mc, n, n ? &rkey : NULL, pgno); + rc = node_add_branch(mc, n, n ? &rkey : NULL, pgno); } break; case P_LEAF: { - mdbx_cassert(mc, pgno == 0); - mdbx_cassert(mc, rdata != NULL); - rc = mdbx_node_add_leaf(mc, n, &rkey, rdata, flags); + cASSERT(mc, pgno == 0); + cASSERT(mc, rdata != NULL); + rc = node_add_leaf(mc, n, &rkey, rdata, flags); } break; /* case P_LEAF | P_LEAF2: { - mdbx_cassert(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); - mdbx_cassert(mc, gno == 0); + cASSERT(mc, (nflags & (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + cASSERT(mc, gno == 0); rc = mdbx_node_add_leaf2(mc, n, &rkey); } break; */ default: - rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE(sister)); + rc = bad_page(sister, "wrong page-type %u\n", PAGETYPE_WHOLE(sister)); } if (unlikely(rc != MDBX_SUCCESS)) goto done; @@ -19015,12 +20955,12 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, i = 0; n = 0; mc->mc_pg[mc->mc_top] = tmp_ki_copy; - mdbx_trace("switch to mp #%u", tmp_ki_copy->mp_pgno); + TRACE("switch to mp #%u", tmp_ki_copy->mp_pgno); } } while (i != split_indx); - mdbx_trace("i %u, nkeys %u, n %u, pgno #%u", i, nkeys, n, - mc->mc_pg[mc->mc_top]->mp_pgno); + TRACE("i %zu, nkeys %zu, n %zu, pgno #%u", i, nkeys, n, + mc->mc_pg[mc->mc_top]->mp_pgno); nkeys = page_numkeys(tmp_ki_copy); for (i = 0; i < nkeys; i++) @@ -19076,18 +21016,18 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, m3->mc_ki[k + 1] = m3->mc_ki[k]; m3->mc_pg[k + 1] = m3->mc_pg[k]; } - m3->mc_ki[0] = (m3->mc_ki[0] >= nkeys) ? 1 : 0; + m3->mc_ki[0] = m3->mc_ki[0] >= nkeys; m3->mc_pg[0] = mc->mc_pg[0]; m3->mc_snum++; m3->mc_top++; } if (m3->mc_top >= mc->mc_top && m3->mc_pg[mc->mc_top] == mp && !pure_left) { - if (m3->mc_ki[mc->mc_top] >= newindx && !(nflags & MDBX_SPLIT_REPLACE)) + if (m3->mc_ki[mc->mc_top] >= newindx && !(naf & MDBX_SPLIT_REPLACE)) m3->mc_ki[mc->mc_top]++; if (m3->mc_ki[mc->mc_top] >= nkeys) { m3->mc_pg[mc->mc_top] = sister; - mdbx_cassert(mc, m3->mc_ki[mc->mc_top] >= nkeys); + cASSERT(mc, m3->mc_ki[mc->mc_top] >= nkeys); m3->mc_ki[mc->mc_top] -= (indx_t)nkeys; for (i = 0; i < mc->mc_top; i++) { m3->mc_ki[i] = mn.mc_ki[i]; @@ -19102,19 +21042,19 @@ static int mdbx_page_split(MDBX_cursor *mc, const MDBX_val *const newkey, if (XCURSOR_INITED(m3) && IS_LEAF(mp)) XCURSOR_REFRESH(m3, m3->mc_pg[mc->mc_top], m3->mc_ki[mc->mc_top]); } - mdbx_trace("mp #%u left: %d, sister #%u left: %d", mp->mp_pgno, page_room(mp), - sister->mp_pgno, page_room(sister)); + TRACE("mp #%u left: %zd, sister #%u left: %zd", mp->mp_pgno, page_room(mp), + sister->mp_pgno, page_room(sister)); done: if (tmp_ki_copy) - mdbx_dpage_free(env, tmp_ki_copy, 1); + dpage_free(env, tmp_ki_copy, 1); if (unlikely(rc != MDBX_SUCCESS)) mc->mc_txn->mt_flags |= MDBX_TXN_ERROR; else { - if (mdbx_audit_enabled()) - rc = mdbx_cursor_check(mc, C_UPDATING); - if (unlikely(nflags & MDBX_RESERVE)) { + if (AUDIT_ENABLED()) + rc = cursor_check_updating(mc); + if (unlikely(naf & MDBX_RESERVE)) { MDBX_node *node = page_node(mc->mc_pg[mc->mc_top], mc->mc_ki[mc->mc_top]); if (!(node_flags(node) & F_BIGDATA)) newdata->iov_base = node_data(node); @@ -19124,7 +21064,7 @@ done: #endif /* MDBX_ENABLE_PGOP_STAT */ } - mdbx_debug("<< mp #%u, rc %d", mp->mp_pgno, rc); + DEBUG("<< mp #%u, rc %d", mp->mp_pgno, rc); return rc; } @@ -19149,7 +21089,7 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; cx.outer.mc_next = txn->mt_cursors[dbi]; @@ -19157,23 +21097,23 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, /* LY: support for update (explicit overwrite) */ if (flags & MDBX_CURRENT) { - rc = mdbx_cursor_get(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET); + rc = cursor_set(&cx.outer, (MDBX_val *)key, NULL, MDBX_SET).err; if (likely(rc == MDBX_SUCCESS) && (txn->mt_dbs[dbi].md_flags & MDBX_DUPSORT) && (flags & MDBX_ALLDUPS) == 0) { /* LY: allows update (explicit overwrite) only for unique keys */ MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && - cx.outer.mc_xcursor->mx_db.md_entries > 1); + if (node_flags(node) & F_DUPDATA) { + tASSERT(txn, XCURSOR_INITED(&cx.outer) && + cx.outer.mc_xcursor->mx_db.md_entries > 1); rc = MDBX_EMULTIVAL; } } } if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_cursor_put(&cx.outer, key, data, flags); + rc = cursor_put_checklen(&cx.outer, key, data, flags); txn->mt_cursors[dbi] = cx.outer.mc_next; return rc; @@ -19182,14 +21122,12 @@ int mdbx_put(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, MDBX_val *data, /**** COPYING *****************************************************************/ /* State needed for a double-buffering compacting copy. */ -typedef struct mdbx_copy { +typedef struct mdbx_compacting_ctx { MDBX_env *mc_env; MDBX_txn *mc_txn; - mdbx_condpair_t mc_condpair; + osal_condpair_t mc_condpair; uint8_t *mc_wbuf[2]; - uint8_t *mc_over[2]; size_t mc_wlen[2]; - size_t mc_olen[2]; mdbx_filehandle_t mc_fd; /* Error code. Never cleared if set. Both threads can set nonzero * to fail the copy. Not mutex-protected, MDBX expects atomic int. */ @@ -19197,39 +21135,38 @@ typedef struct mdbx_copy { pgno_t mc_next_pgno; volatile unsigned mc_head; volatile unsigned mc_tail; -} mdbx_copy; +} mdbx_compacting_ctx; /* Dedicated writer thread for compacting copy. */ -__cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) { - mdbx_copy *my = arg; +__cold static THREAD_RESULT THREAD_CALL compacting_write_thread(void *arg) { + mdbx_compacting_ctx *const ctx = arg; #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) sigset_t sigset; sigemptyset(&sigset); sigaddset(&sigset, SIGPIPE); - my->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); + ctx->mc_error = pthread_sigmask(SIG_BLOCK, &sigset, NULL); #endif /* EPIPE */ - mdbx_condpair_lock(&my->mc_condpair); - while (!my->mc_error) { - while (my->mc_tail == my->mc_head && !my->mc_error) { - int err = mdbx_condpair_wait(&my->mc_condpair, true); + osal_condpair_lock(&ctx->mc_condpair); + while (!ctx->mc_error) { + while (ctx->mc_tail == ctx->mc_head && !ctx->mc_error) { + int err = osal_condpair_wait(&ctx->mc_condpair, true); if (err != MDBX_SUCCESS) { - my->mc_error = err; + ctx->mc_error = err; goto bailout; } } - const unsigned toggle = my->mc_tail & 1; - size_t wsize = my->mc_wlen[toggle]; + const unsigned toggle = ctx->mc_tail & 1; + size_t wsize = ctx->mc_wlen[toggle]; if (wsize == 0) { - my->mc_tail += 1; + ctx->mc_tail += 1; break /* EOF */; } - my->mc_wlen[toggle] = 0; - uint8_t *ptr = my->mc_wbuf[toggle]; - again: - if (!my->mc_error) { - int err = mdbx_write(my->mc_fd, ptr, wsize); + ctx->mc_wlen[toggle] = 0; + uint8_t *ptr = ctx->mc_wbuf[toggle]; + if (!ctx->mc_error) { + int err = osal_write(ctx->mc_fd, ptr, wsize); if (err != MDBX_SUCCESS) { #if defined(EPIPE) && !(defined(_WIN32) || defined(_WIN64)) if (err == EPIPE) { @@ -19239,134 +21176,157 @@ __cold static THREAD_RESULT THREAD_CALL mdbx_env_copythr(void *arg) { sigwait(&sigset, &unused); } #endif /* EPIPE */ - my->mc_error = err; + ctx->mc_error = err; goto bailout; } } - - /* If there's an overflow page tail, write it too */ - wsize = my->mc_olen[toggle]; - if (wsize) { - my->mc_olen[toggle] = 0; - ptr = my->mc_over[toggle]; - goto again; - } - my->mc_tail += 1; - mdbx_condpair_signal(&my->mc_condpair, false); + ctx->mc_tail += 1; + osal_condpair_signal(&ctx->mc_condpair, false); } bailout: - mdbx_condpair_unlock(&my->mc_condpair); + osal_condpair_unlock(&ctx->mc_condpair); return (THREAD_RESULT)0; } /* Give buffer and/or MDBX_EOF to writer thread, await unused buffer. */ -__cold static int mdbx_env_cthr_toggle(mdbx_copy *my) { - mdbx_condpair_lock(&my->mc_condpair); - mdbx_assert(my->mc_env, my->mc_head - my->mc_tail < 2 || my->mc_error); - my->mc_head += 1; - mdbx_condpair_signal(&my->mc_condpair, true); - while (!my->mc_error && - my->mc_head - my->mc_tail == 2 /* both buffers in use */) { - int err = mdbx_condpair_wait(&my->mc_condpair, false); +__cold static int compacting_toggle_write_buffers(mdbx_compacting_ctx *ctx) { + osal_condpair_lock(&ctx->mc_condpair); + eASSERT(ctx->mc_env, ctx->mc_head - ctx->mc_tail < 2 || ctx->mc_error); + ctx->mc_head += 1; + osal_condpair_signal(&ctx->mc_condpair, true); + while (!ctx->mc_error && + ctx->mc_head - ctx->mc_tail == 2 /* both buffers in use */) { + int err = osal_condpair_wait(&ctx->mc_condpair, false); if (err != MDBX_SUCCESS) - my->mc_error = err; + ctx->mc_error = err; } - mdbx_condpair_unlock(&my->mc_condpair); - return my->mc_error; + osal_condpair_unlock(&ctx->mc_condpair); + return ctx->mc_error; } -/* Depth-first tree traversal for compacting copy. - * [in] my control structure. - * [in,out] pg database root. - * [in] flags includes F_DUPDATA if it is a sorted-duplicate sub-DB. */ -__cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { - MDBX_cursor_couple couple; - MDBX_page *mo, *mp, *leaf; - char *buf, *ptr; - int rc; - unsigned i; +__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb); - /* Empty DB, nothing to do */ - if (*pg == P_INVALID) - return MDBX_SUCCESS; +static int compacting_put_bytes(mdbx_compacting_ctx *ctx, const void *src, + size_t bytes, pgno_t pgno, pgno_t npages) { + assert(pgno == 0 || bytes > PAGEHDRSZ); + while (bytes > 0) { + const size_t side = ctx->mc_head & 1; + const size_t left = MDBX_ENVCOPY_WRITEBUF - ctx->mc_wlen[side]; + if (left < (pgno ? PAGEHDRSZ : 1)) { + int err = compacting_toggle_write_buffers(ctx); + if (unlikely(err != MDBX_SUCCESS)) + return err; + continue; + } + const size_t chunk = (bytes < left) ? bytes : left; + void *const dst = ctx->mc_wbuf[side] + ctx->mc_wlen[side]; + if (src) { + memcpy(dst, src, chunk); + if (pgno) { + assert(chunk > PAGEHDRSZ); + MDBX_page *mp = dst; + mp->mp_pgno = pgno; + if (mp->mp_txnid == 0) + mp->mp_txnid = ctx->mc_txn->mt_txnid; + if (mp->mp_flags == P_OVERFLOW) { + assert(bytes <= pgno2bytes(ctx->mc_env, npages)); + mp->mp_pages = npages; + } + pgno = 0; + } + src = ptr_disp(src, chunk); + } else + memset(dst, 0, chunk); + bytes -= chunk; + ctx->mc_wlen[side] += chunk; + } + return MDBX_SUCCESS; +} - memset(&couple, 0, sizeof(couple)); - couple.outer.mc_snum = 1; - couple.outer.mc_txn = my->mc_txn; - couple.outer.mc_flags = couple.inner.mx_cursor.mc_flags = - C_COPYING | C_SKIPORD; +static int compacting_put_page(mdbx_compacting_ctx *ctx, const MDBX_page *mp, + const size_t head_bytes, const size_t tail_bytes, + const pgno_t npages) { + if (tail_bytes) { + assert(head_bytes + tail_bytes <= ctx->mc_env->me_psize); + assert(npages == 1 && + (PAGETYPE_WHOLE(mp) == P_BRANCH || PAGETYPE_WHOLE(mp) == P_LEAF)); + } else { + assert(head_bytes <= pgno2bytes(ctx->mc_env, npages)); + assert((npages == 1 && PAGETYPE_WHOLE(mp) == (P_LEAF | P_LEAF2)) || + PAGETYPE_WHOLE(mp) == P_OVERFLOW); + } - rc = mdbx_page_get(&couple.outer, *pg, &couple.outer.mc_pg[0], - my->mc_txn->mt_txnid); + const pgno_t pgno = ctx->mc_next_pgno; + ctx->mc_next_pgno += npages; + int err = compacting_put_bytes(ctx, mp, head_bytes, pgno, npages); + if (unlikely(err != MDBX_SUCCESS)) + return err; + err = compacting_put_bytes( + ctx, nullptr, pgno2bytes(ctx->mc_env, npages) - (head_bytes + tail_bytes), + 0, 0); + if (unlikely(err != MDBX_SUCCESS)) + return err; + return compacting_put_bytes( + ctx, ptr_disp(mp, ctx->mc_env->me_psize - tail_bytes), tail_bytes, 0, 0); +} + +__cold static int compacting_walk_tree(mdbx_compacting_ctx *ctx, + MDBX_cursor *mc, pgno_t *root, + txnid_t parent_txnid) { + mc->mc_snum = 1; + int rc = page_get(mc, *root, &mc->mc_pg[0], parent_txnid); if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_page_search_root(&couple.outer, NULL, MDBX_PS_FIRST); + + rc = page_search_root(mc, nullptr, MDBX_PS_FIRST); if (unlikely(rc != MDBX_SUCCESS)) return rc; /* Make cursor pages writable */ - buf = ptr = mdbx_malloc(pgno2bytes(my->mc_env, couple.outer.mc_snum)); + void *const buf = osal_malloc(pgno2bytes(ctx->mc_env, mc->mc_snum)); if (buf == NULL) return MDBX_ENOMEM; - for (i = 0; i < couple.outer.mc_top; i++) { - mdbx_page_copy((MDBX_page *)ptr, couple.outer.mc_pg[i], - my->mc_env->me_psize); - couple.outer.mc_pg[i] = (MDBX_page *)ptr; - ptr += my->mc_env->me_psize; + void *ptr = buf; + for (size_t i = 0; i < mc->mc_top; i++) { + page_copy(ptr, mc->mc_pg[i], ctx->mc_env->me_psize); + mc->mc_pg[i] = ptr; + ptr = ptr_disp(ptr, ctx->mc_env->me_psize); } - /* This is writable space for a leaf page. Usually not needed. */ - leaf = (MDBX_page *)ptr; + MDBX_page *const leaf = ptr; - while (couple.outer.mc_snum > 0) { - mp = couple.outer.mc_pg[couple.outer.mc_top]; - unsigned n = page_numkeys(mp); + while (mc->mc_snum > 0) { + MDBX_page *mp = mc->mc_pg[mc->mc_top]; + size_t n = page_numkeys(mp); if (IS_LEAF(mp)) { - if (!IS_LEAF2(mp) && !(flags & F_DUPDATA)) { - for (i = 0; i < n; i++) { + if (!(mc->mc_flags & + C_SUB) /* may have nested F_SUBDATA or F_BIGDATA nodes */) { + for (size_t i = 0; i < n; i++) { MDBX_node *node = page_node(mp, i); - if (node_flags(node) & F_BIGDATA) { - MDBX_page *omp; - + if (node_flags(node) == F_BIGDATA) { /* Need writable leaf */ if (mp != leaf) { - couple.outer.mc_pg[couple.outer.mc_top] = leaf; - mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mc->mc_pg[mc->mc_top] = leaf; + page_copy(leaf, mp, ctx->mc_env->me_psize); mp = leaf; node = page_node(mp, i); } - const pgno_t pgno = node_largedata_pgno(node); - poke_pgno(node_data(node), my->mc_next_pgno); - rc = mdbx_page_get(&couple.outer, pgno, &omp, - pp_txnid4chk(mp, my->mc_txn)); + const pgr_t lp = + page_get_large(mc, node_largedata_pgno(node), mp->mp_txnid); + if (unlikely((rc = lp.err) != MDBX_SUCCESS)) + goto done; + const size_t datasize = node_ds(node); + const pgno_t npages = number_of_ovpages(ctx->mc_env, datasize); + poke_pgno(node_data(node), ctx->mc_next_pgno); + rc = compacting_put_page(ctx, lp.page, PAGEHDRSZ + datasize, 0, + npages); if (unlikely(rc != MDBX_SUCCESS)) goto done; - unsigned toggle = my->mc_head & 1; - if (my->mc_wlen[toggle] + my->mc_env->me_psize > - ((size_t)(MDBX_ENVCOPY_WRITEBUF))) { - rc = mdbx_env_cthr_toggle(my); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - toggle = my->mc_head & 1; - } - mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - memcpy(mo, omp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno; - my->mc_next_pgno += omp->mp_pages; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (omp->mp_pages > 1) { - my->mc_olen[toggle] = pgno2bytes(my->mc_env, omp->mp_pages - 1); - my->mc_over[toggle] = (uint8_t *)omp + my->mc_env->me_psize; - rc = mdbx_env_cthr_toggle(my); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - toggle = my->mc_head & 1; - } } else if (node_flags(node) & F_SUBDATA) { - if (!MDBX_DISABLE_PAGECHECKS && + if (!MDBX_DISABLE_VALIDATION && unlikely(node_ds(node) != sizeof(MDBX_db))) { rc = MDBX_CORRUPTED; goto done; @@ -19374,79 +21334,122 @@ __cold static int mdbx_env_cwalk(mdbx_copy *my, pgno_t *pg, int flags) { /* Need writable leaf */ if (mp != leaf) { - couple.outer.mc_pg[couple.outer.mc_top] = leaf; - mdbx_page_copy(leaf, mp, my->mc_env->me_psize); + mc->mc_pg[mc->mc_top] = leaf; + page_copy(leaf, mp, ctx->mc_env->me_psize); mp = leaf; node = page_node(mp, i); } - MDBX_db db; - memcpy(&db, node_data(node), sizeof(MDBX_db)); - rc = mdbx_env_cwalk(my, &db.md_root, node_flags(node) & F_DUPDATA); - if (rc) + MDBX_db *nested = nullptr; + if (node_flags(node) & F_DUPDATA) { + rc = cursor_xinit1(mc, node, mp); + if (likely(rc == MDBX_SUCCESS)) { + nested = &mc->mc_xcursor->mx_db; + rc = compacting_walk_tree(ctx, &mc->mc_xcursor->mx_cursor, + &nested->md_root, mp->mp_txnid); + } + } else { + cASSERT(mc, (mc->mc_flags & C_SUB) == 0 && mc->mc_xcursor == 0); + MDBX_cursor_couple *couple = + container_of(mc, MDBX_cursor_couple, outer); + cASSERT(mc, + couple->inner.mx_cursor.mc_signature == ~MDBX_MC_LIVE && + !couple->inner.mx_cursor.mc_flags && + !couple->inner.mx_cursor.mc_db && + !couple->inner.mx_cursor.mc_dbx); + nested = &couple->inner.mx_db; + memcpy(nested, node_data(node), sizeof(MDBX_db)); + rc = compacting_walk_sdb(ctx, nested); + } + if (unlikely(rc != MDBX_SUCCESS)) goto done; - memcpy(node_data(node), &db, sizeof(MDBX_db)); + memcpy(node_data(node), nested, sizeof(MDBX_db)); } } } } else { - couple.outer.mc_ki[couple.outer.mc_top]++; - if (couple.outer.mc_ki[couple.outer.mc_top] < n) { - again: - rc = mdbx_page_get( - &couple.outer, - node_pgno(page_node(mp, couple.outer.mc_ki[couple.outer.mc_top])), - &mp, pp_txnid4chk(mp, my->mc_txn)); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - couple.outer.mc_top++; - couple.outer.mc_snum++; - couple.outer.mc_ki[couple.outer.mc_top] = 0; - if (IS_BRANCH(mp)) { + mc->mc_ki[mc->mc_top]++; + if (mc->mc_ki[mc->mc_top] < n) { + while (1) { + const MDBX_node *node = page_node(mp, mc->mc_ki[mc->mc_top]); + rc = page_get(mc, node_pgno(node), &mp, mp->mp_txnid); + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + mc->mc_top++; + mc->mc_snum++; + mc->mc_ki[mc->mc_top] = 0; + if (!IS_BRANCH(mp)) { + mc->mc_pg[mc->mc_top] = mp; + break; + } /* Whenever we advance to a sibling branch page, * we must proceed all the way down to its first leaf. */ - mdbx_page_copy(couple.outer.mc_pg[couple.outer.mc_top], mp, - my->mc_env->me_psize); - goto again; - } else - couple.outer.mc_pg[couple.outer.mc_top] = mp; + page_copy(mc->mc_pg[mc->mc_top], mp, ctx->mc_env->me_psize); + } continue; } } - unsigned toggle = my->mc_head & 1; - if (my->mc_wlen[toggle] + my->mc_wlen[toggle] > - ((size_t)(MDBX_ENVCOPY_WRITEBUF))) { - rc = mdbx_env_cthr_toggle(my); - if (unlikely(rc != MDBX_SUCCESS)) - goto done; - toggle = my->mc_head & 1; + + const pgno_t pgno = ctx->mc_next_pgno; + if (likely(!IS_LEAF2(mp))) { + rc = compacting_put_page( + ctx, mp, PAGEHDRSZ + mp->mp_lower, + ctx->mc_env->me_psize - (PAGEHDRSZ + mp->mp_upper), 1); + } else { + rc = compacting_put_page( + ctx, mp, PAGEHDRSZ + page_numkeys(mp) * mp->mp_leaf2_ksize, 0, 1); } - mo = (MDBX_page *)(my->mc_wbuf[toggle] + my->mc_wlen[toggle]); - mdbx_page_copy(mo, mp, my->mc_env->me_psize); - mo->mp_pgno = my->mc_next_pgno++; - my->mc_wlen[toggle] += my->mc_env->me_psize; - if (couple.outer.mc_top) { + if (unlikely(rc != MDBX_SUCCESS)) + goto done; + + if (mc->mc_top) { /* Update parent if there is one */ - node_set_pgno(page_node(couple.outer.mc_pg[couple.outer.mc_top - 1], - couple.outer.mc_ki[couple.outer.mc_top - 1]), - mo->mp_pgno); - mdbx_cursor_pop(&couple.outer); + node_set_pgno( + page_node(mc->mc_pg[mc->mc_top - 1], mc->mc_ki[mc->mc_top - 1]), + pgno); + cursor_pop(mc); } else { /* Otherwise we're done */ - *pg = mo->mp_pgno; + *root = pgno; break; } } done: - mdbx_free(buf); + osal_free(buf); return rc; } -__cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { +__cold static int compacting_walk_sdb(mdbx_compacting_ctx *ctx, MDBX_db *sdb) { + if (unlikely(sdb->md_root == P_INVALID)) + return MDBX_SUCCESS; /* empty db */ + + MDBX_cursor_couple couple; + memset(&couple, 0, sizeof(couple)); + couple.inner.mx_cursor.mc_signature = ~MDBX_MC_LIVE; + MDBX_dbx dbx = {.md_klen_min = INT_MAX}; + uint8_t dbistate = DBI_VALID | DBI_AUDITED; + int rc = couple_init(&couple, ~0u, ctx->mc_txn, sdb, &dbx, &dbistate); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + couple.outer.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + couple.inner.mx_cursor.mc_checking |= CC_SKIPORD | CC_PAGECHECK; + if (!sdb->md_mod_txnid) + sdb->md_mod_txnid = ctx->mc_txn->mt_txnid; + return compacting_walk_tree(ctx, &couple.outer, &sdb->md_root, + sdb->md_mod_txnid); +} + +__cold static void compacting_fixup_meta(MDBX_env *env, MDBX_meta *meta) { + eASSERT(env, meta->mm_dbs[FREE_DBI].md_mod_txnid || + meta->mm_dbs[FREE_DBI].md_root == P_INVALID); + eASSERT(env, meta->mm_dbs[MAIN_DBI].md_mod_txnid || + meta->mm_dbs[MAIN_DBI].md_root == P_INVALID); + /* Calculate filesize taking in account shrink/growing thresholds */ if (meta->mm_geo.next != meta->mm_geo.now) { meta->mm_geo.now = meta->mm_geo.next; - const pgno_t aligner = pv2pages( + const size_t aligner = pv2pages( meta->mm_geo.grow_pv ? meta->mm_geo.grow_pv : meta->mm_geo.shrink_pv); if (aligner) { const pgno_t aligned = pgno_align2os_pgno( @@ -19462,11 +21465,11 @@ __cold static void compact_fixup_meta(MDBX_env *env, MDBX_meta *meta) { /* Update signature */ assert(meta->mm_geo.now >= meta->mm_geo.next); - unaligned_poke_u64(4, meta->mm_datasync_sign, meta_sign(meta)); + unaligned_poke_u64(4, meta->mm_sign, meta_sign(meta)); } /* Make resizable */ -__cold static void make_resizable(MDBX_meta *meta) { +__cold static void meta_make_sizeable(MDBX_meta *meta) { meta->mm_geo.lower = MIN_PAGENO; if (meta->mm_geo.grow_pv == 0) { const pgno_t step = 1 + (meta->mm_geo.upper - meta->mm_geo.lower) / 42; @@ -19479,91 +21482,120 @@ __cold static void make_resizable(MDBX_meta *meta) { } /* Copy environment with compaction. */ -__cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, const int flags) { +__cold static int env_compact(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, const int flags) { const size_t meta_bytes = pgno2bytes(env, NUM_METAS); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, env->me_os_psize); - MDBX_meta *const meta = mdbx_init_metas(env, buffer); + MDBX_meta *const meta = init_metas(env, buffer); meta_set_txnid(env, meta, read_txn->mt_txnid); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) - make_resizable(meta); + meta_make_sizeable(meta); /* copy canary sequences if present */ if (read_txn->mt_canary.v) { meta->mm_canary = read_txn->mt_canary; - meta->mm_canary.v = constmeta_txnid(env, meta); + meta->mm_canary.v = constmeta_txnid(meta); } - /* Set metapage 1 with current main DB */ - pgno_t new_root, root = read_txn->mt_dbs[MAIN_DBI].md_root; - if ((new_root = root) == P_INVALID) { + if (read_txn->mt_dbs[MAIN_DBI].md_root == P_INVALID) { /* When the DB is empty, handle it specially to * fix any breakage like page leaks from ITS#8174. */ meta->mm_dbs[MAIN_DBI].md_flags = read_txn->mt_dbs[MAIN_DBI].md_flags; - compact_fixup_meta(env, meta); + compacting_fixup_meta(env, meta); if (dest_is_pipe) { - int rc = mdbx_write(fd, buffer, meta_bytes); - if (rc != MDBX_SUCCESS) + int rc = osal_write(fd, buffer, meta_bytes); + if (unlikely(rc != MDBX_SUCCESS)) return rc; } } else { - /* Count free pages + GC pages. Subtract from last_pg - * to find the new last_pg, which also becomes the new root. */ - pgno_t freecount = 0; + /* Count free pages + GC pages. */ MDBX_cursor_couple couple; - MDBX_val key, data; - - int rc = mdbx_cursor_init(&couple.outer, read_txn, FREE_DBI); + int rc = cursor_init(&couple.outer, read_txn, FREE_DBI); if (unlikely(rc != MDBX_SUCCESS)) return rc; - while ((rc = mdbx_cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == 0) - freecount += *(pgno_t *)data.iov_base; + pgno_t gc = read_txn->mt_dbs[FREE_DBI].md_branch_pages + + read_txn->mt_dbs[FREE_DBI].md_leaf_pages + + read_txn->mt_dbs[FREE_DBI].md_overflow_pages; + MDBX_val key, data; + while ((rc = cursor_get(&couple.outer, &key, &data, MDBX_NEXT)) == + MDBX_SUCCESS) { + const MDBX_PNL pnl = data.iov_base; + if (unlikely(data.iov_len % sizeof(pgno_t) || + data.iov_len < MDBX_PNL_SIZEOF(pnl) || + !(pnl_check(pnl, read_txn->mt_next_pgno)))) + return MDBX_CORRUPTED; + gc += MDBX_PNL_GETSIZE(pnl); + } if (unlikely(rc != MDBX_NOTFOUND)) return rc; - freecount += read_txn->mt_dbs[FREE_DBI].md_branch_pages + - read_txn->mt_dbs[FREE_DBI].md_leaf_pages + - read_txn->mt_dbs[FREE_DBI].md_overflow_pages; - - new_root = read_txn->mt_next_pgno - 1 - freecount; - meta->mm_geo.next = new_root + 1; + /* Substract GC-pages from mt_next_pgno to find the new mt_next_pgno. */ + meta->mm_geo.next = read_txn->mt_next_pgno - gc; + /* Set with current main DB */ meta->mm_dbs[MAIN_DBI] = read_txn->mt_dbs[MAIN_DBI]; - meta->mm_dbs[MAIN_DBI].md_root = new_root; - mdbx_copy ctx; + mdbx_compacting_ctx ctx; memset(&ctx, 0, sizeof(ctx)); - rc = mdbx_condpair_init(&ctx.mc_condpair); + rc = osal_condpair_init(&ctx.mc_condpair); if (unlikely(rc != MDBX_SUCCESS)) return rc; - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2); + memset(data_buffer, 0, 2 * (size_t)MDBX_ENVCOPY_WRITEBUF); ctx.mc_wbuf[0] = data_buffer; - ctx.mc_wbuf[1] = data_buffer + ((size_t)(MDBX_ENVCOPY_WRITEBUF)); + ctx.mc_wbuf[1] = data_buffer + (size_t)MDBX_ENVCOPY_WRITEBUF; ctx.mc_next_pgno = NUM_METAS; ctx.mc_env = env; ctx.mc_fd = fd; ctx.mc_txn = read_txn; - mdbx_thread_t thread; - int thread_err = mdbx_thread_create(&thread, mdbx_env_copythr, &ctx); + osal_thread_t thread; + int thread_err = osal_thread_create(&thread, compacting_write_thread, &ctx); if (likely(thread_err == MDBX_SUCCESS)) { if (dest_is_pipe) { - compact_fixup_meta(env, meta); - rc = mdbx_write(fd, buffer, meta_bytes); + if (!meta->mm_dbs[MAIN_DBI].md_mod_txnid) + meta->mm_dbs[MAIN_DBI].md_mod_txnid = read_txn->mt_txnid; + compacting_fixup_meta(env, meta); + rc = osal_write(fd, buffer, meta_bytes); } - if (rc == MDBX_SUCCESS) - rc = mdbx_env_cwalk(&ctx, &root, 0); + if (likely(rc == MDBX_SUCCESS)) + rc = compacting_walk_sdb(&ctx, &meta->mm_dbs[MAIN_DBI]); if (ctx.mc_wlen[ctx.mc_head & 1]) - mdbx_env_cthr_toggle(&ctx); - mdbx_env_cthr_toggle(&ctx); - thread_err = mdbx_thread_join(thread); - mdbx_assert(env, (ctx.mc_tail == ctx.mc_head && - ctx.mc_wlen[ctx.mc_head & 1] == 0) || - ctx.mc_error); - mdbx_condpair_destroy(&ctx.mc_condpair); + /* toggle to flush non-empty buffers */ + compacting_toggle_write_buffers(&ctx); + + if (likely(rc == MDBX_SUCCESS) && + unlikely(meta->mm_geo.next != ctx.mc_next_pgno)) { + if (ctx.mc_next_pgno > meta->mm_geo.next) { + ERROR("the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has double-used pages or other corruption", ctx.mc_next_pgno, + '>', meta->mm_geo.next); + rc = MDBX_CORRUPTED; /* corrupted DB */ + } + if (ctx.mc_next_pgno < meta->mm_geo.next) { + WARNING( + "the source DB %s: post-compactification used pages %" PRIaPGNO + " %c expected %" PRIaPGNO, + "has page leak(s)", ctx.mc_next_pgno, '<', meta->mm_geo.next); + if (dest_is_pipe) + /* the root within already written meta-pages is wrong */ + rc = MDBX_CORRUPTED; + } + /* fixup meta */ + meta->mm_geo.next = ctx.mc_next_pgno; + } + + /* toggle with empty buffers to exit thread's loop */ + eASSERT(env, (ctx.mc_wlen[ctx.mc_head & 1]) == 0); + compacting_toggle_write_buffers(&ctx); + thread_err = osal_thread_join(thread); + eASSERT(env, (ctx.mc_tail == ctx.mc_head && + ctx.mc_wlen[ctx.mc_head & 1] == 0) || + ctx.mc_error); + osal_condpair_destroy(&ctx.mc_condpair); } if (unlikely(thread_err != MDBX_SUCCESS)) return thread_err; @@ -19571,49 +21603,23 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, return rc; if (unlikely(ctx.mc_error != MDBX_SUCCESS)) return ctx.mc_error; - - if (dest_is_pipe) { - if (unlikely(root != new_root)) { - mdbx_error("post-compactification root %" PRIaPGNO - " NE expected %" PRIaPGNO - " (source DB corrupted or has a page leak(s))", - root, new_root); - return MDBX_CORRUPTED; /* page leak or corrupt DB */ - } - } else { - if (unlikely(root > new_root)) { - mdbx_error("post-compactification root %" PRIaPGNO - " GT expected %" PRIaPGNO " (source DB corrupted)", - root, new_root); - return MDBX_CORRUPTED; /* page leak or corrupt DB */ - } - if (unlikely(root < new_root)) { - mdbx_warning("post-compactification root %" PRIaPGNO - " LT expected %" PRIaPGNO " (page leak(s) in source DB)", - root, new_root); - /* fixup meta */ - meta->mm_dbs[MAIN_DBI].md_root = root; - meta->mm_geo.next = root + 1; - } - compact_fixup_meta(env, meta); - } + if (!dest_is_pipe) + compacting_fixup_meta(env, meta); } /* Extend file if required */ if (meta->mm_geo.now != meta->mm_geo.next) { const size_t whole_size = pgno2bytes(env, meta->mm_geo.now); if (!dest_is_pipe) - return mdbx_ftruncate(fd, whole_size); + return osal_ftruncate(fd, whole_size); const size_t used_size = pgno2bytes(env, meta->mm_geo.next); - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); + memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; offset < whole_size;) { - const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) - : whole_size - offset; - /* copy to avoid EFAULT in case swapped-out */ - int rc = mdbx_write(fd, data_buffer, chunk); + const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : whole_size - offset; + int rc = osal_write(fd, data_buffer, chunk); if (unlikely(rc != MDBX_SUCCESS)) return rc; offset += chunk; @@ -19623,11 +21629,11 @@ __cold static int mdbx_env_compact(MDBX_env *env, MDBX_txn *read_txn, } /* Copy environment as-is. */ -__cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, - mdbx_filehandle_t fd, uint8_t *buffer, - const bool dest_is_pipe, const int flags) { +__cold static int env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, + mdbx_filehandle_t fd, uint8_t *buffer, + const bool dest_is_pipe, const int flags) { /* We must start the actual read txn after blocking writers */ - int rc = mdbx_txn_end(read_txn, MDBX_END_RESET_TMP); + int rc = txn_end(read_txn, MDBX_END_RESET_TMP); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19636,33 +21642,34 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_txn_renew0(read_txn, MDBX_TXN_RDONLY); + rc = txn_renew(read_txn, MDBX_TXN_RDONLY); if (unlikely(rc != MDBX_SUCCESS)) { mdbx_txn_unlock(env); return rc; } - mdbx_jitter4testing(false); + jitter4testing(false); const size_t meta_bytes = pgno2bytes(env, NUM_METAS); + const meta_troika_t troika = meta_tap(env); /* Make a snapshot of meta-pages, * but writing ones after the data was flushed */ memcpy(buffer, env->me_map, meta_bytes); MDBX_meta *const headcopy = /* LY: get pointer to the snapshot copy */ - (MDBX_meta *)(buffer + ((uint8_t *)meta_prefer_last(env) - env->me_map)); + ptr_disp(buffer, ptr_dist(meta_recent(env, &troika).ptr_c, env->me_map)); mdbx_txn_unlock(env); if (flags & MDBX_CP_FORCE_DYNAMIC_SIZE) - make_resizable(headcopy); + meta_make_sizeable(headcopy); /* Update signature to steady */ - unaligned_poke_u64(4, headcopy->mm_datasync_sign, meta_sign(headcopy)); + unaligned_poke_u64(4, headcopy->mm_sign, meta_sign(headcopy)); /* Copy the data */ const size_t whole_size = pgno_align2os_bytes(env, read_txn->mt_end_pgno); const size_t used_size = pgno2bytes(env, read_txn->mt_next_pgno); - mdbx_jitter4testing(false); + jitter4testing(false); if (dest_is_pipe) - rc = mdbx_write(fd, buffer, meta_bytes); + rc = osal_write(fd, buffer, meta_bytes); uint8_t *const data_buffer = buffer + ceil_powerof2(meta_bytes, env->me_os_psize); @@ -19719,30 +21726,28 @@ __cold static int mdbx_env_copy_asis(MDBX_env *env, MDBX_txn *read_txn, #endif /* MDBX_USE_COPYFILERANGE */ /* fallback to portable */ - const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < used_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) - : used_size - offset; + const size_t chunk = ((size_t)MDBX_ENVCOPY_WRITEBUF < used_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF + : used_size - offset; /* copy to avoid EFAULT in case swapped-out */ - memcpy(data_buffer, env->me_map + offset, chunk); - rc = mdbx_write(fd, data_buffer, chunk); + memcpy(data_buffer, ptr_disp(env->me_map, offset), chunk); + rc = osal_write(fd, data_buffer, chunk); offset += chunk; } /* Extend file if required */ if (likely(rc == MDBX_SUCCESS) && whole_size != used_size) { if (!dest_is_pipe) - rc = mdbx_ftruncate(fd, whole_size); + rc = osal_ftruncate(fd, whole_size); else { - memset(data_buffer, 0, ((size_t)(MDBX_ENVCOPY_WRITEBUF))); + memset(data_buffer, 0, (size_t)MDBX_ENVCOPY_WRITEBUF); for (size_t offset = used_size; rc == MDBX_SUCCESS && offset < whole_size;) { const size_t chunk = - (((size_t)(MDBX_ENVCOPY_WRITEBUF)) < whole_size - offset) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) + ((size_t)MDBX_ENVCOPY_WRITEBUF < whole_size - offset) + ? (size_t)MDBX_ENVCOPY_WRITEBUF : whole_size - offset; - /* copy to avoid EFAULT in case swapped-out */ - rc = mdbx_write(fd, data_buffer, chunk); + rc = osal_write(fd, data_buffer, chunk); offset += chunk; } } @@ -19757,12 +21762,12 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, if (unlikely(rc != MDBX_SUCCESS)) return rc; - const int dest_is_pipe = mdbx_is_pipe(fd); + const int dest_is_pipe = osal_is_pipe(fd); if (MDBX_IS_ERROR(dest_is_pipe)) return dest_is_pipe; if (!dest_is_pipe) { - rc = mdbx_fseek(fd, 0); + rc = osal_fseek(fd, 0); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -19770,12 +21775,12 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, const size_t buffer_size = pgno_align2os_bytes(env, NUM_METAS) + ceil_powerof2(((flags & MDBX_CP_COMPACT) - ? ((size_t)(MDBX_ENVCOPY_WRITEBUF)) * 2 - : ((size_t)(MDBX_ENVCOPY_WRITEBUF))), + ? 2 * (size_t)MDBX_ENVCOPY_WRITEBUF + : (size_t)MDBX_ENVCOPY_WRITEBUF), env->me_os_psize); uint8_t *buffer = NULL; - rc = mdbx_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); + rc = osal_memalign_alloc(env->me_os_psize, buffer_size, (void **)&buffer); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19784,7 +21789,7 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, * write txn. Otherwise other read txns could block writers. */ rc = mdbx_txn_begin(env, NULL, MDBX_TXN_RDONLY, &read_txn); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_memalign_free(buffer); + osal_memalign_free(buffer); return rc; } @@ -19792,34 +21797,48 @@ __cold int mdbx_env_copy2fd(MDBX_env *env, mdbx_filehandle_t fd, /* Firstly write a stub to meta-pages. * Now we sure to incomplete copy will not be used. */ memset(buffer, -1, pgno2bytes(env, NUM_METAS)); - rc = mdbx_write(fd, buffer, pgno2bytes(env, NUM_METAS)); + rc = osal_write(fd, buffer, pgno2bytes(env, NUM_METAS)); } if (likely(rc == MDBX_SUCCESS)) { memset(buffer, 0, pgno2bytes(env, NUM_METAS)); - rc = ((flags & MDBX_CP_COMPACT) ? mdbx_env_compact : mdbx_env_copy_asis)( + rc = ((flags & MDBX_CP_COMPACT) ? env_compact : env_copy_asis)( env, read_txn, fd, buffer, dest_is_pipe, flags); } mdbx_txn_abort(read_txn); if (!dest_is_pipe) { if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); + rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_SIZE); /* Write actual meta */ if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); + rc = osal_pwrite(fd, buffer, pgno2bytes(env, NUM_METAS), 0); if (likely(rc == MDBX_SUCCESS)) - rc = mdbx_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); + rc = osal_fsync(fd, MDBX_SYNC_DATA | MDBX_SYNC_IODQ); } - mdbx_memalign_free(buffer); + osal_memalign_free(buffer); return rc; } __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, MDBX_copy_flags_t flags) { +#if defined(_WIN32) || defined(_WIN64) + wchar_t *dest_pathW = nullptr; + int rc = osal_mb2w(dest_path, &dest_pathW); + if (likely(rc == MDBX_SUCCESS)) { + rc = mdbx_env_copyW(env, dest_pathW, flags); + osal_free(dest_pathW); + } + return rc; +} + +LIBMDBX_API int mdbx_env_copyW(MDBX_env *env, const wchar_t *dest_path, + MDBX_copy_flags_t flags) { +#endif /* Windows */ + int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19831,7 +21850,7 @@ __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, * We don't want the OS to cache the writes, since the source data is * already in the OS cache. */ mdbx_filehandle_t newfd; - rc = mdbx_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, + rc = osal_openfile(MDBX_OPEN_COPY, env, dest_path, &newfd, #if defined(_WIN32) || defined(_WIN64) (mdbx_mode_t)-1 #else @@ -19863,11 +21882,11 @@ __cold int mdbx_env_copy(MDBX_env *env, const char *dest_path, rc = mdbx_env_copy2fd(env, newfd, flags); if (newfd != INVALID_HANDLE_VALUE) { - int err = mdbx_closefile(newfd); + int err = osal_closefile(newfd); if (rc == MDBX_SUCCESS && err != rc) rc = err; if (rc != MDBX_SUCCESS) - (void)mdbx_removefile(dest_path); + (void)osal_removefile(dest_path); } return rc; @@ -19890,11 +21909,11 @@ __cold int mdbx_env_set_flags(MDBX_env *env, MDBX_env_flags_t flags, return MDBX_EACCESS; if ((env->me_flags & MDBX_ENV_ACTIVE) && - unlikely(env->me_txn0->mt_owner == mdbx_thread_self())) + unlikely(env->me_txn0->mt_owner == osal_thread_self())) return MDBX_BUSY; const bool lock_needed = (env->me_flags & MDBX_ENV_ACTIVE) && - env->me_txn0->mt_owner != mdbx_thread_self(); + env->me_txn0->mt_owner != osal_thread_self(); bool should_unlock = false; if (lock_needed) { rc = mdbx_txn_lock(env, false); @@ -19952,7 +21971,8 @@ __cold int mdbx_env_set_assert(MDBX_env *env, MDBX_assert_func *func) { #endif } -__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { +#if defined(_WIN32) || defined(_WIN64) +__cold int mdbx_env_get_pathW(const MDBX_env *env, const wchar_t **arg) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -19963,6 +21983,52 @@ __cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { *arg = env->me_pathname; return MDBX_SUCCESS; } +#endif /* Windows */ + +__cold int mdbx_env_get_path(const MDBX_env *env, const char **arg) { + int rc = check_env(env, true); + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + if (unlikely(!arg)) + return MDBX_EINVAL; + +#if defined(_WIN32) || defined(_WIN64) + if (!env->me_pathname_char) { + *arg = nullptr; + DWORD flags = /* WC_ERR_INVALID_CHARS */ 0x80; + size_t mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags, env->me_pathname, + -1, nullptr, 0, nullptr, nullptr); + rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); + if (rc == ERROR_INVALID_FLAGS) { + mb_len = WideCharToMultiByte(CP_THREAD_ACP, flags = 0, env->me_pathname, + -1, nullptr, 0, nullptr, nullptr); + rc = mb_len ? MDBX_SUCCESS : (int)GetLastError(); + } + if (unlikely(rc != MDBX_SUCCESS)) + return rc; + + char *const mb_pathname = osal_malloc(mb_len); + if (!mb_pathname) + return MDBX_ENOMEM; + if (mb_len != (size_t)WideCharToMultiByte(CP_THREAD_ACP, flags, + env->me_pathname, -1, mb_pathname, + (int)mb_len, nullptr, nullptr)) { + rc = (int)GetLastError(); + osal_free(mb_pathname); + return rc; + } + if (env->me_pathname_char || + InterlockedCompareExchangePointer( + (PVOID volatile *)&env->me_pathname_char, mb_pathname, nullptr)) + osal_free(mb_pathname); + } + *arg = env->me_pathname_char; +#else + *arg = env->me_pathname; +#endif /* Windows */ + return MDBX_SUCCESS; +} __cold int mdbx_env_get_fd(const MDBX_env *env, mdbx_filehandle_t *arg) { int rc = check_env(env, true); @@ -20028,15 +22094,15 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { if (!(txn->mt_dbs[MAIN_DBI].md_flags & (MDBX_DUPSORT | MDBX_INTEGERKEY)) && txn->mt_dbs[MAIN_DBI].md_entries /* TODO: use `md_subs` field */) { MDBX_cursor_couple cx; - err = mdbx_cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); + err = cursor_init(&cx.outer, (MDBX_txn *)txn, MAIN_DBI); if (unlikely(err != MDBX_SUCCESS)) return err; /* scan and account not opened named subDBs */ - err = mdbx_page_search(&cx.outer, NULL, MDBX_PS_FIRST); + err = page_search(&cx.outer, NULL, MDBX_PS_FIRST); while (err == MDBX_SUCCESS) { const MDBX_page *mp = cx.outer.mc_pg[cx.outer.mc_top]; - for (unsigned i = 0; i < page_numkeys(mp); i++) { + for (size_t i = 0; i < page_numkeys(mp); i++) { const MDBX_node *node = page_node(mp, i); if (node_flags(node) != F_SUBDATA) continue; @@ -20059,7 +22125,7 @@ __cold static int stat_acc(const MDBX_txn *txn, MDBX_stat *st, size_t bytes) { stat_add(&db, st, bytes); } } - err = mdbx_cursor_sibling(&cx.outer, SIBLING_RIGHT); + err = cursor_sibling(&cx.outer, SIBLING_RIGHT); } if (unlikely(err != MDBX_NOTFOUND)) return err; @@ -20086,7 +22152,7 @@ __cold int mdbx_env_stat_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(err != MDBX_SUCCESS)) return err; - if (env->me_txn0 && env->me_txn0->mt_owner == mdbx_thread_self()) + if (env->me_txn0 && env->me_txn0->mt_owner == osal_thread_self()) /* inside write-txn */ return stat_acc(env->me_txn, dest, bytes); @@ -20115,14 +22181,14 @@ __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, return MDBX_BAD_DBI; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; if ((cx.outer.mc_db->md_flags & MDBX_DUPSORT) == 0) return MDBX_RESULT_TRUE; MDBX_val key, data; - rc = mdbx_cursor_first(&cx.outer, &key, &data); + rc = cursor_first(&cx.outer, &key, &data); *mask = 0; while (rc == MDBX_SUCCESS) { const MDBX_node *node = page_node(cx.outer.mc_pg[cx.outer.mc_top], @@ -20144,10 +22210,10 @@ __cold int mdbx_dbi_dupsort_depthmask(MDBX_txn *txn, MDBX_dbi dbi, *mask |= 1 << UNALIGNED_PEEK_16(db, MDBX_db, md_depth); break; default: - mdbx_error("wrong node-flags %u", flags); + ERROR("wrong node-flags %u", flags); return MDBX_CORRUPTED; } - rc = mdbx_cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); + rc = cursor_next(&cx.outer, &key, &data, MDBX_NEXT_NODUP); } return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; @@ -20198,21 +22264,30 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, if (unlikely(env->me_flags & MDBX_FATAL_ERROR)) return MDBX_PANIC; - volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); - arg->mi_recent_txnid = meta_txnid(env, recent_meta); - arg->mi_meta0_txnid = meta_txnid(env, meta0); - arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_datasync_sign); - arg->mi_meta1_txnid = meta_txnid(env, meta1); - arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_datasync_sign); - arg->mi_meta2_txnid = meta_txnid(env, meta2); - arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_datasync_sign); + meta_troika_t holder; + meta_troika_t const *troika; + if (txn && !(txn->mt_flags & MDBX_TXN_RDONLY)) + troika = &txn->tw.troika; + else { + holder = meta_tap(env); + troika = &holder; + } + + const meta_ptr_t head = meta_recent(env, troika); + arg->mi_recent_txnid = head.txnid; + arg->mi_meta0_txnid = troika->txnid[0]; + arg->mi_meta0_sign = unaligned_peek_u64(4, meta0->mm_sign); + arg->mi_meta1_txnid = troika->txnid[1]; + arg->mi_meta1_sign = unaligned_peek_u64(4, meta1->mm_sign); + arg->mi_meta2_txnid = troika->txnid[2]; + arg->mi_meta2_sign = unaligned_peek_u64(4, meta2->mm_sign); if (likely(bytes > size_before_bootid)) { memcpy(&arg->mi_bootid.meta0, &meta0->mm_bootid, 16); memcpy(&arg->mi_bootid.meta1, &meta1->mm_bootid, 16); memcpy(&arg->mi_bootid.meta2, &meta2->mm_bootid, 16); } - volatile const MDBX_meta *txn_meta = recent_meta; + const volatile MDBX_meta *txn_meta = head.ptr_v; arg->mi_last_pgno = txn_meta->mm_geo.next - 1; arg->mi_geo.current = pgno2bytes(env, txn_meta->mm_geo.now); if (txn) { @@ -20230,8 +22305,8 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_geo.upper = pgno2bytes(env, txn_meta->mm_geo.upper); arg->mi_geo.shrink = pgno2bytes(env, pv2pages(txn_meta->mm_geo.shrink_pv)); arg->mi_geo.grow = pgno2bytes(env, pv2pages(txn_meta->mm_geo.grow_pv)); - const pgno_t unsynced_pages = - atomic_load32(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + + const uint64_t unsynced_pages = + atomic_load64(&env->me_lck->mti_unsynced_pages, mo_Relaxed) + (atomic_load32(&env->me_lck->mti_meta_sync_txnid, mo_Relaxed) != (uint32_t)arg->mi_recent_txnid); @@ -20246,18 +22321,19 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_sys_pagesize = env->me_os_psize; if (likely(bytes > size_before_bootid)) { - arg->mi_unsync_volume = pgno2bytes(env, unsynced_pages); - const uint64_t monotime_now = mdbx_osal_monotime(); - uint64_t ts = atomic_load64(&lck->mti_sync_timestamp, mo_Relaxed); + arg->mi_unsync_volume = pgno2bytes(env, (size_t)unsynced_pages); + const uint64_t monotime_now = osal_monotime(); + uint64_t ts = atomic_load64(&lck->mti_eoos_timestamp, mo_Relaxed); arg->mi_since_sync_seconds16dot16 = - ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; ts = atomic_load64(&lck->mti_reader_check_timestamp, mo_Relaxed); arg->mi_since_reader_check_seconds16dot16 = - ts ? mdbx_osal_monotime_to_16dot16(monotime_now - ts) : 0; + ts ? osal_monotime_to_16dot16_noUnderflow(monotime_now - ts) : 0; arg->mi_autosync_threshold = pgno2bytes( env, atomic_load32(&lck->mti_autosync_threshold, mo_Relaxed)); - arg->mi_autosync_period_seconds16dot16 = mdbx_osal_monotime_to_16dot16( - atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); + arg->mi_autosync_period_seconds16dot16 = + osal_monotime_to_16dot16_noUnderflow( + atomic_load64(&lck->mti_autosync_period, mo_Relaxed)); arg->mi_bootid.current.x = bootid.x; arg->mi_bootid.current.y = bootid.y; arg->mi_mode = env->me_lck_mmap.lck ? lck->mti_envmode.weak : env->me_flags; @@ -20280,6 +22356,14 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, atomic_load64(&lck->mti_pgop_stat.unspill, mo_Relaxed); arg->mi_pgop_stat.wops = atomic_load64(&lck->mti_pgop_stat.wops, mo_Relaxed); + arg->mi_pgop_stat.prefault = + atomic_load64(&lck->mti_pgop_stat.prefault, mo_Relaxed); + arg->mi_pgop_stat.mincore = + atomic_load64(&lck->mti_pgop_stat.mincore, mo_Relaxed); + arg->mi_pgop_stat.msync = + atomic_load64(&lck->mti_pgop_stat.msync, mo_Relaxed); + arg->mi_pgop_stat.fsync = + atomic_load64(&lck->mti_pgop_stat.fsync, mo_Relaxed); #else memset(&arg->mi_pgop_stat, 0, sizeof(arg->mi_pgop_stat)); #endif /* MDBX_ENABLE_PGOP_STAT*/ @@ -20288,7 +22372,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, arg->mi_self_latter_reader_txnid = arg->mi_latter_reader_txnid = arg->mi_recent_txnid; if (env->me_lck_mmap.lck) { - for (unsigned i = 0; i < arg->mi_numreaders; ++i) { + for (size_t i = 0; i < arg->mi_numreaders; ++i) { const uint32_t pid = atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (pid) { @@ -20301,7 +22385,7 @@ __cold static int fetch_envinfo_ex(const MDBX_env *env, const MDBX_txn *txn, } } - mdbx_compiler_barrier(); + osal_compiler_barrier(); return MDBX_SUCCESS; } @@ -20363,8 +22447,8 @@ static __inline MDBX_cmp_func *get_default_datacmp(unsigned flags) { : ((flags & MDBX_REVERSEDUP) ? cmp_reverse : cmp_lexical)); } -static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { +static int dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { /* LY: so, accepting only three cases for the table's flags: * 1) user_flags and both comparators are zero * = assume that a by-default mode/flags is requested for reading; @@ -20413,22 +22497,34 @@ static int mdbx_dbi_bind(MDBX_txn *txn, const MDBX_dbi dbi, unsigned user_flags, return MDBX_SUCCESS; } -static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, - MDBX_dbi *dbi, MDBX_cmp_func *keycmp, +static int dbi_open(MDBX_txn *txn, const MDBX_val *const table_name, + unsigned user_flags, MDBX_dbi *dbi, MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { int rc = MDBX_EINVAL; if (unlikely(!dbi)) return rc; + void *clone = nullptr; + bool locked = false; if (unlikely((user_flags & ~DB_USABLE_FLAGS) != 0)) { - early_bailout: + bailout: + tASSERT(txn, MDBX_IS_ERROR(rc)); *dbi = 0; + if (locked) + ENSURE(txn->mt_env, + osal_fastmutex_release(&txn->mt_env->me_dbi_lock) == MDBX_SUCCESS); + osal_free(clone); return rc; } rc = check_txn(txn, MDBX_TXN_BLOCKED); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; + + if ((user_flags & MDBX_CREATE) && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { + rc = MDBX_EACCESS; + goto bailout; + } switch (user_flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED | MDBX_DUPSORT | MDBX_REVERSEDUP | MDBX_ACCEDE)) { @@ -20438,7 +22534,7 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, __fallthrough /* fall through */; default: rc = MDBX_EINVAL; - goto early_bailout; + goto bailout; case MDBX_DUPSORT: case MDBX_DUPSORT | MDBX_REVERSEDUP: @@ -20451,39 +22547,74 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, } /* main table? */ - if (!table_name) { - rc = mdbx_dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); + if (table_name == MDBX_PGWALK_MAIN || + table_name->iov_base == MDBX_PGWALK_MAIN) { + rc = dbi_bind(txn, MAIN_DBI, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; *dbi = MAIN_DBI; return rc; } + if (table_name == MDBX_PGWALK_GC || table_name->iov_base == MDBX_PGWALK_GC) { + rc = dbi_bind(txn, FREE_DBI, user_flags, keycmp, datacmp); + if (unlikely(rc != MDBX_SUCCESS)) + goto bailout; + *dbi = FREE_DBI; + return rc; + } + if (table_name == MDBX_PGWALK_META || + table_name->iov_base == MDBX_PGWALK_META) { + rc = MDBX_EINVAL; + goto bailout; + } - MDBX_env *env = txn->mt_env; - size_t len = strlen(table_name); - if (len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) + MDBX_val key = *table_name; + MDBX_env *const env = txn->mt_env; + if (key.iov_len > env->me_leaf_nodemax - NODESIZE - sizeof(MDBX_db)) return MDBX_EINVAL; - if (txn->mt_dbxs[MAIN_DBI].md_cmp == NULL) { + /* Cannot mix named table(s) with DUPSORT flags */ + if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & MDBX_DUPSORT)) { + if ((user_flags & MDBX_CREATE) == 0) { + rc = MDBX_NOTFOUND; + goto bailout; + } + if (txn->mt_dbs[MAIN_DBI].md_leaf_pages || txn->mt_dbxs[MAIN_DBI].md_cmp) { + /* В MAIN_DBI есть записи либо она уже использовалась. */ + rc = MDBX_INCOMPATIBLE; + goto bailout; + } + /* Пересоздаём MAIN_DBI если там пусто. */ + atomic_store32(&txn->mt_dbiseqs[MAIN_DBI], dbi_seq(env, MAIN_DBI), + mo_AcquireRelease); + tASSERT(txn, txn->mt_dbs[MAIN_DBI].md_depth == 0 && + txn->mt_dbs[MAIN_DBI].md_entries == 0 && + txn->mt_dbs[MAIN_DBI].md_root == P_INVALID); + txn->mt_dbs[MAIN_DBI].md_flags &= MDBX_REVERSEKEY | MDBX_INTEGERKEY; + txn->mt_dbistate[MAIN_DBI] |= DBI_DIRTY; + txn->mt_flags |= MDBX_TXN_DIRTY; txn->mt_dbxs[MAIN_DBI].md_cmp = get_default_keycmp(txn->mt_dbs[MAIN_DBI].md_flags); txn->mt_dbxs[MAIN_DBI].md_dcmp = get_default_datacmp(txn->mt_dbs[MAIN_DBI].md_flags); } + tASSERT(txn, txn->mt_dbxs[MAIN_DBI].md_cmp); + /* Is the DB already open? */ MDBX_dbi scan, slot; for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_len) { + if (!txn->mt_dbxs[scan].md_name.iov_base) { /* Remember this free slot */ slot = scan; continue; } - if (len == txn->mt_dbxs[scan].md_name.iov_len && - !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { - rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); + if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && + !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, + key.iov_len)) { + rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; + goto bailout; *dbi = scan; return rc; } @@ -20492,150 +22623,170 @@ static int dbi_open(MDBX_txn *txn, const char *table_name, unsigned user_flags, /* Fail, if no free slot and max hit */ if (unlikely(slot >= env->me_maxdbs)) { rc = MDBX_DBS_FULL; - goto early_bailout; - } - - /* Cannot mix named table with some main-table flags */ - if (unlikely(txn->mt_dbs[MAIN_DBI].md_flags & - (MDBX_DUPSORT | MDBX_INTEGERKEY))) { - rc = (user_flags & MDBX_CREATE) ? MDBX_INCOMPATIBLE : MDBX_NOTFOUND; - goto early_bailout; + goto bailout; } /* Find the DB info */ - MDBX_val key, data; - key.iov_len = len; - key.iov_base = (void *)table_name; + MDBX_val data; MDBX_cursor_couple couple; - rc = mdbx_cursor_init(&couple.outer, txn, MAIN_DBI); + rc = cursor_init(&couple.outer, txn, MAIN_DBI); if (unlikely(rc != MDBX_SUCCESS)) - goto early_bailout; - rc = mdbx_cursor_set(&couple.outer, &key, &data, MDBX_SET).err; + goto bailout; + rc = cursor_set(&couple.outer, &key, &data, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { if (rc != MDBX_NOTFOUND || !(user_flags & MDBX_CREATE)) - goto early_bailout; + goto bailout; } else { /* make sure this is actually a table */ MDBX_node *node = page_node(couple.outer.mc_pg[couple.outer.mc_top], couple.outer.mc_ki[couple.outer.mc_top]); if (unlikely((node_flags(node) & (F_DUPDATA | F_SUBDATA)) != F_SUBDATA)) { rc = MDBX_INCOMPATIBLE; - goto early_bailout; + goto bailout; } - if (!MDBX_DISABLE_PAGECHECKS && unlikely(data.iov_len != sizeof(MDBX_db))) { + if (!MDBX_DISABLE_VALIDATION && unlikely(data.iov_len != sizeof(MDBX_db))) { rc = MDBX_CORRUPTED; - goto early_bailout; + goto bailout; } } if (rc != MDBX_SUCCESS && unlikely(txn->mt_flags & MDBX_TXN_RDONLY)) { rc = MDBX_EACCESS; - goto early_bailout; + goto bailout; } /* Done here so we cannot fail after creating a new DB */ - char *namedup = mdbx_strdup(table_name); - if (unlikely(!namedup)) { - rc = MDBX_ENOMEM; - goto early_bailout; - } + if (key.iov_len) { + clone = osal_malloc(key.iov_len); + if (unlikely(!clone)) { + rc = MDBX_ENOMEM; + goto bailout; + } + key.iov_base = memcpy(clone, key.iov_base, key.iov_len); + } else + key.iov_base = ""; - int err = mdbx_fastmutex_acquire(&env->me_dbi_lock); + int err = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(err != MDBX_SUCCESS)) { rc = err; - mdbx_free(namedup); - goto early_bailout; + goto bailout; } + locked = true; /* Import handles from env */ dbi_import_locked(txn); /* Rescan after mutex acquisition & import handles */ for (slot = scan = txn->mt_numdbs; --scan >= CORE_DBS;) { - if (!txn->mt_dbxs[scan].md_name.iov_len) { + if (!txn->mt_dbxs[scan].md_name.iov_base) { /* Remember this free slot */ slot = scan; continue; } - if (len == txn->mt_dbxs[scan].md_name.iov_len && - !strncmp(table_name, txn->mt_dbxs[scan].md_name.iov_base, len)) { - rc = mdbx_dbi_bind(txn, scan, user_flags, keycmp, datacmp); + if (key.iov_len == txn->mt_dbxs[scan].md_name.iov_len && + !memcmp(key.iov_base, txn->mt_dbxs[scan].md_name.iov_base, + key.iov_len)) { + rc = dbi_bind(txn, scan, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) - goto later_bailout; - *dbi = scan; - goto later_exit; + goto bailout; + slot = scan; + goto done; } } if (unlikely(slot >= env->me_maxdbs)) { rc = MDBX_DBS_FULL; - goto later_bailout; + goto bailout; } unsigned dbiflags = DBI_FRESH | DBI_VALID | DBI_USRVALID; MDBX_db db_dummy; if (unlikely(rc)) { /* MDBX_NOTFOUND and MDBX_CREATE: Create new DB */ - mdbx_tassert(txn, rc == MDBX_NOTFOUND); + tASSERT(txn, rc == MDBX_NOTFOUND); memset(&db_dummy, 0, sizeof(db_dummy)); db_dummy.md_root = P_INVALID; db_dummy.md_mod_txnid = txn->mt_txnid; db_dummy.md_flags = user_flags & DB_PERSISTENT_FLAGS; data.iov_len = sizeof(db_dummy); data.iov_base = &db_dummy; - WITH_CURSOR_TRACKING(couple.outer, - rc = mdbx_cursor_put(&couple.outer, &key, &data, - F_SUBDATA | MDBX_NOOVERWRITE)); - + WITH_CURSOR_TRACKING( + couple.outer, rc = cursor_put_checklen(&couple.outer, &key, &data, + F_SUBDATA | MDBX_NOOVERWRITE)); if (unlikely(rc != MDBX_SUCCESS)) - goto later_bailout; + goto bailout; dbiflags |= DBI_DIRTY | DBI_CREAT; txn->mt_flags |= MDBX_TXN_DIRTY; - mdbx_tassert(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); + tASSERT(txn, (txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY) != 0); } /* Got info, register DBI in this txn */ memset(txn->mt_dbxs + slot, 0, sizeof(MDBX_dbx)); memcpy(&txn->mt_dbs[slot], data.iov_base, sizeof(MDBX_db)); env->me_dbflags[slot] = 0; - rc = mdbx_dbi_bind(txn, slot, user_flags, keycmp, datacmp); + rc = dbi_bind(txn, slot, user_flags, keycmp, datacmp); if (unlikely(rc != MDBX_SUCCESS)) { - mdbx_tassert(txn, (dbiflags & DBI_CREAT) == 0); - later_bailout: - *dbi = 0; - later_exit: - mdbx_free(namedup); - } else { - txn->mt_dbistate[slot] = (uint8_t)dbiflags; - txn->mt_dbxs[slot].md_name.iov_base = namedup; - txn->mt_dbxs[slot].md_name.iov_len = len; - txn->mt_dbiseqs[slot] = env->me_dbiseqs[slot] = dbi_seq(env, slot); - if (!(dbiflags & DBI_CREAT)) - env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; - if (txn->mt_numdbs == slot) { - mdbx_compiler_barrier(); - txn->mt_numdbs = slot + 1; - txn->mt_cursors[slot] = NULL; - } - if (env->me_numdbs <= slot) - env->me_numdbs = slot + 1; - *dbi = slot; + tASSERT(txn, (dbiflags & DBI_CREAT) == 0); + goto bailout; } - mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); - return rc; + txn->mt_dbistate[slot] = (uint8_t)dbiflags; + txn->mt_dbxs[slot].md_name = key; + txn->mt_dbiseqs[slot].weak = env->me_dbiseqs[slot].weak = dbi_seq(env, slot); + if (!(dbiflags & DBI_CREAT)) + env->me_dbflags[slot] = txn->mt_dbs[slot].md_flags | DB_VALID; + if (txn->mt_numdbs == slot) { + txn->mt_cursors[slot] = NULL; + osal_compiler_barrier(); + txn->mt_numdbs = slot + 1; + } + if (env->me_numdbs <= slot) { + osal_memory_fence(mo_AcquireRelease, true); + env->me_numdbs = slot + 1; + } + +done: + *dbi = slot; + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + return MDBX_SUCCESS; } -int mdbx_dbi_open(MDBX_txn *txn, const char *table_name, - MDBX_db_flags_t table_flags, MDBX_dbi *dbi) { - return dbi_open(txn, table_name, table_flags, dbi, nullptr, nullptr); +static int dbi_open_cstr(MDBX_txn *txn, const char *name_cstr, + MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + MDBX_val thunk, *name; + if (name_cstr == MDBX_PGWALK_MAIN || name_cstr == MDBX_PGWALK_GC || + name_cstr == MDBX_PGWALK_META) + name = (void *)name_cstr; + else { + thunk.iov_len = strlen(name_cstr); + thunk.iov_base = (void *)name_cstr; + name = &thunk; + } + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } -int mdbx_dbi_open_ex(MDBX_txn *txn, const char *table_name, - MDBX_db_flags_t table_flags, MDBX_dbi *dbi, - MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { - return dbi_open(txn, table_name, table_flags, dbi, keycmp, datacmp); +int mdbx_dbi_open(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi) { + return dbi_open_cstr(txn, name, flags, dbi, nullptr, nullptr); +} + +int mdbx_dbi_open2(MDBX_txn *txn, const MDBX_val *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi) { + return dbi_open(txn, name, flags, dbi, nullptr, nullptr); +} + +int mdbx_dbi_open_ex(MDBX_txn *txn, const char *name, MDBX_db_flags_t flags, + MDBX_dbi *dbi, MDBX_cmp_func *keycmp, + MDBX_cmp_func *datacmp) { + return dbi_open_cstr(txn, name, flags, dbi, keycmp, datacmp); +} + +int mdbx_dbi_open_ex2(MDBX_txn *txn, const MDBX_val *name, + MDBX_db_flags_t flags, MDBX_dbi *dbi, + MDBX_cmp_func *keycmp, MDBX_cmp_func *datacmp) { + return dbi_open(txn, name, flags, dbi, keycmp, datacmp); } __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, @@ -20658,7 +22809,7 @@ __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, return MDBX_BAD_TXN; if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { - rc = mdbx_fetch_sdb(txn, dbi); + rc = fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -20668,28 +22819,28 @@ __cold int mdbx_dbi_stat(MDBX_txn *txn, MDBX_dbi dbi, MDBX_stat *dest, return MDBX_SUCCESS; } -static int mdbx_dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { - mdbx_assert(env, dbi >= CORE_DBS); +static int dbi_close_locked(MDBX_env *env, MDBX_dbi dbi) { + eASSERT(env, dbi >= CORE_DBS); if (unlikely(dbi >= env->me_numdbs)) return MDBX_BAD_DBI; - char *ptr = env->me_dbxs[dbi].md_name.iov_base; + char *const ptr = env->me_dbxs[dbi].md_name.iov_base; /* If there was no name, this was already closed */ if (unlikely(!ptr)) return MDBX_BAD_DBI; env->me_dbflags[dbi] = 0; env->me_dbxs[dbi].md_name.iov_len = 0; - mdbx_memory_fence(mo_AcquireRelease, true); + osal_memory_fence(mo_AcquireRelease, true); env->me_dbxs[dbi].md_name.iov_base = NULL; - mdbx_free(ptr); + osal_free(ptr); if (env->me_numdbs == dbi + 1) { - unsigned i = env->me_numdbs; + size_t i = env->me_numdbs; do --i; while (i > CORE_DBS && !env->me_dbxs[i - 1].md_name.iov_base); - env->me_numdbs = i; + env->me_numdbs = (MDBX_dbi)i; } return MDBX_SUCCESS; @@ -20700,15 +22851,21 @@ int mdbx_dbi_close(MDBX_env *env, MDBX_dbi dbi) { if (unlikely(rc != MDBX_SUCCESS)) return rc; + if (unlikely(dbi < CORE_DBS)) + return (dbi == MAIN_DBI) ? MDBX_SUCCESS : MDBX_BAD_DBI; + + if (unlikely(dbi >= env->me_maxdbs)) + return MDBX_BAD_DBI; + if (unlikely(dbi < CORE_DBS || dbi >= env->me_maxdbs)) return MDBX_BAD_DBI; - rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + rc = osal_fastmutex_acquire(&env->me_dbi_lock); if (likely(rc == MDBX_SUCCESS)) { rc = (dbi < env->me_maxdbs && (env->me_dbflags[dbi] & DB_VALID)) - ? mdbx_dbi_close_locked(env, dbi) + ? dbi_close_locked(env, dbi) : MDBX_BAD_DBI; - mdbx_ensure(env, mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } return rc; } @@ -20738,21 +22895,22 @@ int mdbx_dbi_flags(MDBX_txn *txn, MDBX_dbi dbi, unsigned *flags) { } #endif /* LIBMDBX_NO_EXPORTS_LEGACY_API */ -static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { - int rc = mdbx_page_search(mc, NULL, MDBX_PS_FIRST); +static int drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { + int rc = page_search(mc, NULL, MDBX_PS_FIRST); if (likely(rc == MDBX_SUCCESS)) { MDBX_txn *txn = mc->mc_txn; /* DUPSORT sub-DBs have no ovpages/DBs. Omit scanning leaves. * This also avoids any P_LEAF2 pages, which have no nodes. - * Also if the DB doesn't have sub-DBs and has no overflow + * Also if the DB doesn't have sub-DBs and has no large/overflow * pages, omit scanning leaves. */ if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) - mdbx_cursor_pop(mc); + cursor_pop(mc); - rc = mdbx_pnl_need(&txn->tw.retired_pages, - mc->mc_db->md_branch_pages + mc->mc_db->md_leaf_pages + - mc->mc_db->md_overflow_pages); + rc = pnl_need(&txn->tw.retired_pages, + (size_t)mc->mc_db->md_branch_pages + + (size_t)mc->mc_db->md_leaf_pages + + (size_t)mc->mc_db->md_overflow_pages); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; @@ -20760,13 +22918,13 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { cursor_copy(mc, &mx); while (mc->mc_snum > 0) { MDBX_page *const mp = mc->mc_pg[mc->mc_top]; - const unsigned nkeys = page_numkeys(mp); + const size_t nkeys = page_numkeys(mp); if (IS_LEAF(mp)) { - mdbx_cassert(mc, mc->mc_snum == mc->mc_db->md_depth); - for (unsigned i = 0; i < nkeys; i++) { + cASSERT(mc, mc->mc_snum == mc->mc_db->md_depth); + for (size_t i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); if (node_flags(node) & F_BIGDATA) { - rc = mdbx_page_retire_ex(mc, node_largedata_pgno(node), NULL, 0); + rc = page_retire_ex(mc, node_largedata_pgno(node), nullptr, 0); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; if (!(may_have_subDBs | mc->mc_db->md_overflow_pages)) @@ -20776,53 +22934,52 @@ static int mdbx_drop_tree(MDBX_cursor *mc, const bool may_have_subDBs) { rc = /* disallowing implicit subDB deletion */ MDBX_INCOMPATIBLE; goto bailout; } - rc = mdbx_xcursor_init1(mc, node, mp); + rc = cursor_xinit1(mc, node, mp); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; - rc = mdbx_drop_tree(&mc->mc_xcursor->mx_cursor, false); + rc = drop_tree(&mc->mc_xcursor->mx_cursor, false); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } } } else { - mdbx_cassert(mc, mc->mc_snum < mc->mc_db->md_depth); - if (mdbx_audit_enabled()) - mc->mc_flags |= C_RETIRING; - const int pagetype = - (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + - ((mc->mc_snum + 1 == mc->mc_db->md_depth) ? P_LEAF : P_BRANCH); - for (unsigned i = 0; i < nkeys; i++) { + cASSERT(mc, mc->mc_snum < mc->mc_db->md_depth); + mc->mc_checking |= CC_RETIRING; + const unsigned pagetype = (IS_FROZEN(txn, mp) ? P_FROZEN : 0) + + ((mc->mc_snum + 1 == mc->mc_db->md_depth) + ? (mc->mc_checking & (P_LEAF | P_LEAF2)) + : P_BRANCH); + for (size_t i = 0; i < nkeys; i++) { MDBX_node *node = page_node(mp, i); - mdbx_tassert(txn, (node_flags(node) & - (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); + tASSERT(txn, (node_flags(node) & + (F_BIGDATA | F_SUBDATA | F_DUPDATA)) == 0); const pgno_t pgno = node_pgno(node); - rc = mdbx_page_retire_ex(mc, pgno, NULL, pagetype); + rc = page_retire_ex(mc, pgno, nullptr, pagetype); if (unlikely(rc != MDBX_SUCCESS)) goto bailout; } - if (mdbx_audit_enabled()) - mc->mc_flags -= C_RETIRING; + mc->mc_checking -= CC_RETIRING; } if (!mc->mc_top) break; - mdbx_cassert(mc, nkeys > 0); + cASSERT(mc, nkeys > 0); mc->mc_ki[mc->mc_top] = (indx_t)nkeys; - rc = mdbx_cursor_sibling(mc, SIBLING_RIGHT); + rc = cursor_sibling(mc, SIBLING_RIGHT); if (unlikely(rc != MDBX_SUCCESS)) { if (unlikely(rc != MDBX_NOTFOUND)) goto bailout; /* no more siblings, go back to beginning * of previous level. */ pop: - mdbx_cursor_pop(mc); + cursor_pop(mc); mc->mc_ki[0] = 0; - for (unsigned i = 1; i < mc->mc_snum; i++) { + for (size_t i = 1; i < mc->mc_snum; i++) { mc->mc_ki[i] = 0; mc->mc_pg[i] = mx.mc_pg[i]; } } } - rc = mdbx_page_retire(mc, mc->mc_pg[0]); + rc = page_retire(mc, mc->mc_pg[0]); bailout: if (unlikely(rc != MDBX_SUCCESS)) txn->mt_flags |= MDBX_TXN_ERROR; @@ -20843,8 +23000,8 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { if (unlikely(rc != MDBX_SUCCESS)) return rc; - rc = mdbx_drop_tree(mc, dbi == MAIN_DBI || - (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); + rc = drop_tree(mc, + dbi == MAIN_DBI || (mc->mc_db->md_flags & MDBX_DUPSORT) != 0); /* Invalidate the dropped DB's cursors */ for (MDBX_cursor *m2 = txn->mt_cursors[dbi]; m2; m2 = m2->mc_next) m2->mc_flags &= ~(C_INITIALIZED | C_EOF); @@ -20853,20 +23010,19 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { /* Can't delete the main DB */ if (del && dbi >= CORE_DBS) { - rc = mdbx_del0(txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); + rc = delete (txn, MAIN_DBI, &mc->mc_dbx->md_name, NULL, F_SUBDATA); if (likely(rc == MDBX_SUCCESS)) { - mdbx_tassert(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); - mdbx_tassert(txn, txn->mt_flags & MDBX_TXN_DIRTY); + tASSERT(txn, txn->mt_dbistate[MAIN_DBI] & DBI_DIRTY); + tASSERT(txn, txn->mt_flags & MDBX_TXN_DIRTY); txn->mt_dbistate[dbi] = DBI_STALE; MDBX_env *env = txn->mt_env; - rc = mdbx_fastmutex_acquire(&env->me_dbi_lock); + rc = osal_fastmutex_acquire(&env->me_dbi_lock); if (unlikely(rc != MDBX_SUCCESS)) { txn->mt_flags |= MDBX_TXN_ERROR; goto bailout; } - mdbx_dbi_close_locked(env, dbi); - mdbx_ensure(env, - mdbx_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); + dbi_close_locked(env, dbi); + ENSURE(env, osal_fastmutex_release(&env->me_dbi_lock) == MDBX_SUCCESS); } else { txn->mt_flags |= MDBX_TXN_ERROR; } @@ -20880,7 +23036,6 @@ int mdbx_drop(MDBX_txn *txn, MDBX_dbi dbi, bool del) { txn->mt_dbs[dbi].md_entries = 0; txn->mt_dbs[dbi].md_root = P_INVALID; txn->mt_dbs[dbi].md_seq = 0; - /* txn->mt_dbs[dbi].md_mod_txnid = txn->mt_txnid; */ txn->mt_flags |= MDBX_TXN_DIRTY; } @@ -20926,9 +23081,9 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, int serial = 0; MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (likely(lck)) { - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; i++) { + for (size_t i = 0; i < snap_nreaders; i++) { const MDBX_reader *r = lck->mti_readers + i; retry_reader:; const uint32_t pid = atomic_load32(&r->mr_pid, mo_AcquireRelease); @@ -20950,7 +23105,7 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, atomic_load64(&r->mr_snapshot_pages_retired, mo_Relaxed))) goto retry_reader; - mdbx_assert(env, txnid > 0); + eASSERT(env, txnid > 0); if (txnid >= SAFE64_INVALID_THRESHOLD) txnid = 0; @@ -20958,28 +23113,26 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, size_t bytes_retained = 0; uint64_t lag = 0; if (txnid) { + meta_troika_t troika = meta_tap(env); retry_header:; - volatile const MDBX_meta *const recent_meta = meta_prefer_last(env); + const meta_ptr_t head = meta_recent(env, &troika); const uint64_t head_pages_retired = - unaligned_peek_u64_volatile(4, recent_meta->mm_pages_retired); - const txnid_t head_txnid = meta_txnid(env, recent_meta); - mdbx_compiler_barrier(); - if (unlikely(recent_meta != meta_prefer_last(env) || + unaligned_peek_u64_volatile(4, head.ptr_v->mm_pages_retired); + if (unlikely(meta_should_retry(env, &troika) || head_pages_retired != unaligned_peek_u64_volatile( - 4, recent_meta->mm_pages_retired)) || - head_txnid != meta_txnid(env, recent_meta)) + 4, head.ptr_v->mm_pages_retired))) goto retry_header; - lag = (head_txnid - txnid) / xMDBX_TXNID_STEP; + lag = (head.txnid - txnid) / xMDBX_TXNID_STEP; bytes_used = pgno2bytes(env, pages_used); bytes_retained = (head_pages_retired > reader_pages_retired) ? pgno2bytes(env, (pgno_t)(head_pages_retired - reader_pages_retired)) : 0; } - rc = func(ctx, ++serial, i, pid, (mdbx_tid_t)((intptr_t)tid), txnid, lag, - bytes_used, bytes_retained); + rc = func(ctx, ++serial, (unsigned)i, pid, (mdbx_tid_t)((intptr_t)tid), + txnid, lag, bytes_used, bytes_retained); if (unlikely(rc != MDBX_SUCCESS)) break; } @@ -20990,15 +23143,15 @@ __cold int mdbx_reader_list(const MDBX_env *env, MDBX_reader_list_func *func, /* Insert pid into list if not already present. * return -1 if already present. */ -__cold static bool mdbx_pid_insert(uint32_t *ids, uint32_t pid) { +__cold static bool pid_insert(uint32_t *ids, uint32_t pid) { /* binary search of pid in list */ - unsigned base = 0; - unsigned cursor = 1; + size_t base = 0; + size_t cursor = 1; int val = 0; - unsigned n = ids[0]; + size_t n = ids[0]; while (n > 0) { - unsigned pivot = n >> 1; + size_t pivot = n >> 1; cursor = base + pivot + 1; val = pid - ids[cursor]; @@ -21026,20 +23179,20 @@ __cold static bool mdbx_pid_insert(uint32_t *ids, uint32_t pid) { __cold int mdbx_reader_check(MDBX_env *env, int *dead) { if (dead) *dead = 0; - return mdbx_cleanup_dead_readers(env, false, dead); + return cleanup_dead_readers(env, false, dead); } /* Return: * MDBX_RESULT_TRUE - done and mutex recovered * MDBX_SUCCESS - done * Otherwise errcode. */ -__cold MDBX_INTERNAL_FUNC int -mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { +__cold MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, + int rdt_locked, int *dead) { int rc = check_env(env, true); if (unlikely(rc != MDBX_SUCCESS)) return rc; - mdbx_assert(env, rdt_locked >= 0); + eASSERT(env, rdt_locked >= 0); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; if (unlikely(lck == NULL)) { /* exclusive mode */ @@ -21048,40 +23201,40 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { return MDBX_SUCCESS; } - const unsigned snap_nreaders = + const size_t snap_nreaders = atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); uint32_t pidsbuf_onstask[142]; uint32_t *const pids = (snap_nreaders < ARRAY_LENGTH(pidsbuf_onstask)) ? pidsbuf_onstask - : mdbx_malloc((snap_nreaders + 1) * sizeof(uint32_t)); + : osal_malloc((snap_nreaders + 1) * sizeof(uint32_t)); if (unlikely(!pids)) return MDBX_ENOMEM; pids[0] = 0; int count = 0; - for (unsigned i = 0; i < snap_nreaders; i++) { + for (size_t i = 0; i < snap_nreaders; i++) { const uint32_t pid = atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease); if (pid == 0) continue /* skip empty */; if (pid == env->me_pid) continue /* skip self */; - if (!mdbx_pid_insert(pids, pid)) + if (!pid_insert(pids, pid)) continue /* such pid already processed */; - int err = mdbx_rpid_check(env, pid); + int err = osal_rpid_check(env, pid); if (err == MDBX_RESULT_TRUE) continue /* reader is live */; if (err != MDBX_SUCCESS) { rc = err; - break /* mdbx_rpid_check() failed */; + break /* osal_rpid_check() failed */; } /* stale reader found */ if (!rdt_locked) { - err = mdbx_rdt_lock(env); + err = osal_rdt_lock(env); if (MDBX_IS_ERROR(err)) { rc = err; break; @@ -21098,7 +23251,7 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { if (lck->mti_readers[i].mr_pid.weak != pid) continue; - err = mdbx_rpid_check(env, pid); + err = osal_rpid_check(env, pid); if (MDBX_IS_ERROR(err)) { rc = err; break; @@ -21109,10 +23262,10 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { } /* clean it */ - for (unsigned j = i; j < snap_nreaders; j++) { + for (size_t j = i; j < snap_nreaders; j++) { if (lck->mti_readers[j].mr_pid.weak == pid) { - mdbx_debug("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, - (size_t)pid, lck->mti_readers[j].mr_txnid.weak); + DEBUG("clear stale reader pid %" PRIuPTR " txn %" PRIaTXN, (size_t)pid, + lck->mti_readers[j].mr_txnid.weak); atomic_store32(&lck->mti_readers[j].mr_pid, 0, mo_Relaxed); atomic_store32(&lck->mti_readers_refresh_flag, true, mo_AcquireRelease); count++; @@ -21121,25 +23274,25 @@ mdbx_cleanup_dead_readers(MDBX_env *env, int rdt_locked, int *dead) { } if (likely(!MDBX_IS_ERROR(rc))) - atomic_store64(&lck->mti_reader_check_timestamp, mdbx_osal_monotime(), + atomic_store64(&lck->mti_reader_check_timestamp, osal_monotime(), mo_Relaxed); if (rdt_locked < 0) - mdbx_rdt_unlock(env); + osal_rdt_unlock(env); if (pids != pidsbuf_onstask) - mdbx_free(pids); + osal_free(pids); if (dead) *dead = count; return rc; } -__cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { - const int rc = mdbx_runtime_flags | (mdbx_loglevel << 16); +__cold int mdbx_setup_debug(int level, int flags, MDBX_debug_func *logger) { + const int rc = runtime_flags | (loglevel << 16); - if (loglevel != MDBX_LOG_DONTCHANGE) - mdbx_loglevel = (uint8_t)loglevel; + if (level != MDBX_LOG_DONTCHANGE) + loglevel = (uint8_t)level; if (flags != MDBX_DBG_DONTCHANGE) { flags &= @@ -21148,111 +23301,107 @@ __cold int mdbx_setup_debug(int loglevel, int flags, MDBX_debug_func *logger) { #endif MDBX_DBG_DUMP | MDBX_DBG_LEGACY_MULTIOPEN | MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE; - mdbx_runtime_flags = (uint8_t)flags; + runtime_flags = (uint8_t)flags; } if (logger != MDBX_LOGGER_DONTCHANGE) - mdbx_debug_logger = logger; + debug_logger = logger; return rc; } -__cold static txnid_t mdbx_kick_longlived_readers(MDBX_env *env, - const txnid_t laggard) { - mdbx_debug("DB size maxed out by reading #%" PRIaTXN, laggard); +__cold static txnid_t kick_longlived_readers(MDBX_env *env, + const txnid_t laggard) { + DEBUG("DB size maxed out by reading #%" PRIaTXN, laggard); + osal_memory_fence(mo_AcquireRelease, false); + MDBX_hsr_func *const callback = env->me_hsr_callback; + txnid_t oldest = 0; + bool notify_eof_of_loop = false; + int retry = 0; + do { + const txnid_t steady = + env->me_txn->tw.troika.txnid[env->me_txn->tw.troika.prefer_steady]; + env->me_lck->mti_readers_refresh_flag.weak = /* force refresh */ true; + oldest = find_oldest_reader(env, steady); + eASSERT(env, oldest < env->me_txn0->mt_txnid); + eASSERT(env, oldest >= laggard); + eASSERT(env, oldest >= env->me_lck->mti_oldest_reader.weak); - int retry; - for (retry = 0; retry < INT_MAX; ++retry) { - txnid_t oldest = mdbx_recent_steady_txnid(env); - mdbx_assert(env, oldest < env->me_txn0->mt_txnid); - mdbx_assert(env, oldest >= laggard); - mdbx_assert(env, oldest >= env->me_lck->mti_oldest_reader.weak); MDBX_lockinfo *const lck = env->me_lck_mmap.lck; - if (oldest == laggard || unlikely(!lck /* without-LCK mode */)) - return oldest; - - if (MDBX_IS_ERROR(mdbx_cleanup_dead_readers(env, false, NULL))) + if (oldest == steady || oldest > laggard || /* without-LCK mode */ !lck) break; - MDBX_reader *asleep = nullptr; - uint64_t oldest_retired = UINT64_MAX; - const unsigned snap_nreaders = - atomic_load32(&lck->mti_numreaders, mo_AcquireRelease); - for (unsigned i = 0; i < snap_nreaders; ++i) { - retry: - if (atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { - /* mdbx_jitter4testing(true); */ - const uint64_t snap_retired = atomic_load64( - &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); - const txnid_t snap_txnid = safe64_read(&lck->mti_readers[i].mr_txnid); - if (unlikely(snap_retired != - atomic_load64( - &lck->mti_readers[i].mr_snapshot_pages_retired, - mo_AcquireRelease) || - snap_txnid != safe64_read(&lck->mti_readers[i].mr_txnid))) - goto retry; - if (oldest > snap_txnid && - laggard <= /* ignore pending updates */ snap_txnid) { - oldest = snap_txnid; - oldest_retired = snap_retired; - asleep = &lck->mti_readers[i]; - } + if (MDBX_IS_ERROR(cleanup_dead_readers(env, false, NULL))) + break; + + if (!callback) + break; + + MDBX_reader *stucked = nullptr; + uint64_t hold_retired = 0; + for (size_t i = 0; i < lck->mti_numreaders.weak; ++i) { + const uint64_t snap_retired = atomic_load64( + &lck->mti_readers[i].mr_snapshot_pages_retired, mo_Relaxed); + const txnid_t rtxn = safe64_read(&lck->mti_readers[i].mr_txnid); + if (rtxn == laggard && + atomic_load32(&lck->mti_readers[i].mr_pid, mo_AcquireRelease)) { + hold_retired = snap_retired; + stucked = &lck->mti_readers[i]; } } - if (laggard < oldest || !asleep) { - if (retry && env->me_hsr_callback) { - /* LY: notify end of hsr-loop */ - const txnid_t gap = oldest - laggard; - env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, 0, - -retry); - } - mdbx_notice("hsr-kick: update oldest %" PRIaTXN " -> %" PRIaTXN, - lck->mti_oldest_reader.weak, oldest); - mdbx_assert(env, lck->mti_oldest_reader.weak <= oldest); - return atomic_store64(&lck->mti_oldest_reader, oldest, mo_Relaxed); - } - - if (!env->me_hsr_callback) + if (!stucked) break; - uint32_t pid = atomic_load32(&asleep->mr_pid, mo_AcquireRelease); - uint64_t tid = asleep->mr_tid.weak; - if (safe64_read(&asleep->mr_txnid) != laggard || pid <= 0) + uint32_t pid = atomic_load32(&stucked->mr_pid, mo_AcquireRelease); + uint64_t tid = atomic_load64(&stucked->mr_tid, mo_AcquireRelease); + if (safe64_read(&stucked->mr_txnid) != laggard || !pid || + stucked->mr_snapshot_pages_retired.weak != hold_retired) continue; - const MDBX_meta *head_meta = constmeta_prefer_last(env); - const txnid_t gap = - (constmeta_txnid(env, head_meta) - laggard) / xMDBX_TXNID_STEP; + const meta_ptr_t head = meta_recent(env, &env->me_txn->tw.troika); + const txnid_t gap = (head.txnid - laggard) / xMDBX_TXNID_STEP; const uint64_t head_retired = - unaligned_peek_u64(4, head_meta->mm_pages_retired); + unaligned_peek_u64(4, head.ptr_c->mm_pages_retired); const size_t space = - (head_retired > oldest_retired) - ? pgno2bytes(env, (pgno_t)(head_retired - oldest_retired)) + (head_retired > hold_retired) + ? pgno2bytes(env, (pgno_t)(head_retired - hold_retired)) : 0; - int rc = env->me_hsr_callback( - env, env->me_txn, pid, (mdbx_tid_t)((intptr_t)tid), laggard, - (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); + int rc = + callback(env, env->me_txn, pid, (mdbx_tid_t)((intptr_t)tid), laggard, + (gap < UINT_MAX) ? (unsigned)gap : UINT_MAX, space, retry); if (rc < 0) + /* hsr returned error and/or agree MDBX_MAP_FULL error */ break; if (rc > 0) { if (rc == 1) { - safe64_reset_compare(&asleep->mr_txnid, laggard); + /* hsr reported transaction (will be) aborted asynchronous */ + safe64_reset_compare(&stucked->mr_txnid, laggard); } else { - safe64_reset(&asleep->mr_txnid, true); - atomic_store64(&asleep->mr_tid, 0, mo_Relaxed); - atomic_store32(&asleep->mr_pid, 0, mo_Relaxed); + /* hsr reported reader process was killed and slot should be cleared */ + safe64_reset(&stucked->mr_txnid, true); + atomic_store64(&stucked->mr_tid, 0, mo_Relaxed); + atomic_store32(&stucked->mr_pid, 0, mo_AcquireRelease); } - atomic_store32(&lck->mti_readers_refresh_flag, true, mo_Relaxed); + } else if (!notify_eof_of_loop) { +#if MDBX_ENABLE_PROFGC + env->me_lck->mti_pgop_stat.gc_prof.kicks += 1; +#endif /* MDBX_ENABLE_PROFGC */ + notify_eof_of_loop = true; } - } - if (retry && env->me_hsr_callback) { - /* LY: notify end of hsr-loop */ - env->me_hsr_callback(env, env->me_txn, 0, 0, laggard, 0, 0, -retry); + } while (++retry < INT_MAX); + + if (notify_eof_of_loop) { + /* notify end of hsr-loop */ + const txnid_t turn = oldest - laggard; + if (turn) + NOTICE("hsr-kick: done turn %" PRIaTXN " -> %" PRIaTXN " +%" PRIaTXN, + laggard, oldest, turn); + callback(env, env->me_txn, 0, 0, laggard, + (turn < UINT_MAX) ? (unsigned)turn : UINT_MAX, 0, -retry); } - return mdbx_find_oldest(env->me_txn); + return oldest; } #ifndef LIBMDBX_NO_EXPORTS_LEGACY_API @@ -21299,18 +23448,18 @@ int mdbx_txn_straggler(const MDBX_txn *txn, int *percent) return 0; } - txnid_t recent = 0; - volatile const MDBX_meta *meta = nullptr; + txnid_t lag; + meta_troika_t troika = meta_tap(env); do { - meta = meta_prefer_last(env); - recent = meta_txnid(env, meta); + const meta_ptr_t head = meta_recent(env, &troika); if (percent) { - const pgno_t maxpg = meta->mm_geo.now; - *percent = (int)((meta->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); + const pgno_t maxpg = head.ptr_v->mm_geo.now; + *percent = + (int)((head.ptr_v->mm_geo.next * UINT64_C(100) + maxpg / 2) / maxpg); } - } while (unlikely(recent != meta_txnid(env, meta))); + lag = (head.txnid - txn->mt_txnid) / xMDBX_TXNID_STEP; + } while (unlikely(meta_should_retry(env, &troika))); - txnid_t lag = (recent - txn->mt_txnid) / xMDBX_TXNID_STEP; return (lag > INT_MAX) ? INT_MAX : (int)lag; } @@ -21322,8 +23471,8 @@ typedef struct mdbx_walk_ctx { bool mw_dont_check_keys_ordering; } mdbx_walk_ctx_t; -__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, - const char *name, int deep); +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, + const MDBX_val *name, int deep); static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { if (mp) @@ -21343,47 +23492,26 @@ static MDBX_page_type_t walk_page_type(const MDBX_page *mp) { } /* Depth-first tree traversal. */ -__cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, - const char *name, int deep, - txnid_t parent_txnid) { +__cold static int walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, + const MDBX_val *name, int deep, + txnid_t parent_txnid) { assert(pgno != P_INVALID); MDBX_page *mp = nullptr; - int rc, err = mdbx_page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); - if (err == MDBX_SUCCESS) - err = mdbx_page_check(ctx->mw_cursor, mp, 0); + int err = page_get(ctx->mw_cursor, pgno, &mp, parent_txnid); MDBX_page_type_t type = walk_page_type(mp); - const int nentries = (mp && !IS_OVERFLOW(mp)) ? page_numkeys(mp) : 1; - unsigned npages = (mp && IS_OVERFLOW(mp)) ? mp->mp_pages : 1; + const size_t nentries = mp ? page_numkeys(mp) : 0; + unsigned npages = 1; size_t pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); - size_t header_size = (mp && !IS_LEAF2(mp) && !IS_OVERFLOW(mp)) - ? PAGEHDRSZ + mp->mp_lower - : PAGEHDRSZ; + size_t header_size = + (mp && !IS_LEAF2(mp)) ? PAGEHDRSZ + mp->mp_lower : PAGEHDRSZ; size_t payload_size = 0; size_t unused_size = - (mp && !IS_OVERFLOW(mp) ? page_room(mp) : pagesize - header_size) - - payload_size; + (mp ? page_room(mp) : pagesize - header_size) - payload_size; size_t align_bytes = 0; - if (err == MDBX_SUCCESS) { - /* LY: Don't use mask here, e.g bitwise - * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - switch (mp->mp_flags) { - default: - err = MDBX_CORRUPTED; - break; - case P_BRANCH: - if (unlikely(nentries < 2)) - err = MDBX_CORRUPTED; - case P_LEAF: - case P_LEAF | P_LEAF2: - break; - } - } - - for (int i = 0; err == MDBX_SUCCESS && i < nentries; - align_bytes += ((payload_size + align_bytes) & 1), i++) { + for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; + align_bytes += ((payload_size + align_bytes) & 1), ++i) { if (type == MDBX_page_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ payload_size += mp->mp_leaf2_ksize; @@ -21411,26 +23539,19 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, const size_t over_header = PAGEHDRSZ; npages = 1; - MDBX_page *op; - err = mdbx_page_get(ctx->mw_cursor, large_pgno, &op, - pp_txnid4chk(mp, ctx->mw_txn)); - if (err == MDBX_SUCCESS) - err = mdbx_page_check(ctx->mw_cursor, op, 0); + assert(err == MDBX_SUCCESS); + pgr_t lp = page_get_large(ctx->mw_cursor, large_pgno, mp->mp_txnid); + err = lp.err; if (err == MDBX_SUCCESS) { - /* LY: Don't use mask here, e.g bitwise - * (P_BRANCH|P_LEAF|P_LEAF2|P_META|P_OVERFLOW|P_SUBP). - * Pages should not me marked dirty/loose or otherwise. */ - if (unlikely(P_OVERFLOW != op->mp_flags)) - err = bad_page(mp, "wrong page type %d for large data", op->mp_flags); - else - npages = op->mp_pages; + cASSERT(ctx->mw_cursor, PAGETYPE_WHOLE(lp.page) == P_OVERFLOW); + npages = lp.page->mp_pages; } pagesize = pgno2bytes(ctx->mw_txn->mt_env, npages); const size_t over_unused = pagesize - over_payload - over_header; - rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, name, - pagesize, MDBX_page_large, err, 1, over_payload, - over_header, over_unused); + const int rc = ctx->mw_visitor(large_pgno, npages, ctx->mw_user, deep, + name, pagesize, MDBX_page_large, err, 1, + over_payload, over_header, over_unused); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; } break; @@ -21438,24 +23559,29 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, case F_SUBDATA /* sub-db */: { const size_t namelen = node_ks(node); payload_size += node_ds(node); - if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) + if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } } break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: payload_size += sizeof(MDBX_db); - if (unlikely(node_ds(node) != sizeof(MDBX_db))) + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } break; case F_DUPDATA /* short sub-page */: { if (unlikely(node_ds(node) <= PAGEHDRSZ)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; break; } MDBX_page *sp = node_data(node); - const int nsubkeys = page_numkeys(sp); + const size_t nsubkeys = page_numkeys(sp); size_t subheader_size = IS_LEAF2(sp) ? PAGEHDRSZ : PAGEHDRSZ + sp->mp_lower; size_t subunused_size = page_room(sp); @@ -21463,7 +23589,7 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, size_t subalign_bytes = 0; MDBX_page_type_t subtype; - switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~0x10) { + switch (sp->mp_flags & /* ignore legacy P_DIRTY flag */ ~P_LEGACY_DIRTY) { case P_LEAF | P_SUBP: subtype = MDBX_subpage_leaf; break; @@ -21471,12 +23597,13 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, subtype = MDBX_subpage_dupfixed_leaf; break; default: + assert(err == MDBX_CORRUPTED); subtype = MDBX_subpage_broken; err = MDBX_CORRUPTED; } - for (int j = 0; err == MDBX_SUCCESS && j < nsubkeys; - subalign_bytes += ((subpayload_size + subalign_bytes) & 1), j++) { + for (size_t j = 0; err == MDBX_SUCCESS && j < nsubkeys; + subalign_bytes += ((subpayload_size + subalign_bytes) & 1), ++j) { if (subtype == MDBX_subpage_dupfixed_leaf) { /* LEAF2 pages have no mp_ptrs[] or node headers */ @@ -21485,14 +23612,17 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, assert(subtype == MDBX_subpage_leaf); MDBX_node *subnode = page_node(sp, j); subpayload_size += NODESIZE + node_ks(subnode) + node_ds(subnode); - if (unlikely(node_flags(subnode) != 0)) + if (unlikely(node_flags(subnode) != 0)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; + } } } - rc = ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), - subtype, err, nsubkeys, subpayload_size, - subheader_size, subunused_size + subalign_bytes); + const int rc = + ctx->mw_visitor(pgno, 0, ctx->mw_user, deep + 1, name, node_ds(node), + subtype, err, nsubkeys, subpayload_size, + subheader_size, subunused_size + subalign_bytes); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; header_size += subheader_size; @@ -21502,24 +23632,25 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } break; default: + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; } } - rc = ctx->mw_visitor(pgno, 1, ctx->mw_user, deep, name, - ctx->mw_txn->mt_env->me_psize, type, err, nentries, - payload_size, header_size, unused_size + align_bytes); + const int rc = ctx->mw_visitor( + pgno, 1, ctx->mw_user, deep, name, ctx->mw_txn->mt_env->me_psize, type, + err, nentries, payload_size, header_size, unused_size + align_bytes); if (unlikely(rc != MDBX_SUCCESS)) return (rc == MDBX_RESULT_TRUE) ? MDBX_SUCCESS : rc; - for (int i = 0; err == MDBX_SUCCESS && i < nentries; i++) { + for (size_t i = 0; err == MDBX_SUCCESS && i < nentries; ++i) { if (type == MDBX_page_dupfixed_leaf) continue; MDBX_node *node = page_node(mp, i); if (type == MDBX_page_branch) { - err = mdbx_walk_tree(ctx, node_pgno(node), name, deep + 1, - pp_txnid4chk(mp, ctx->mw_txn)); + assert(err == MDBX_SUCCESS); + err = walk_tree(ctx, node_pgno(node), name, deep + 1, mp->mp_txnid); if (unlikely(err != MDBX_SUCCESS)) { if (err == MDBX_RESULT_TRUE) break; @@ -21529,50 +23660,44 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, } assert(type == MDBX_page_leaf); - MDBX_db db; switch (node_flags(node)) { default: continue; - case F_SUBDATA /* sub-db */: { - const size_t namelen = node_ks(node); - if (unlikely(namelen == 0 || node_ds(node) != sizeof(MDBX_db))) { + case F_SUBDATA /* sub-db */: + if (unlikely(node_ds(node) != sizeof(MDBX_db))) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; - break; - } - - char namebuf_onstask[64]; - char *const sub_name = (namelen < sizeof(namebuf_onstask)) - ? namebuf_onstask - : mdbx_malloc(namelen + 1); - if (sub_name) { - memcpy(sub_name, node_key(node), namelen); - sub_name[namelen] = 0; - memcpy(&db, node_data(node), sizeof(db)); - err = mdbx_walk_sdb(ctx, &db, sub_name, deep + 1); - if (sub_name != namebuf_onstask) - mdbx_free(sub_name); } else { - err = MDBX_ENOMEM; + MDBX_db db; + memcpy(&db, node_data(node), sizeof(db)); + const MDBX_val subdb_name = {node_key(node), node_ks(node)}; + assert(err == MDBX_SUCCESS); + err = walk_sdb(ctx, &db, &subdb_name, deep + 1); } - } break; + break; case F_SUBDATA | F_DUPDATA /* dupsorted sub-tree */: if (unlikely(node_ds(node) != sizeof(MDBX_db) || - ctx->mw_cursor->mc_xcursor == NULL)) + ctx->mw_cursor->mc_xcursor == NULL)) { + assert(err == MDBX_CORRUPTED); err = MDBX_CORRUPTED; - else { + } else { + MDBX_db db; memcpy(&db, node_data(node), sizeof(db)); assert(ctx->mw_cursor->mc_xcursor == &container_of(ctx->mw_cursor, MDBX_cursor_couple, outer)->inner); - ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; - err = mdbx_walk_tree(ctx, db.md_root, name, deep + 1, - pp_txnid4chk(mp, ctx->mw_txn)); - MDBX_xcursor *inner_xcursor = - container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); - MDBX_cursor_couple *couple = - container_of(inner_xcursor, MDBX_cursor_couple, inner); - ctx->mw_cursor = &couple->outer; + assert(err == MDBX_SUCCESS); + err = cursor_xinit1(ctx->mw_cursor, node, mp); + if (likely(err == MDBX_SUCCESS)) { + ctx->mw_cursor = &ctx->mw_cursor->mc_xcursor->mx_cursor; + err = walk_tree(ctx, db.md_root, name, deep + 1, mp->mp_txnid); + MDBX_xcursor *inner_xcursor = + container_of(ctx->mw_cursor, MDBX_xcursor, mx_cursor); + MDBX_cursor_couple *couple = + container_of(inner_xcursor, MDBX_cursor_couple, inner); + ctx->mw_cursor = &couple->outer; + } } break; } @@ -21581,25 +23706,28 @@ __cold static int mdbx_walk_tree(mdbx_walk_ctx_t *ctx, const pgno_t pgno, return MDBX_SUCCESS; } -__cold static int mdbx_walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const db, - const char *name, int deep) { - if (unlikely(db->md_root == P_INVALID)) +__cold static int walk_sdb(mdbx_walk_ctx_t *ctx, MDBX_db *const sdb, + const MDBX_val *name, int deep) { + if (unlikely(sdb->md_root == P_INVALID)) return MDBX_SUCCESS; /* empty db */ MDBX_cursor_couple couple; MDBX_dbx dbx = {.md_klen_min = INT_MAX}; uint8_t dbistate = DBI_VALID | DBI_AUDITED; - int rc = mdbx_couple_init(&couple, ~0u, ctx->mw_txn, db, &dbx, &dbistate); + int rc = couple_init(&couple, ~0u, ctx->mw_txn, sdb, &dbx, &dbistate); if (unlikely(rc != MDBX_SUCCESS)) return rc; - if (ctx->mw_dont_check_keys_ordering) { - couple.outer.mc_flags |= C_SKIPORD; - couple.inner.mx_cursor.mc_flags |= C_SKIPORD; - } + couple.outer.mc_checking |= ctx->mw_dont_check_keys_ordering + ? CC_SKIPORD | CC_PAGECHECK + : CC_PAGECHECK; + couple.inner.mx_cursor.mc_checking |= ctx->mw_dont_check_keys_ordering + ? CC_SKIPORD | CC_PAGECHECK + : CC_PAGECHECK; couple.outer.mc_next = ctx->mw_cursor; ctx->mw_cursor = &couple.outer; - rc = mdbx_walk_tree(ctx, db->md_root, name, deep, ctx->mw_txn->mt_txnid); + rc = walk_tree(ctx, sdb->md_root, name, deep, + sdb->md_mod_txnid ? sdb->md_mod_txnid : ctx->mw_txn->mt_txnid); ctx->mw_cursor = couple.outer.mc_next; return rc; } @@ -21623,9 +23751,9 @@ __cold int mdbx_env_pgwalk(MDBX_txn *txn, MDBX_pgvisitor_func *visitor, (txn->mt_env->me_psize - sizeof(MDBX_meta) - PAGEHDRSZ) * NUM_METAS); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); + rc = walk_sdb(&ctx, &txn->mt_dbs[FREE_DBI], MDBX_PGWALK_GC, 0); if (!MDBX_IS_ERROR(rc)) - rc = mdbx_walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); + rc = walk_sdb(&ctx, &txn->mt_dbs[MAIN_DBI], MDBX_PGWALK_MAIN, 0); return rc; } @@ -21671,7 +23799,7 @@ int mdbx_cursor_on_first(const MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED)) return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; - for (unsigned i = 0; i < mc->mc_snum; ++i) { + for (size_t i = 0; i < mc->mc_snum; ++i) { if (mc->mc_ki[i]) return MDBX_RESULT_FALSE; } @@ -21690,8 +23818,8 @@ int mdbx_cursor_on_last(const MDBX_cursor *mc) { if (!(mc->mc_flags & C_INITIALIZED)) return mc->mc_db->md_entries ? MDBX_RESULT_FALSE : MDBX_RESULT_TRUE; - for (unsigned i = 0; i < mc->mc_snum; ++i) { - unsigned nkeys = page_numkeys(mc->mc_pg[i]); + for (size_t i = 0; i < mc->mc_snum; ++i) { + size_t nkeys = page_numkeys(mc->mc_pg[i]); if (mc->mc_ki[i] < nkeys - 1) return MDBX_RESULT_FALSE; } @@ -21718,8 +23846,8 @@ int mdbx_cursor_eof(const MDBX_cursor *mc) { struct diff_result { ptrdiff_t diff; - unsigned level; - int root_nkeys; + size_t level; + ptrdiff_t root_nkeys; }; /* calculates: r = x - y */ @@ -21753,18 +23881,18 @@ __hot static int cursor_diff(const MDBX_cursor *const __restrict x, while (likely(r->level < y->mc_snum && r->level < x->mc_snum)) { if (unlikely(y->mc_pg[r->level] != x->mc_pg[r->level])) { - mdbx_error("Mismatch cursors's pages at %u level", r->level); + ERROR("Mismatch cursors's pages at %zu level", r->level); return MDBX_PROBLEM; } - int nkeys = page_numkeys(y->mc_pg[r->level]); + intptr_t nkeys = page_numkeys(y->mc_pg[r->level]); assert(nkeys > 0); if (r->level == 0) r->root_nkeys = nkeys; - const int limit_ki = nkeys - 1; - const int x_ki = x->mc_ki[r->level]; - const int y_ki = y->mc_ki[r->level]; + const intptr_t limit_ki = nkeys - 1; + const intptr_t x_ki = x->mc_ki[r->level]; + const intptr_t y_ki = y->mc_ki[r->level]; r->diff = ((x_ki < limit_ki) ? x_ki : limit_ki) - ((y_ki < limit_ki) ? y_ki : limit_ki); if (r->diff == 0) { @@ -21918,7 +24046,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, cursor_copy(cursor, &next.outer); if (cursor->mc_db->md_flags & MDBX_DUPSORT) { next.outer.mc_xcursor = &next.inner; - rc = mdbx_xcursor_init0(&next.outer); + rc = cursor_xinit0(&next.outer); if (unlikely(rc != MDBX_SUCCESS)) return rc; MDBX_xcursor *mx = &container_of(cursor, MDBX_cursor_couple, outer)->inner; @@ -21944,7 +24072,7 @@ int mdbx_estimate_move(const MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, } next.outer.mc_signature = MDBX_MC_LIVE; - rc = mdbx_cursor_get(&next.outer, key, data, move_op); + rc = cursor_get(&next.outer, key, data, move_op); if (unlikely(rc != MDBX_SUCCESS && (rc != MDBX_NOTFOUND || !(next.outer.mc_flags & C_INITIALIZED)))) return rc; @@ -21976,7 +24104,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, MDBX_cursor_couple begin; /* LY: first, initialize cursor to refresh a DB in case it have DB_STALE */ - rc = mdbx_cursor_init(&begin.outer, txn, dbi); + rc = cursor_init(&begin.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; @@ -21992,7 +24120,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, return MDBX_SUCCESS; } MDBX_val stub = {0, 0}; - rc = mdbx_cursor_first(&begin.outer, &stub, &stub); + rc = cursor_first(&begin.outer, &stub, &stub); if (unlikely(end_key == MDBX_EPSILON)) { /* LY: FIRST..+epsilon case */ return (rc == MDBX_SUCCESS) @@ -22004,7 +24132,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, if (end_key == NULL) { /* LY: -epsilon..LAST case */ MDBX_val stub = {0, 0}; - rc = mdbx_cursor_last(&begin.outer, &stub, &stub); + rc = cursor_last(&begin.outer, &stub, &stub); return (rc == MDBX_SUCCESS) ? mdbx_cursor_count(&begin.outer, (size_t *)size_items) : rc; @@ -22021,7 +24149,7 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, (begin_key == end_key || begin.outer.mc_dbx->md_cmp(begin_key, end_key) == 0)) { /* LY: single key case */ - rc = mdbx_cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err; + rc = cursor_set(&begin.outer, begin_key, NULL, MDBX_SET).err; if (unlikely(rc != MDBX_SUCCESS)) { *size_items = 0; return (rc == MDBX_NOTFOUND) ? MDBX_SUCCESS : rc; @@ -22030,10 +24158,9 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, if (begin.outer.mc_xcursor != NULL) { MDBX_node *node = page_node(begin.outer.mc_pg[begin.outer.mc_top], begin.outer.mc_ki[begin.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { + if (node_flags(node) & F_DUPDATA) { /* LY: return the number of duplicates for given key */ - mdbx_tassert(txn, - begin.outer.mc_xcursor == &begin.inner && + tASSERT(txn, begin.outer.mc_xcursor == &begin.inner && (begin.inner.mx_cursor.mc_flags & C_INITIALIZED)); *size_items = (sizeof(*size_items) >= sizeof(begin.inner.mx_db.md_entries) || @@ -22044,8 +24171,8 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, } return MDBX_SUCCESS; } else { - rc = mdbx_cursor_set(&begin.outer, begin_key, begin_data, - begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) + rc = cursor_set(&begin.outer, begin_key, begin_data, + begin_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) .err; } } @@ -22056,15 +24183,15 @@ int mdbx_estimate_range(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *begin_key, } MDBX_cursor_couple end; - rc = mdbx_cursor_init(&end.outer, txn, dbi); + rc = cursor_init(&end.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; if (!end_key) { MDBX_val stub = {0, 0}; - rc = mdbx_cursor_last(&end.outer, &stub, &stub); + rc = cursor_last(&end.outer, &stub, &stub); } else { - rc = mdbx_cursor_set(&end.outer, end_key, end_data, - end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) + rc = cursor_set(&end.outer, end_key, end_data, + end_data ? MDBX_GET_BOTH_RANGE : MDBX_SET_RANGE) .err; } if (unlikely(rc != MDBX_SUCCESS)) { @@ -22157,7 +24284,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, return MDBX_EINVAL; MDBX_cursor_couple cx; - rc = mdbx_cursor_init(&cx.outer, txn, dbi); + rc = cursor_init(&cx.outer, txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; cx.outer.mc_next = txn->mt_cursors[dbi]; @@ -22174,7 +24301,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, /* убираем лишний бит, он был признаком запрошенного режима */ flags -= MDBX_NOOVERWRITE; - rc = mdbx_cursor_get(&cx.outer, &present_key, old_data, MDBX_GET_BOTH); + rc = cursor_set(&cx.outer, &present_key, old_data, MDBX_GET_BOTH).err; if (rc != MDBX_SUCCESS) goto bailout; } else { @@ -22182,7 +24309,7 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (unlikely(new_data && old_data->iov_base == new_data->iov_base)) return MDBX_EINVAL; MDBX_val present_data; - rc = mdbx_cursor_get(&cx.outer, &present_key, &present_data, MDBX_SET_KEY); + rc = cursor_set(&cx.outer, &present_key, &present_data, MDBX_SET_KEY).err; if (unlikely(rc != MDBX_SUCCESS)) { old_data->iov_base = NULL; old_data->iov_len = 0; @@ -22198,9 +24325,9 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, if (flags & MDBX_CURRENT) { /* disallow update/delete for multi-values */ MDBX_node *node = page_node(page, cx.outer.mc_ki[cx.outer.mc_top]); - if (F_ISSET(node_flags(node), F_DUPDATA)) { - mdbx_tassert(txn, XCURSOR_INITED(&cx.outer) && - cx.outer.mc_xcursor->mx_db.md_entries > 1); + if (node_flags(node) & F_DUPDATA) { + tASSERT(txn, XCURSOR_INITED(&cx.outer) && + cx.outer.mc_xcursor->mx_db.md_entries > 1); if (cx.outer.mc_xcursor->mx_db.md_entries > 1) { rc = MDBX_EMULTIVAL; goto bailout; @@ -22232,9 +24359,9 @@ int mdbx_replace_ex(MDBX_txn *txn, MDBX_dbi dbi, const MDBX_val *key, } if (likely(new_data)) - rc = mdbx_cursor_put(&cx.outer, key, new_data, flags); + rc = cursor_put_checklen(&cx.outer, key, new_data, flags); else - rc = mdbx_cursor_del(&cx.outer, flags & MDBX_ALLDUPS); + rc = cursor_del(&cx.outer, flags & MDBX_ALLDUPS); bailout: txn->mt_cursors[dbi] = cx.outer.mc_next; @@ -22288,7 +24415,7 @@ int mdbx_is_dirty(const MDBX_txn *txn, const void *ptr) { return rc; const MDBX_env *env = txn->mt_env; - const ptrdiff_t offset = (uint8_t *)ptr - env->me_map; + const ptrdiff_t offset = ptr_dist(ptr, env->me_map); if (offset >= 0) { const pgno_t pgno = bytes2pgno(env, offset); if (likely(pgno < txn->mt_next_pgno)) { @@ -22332,7 +24459,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, return MDBX_BAD_DBI; if (unlikely(txn->mt_dbistate[dbi] & DBI_STALE)) { - rc = mdbx_fetch_sdb(txn, dbi); + rc = fetch_sdb(txn, dbi); if (unlikely(rc != MDBX_SUCCESS)) return rc; } @@ -22349,7 +24476,7 @@ int mdbx_dbi_sequence(MDBX_txn *txn, MDBX_dbi dbi, uint64_t *result, if (unlikely(new < increment)) return MDBX_RESULT_TRUE; - mdbx_tassert(txn, new > dbs->md_seq); + tASSERT(txn, new > dbs->md_seq); dbs->md_seq = new; txn->mt_flags |= MDBX_TXN_DIRTY; txn->mt_dbistate[dbi] |= DBI_DIRTY; @@ -22404,8 +24531,8 @@ __cold intptr_t mdbx_limits_txnsize_max(intptr_t pagesize) { STATIC_ASSERT(MAX_MAPSIZE < INTPTR_MAX); const uint64_t pgl_limit = - pagesize * (uint64_t)(MDBX_PGL_LIMIT / 1.6180339887498948482); - const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / 1.6180339887498948482); + pagesize * (uint64_t)(MDBX_PGL_LIMIT / MDBX_GOLD_RATIO_DBL); + const uint64_t map_limit = (uint64_t)(MAX_MAPSIZE / MDBX_GOLD_RATIO_DBL); return (pgl_limit < map_limit) ? (intptr_t)pgl_limit : (intptr_t)map_limit; } @@ -22427,7 +24554,7 @@ static __always_inline uint64_t double2key(const double *const ptr) { const int64_t i = *(const int64_t *)ptr; const uint64_t u = (i < 0) ? UINT64_C(0xffffFFFFffffFFFF) - i : i + UINT64_C(0x8000000000000000); - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const double f = key2double(u); assert(memcmp(&f, ptr, 8) == 0); } @@ -22450,7 +24577,7 @@ static __always_inline uint32_t float2key(const float *const ptr) { const int32_t i = *(const int32_t *)ptr; const uint32_t u = (i < 0) ? UINT32_C(0xffffFFFF) - i : i + UINT32_C(0x80000000); - if (mdbx_assert_enabled()) { + if (ASSERT_ENABLED()) { const float f = key2float(u); assert(memcmp(&f, ptr, 4) == 0); } @@ -22554,8 +24681,8 @@ uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); - const uint64_t exponent = - IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + + IEEE754_DOUBLE_MANTISSA_SIZE - shift; assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); const uint64_t key = bias + (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) + (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); @@ -22580,8 +24707,8 @@ uint64_t mdbx_key_from_jsonInteger(const int64_t json_integer) { assert(mantissa >= IEEE754_DOUBLE_IMPLICIT_LEAD && mantissa <= IEEE754_DOUBLE_MANTISSA_AMAX); - const uint64_t exponent = - IEEE754_DOUBLE_EXPONENTA_BIAS + IEEE754_DOUBLE_MANTISSA_SIZE - shift; + const uint64_t exponent = (uint64_t)IEEE754_DOUBLE_EXPONENTA_BIAS + + IEEE754_DOUBLE_MANTISSA_SIZE - shift; assert(exponent > 0 && exponent <= IEEE754_DOUBLE_EXPONENTA_MAX); const uint64_t key = bias - 1 - (exponent << IEEE754_DOUBLE_MANTISSA_SIZE) - (mantissa - IEEE754_DOUBLE_IMPLICIT_LEAD); @@ -22657,18 +24784,18 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, return err; const bool lock_needed = ((env->me_flags & MDBX_ENV_ACTIVE) && env->me_txn0 && - env->me_txn0->mt_owner != mdbx_thread_self()); + env->me_txn0->mt_owner != osal_thread_self()); bool should_unlock = false; switch (option) { case MDBX_opt_sync_bytes: - if (value == UINT64_MAX) - value = SIZE_MAX - 65536; + if (value == /* default */ UINT64_MAX) + value = MAX_WRITE; if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; if (unlikely(value > SIZE_MAX - 65536)) - return MDBX_TOO_LARGE; + return MDBX_EINVAL; value = bytes2pgno(env, (size_t)value + env->me_psize - 1); if ((uint32_t)value != atomic_load32(&env->me_lck->mti_autosync_threshold, mo_AcquireRelease) && @@ -22677,37 +24804,37 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, /* Дергаем sync(force=off) только если задано новое не-нулевое значение * и мы вне транзакции */ && lock_needed) { - err = mdbx_env_sync_internal(env, false, false); + err = env_sync(env, false, false); if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE) err = MDBX_SUCCESS; } break; case MDBX_opt_sync_period: - if (value == UINT64_MAX) - value = UINT32_MAX; + if (value == /* default */ UINT64_MAX) + value = 2780315 /* 42.42424 секунды */; if (unlikely(env->me_flags & MDBX_RDONLY)) return MDBX_EACCESS; if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; if (unlikely(value > UINT32_MAX)) - return MDBX_TOO_LARGE; - value = mdbx_osal_16dot16_to_monotime((uint32_t)value); + return MDBX_EINVAL; + value = osal_16dot16_to_monotime((uint32_t)value); if (value != atomic_load64(&env->me_lck->mti_autosync_period, mo_AcquireRelease) && atomic_store64(&env->me_lck->mti_autosync_period, value, mo_Relaxed) /* Дергаем sync(force=off) только если задано новое не-нулевое значение * и мы вне транзакции */ && lock_needed) { - err = mdbx_env_sync_internal(env, false, false); + err = env_sync(env, false, false); if (err == /* нечего сбрасывать на диск */ MDBX_RESULT_TRUE) err = MDBX_SUCCESS; } break; case MDBX_opt_max_db: - if (value == UINT64_MAX) - value = MDBX_MAX_DBI; + if (value == /* default */ UINT64_MAX) + value = 42; if (unlikely(value > MDBX_MAX_DBI)) return MDBX_EINVAL; if (unlikely(env->me_map)) @@ -22716,7 +24843,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_max_readers: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = MDBX_READERS_LIMIT; if (unlikely(value < 1 || value > MDBX_READERS_LIMIT)) return MDBX_EINVAL; @@ -22726,7 +24853,7 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_dp_reserve_limit: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = INT_MAX; if (unlikely(value > INT_MAX)) return MDBX_EINVAL; @@ -22739,29 +24866,33 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, } env->me_options.dp_reserve_limit = (unsigned)value; while (env->me_dp_reserve_len > env->me_options.dp_reserve_limit) { - mdbx_assert(env, env->me_dp_reserve != NULL); + eASSERT(env, env->me_dp_reserve != NULL); MDBX_page *dp = env->me_dp_reserve; MDBX_ASAN_UNPOISON_MEMORY_REGION(dp, env->me_psize); - VALGRIND_MAKE_MEM_DEFINED(&dp->mp_next, sizeof(dp->mp_next)); - env->me_dp_reserve = dp->mp_next; - VALGRIND_MEMPOOL_FREE(env, dp); - mdbx_free(dp); + VALGRIND_MAKE_MEM_DEFINED(&mp_next(dp), sizeof(MDBX_page *)); + env->me_dp_reserve = mp_next(dp); + void *const ptr = ptr_disp(dp, -(ptrdiff_t)sizeof(size_t)); + osal_free(ptr); env->me_dp_reserve_len -= 1; } } break; case MDBX_opt_rp_augment_limit: - if (value == UINT64_MAX) - value = MDBX_PGL_LIMIT; - if (unlikely(value > MDBX_PGL_LIMIT)) + if (value == /* default */ UINT64_MAX) { + env->me_options.flags.non_auto.rp_augment_limit = 0; + env->me_options.rp_augment_limit = default_rp_augment_limit(env); + } else if (unlikely(value > MDBX_PGL_LIMIT)) return MDBX_EINVAL; - env->me_options.rp_augment_limit = (unsigned)value; + else { + env->me_options.flags.non_auto.rp_augment_limit = 1; + env->me_options.rp_augment_limit = (unsigned)value; + } break; case MDBX_opt_txn_dp_limit: case MDBX_opt_txn_dp_initial: - if (value == UINT64_MAX) + if (value == /* default */ UINT64_MAX) value = MDBX_PGL_LIMIT; if (unlikely(value > MDBX_PGL_LIMIT || value < CURSOR_STACK * 4)) return MDBX_EINVAL; @@ -22796,40 +24927,73 @@ __cold int mdbx_env_set_option(MDBX_env *env, const MDBX_option_t option, break; case MDBX_opt_spill_max_denominator: - if (value == UINT64_MAX) - value = 255; + if (value == /* default */ UINT64_MAX) + value = 8; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_max_denominator = (uint8_t)value; break; case MDBX_opt_spill_min_denominator: + if (value == /* default */ UINT64_MAX) + value = 8; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_min_denominator = (uint8_t)value; break; case MDBX_opt_spill_parent4child_denominator: + if (value == /* default */ UINT64_MAX) + value = 0; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.spill_parent4child_denominator = (uint8_t)value; break; case MDBX_opt_loose_limit: - if (value == UINT64_MAX) - value = 255; + if (value == /* default */ UINT64_MAX) + value = 64; if (unlikely(value > 255)) return MDBX_EINVAL; env->me_options.dp_loose_limit = (uint8_t)value; break; case MDBX_opt_merge_threshold_16dot16_percent: - if (value == UINT64_MAX) - value = 32768; + if (value == /* default */ UINT64_MAX) + value = 65536 / 4 /* 25% */; if (unlikely(value < 8192 || value > 32768)) return MDBX_EINVAL; env->me_options.merge_threshold_16dot16_percent = (unsigned)value; recalculate_merge_threshold(env); break; + case MDBX_opt_writethrough_threshold: +#if defined(_WIN32) || defined(_WIN64) + /* позволяем "установить" значение по-умолчанию и совпадающее + * с поведением соответствующим текущей установке MDBX_NOMETASYNC */ + if (value == /* default */ UINT64_MAX && + value != ((env->me_flags & MDBX_NOMETASYNC) ? 0 : UINT_MAX)) + err = MDBX_EINVAL; +#else + if (value == /* default */ UINT64_MAX) + value = MDBX_WRITETHROUGH_THRESHOLD_DEFAULT; + if (value != (unsigned)value) + err = MDBX_EINVAL; + else + env->me_options.writethrough_threshold = (unsigned)value; +#endif + break; + + case MDBX_opt_prefault_write_enable: + if (value == /* default */ UINT64_MAX) { + env->me_options.prefault_write = default_prefault_write(env); + env->me_options.flags.non_auto.prefault_write = false; + } else if (value > 1) + err = MDBX_EINVAL; + else { + env->me_options.prefault_write = value != 0; + env->me_options.flags.non_auto.prefault_write = true; + } + break; + default: return MDBX_EINVAL; } @@ -22858,7 +25022,7 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, case MDBX_opt_sync_period: if (unlikely(!(env->me_flags & MDBX_ENV_ACTIVE))) return MDBX_EPERM; - *pvalue = mdbx_osal_monotime_to_16dot16( + *pvalue = osal_monotime_to_16dot16( atomic_load64(&env->me_lck->mti_autosync_period, mo_Relaxed)); break; @@ -22903,6 +25067,18 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, *pvalue = env->me_options.merge_threshold_16dot16_percent; break; + case MDBX_opt_writethrough_threshold: +#if defined(_WIN32) || defined(_WIN64) + *pvalue = (env->me_flags & MDBX_NOMETASYNC) ? 0 : INT_MAX; +#else + *pvalue = env->me_options.writethrough_threshold; +#endif + break; + + case MDBX_opt_prefault_write_enable: + *pvalue = env->me_options.prefault_write; + break; + default: return MDBX_EINVAL; } @@ -22910,131 +25086,337 @@ __cold int mdbx_env_get_option(const MDBX_env *env, const MDBX_option_t option, return MDBX_SUCCESS; } -/*** Attribute support functions for Nexenta **********************************/ -#ifdef MDBX_NEXENTA_ATTRS - -static __inline int mdbx_attr_peek(MDBX_val *data, mdbx_attr_t *attrptr) { - if (unlikely(data->iov_len < sizeof(mdbx_attr_t))) - return MDBX_INCOMPATIBLE; - - if (likely(attrptr != NULL)) - *attrptr = *(mdbx_attr_t *)data->iov_base; - data->iov_len -= sizeof(mdbx_attr_t); - data->iov_base = - likely(data->iov_len > 0) ? ((mdbx_attr_t *)data->iov_base) + 1 : NULL; - - return MDBX_SUCCESS; +static size_t estimate_rss(size_t database_bytes) { + return database_bytes + database_bytes / 64 + + (512 + MDBX_WORDBITS * 16) * MEGABYTE; } -static __inline int mdbx_attr_poke(MDBX_val *reserved, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - mdbx_attr_t *space = reserved->iov_base; - if (flags & MDBX_RESERVE) { - if (likely(data != NULL)) { - data->iov_base = data->iov_len ? space + 1 : NULL; - } - } else { - *space = attr; - if (likely(data != NULL)) { - memcpy(space + 1, data->iov_base, data->iov_len); - } - } - - return MDBX_SUCCESS; -} - -int mdbx_cursor_get_attr(MDBX_cursor *mc, MDBX_val *key, MDBX_val *data, - mdbx_attr_t *attrptr, MDBX_cursor_op op) { - int rc = mdbx_cursor_get(mc, key, data, op); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_peek(data, attrptr); -} - -int mdbx_get_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - uint64_t *attrptr) { - int rc = mdbx_get(txn, dbi, key, data); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_peek(data, attrptr); -} - -int mdbx_put_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - MDBX_val reserve; - reserve.iov_base = NULL; - reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); - - int rc = mdbx_put(txn, dbi, key, &reserve, flags | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_poke(&reserve, data, attr, flags); -} - -int mdbx_cursor_put_attr(MDBX_cursor *cursor, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr, MDBX_put_flags_t flags) { - MDBX_val reserve; - reserve.iov_base = NULL; - reserve.iov_len = (data ? data->iov_len : 0) + sizeof(mdbx_attr_t); - - int rc = mdbx_cursor_put(cursor, key, &reserve, flags | MDBX_RESERVE); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - - return mdbx_attr_poke(&reserve, data, attr, flags); -} - -int mdbx_set_attr(MDBX_txn *txn, MDBX_dbi dbi, MDBX_val *key, MDBX_val *data, - mdbx_attr_t attr) { - if (unlikely(!key || !txn)) +__cold int mdbx_env_warmup(const MDBX_env *env, const MDBX_txn *txn, + MDBX_warmup_flags_t flags, + unsigned timeout_seconds_16dot16) { + if (unlikely(env == NULL && txn == NULL)) + return MDBX_EINVAL; + if (unlikely(flags > + (MDBX_warmup_force | MDBX_warmup_oomsafe | MDBX_warmup_lock | + MDBX_warmup_touchlimit | MDBX_warmup_release))) return MDBX_EINVAL; - if (unlikely(txn->mt_signature != MDBX_MT_SIGNATURE)) - return MDBX_EBADSIGN; - - if (unlikely(!check_dbi(txn, dbi, DB_USRVALID))) - return MDBX_BAD_DBI; - - if (unlikely(txn->mt_flags & (MDBX_TXN_RDONLY | MDBX_TXN_BLOCKED))) - return (txn->mt_flags & MDBX_TXN_RDONLY) ? MDBX_EACCESS : MDBX_BAD_TXN; - - MDBX_cursor_couple cx; - MDBX_val old_data; - int rc = mdbx_cursor_init(&cx.outer, txn, dbi); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; - rc = mdbx_cursor_set(&cx.outer, key, &old_data, MDBX_SET, NULL); - if (unlikely(rc != MDBX_SUCCESS)) { - if (rc == MDBX_NOTFOUND && data) { - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_put_attr(&cx.outer, key, data, attr, 0); - txn->mt_cursors[dbi] = cx.outer.mc_next; - } - return rc; + if (txn) { + int err = check_txn(txn, MDBX_TXN_BLOCKED - MDBX_TXN_ERROR); + if (unlikely(err != MDBX_SUCCESS)) + return err; + } + if (env) { + int err = check_env(env, false); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (txn && unlikely(txn->mt_env != env)) + return MDBX_EINVAL; + } else { + env = txn->mt_env; } - mdbx_attr_t old_attr = 0; - rc = mdbx_attr_peek(&old_data, &old_attr); - if (unlikely(rc != MDBX_SUCCESS)) - return rc; + const uint64_t timeout_monotime = + (timeout_seconds_16dot16 && (flags & MDBX_warmup_force)) + ? osal_monotime() + osal_16dot16_to_monotime(timeout_seconds_16dot16) + : 0; - if (old_attr == attr && (!data || (data->iov_len == old_data.iov_len && - memcmp(data->iov_base, old_data.iov_base, - old_data.iov_len) == 0))) - return MDBX_SUCCESS; + if (flags & MDBX_warmup_release) + munlock_all(env); + + pgno_t used_pgno; + if (txn) { + used_pgno = txn->mt_geo.next; + } else { + const meta_troika_t troika = meta_tap(env); + used_pgno = meta_recent(env, &troika).ptr_v->mm_geo.next; + } + const size_t used_range = pgno_align2os_bytes(env, used_pgno); + const pgno_t mlock_pgno = bytes2pgno(env, used_range); + + int rc = MDBX_SUCCESS; + if (flags & MDBX_warmup_touchlimit) { + const size_t estimated_rss = estimate_rss(used_range); +#if defined(_WIN32) || defined(_WIN64) + SIZE_T current_ws_lower, current_ws_upper; + if (GetProcessWorkingSetSize(GetCurrentProcess(), ¤t_ws_lower, + ¤t_ws_upper) && + current_ws_lower < estimated_rss) { + const SIZE_T ws_lower = estimated_rss; + const SIZE_T ws_upper = + (MDBX_WORDBITS == 32 && ws_lower > MEGABYTE * 2048) + ? ws_lower + : ws_lower + MDBX_WORDBITS * MEGABYTE * 32; + if (!SetProcessWorkingSetSize(GetCurrentProcess(), ws_lower, ws_upper)) { + rc = (int)GetLastError(); + WARNING("SetProcessWorkingSetSize(%zu, %zu) error %d", ws_lower, + ws_upper, rc); + } + } +#endif /* Windows */ +#ifdef RLIMIT_RSS + struct rlimit rss; + if (getrlimit(RLIMIT_RSS, &rss) == 0 && rss.rlim_cur < estimated_rss) { + rss.rlim_cur = estimated_rss; + if (rss.rlim_max < estimated_rss) + rss.rlim_max = estimated_rss; + if (setrlimit(RLIMIT_RSS, &rss)) { + rc = errno; + WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_RSS", + (size_t)rss.rlim_cur, (size_t)rss.rlim_max, rc); + } + } +#endif /* RLIMIT_RSS */ +#ifdef RLIMIT_MEMLOCK + if (flags & MDBX_warmup_lock) { + struct rlimit memlock; + if (getrlimit(RLIMIT_MEMLOCK, &memlock) == 0 && + memlock.rlim_cur < estimated_rss) { + memlock.rlim_cur = estimated_rss; + if (memlock.rlim_max < estimated_rss) + memlock.rlim_max = estimated_rss; + if (setrlimit(RLIMIT_MEMLOCK, &memlock)) { + rc = errno; + WARNING("setrlimit(%s, {%zu, %zu}) error %d", "RLIMIT_MEMLOCK", + (size_t)memlock.rlim_cur, (size_t)memlock.rlim_max, rc); + } + } + } +#endif /* RLIMIT_MEMLOCK */ + (void)estimated_rss; + } + +#if defined(MLOCK_ONFAULT) && \ + ((defined(_GNU_SOURCE) && __GLIBC_PREREQ(2, 27)) || \ + (defined(__ANDROID_API__) && __ANDROID_API__ >= 30)) && \ + (defined(__linux__) || defined(__gnu_linux__)) + if ((flags & MDBX_warmup_lock) != 0 && linux_kernel_version >= 0x04040000 && + atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { + if (mlock2(env->me_map, used_range, MLOCK_ONFAULT)) { + rc = errno; + WARNING("mlock2(%zu, %s) error %d", used_range, "MLOCK_ONFAULT", rc); + } else { + update_mlcnt(env, mlock_pgno, true); + rc = MDBX_SUCCESS; + } + if (rc != EINVAL) + flags -= MDBX_warmup_lock; + } +#endif /* MLOCK_ONFAULT */ + + int err = MDBX_ENOSYS; +#if MDBX_ENABLE_MADVISE + err = set_readahead(env, used_pgno, true, true); +#else +#if defined(_WIN32) || defined(_WIN64) + if (mdbx_PrefetchVirtualMemory) { + WIN32_MEMORY_RANGE_ENTRY hint; + hint.VirtualAddress = env->me_map; + hint.NumberOfBytes = used_range; + if (mdbx_PrefetchVirtualMemory(GetCurrentProcess(), 1, &hint, 0)) + err = MDBX_SUCCESS; + else { + err = (int)GetLastError(); + ERROR("%s(%zu) error %d", "PrefetchVirtualMemory", used_range, err); + } + } +#endif /* Windows */ + +#if defined(POSIX_MADV_WILLNEED) + err = posix_madvise(env->me_map, used_range, POSIX_MADV_WILLNEED) + ? ignore_enosys(errno) + : MDBX_SUCCESS; +#elif defined(MADV_WILLNEED) + err = madvise(env->me_map, used_range, MADV_WILLNEED) ? ignore_enosys(errno) + : MDBX_SUCCESS; +#endif + +#if defined(F_RDADVISE) + if (err) { + fcntl(env->me_lazy_fd, F_RDAHEAD, true); + struct radvisory hint; + hint.ra_offset = 0; + hint.ra_count = unlikely(used_range > INT_MAX && + sizeof(used_range) > sizeof(hint.ra_count)) + ? INT_MAX + : (int)used_range; + err = fcntl(env->me_lazy_fd, F_RDADVISE, &hint) ? ignore_enosys(errno) + : MDBX_SUCCESS; + if (err == ENOTTY) + err = MDBX_SUCCESS /* Ignore ENOTTY for DB on the ram-disk */; + } +#endif /* F_RDADVISE */ +#endif /* MDBX_ENABLE_MADVISE */ + if (err != MDBX_SUCCESS && rc == MDBX_SUCCESS) + rc = err; + + if ((flags & MDBX_warmup_force) != 0 && + (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS)) { + const volatile uint8_t *ptr = env->me_map; + size_t offset = 0, unused = 42; +#if !(defined(_WIN32) || defined(_WIN64)) + if (flags & MDBX_warmup_oomsafe) { + const int null_fd = open("/dev/null", O_WRONLY); + if (unlikely(null_fd < 0)) + rc = errno; + else { + struct iovec iov[MDBX_AUXILARY_IOV_MAX]; + for (;;) { + unsigned i; + for (i = 0; i < MDBX_AUXILARY_IOV_MAX && offset < used_range; ++i) { + iov[i].iov_base = (void *)(ptr + offset); + iov[i].iov_len = 1; + offset += env->me_os_psize; + } + if (unlikely(writev(null_fd, iov, i) < 0)) { + rc = errno; + if (rc == EFAULT) + rc = ENOMEM; + break; + } + if (offset >= used_range) { + rc = MDBX_SUCCESS; + break; + } + if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) { + rc = MDBX_RESULT_TRUE; + break; + } + } + close(null_fd); + } + } else +#endif /* Windows */ + for (;;) { + unused += ptr[offset]; + offset += env->me_os_psize; + if (offset >= used_range) { + rc = MDBX_SUCCESS; + break; + } + if (timeout_seconds_16dot16 && osal_monotime() > timeout_monotime) { + rc = MDBX_RESULT_TRUE; + break; + } + } + (void)unused; + } + + if ((flags & MDBX_warmup_lock) != 0 && + (rc == MDBX_SUCCESS || rc == MDBX_ENOSYS) && + atomic_load32(&env->me_mlocked_pgno, mo_AcquireRelease) < mlock_pgno) { +#if defined(_WIN32) || defined(_WIN64) + if (VirtualLock(env->me_map, used_range)) { + update_mlcnt(env, mlock_pgno, true); + rc = MDBX_SUCCESS; + } else { + rc = (int)GetLastError(); + WARNING("%s(%zu) error %d", "VirtualLock", used_range, rc); + } +#elif defined(_POSIX_MEMLOCK_RANGE) + if (mlock(env->me_map, used_range) == 0) { + update_mlcnt(env, mlock_pgno, true); + rc = MDBX_SUCCESS; + } else { + rc = errno; + WARNING("%s(%zu) error %d", "mlock", used_range, rc); + } +#else + rc = MDBX_ENOSYS; +#endif + } - cx.outer.mc_next = txn->mt_cursors[dbi]; - txn->mt_cursors[dbi] = &cx.outer; - rc = mdbx_cursor_put_attr(&cx.outer, key, data ? data : &old_data, attr, - MDBX_CURRENT); - txn->mt_cursors[dbi] = cx.outer.mc_next; return rc; } -#endif /* MDBX_NEXENTA_ATTRS */ + +__cold void global_ctor(void) { + osal_ctor(); + rthc_limit = RTHC_INITIAL_LIMIT; + rthc_table = rthc_table_static; +#if defined(_WIN32) || defined(_WIN64) + InitializeCriticalSection(&rthc_critical_section); + InitializeCriticalSection(&lcklist_critical_section); +#else + ENSURE(nullptr, pthread_key_create(&rthc_key, thread_dtor) == 0); + TRACE("pid %d, &mdbx_rthc_key = %p, value 0x%x", osal_getpid(), + __Wpedantic_format_voidptr(&rthc_key), (unsigned)rthc_key); +#endif + /* checking time conversion, this also avoids racing on 32-bit architectures + * during storing calculated 64-bit ratio(s) into memory. */ + uint32_t proba = UINT32_MAX; + while (true) { + unsigned time_conversion_checkup = + osal_monotime_to_16dot16(osal_16dot16_to_monotime(proba)); + unsigned one_more = (proba < UINT32_MAX) ? proba + 1 : proba; + unsigned one_less = (proba > 0) ? proba - 1 : proba; + ENSURE(nullptr, time_conversion_checkup >= one_less && + time_conversion_checkup <= one_more); + if (proba == 0) + break; + proba >>= 1; + } + + bootid = osal_bootid(); + +#if MDBX_DEBUG + for (size_t i = 0; i < 2 * 2 * 2 * 3 * 3 * 3; ++i) { + const bool s0 = (i >> 0) & 1; + const bool s1 = (i >> 1) & 1; + const bool s2 = (i >> 2) & 1; + const uint8_t c01 = (i / (8 * 1)) % 3; + const uint8_t c02 = (i / (8 * 3)) % 3; + const uint8_t c12 = (i / (8 * 9)) % 3; + + const uint8_t packed = meta_cmp2pack(c01, c02, c12, s0, s1, s2); + meta_troika_t troika; + troika.fsm = (uint8_t)i; + meta_troika_unpack(&troika, packed); + + const uint8_t tail = TROIKA_TAIL(&troika); + const bool strict = TROIKA_STRICT_VALID(&troika); + const bool valid = TROIKA_VALID(&troika); + + const uint8_t recent_chk = meta_cmp2recent(c01, s0, s1) + ? (meta_cmp2recent(c02, s0, s2) ? 0 : 2) + : (meta_cmp2recent(c12, s1, s2) ? 1 : 2); + const uint8_t prefer_steady_chk = + meta_cmp2steady(c01, s0, s1) ? (meta_cmp2steady(c02, s0, s2) ? 0 : 2) + : (meta_cmp2steady(c12, s1, s2) ? 1 : 2); + + uint8_t tail_chk; + if (recent_chk == 0) + tail_chk = meta_cmp2steady(c12, s1, s2) ? 2 : 1; + else if (recent_chk == 1) + tail_chk = meta_cmp2steady(c02, s0, s2) ? 2 : 0; + else + tail_chk = meta_cmp2steady(c01, s0, s1) ? 1 : 0; + + const bool valid_chk = + c01 != 1 || s0 != s1 || c02 != 1 || s0 != s2 || c12 != 1 || s1 != s2; + const bool strict_chk = (c01 != 1 || s0 != s1) && (c02 != 1 || s0 != s2) && + (c12 != 1 || s1 != s2); + assert(troika.recent == recent_chk); + assert(troika.prefer_steady == prefer_steady_chk); + assert(tail == tail_chk); + assert(valid == valid_chk); + assert(strict == strict_chk); + // printf(" %d, ", packed); + assert(troika_fsm_map[troika.fsm] == packed); + } +#endif /* MDBX_DEBUG*/ + +#if 0 /* debug */ + for (size_t i = 0; i < 65536; ++i) { + size_t pages = pv2pages(i); + size_t x = pages2pv(pages); + size_t xp = pv2pages(x); + if (!(x == i || (x % 2 == 0 && x < 65536)) || pages != xp) + printf("%u => %zu => %u => %zu\n", i, pages, x, xp); + assert(pages == xp); + } + fflush(stdout); +#endif /* #if 0 */ +} /******************************************************************************/ /* *INDENT-OFF* */ @@ -23162,6 +25544,9 @@ __dll_export #endif /* MDBX_BUILD_TYPE */ , "MDBX_DEBUG=" MDBX_STRINGIFY(MDBX_DEBUG) +#ifdef ENABLE_GPROF + " ENABLE_GPROF" +#endif /* ENABLE_GPROF */ " MDBX_WORDBITS=" MDBX_STRINGIFY(MDBX_WORDBITS) " BYTE_ORDER=" #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ @@ -23171,16 +25556,21 @@ __dll_export #else #error "FIXME: Unsupported byte order" #endif /* __BYTE_ORDER__ */ + " MDBX_ENABLE_BIGFOOT=" MDBX_STRINGIFY(MDBX_ENABLE_BIGFOOT) " MDBX_ENV_CHECKPID=" MDBX_ENV_CHECKPID_CONFIG " MDBX_TXN_CHECKOWNER=" MDBX_TXN_CHECKOWNER_CONFIG " MDBX_64BIT_ATOMIC=" MDBX_64BIT_ATOMIC_CONFIG " MDBX_64BIT_CAS=" MDBX_64BIT_CAS_CONFIG " MDBX_TRUST_RTC=" MDBX_TRUST_RTC_CONFIG + " MDBX_AVOID_MSYNC=" MDBX_STRINGIFY(MDBX_AVOID_MSYNC) " MDBX_ENABLE_REFUND=" MDBX_STRINGIFY(MDBX_ENABLE_REFUND) " MDBX_ENABLE_MADVISE=" MDBX_STRINGIFY(MDBX_ENABLE_MADVISE) -#if MDBX_DISABLE_PAGECHECKS - " MDBX_DISABLE_PAGECHECKS=YES" -#endif /* MDBX_DISABLE_PAGECHECKS */ + " MDBX_ENABLE_MINCORE=" MDBX_STRINGIFY(MDBX_ENABLE_MINCORE) + " MDBX_ENABLE_PGOP_STAT=" MDBX_STRINGIFY(MDBX_ENABLE_PGOP_STAT) + " MDBX_ENABLE_PROFGC=" MDBX_STRINGIFY(MDBX_ENABLE_PROFGC) +#if MDBX_DISABLE_VALIDATION + " MDBX_DISABLE_VALIDATION=YES" +#endif /* MDBX_DISABLE_VALIDATION */ #ifdef __SANITIZE_ADDRESS__ " SANITIZE_ADDRESS=YES" #endif /* __SANITIZE_ADDRESS__ */ @@ -23267,7 +25657,10 @@ __dll_export }; #ifdef __SANITIZE_ADDRESS__ -LIBMDBX_API __attribute__((__weak__)) const char *__asan_default_options(void) { +#if !defined(_MSC_VER) || __has_attribute(weak) +LIBMDBX_API __attribute__((__weak__)) +#endif +const char *__asan_default_options(void) { return "symbolize=1:allow_addr2line=1:" #if MDBX_DEBUG "debug=1:" diff --git a/src/debug_begin.h b/src/debug_begin.h index 9a904095..521e99cf 100644 --- a/src/debug_begin.h +++ b/src/debug_begin.h @@ -1,42 +1,42 @@ #if defined(__GNUC__) && !defined(__LCC__) -#pragma push_macro("mdbx_trace") -#pragma push_macro("mdbx_debug") -#pragma push_macro("mdbx_verbose") -#pragma push_macro("mdbx_notice") -#pragma push_macro("mdbx_warning") -#pragma push_macro("mdbx_error") -#pragma push_macro("mdbx_assert") +#pragma push_macro("TRACE") +#pragma push_macro("DEBUG") +#pragma push_macro("VERBOSE") +#pragma push_macro("NOTICE") +#pragma push_macro("WARNING") +#pragma push_macro("ERROR") +#pragma push_macro("eASSERT") -#undef mdbx_trace -#define mdbx_trace(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef TRACE +#define TRACE(fmt, ...) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_debug -#define mdbx_debug(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef DEBUG +#define DEBUG(fmt, ...) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_verbose -#define mdbx_verbose(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef VERBOSE +#define VERBOSE(fmt, ...) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_notice -#define mdbx_notice(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef NOTICE +#define NOTICE(fmt, ...) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_warning -#define mdbx_warning(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef WARNING +#define WARNING(fmt, ...) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_error -#define mdbx_error(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__) +#undef ERROR +#define ERROR(fmt, ...) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__) -#undef mdbx_assert -#define mdbx_assert(env, expr) mdbx_ensure(env, expr) +#undef eASSERT +#define eASSERT(env, expr) ENSURE(env, expr) #if !defined(__clang__) -#pragma GCC optimize("-O0") +#pragma GCC optimize("-Og") #endif #endif /* GCC only */ diff --git a/src/debug_end.h b/src/debug_end.h index a854f715..bbf66526 100644 --- a/src/debug_end.h +++ b/src/debug_end.h @@ -1,12 +1,12 @@ #if defined(__GNUC__) && !defined(__LCC__) -#pragma pop_macro("mdbx_trace") -#pragma pop_macro("mdbx_debug") -#pragma pop_macro("mdbx_verbose") -#pragma pop_macro("mdbx_notice") -#pragma pop_macro("mdbx_warning") -#pragma pop_macro("mdbx_error") -#pragma pop_macro("mdbx_assert") +#pragma pop_macro("TRACE") +#pragma pop_macro("DEBUG") +#pragma pop_macro("VERBOSE") +#pragma pop_macro("NOTICE") +#pragma pop_macro("WARNING") +#pragma pop_macro("ERROR") +#pragma pop_macro("eASSERT") #if !defined(__clang__) #pragma GCC reset_options diff --git a/src/internals.h b/src/internals.h index 2d314996..7bd0f96d 100644 --- a/src/internals.h +++ b/src/internals.h @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -86,14 +86,18 @@ #pragma warning(disable : 4464) /* relative include path contains '..' */ #endif #if _MSC_VER > 1913 -#pragma warning(disable : 5045) /* Compiler will insert Spectre mitigation... \ - */ +#pragma warning(disable : 5045) /* will insert Spectre mitigation... */ #endif #if _MSC_VER > 1914 #pragma warning( \ disable : 5105) /* winbase.h(9531): warning C5105: macro expansion \ producing 'defined' has undefined behavior */ #endif +#if _MSC_VER > 1930 +#pragma warning(disable : 6235) /* is always a constant */ +#pragma warning(disable : 6237) /* is never evaluated and might \ + have side effects */ +#endif #pragma warning(disable : 4710) /* 'xyz': function not inlined */ #pragma warning(disable : 4711) /* function 'xyz' selected for automatic \ inline expansion */ @@ -128,6 +132,10 @@ #define __USE_MINGW_ANSI_STDIO 1 #endif /* MinGW */ +#if (defined(_WIN32) || defined(_WIN64)) && !defined(UNICODE) +#define UNICODE +#endif /* UNICODE */ + #include "../mdbx.h" #include "base.h" @@ -195,6 +203,16 @@ #endif #endif /* -Walignment-reduction-ignored */ +#ifndef MDBX_EXCLUDE_FOR_GPROF +#ifdef ENABLE_GPROF +#define MDBX_EXCLUDE_FOR_GPROF \ + __attribute__((__no_instrument_function__, \ + __no_profile_instrument_function__)) +#else +#define MDBX_EXCLUDE_FOR_GPROF +#endif /* ENABLE_GPROF */ +#endif /* MDBX_EXCLUDE_FOR_GPROF */ + #ifdef __cplusplus extern "C" { #endif @@ -213,13 +231,149 @@ extern LIBMDBX_API const char *const mdbx_sourcery_anchor; #undef NDEBUG #endif +#ifndef __cplusplus +/*----------------------------------------------------------------------------*/ +/* Debug and Logging stuff */ + +#define MDBX_RUNTIME_FLAGS_INIT \ + ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT + +extern uint8_t runtime_flags; +extern uint8_t loglevel; +extern MDBX_debug_func *debug_logger; + +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny) { +#if MDBX_DEBUG + if (MDBX_DBG_JITTER & runtime_flags) + osal_jitter(tiny); +#else + (void)tiny; +#endif +} + +MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) + debug_log(int level, const char *function, int line, const char *fmt, ...) + MDBX_PRINTF_ARGS(4, 5); +MDBX_INTERNAL_FUNC void debug_log_va(int level, const char *function, int line, + const char *fmt, va_list args); + +#if MDBX_DEBUG +#define LOG_ENABLED(msg) unlikely(msg <= loglevel) +#define AUDIT_ENABLED() unlikely((runtime_flags & MDBX_DBG_AUDIT)) +#else /* MDBX_DEBUG */ +#define LOG_ENABLED(msg) (msg < MDBX_LOG_VERBOSE && msg <= loglevel) +#define AUDIT_ENABLED() (0) +#endif /* MDBX_DEBUG */ + +#if MDBX_FORCE_ASSERTIONS +#define ASSERT_ENABLED() (1) +#elif MDBX_DEBUG +#define ASSERT_ENABLED() likely((runtime_flags & MDBX_DBG_ASSERT)) +#else +#define ASSERT_ENABLED() (0) +#endif /* assertions */ + +#define DEBUG_EXTRA(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ + } while (0) + +#define DEBUG_EXTRA_PRINT(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_EXTRA)) \ + debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ + } while (0) + +#define TRACE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_TRACE)) \ + debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define DEBUG(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_DEBUG)) \ + debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define VERBOSE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_VERBOSE)) \ + debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define NOTICE(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_NOTICE)) \ + debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define WARNING(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_WARN)) \ + debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#undef ERROR /* wingdi.h \ + Yeah, morons from M$ put such definition to the public header. */ + +#define ERROR(fmt, ...) \ + do { \ + if (LOG_ENABLED(MDBX_LOG_ERROR)) \ + debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", __VA_ARGS__); \ + } while (0) + +#define FATAL(fmt, ...) \ + debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); + +#if MDBX_DEBUG +#define ASSERT_FAIL(env, msg, func, line) mdbx_assert_fail(env, msg, func, line) +#else /* MDBX_DEBUG */ +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line); +#define ASSERT_FAIL(env, msg, func, line) \ + do { \ + (void)(env); \ + assert_fail(msg, func, line); \ + } while (0) +#endif /* MDBX_DEBUG */ + +#define ENSURE_MSG(env, expr, msg) \ + do { \ + if (unlikely(!(expr))) \ + ASSERT_FAIL(env, msg, __func__, __LINE__); \ + } while (0) + +#define ENSURE(env, expr) ENSURE_MSG(env, expr, #expr) + +/* assert(3) variant in environment context */ +#define eASSERT(env, expr) \ + do { \ + if (ASSERT_ENABLED()) \ + ENSURE(env, expr); \ + } while (0) + +/* assert(3) variant in cursor context */ +#define cASSERT(mc, expr) eASSERT((mc)->mc_txn->mt_env, expr) + +/* assert(3) variant in transaction context */ +#define tASSERT(txn, expr) eASSERT((txn)->mt_env, expr) + +#ifndef xMDBX_TOOLS /* Avoid using internal eASSERT() */ +#undef assert +#define assert(expr) eASSERT(NULL, expr) +#endif + +#endif /* __cplusplus */ + /*----------------------------------------------------------------------------*/ /* Atomics */ enum MDBX_memory_order { mo_Relaxed, - mo_AcquireRelease, - mo_SequentialConsistency + mo_AcquireRelease + /* , mo_SequentialConsistency */ }; typedef union { @@ -275,15 +429,15 @@ typedef union { #ifndef __cplusplus #ifdef MDBX_HAVE_C11ATOMICS -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ atomic_thread_fence((write) ? mo_c11_store(order) : mo_c11_load(order)) #else /* MDBX_HAVE_C11ATOMICS */ -#define mdbx_memory_fence(order, write) \ +#define osal_memory_fence(order, write) \ do { \ - mdbx_compiler_barrier(); \ + osal_compiler_barrier(); \ if (write && order > (MDBX_CPU_WRITEBACK_INCOHERENT ? mo_Relaxed \ : mo_AcquireRelease)) \ - mdbx_memory_barrier(); \ + osal_memory_barrier(); \ } while (0) #endif /* MDBX_HAVE_C11ATOMICS */ @@ -318,26 +472,26 @@ atomic_store32(MDBX_atomic_uint32_t *p, const uint32_t value, atomic_store_explicit(MDBX_c11a_rw(uint32_t, p), value, mo_c11_store(order)); #else /* MDBX_HAVE_C11ATOMICS */ if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); p->weak = value; - mdbx_memory_fence(order, true); + osal_memory_fence(order, true); #endif /* MDBX_HAVE_C11ATOMICS */ return value; } #endif /* atomic_store32 */ #ifndef atomic_load32 -MDBX_MAYBE_UNUSED static __always_inline uint32_t -atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { +MDBX_MAYBE_UNUSED static __always_inline uint32_t atomic_load32( + const volatile MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { STATIC_ASSERT(sizeof(MDBX_atomic_uint32_t) == 4); #ifdef MDBX_HAVE_C11ATOMICS assert(atomic_is_lock_free(MDBX_c11a_ro(uint32_t, p))); return atomic_load_explicit(MDBX_c11a_ro(uint32_t, p), mo_c11_load(order)); #else /* MDBX_HAVE_C11ATOMICS */ - mdbx_memory_fence(order, false); + osal_memory_fence(order, false); const uint32_t value = p->weak; if (order != mo_Relaxed) - mdbx_compiler_barrier(); + osal_compiler_barrier(); return value; #endif /* MDBX_HAVE_C11ATOMICS */ } @@ -356,7 +510,7 @@ atomic_load32(const MDBX_atomic_uint32_t *p, enum MDBX_memory_order order) { /* FROZEN: The version number for a database's datafile format. */ #define MDBX_DATA_VERSION 3 /* The version number for a database's lockfile format. */ -#define MDBX_LOCK_VERSION 4 +#define MDBX_LOCK_VERSION 5 /* handle for the DB used to track free pages. */ #define FREE_DBI 0 @@ -445,7 +599,10 @@ typedef struct MDBX_meta { uint32_t mm_magic_and_version[2]; /* txnid that committed this page, the first of a two-phase-update pair */ - uint32_t mm_txnid_a[2]; + union { + MDBX_atomic_uint32_t mm_txnid_a[2]; + uint64_t unsafe_txnid; + }; uint16_t mm_extra_flags; /* extra DB flags, zero (nothing) for now */ uint8_t mm_validator_id; /* ID of checksum and page validation method, @@ -464,11 +621,14 @@ typedef struct MDBX_meta { #define MDBX_DATASIGN_WEAK 1u #define SIGN_IS_STEADY(sign) ((sign) > MDBX_DATASIGN_WEAK) #define META_IS_STEADY(meta) \ - SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_datasync_sign)) - uint32_t mm_datasync_sign[2]; + SIGN_IS_STEADY(unaligned_peek_u64_volatile(4, (meta)->mm_sign)) + union { + uint32_t mm_sign[2]; + uint64_t unsafe_sign; + }; /* txnid that committed this page, the second of a two-phase-update pair */ - uint32_t mm_txnid_b[2]; + MDBX_atomic_uint32_t mm_txnid_b[2]; /* Number of non-meta pages which were put in GC after COW. May be 0 in case * DB was previously handled by libmdbx without corresponding feature. @@ -505,27 +665,26 @@ typedef struct MDBX_meta { * Each non-metapage up to MDBX_meta.mm_last_pg is reachable exactly once * in the snapshot: Either used by a database or listed in a GC record. */ typedef struct MDBX_page { - union { #define IS_FROZEN(txn, p) ((p)->mp_txnid < (txn)->mt_txnid) #define IS_SPILLED(txn, p) ((p)->mp_txnid == (txn)->mt_txnid) #define IS_SHADOWED(txn, p) ((p)->mp_txnid > (txn)->mt_txnid) #define IS_VALID(txn, p) ((p)->mp_txnid <= (txn)->mt_front) #define IS_MODIFIABLE(txn, p) ((p)->mp_txnid == (txn)->mt_front) - uint64_t mp_txnid; - struct MDBX_page *mp_next; /* for in-memory list of freed pages */ - }; - uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ -#define P_BRANCH 0x01 /* branch page */ -#define P_LEAF 0x02 /* leaf page */ -#define P_OVERFLOW 0x04 /* overflow page */ -#define P_META 0x08 /* meta page */ -#define P_BAD 0x10 /* explicit flag for invalid/bad page */ -#define P_LEAF2 0x20 /* for MDBX_DUPFIXED records */ -#define P_SUBP 0x40 /* for MDBX_DUPSORT sub-pages */ -#define P_SPILLED 0x2000 /* spilled in parent txn */ -#define P_LOOSE 0x4000 /* page was dirtied then freed, can be reused */ -#define P_FROZEN 0x8000 /* used for retire page with known status */ -#define P_ILL_BITS (~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) + uint64_t mp_txnid; /* txnid which created page, maybe zero in legacy DB */ + uint16_t mp_leaf2_ksize; /* key size if this is a LEAF2 page */ +#define P_BRANCH 0x01u /* branch page */ +#define P_LEAF 0x02u /* leaf page */ +#define P_OVERFLOW 0x04u /* overflow page */ +#define P_META 0x08u /* meta page */ +#define P_LEGACY_DIRTY 0x10u /* legacy P_DIRTY flag prior to v0.10 958fd5b9 */ +#define P_BAD P_LEGACY_DIRTY /* explicit flag for invalid/bad page */ +#define P_LEAF2 0x20u /* for MDBX_DUPFIXED records */ +#define P_SUBP 0x40u /* for MDBX_DUPSORT sub-pages */ +#define P_SPILLED 0x2000u /* spilled in parent txn */ +#define P_LOOSE 0x4000u /* page was dirtied then freed, can be reused */ +#define P_FROZEN 0x8000u /* used for retire page with known status */ +#define P_ILL_BITS \ + ((uint16_t) ~(P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW | P_SPILLED)) uint16_t mp_flags; union { uint32_t mp_pages; /* number of overflow pages */ @@ -542,15 +701,49 @@ typedef struct MDBX_page { #endif /* C99 */ } MDBX_page; +#define PAGETYPE_WHOLE(p) ((uint8_t)(p)->mp_flags) + +/* Drop legacy P_DIRTY flag for sub-pages for compatilibity */ +#define PAGETYPE_COMPAT(p) \ + (unlikely(PAGETYPE_WHOLE(p) & P_SUBP) \ + ? PAGETYPE_WHOLE(p) & ~(P_SUBP | P_LEGACY_DIRTY) \ + : PAGETYPE_WHOLE(p)) + /* Size of the page header, excluding dynamic data at the end */ -#define PAGEHDRSZ ((unsigned)offsetof(MDBX_page, mp_ptrs)) +#define PAGEHDRSZ offsetof(MDBX_page, mp_ptrs) + +/* Pointer displacement without casting to char* to avoid pointer-aliasing */ +#define ptr_disp(ptr, disp) ((void *)(((intptr_t)(ptr)) + ((intptr_t)(disp)))) + +/* Pointer distance as signed number of bytes */ +#define ptr_dist(more, less) (((intptr_t)(more)) - ((intptr_t)(less))) + +#define mp_next(mp) \ + (*(MDBX_page **)ptr_disp((mp)->mp_ptrs, sizeof(void *) - sizeof(uint32_t))) #pragma pack(pop) -#if MDBX_ENABLE_PGOP_STAT +typedef struct profgc_stat { + /* Монотонное время по "настенным часам" + * затраченное на чтение и поиск внутри GC */ + uint64_t rtime_monotonic; + /* Процессорное время в режим пользователя + * на подготовку страниц извлекаемых из GC, включая подкачку с диска. */ + uint64_t xtime_cpu; + /* Количество итераций чтения-поиска внутри GC при выделении страниц */ + uint32_t rsteps; + /* Количество запросов на выделение последовательностей страниц, + * т.е. когда запрашивает выделение больше одной страницы */ + uint32_t xpages; + /* Счетчик выполнения по медленному пути (slow path execution count) */ + uint32_t spe_counter; + /* page faults (hard page faults) */ + uint32_t majflt; +} profgc_stat_t; + /* Statistics of page operations overall of all (running, completed and aborted) * transactions */ -typedef struct { +typedef struct pgop_stat { MDBX_atomic_uint64_t newly; /* Quantity of a new pages added */ MDBX_atomic_uint64_t cow; /* Quantity of pages copied for update */ MDBX_atomic_uint64_t clone; /* Quantity of parent's dirty pages clones @@ -561,16 +754,48 @@ typedef struct { MDBX_atomic_uint64_t unspill; /* Quantity of unspilled/reloaded pages */ MDBX_atomic_uint64_t wops; /* Number of explicit write operations (not a pages) to a disk */ -} MDBX_pgop_stat_t; -#endif /* MDBX_ENABLE_PGOP_STAT */ + MDBX_atomic_uint64_t + msync; /* Number of explicit msync/flush-to-disk operations */ + MDBX_atomic_uint64_t + fsync; /* Number of explicit fsync/flush-to-disk operations */ + + MDBX_atomic_uint64_t prefault; /* Number of prefault write operations */ + MDBX_atomic_uint64_t mincore; /* Number of mincore() calls */ + + MDBX_atomic_uint32_t + incoherence; /* number of https://libmdbx.dqdkfa.ru/dead-github/issues/269 + caught */ + MDBX_atomic_uint32_t reserved; + + /* Статистика для профилирования GC. + * Логически эти данные может быть стоит вынести в другую структуру, + * но разница будет сугубо косметическая. */ + struct { + /* Затраты на поддержку данных пользователя */ + profgc_stat_t work; + /* Затраты на поддержку и обновления самой GC */ + profgc_stat_t self; + /* Итераций обновления GC, + * больше 1 если были повторы/перезапуски */ + uint32_t wloops; + /* Итерации слияния записей GC */ + uint32_t coalescences; + /* Уничтожения steady-точек фиксации в MDBX_UTTERLY_NOSYNC */ + uint32_t wipes; + /* Сбросы данные на диск вне MDBX_UTTERLY_NOSYNC */ + uint32_t flushes; + /* Попытки пнуть тормозящих читателей */ + uint32_t kicks; + } gc_prof; +} pgop_stat_t; #if MDBX_LOCKING == MDBX_LOCKING_WIN32FILES #define MDBX_CLOCK_SIGN UINT32_C(0xF10C) -typedef void mdbx_ipclock_t; +typedef void osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_SYSV #define MDBX_CLOCK_SIGN UINT32_C(0xF18D) -typedef mdbx_pid_t mdbx_ipclock_t; +typedef mdbx_pid_t osal_ipclock_t; #ifndef EOWNERDEAD #define EOWNERDEAD MDBX_RESULT_TRUE #endif @@ -578,17 +803,17 @@ typedef mdbx_pid_t mdbx_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 #define MDBX_CLOCK_SIGN UINT32_C(0x8017) -typedef pthread_mutex_t mdbx_ipclock_t; +typedef pthread_mutex_t osal_ipclock_t; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX1988 #define MDBX_CLOCK_SIGN UINT32_C(0xFC29) -typedef sem_t mdbx_ipclock_t; +typedef sem_t osal_ipclock_t; #else #error "FIXME" #endif /* MDBX_LOCKING */ #if MDBX_LOCKING > MDBX_LOCKING_SYSV && !defined(__cplusplus) -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc); -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc); +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc); #endif /* MDBX_LOCKING */ /* Reader Lock Table @@ -681,6 +906,10 @@ typedef struct MDBX_lockinfo { /* Low 32-bit of txnid with which meta-pages was synced, * i.e. for sync-polling in the MDBX_NOMETASYNC mode. */ +#define MDBX_NOMETASYNC_LAZY_UNK (UINT32_MAX / 3) +#define MDBX_NOMETASYNC_LAZY_FD (MDBX_NOMETASYNC_LAZY_UNK + UINT32_MAX / 8) +#define MDBX_NOMETASYNC_LAZY_WRITEMAP \ + (MDBX_NOMETASYNC_LAZY_UNK - UINT32_MAX / 8) MDBX_atomic_uint32_t mti_meta_sync_txnid; /* Period for timed auto-sync feature, i.e. at the every steady checkpoint @@ -693,45 +922,54 @@ typedef struct MDBX_lockinfo { /* Marker to distinguish uniqueness of DB/CLK. */ MDBX_atomic_uint64_t mti_bait_uniqueness; + /* Paired counter of processes that have mlock()ed part of mmapped DB. + * The (mti_mlcnt[0] - mti_mlcnt[1]) > 0 means at least one process + * lock at least one page, so therefore madvise() could return EINVAL. */ + MDBX_atomic_uint32_t mti_mlcnt[2]; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ -#if MDBX_ENABLE_PGOP_STAT /* Statistics of costly ops of all (running, completed and aborted) * transactions */ - MDBX_pgop_stat_t mti_pgop_stat; -#endif /* MDBX_ENABLE_PGOP_STAT*/ + pgop_stat_t mti_pgop_stat; MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Write transaction lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_wlock; + osal_ipclock_t mti_wlock; #endif /* MDBX_LOCKING > 0 */ atomic_txnid_t mti_oldest_reader; - /* Timestamp of the last steady sync. Value is represented in a suitable - * system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) or - * clock_gettime(CLOCK_MONOTONIC). */ - MDBX_atomic_uint64_t mti_sync_timestamp; + /* Timestamp of entering an out-of-sync state. Value is represented in a + * suitable system-dependent form, for example clock_gettime(CLOCK_BOOTTIME) + * or clock_gettime(CLOCK_MONOTONIC). */ + MDBX_atomic_uint64_t mti_eoos_timestamp; /* Number un-synced-with-disk pages for auto-sync feature. */ - atomic_pgno_t mti_unsynced_pages; - - /* Number of page which was discarded last time by madvise(MADV_FREE). */ - atomic_pgno_t mti_discarded_tail; + MDBX_atomic_uint64_t mti_unsynced_pages; /* Timestamp of the last readers check. */ MDBX_atomic_uint64_t mti_reader_check_timestamp; + /* Number of page which was discarded last time by madvise(DONTNEED). */ + atomic_pgno_t mti_discarded_tail; + /* Shared anchor for tracking readahead edge and enabled/disabled status. */ pgno_t mti_readahead_anchor; + /* Shared cache for mincore() results */ + struct { + pgno_t begin[4]; + uint64_t mask[4]; + } mti_mincore_cache; + MDBX_ALIGNAS(MDBX_CACHELINE_SIZE) /* cacheline ----------------------------*/ /* Readeaders registration lock. */ #if MDBX_LOCKING > 0 - mdbx_ipclock_t mti_rlock; + osal_ipclock_t mti_rlock; #endif /* MDBX_LOCKING > 0 */ /* The number of slots that have been used in the reader table. @@ -799,7 +1037,8 @@ typedef struct MDBX_lockinfo { #endif /* MDBX_WORDBITS */ #define MDBX_READERS_LIMIT 32767 -#define MDBX_RADIXSORT_THRESHOLD 333 +#define MDBX_RADIXSORT_THRESHOLD 142 +#define MDBX_GOLD_RATIO_DBL 1.6180339887498948482 /*----------------------------------------------------------------------------*/ @@ -824,21 +1063,15 @@ typedef txnid_t *MDBX_TXL; /* An Dirty-Page list item is an pgno/pointer pair. */ typedef struct MDBX_dp { MDBX_page *ptr; - pgno_t pgno; - union { - unsigned extra; - __anonymous_struct_extension__ struct { - unsigned multi : 1; - unsigned lru : 31; - }; - }; + pgno_t pgno, npages; } MDBX_dp; /* An DPL (dirty-page list) is a sorted array of MDBX_DPs. */ typedef struct MDBX_dpl { - unsigned sorted; - unsigned length; - unsigned detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ + size_t sorted; + size_t length; + size_t pages_including_loose; /* number of pages, but not an entries. */ + size_t detent; /* allocated size excluding the MDBX_DPL_RESERVE_GAP */ #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) || \ (!defined(__cplusplus) && defined(_MSC_VER)) MDBX_dp items[] /* dynamic size with holes at zero and after the last */; @@ -846,7 +1079,8 @@ typedef struct MDBX_dpl { } MDBX_dpl; /* PNL sizes */ -#define MDBX_PNL_GRANULATE 1024 +#define MDBX_PNL_GRANULATE_LOG2 10 +#define MDBX_PNL_GRANULATE (1 << MDBX_PNL_GRANULATE_LOG2) #define MDBX_PNL_INITIAL \ (MDBX_PNL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(pgno_t)) @@ -854,25 +1088,33 @@ typedef struct MDBX_dpl { #define MDBX_TXL_INITIAL \ (MDBX_TXL_GRANULATE - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_TXL_MAX \ - ((1u << 17) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) + ((1u << 26) - 2 - MDBX_ASSUME_MALLOC_OVERHEAD / sizeof(txnid_t)) #define MDBX_PNL_ALLOCLEN(pl) ((pl)[-1]) -#define MDBX_PNL_SIZE(pl) ((pl)[0]) +#define MDBX_PNL_GETSIZE(pl) ((size_t)((pl)[0])) +#define MDBX_PNL_SETSIZE(pl, size) \ + do { \ + const size_t __size = size; \ + assert(__size < INT_MAX); \ + (pl)[0] = (pgno_t)__size; \ + } while (0) #define MDBX_PNL_FIRST(pl) ((pl)[1]) -#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_SIZE(pl)]) +#define MDBX_PNL_LAST(pl) ((pl)[MDBX_PNL_GETSIZE(pl)]) #define MDBX_PNL_BEGIN(pl) (&(pl)[1]) -#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_SIZE(pl) + 1]) +#define MDBX_PNL_END(pl) (&(pl)[MDBX_PNL_GETSIZE(pl) + 1]) #if MDBX_PNL_ASCENDING +#define MDBX_PNL_EDGE(pl) ((pl) + 1) #define MDBX_PNL_LEAST(pl) MDBX_PNL_FIRST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_LAST(pl) #else +#define MDBX_PNL_EDGE(pl) ((pl) + MDBX_PNL_GETSIZE(pl)) #define MDBX_PNL_LEAST(pl) MDBX_PNL_LAST(pl) #define MDBX_PNL_MOST(pl) MDBX_PNL_FIRST(pl) #endif -#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_SIZE(pl) + 1) * sizeof(pgno_t)) -#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_SIZE(pl) == 0) +#define MDBX_PNL_SIZEOF(pl) ((MDBX_PNL_GETSIZE(pl) + 1) * sizeof(pgno_t)) +#define MDBX_PNL_IS_EMPTY(pl) (MDBX_PNL_GETSIZE(pl) == 0) /*----------------------------------------------------------------------------*/ /* Internal structures */ @@ -889,6 +1131,18 @@ typedef struct MDBX_dbx { md_vlen_max; /* min/max value/data length for the database */ } MDBX_dbx; +typedef struct troika { + uint8_t fsm, recent, prefer_steady, tail_and_flags; +#if MDBX_WORDBITS > 32 /* Workaround for false-positives from Valgrind */ + uint32_t unused_pad; +#endif +#define TROIKA_HAVE_STEADY(troika) ((troika)->fsm & 7) +#define TROIKA_STRICT_VALID(troika) ((troika)->tail_and_flags & 64) +#define TROIKA_VALID(troika) ((troika)->tail_and_flags & 128) +#define TROIKA_TAIL(troika) ((troika)->tail_and_flags & 3) + txnid_t txnid[NUM_METAS]; +} meta_troika_t; + /* A database transaction. * Every operation requires a transaction handle. */ struct MDBX_txn { @@ -900,12 +1154,14 @@ struct MDBX_txn { #define MDBX_TXN_RO_BEGIN_FLAGS (MDBX_TXN_RDONLY | MDBX_TXN_RDONLY_PREPARE) #define MDBX_TXN_RW_BEGIN_FLAGS \ (MDBX_TXN_NOMETASYNC | MDBX_TXN_NOSYNC | MDBX_TXN_TRY) - /* Additional flag for mdbx_sync_locked() */ + /* Additional flag for sync_locked() */ #define MDBX_SHRINK_ALLOWED UINT32_C(0x40000000) +#define MDBX_TXN_DRAINED_GC 0x20 /* GC was depleted up to oldest reader */ + #define TXN_FLAGS \ (MDBX_TXN_FINISHED | MDBX_TXN_ERROR | MDBX_TXN_DIRTY | MDBX_TXN_SPILLS | \ - MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID) + MDBX_TXN_HAS_CHILD | MDBX_TXN_INVALID | MDBX_TXN_DRAINED_GC) #if (TXN_FLAGS & (MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS)) || \ ((MDBX_TXN_RW_BEGIN_FLAGS | MDBX_TXN_RO_BEGIN_FLAGS | TXN_FLAGS) & \ @@ -923,9 +1179,9 @@ struct MDBX_txn { /* corresponding to the current size of datafile */ #define mt_end_pgno mt_geo.now - /* The ID of this transaction. IDs are integers incrementing from 1. - * Only committed write transactions increment the ID. If a transaction - * aborts, the ID may be re-used by the next writer. */ + /* The ID of this transaction. IDs are integers incrementing from + * INITIAL_TXNID. Only committed write transactions increment the ID. If a + * transaction aborts, the ID may be re-used by the next writer. */ txnid_t mt_txnid; txnid_t mt_front; @@ -935,7 +1191,7 @@ struct MDBX_txn { /* Array of MDBX_db records for each known DB */ MDBX_db *mt_dbs; /* Array of sequence numbers for each DB handle */ - unsigned *mt_dbiseqs; + MDBX_atomic_uint32_t *mt_dbiseqs; /* Transaction DBI Flags */ #define DBI_DIRTY MDBX_DBI_DIRTY /* DB was written in this txn */ @@ -962,19 +1218,20 @@ struct MDBX_txn { MDBX_reader *reader; } to; struct { + meta_troika_t troika; /* In write txns, array of cursors for each DB */ - pgno_t *reclaimed_pglist; /* Reclaimed GC pages */ - txnid_t last_reclaimed; /* ID of last used record */ + MDBX_PNL relist; /* Reclaimed GC pages */ + txnid_t last_reclaimed; /* ID of last used record */ #if MDBX_ENABLE_REFUND pgno_t loose_refund_wl /* FIXME: describe */; #endif /* MDBX_ENABLE_REFUND */ + /* a sequence to spilling dirty page with LRU policy */ + unsigned dirtylru; /* dirtylist room: Dirty array size - dirty pages visible to this txn. * Includes ancestor txns' dirty pages not hidden by other txns' * dirty/spilled pages. Thus commit(nested txn) has room to merge * dirtylist into mt_parent after freeing hidden mt_parent pages. */ - unsigned dirtyroom; - /* a sequence to spilling dirty page with LRU policy */ - unsigned dirtylru; + size_t dirtyroom; /* For write txns: Modified pages. Sorted when not MDBX_WRITEMAP. */ MDBX_dpl *dirtylist; /* The list of reclaimed txns from GC */ @@ -985,12 +1242,18 @@ struct MDBX_txn { * in this transaction, linked through `mp_next`. */ MDBX_page *loose_pages; /* Number of loose pages (tw.loose_pages) */ - unsigned loose_count; - /* The sorted list of dirty pages we temporarily wrote to disk - * because the dirty list was full. page numbers in here are - * shifted left by 1, deleted slots have the LSB set. */ - MDBX_PNL spill_pages; - unsigned spill_least_removed; + size_t loose_count; + union { + struct { + size_t least_removed; + /* The sorted list of dirty pages we temporarily wrote to disk + * because the dirty list was full. page numbers in here are + * shifted left by 1, deleted slots have the LSB set. */ + MDBX_PNL list; + } spilled; + size_t writemap_dirty_npages; + size_t writemap_spilled_npages; + }; } tw; }; }; @@ -1031,8 +1294,8 @@ struct MDBX_cursor { MDBX_dbx *mc_dbx; /* The mt_dbistate for this database */ uint8_t *mc_dbistate; - unsigned mc_snum; /* number of pushed pages */ - unsigned mc_top; /* index of top page, normally mc_snum-1 */ + uint8_t mc_snum; /* number of pushed pages */ + uint8_t mc_top; /* index of top page, normally mc_snum-1 */ /* Cursor state flags. */ #define C_INITIALIZED 0x01 /* cursor has been initialized and is valid */ @@ -1040,20 +1303,30 @@ struct MDBX_cursor { #define C_SUB 0x04 /* Cursor is a sub-cursor */ #define C_DEL 0x08 /* last op was a cursor_del */ #define C_UNTRACK 0x10 /* Un-track cursor when closing */ -#define C_RECLAIMING 0x20 /* GC lookup is prohibited */ -#define C_GCFREEZE 0x40 /* reclaimed_pglist must not be updated */ +#define C_GCU \ + 0x20 /* Происходит подготовка к обновлению GC, поэтому \ + * можно брать страницы из GC даже для FREE_DBI */ + uint8_t mc_flags; /* Cursor checking flags. */ -#define C_COPYING 0x100 /* skip key-value length check (copying simplify) */ -#define C_UPDATING 0x200 /* update/rebalance pending */ -#define C_RETIRING 0x400 /* refs to child pages may be invalid */ -#define C_SKIPORD 0x800 /* don't check keys ordering */ +#define CC_BRANCH 0x01 /* same as P_BRANCH for CHECK_LEAF_TYPE() */ +#define CC_LEAF 0x02 /* same as P_LEAF for CHECK_LEAF_TYPE() */ +#define CC_OVERFLOW 0x04 /* same as P_OVERFLOW for CHECK_LEAF_TYPE() */ +#define CC_UPDATING 0x08 /* update/rebalance pending */ +#define CC_SKIPORD 0x10 /* don't check keys ordering */ +#define CC_LEAF2 0x20 /* same as P_LEAF2 for CHECK_LEAF_TYPE() */ +#define CC_RETIRING 0x40 /* refs to child pages may be invalid */ +#define CC_PAGECHECK 0x80 /* perform page checking, see MDBX_VALIDATION */ + uint8_t mc_checking; - unsigned mc_flags; /* see mdbx_cursor */ MDBX_page *mc_pg[CURSOR_STACK]; /* stack of pushed pages */ indx_t mc_ki[CURSOR_STACK]; /* stack of page indices */ }; +#define CHECK_LEAF_TYPE(mc, mp) \ + (((PAGETYPE_WHOLE(mp) ^ (mc)->mc_checking) & \ + (CC_BRANCH | CC_LEAF | CC_OVERFLOW | CC_LEAF2)) == 0) + /* Context for sorted-dup records. * We could have gone to a fully recursive design, with arbitrarily * deep nesting of sub-databases. But for now we only handle these @@ -1086,40 +1359,50 @@ struct MDBX_env { #define MDBX_ENV_TXKEY UINT32_C(0x10000000) /* Legacy MDBX_MAPASYNC (prior v0.9) */ #define MDBX_DEPRECATED_MAPASYNC UINT32_C(0x100000) + /* Legacy MDBX_COALESCE (prior v0.12) */ +#define MDBX_DEPRECATED_COALESCE UINT32_C(0x2000000) #define ENV_INTERNAL_FLAGS (MDBX_FATAL_ERROR | MDBX_ENV_ACTIVE | MDBX_ENV_TXKEY) uint32_t me_flags; - mdbx_mmap_t me_dxb_mmap; /* The main data file */ -#define me_map me_dxb_mmap.dxb + osal_mmap_t me_dxb_mmap; /* The main data file */ +#define me_map me_dxb_mmap.base #define me_lazy_fd me_dxb_mmap.fd - mdbx_filehandle_t me_dsync_fd; - mdbx_mmap_t me_lck_mmap; /* The lock file */ + mdbx_filehandle_t me_dsync_fd, me_fd4meta; +#if defined(_WIN32) || defined(_WIN64) +#define me_overlapped_fd me_ioring.overlapped_fd + HANDLE me_data_lock_event; +#endif /* Windows */ + osal_mmap_t me_lck_mmap; /* The lock file */ #define me_lfd me_lck_mmap.fd struct MDBX_lockinfo *me_lck; - unsigned me_psize; /* DB page size, initialized from me_os_psize */ - unsigned me_leaf_nodemax; /* max size of a leaf-node */ - uint8_t me_psize2log; /* log2 of DB page size */ + unsigned me_psize; /* DB page size, initialized from me_os_psize */ + unsigned me_leaf_nodemax; /* max size of a leaf-node */ + unsigned me_branch_nodemax; /* max size of a branch-node */ + atomic_pgno_t me_mlocked_pgno; + uint8_t me_psize2log; /* log2 of DB page size */ int8_t me_stuck_meta; /* recovery-only: target meta page or less that zero */ uint16_t me_merge_threshold, me_merge_threshold_gc; /* pages emptier than this are candidates for merging */ - unsigned me_os_psize; /* OS page size, from mdbx_syspagesize() */ + unsigned me_os_psize; /* OS page size, from osal_syspagesize() */ unsigned me_maxreaders; /* size of the reader table */ MDBX_dbi me_maxdbs; /* size of the DB table */ uint32_t me_pid; /* process ID of this env */ - mdbx_thread_key_t me_txkey; /* thread-key for readers */ - char *me_pathname; /* path to the DB files */ + osal_thread_key_t me_txkey; /* thread-key for readers */ + pathchar_t *me_pathname; /* path to the DB files */ void *me_pbuf; /* scratch area for DUPSORT put() */ MDBX_txn *me_txn0; /* preallocated write transaction */ - MDBX_dbx *me_dbxs; /* array of static DB info */ - uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ - unsigned *me_dbiseqs; /* array of dbi sequence numbers */ + MDBX_dbx *me_dbxs; /* array of static DB info */ + uint16_t *me_dbflags; /* array of flags from MDBX_db.md_flags */ + MDBX_atomic_uint32_t *me_dbiseqs; /* array of dbi sequence numbers */ unsigned - me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ - uint32_t me_live_reader; /* have liveness lock in reader table */ - void *me_userctx; /* User-settable context */ + me_maxgc_ov1page; /* Number of pgno_t fit in a single overflow page */ + unsigned me_maxgc_per_branch; + uint32_t me_live_reader; /* have liveness lock in reader table */ + void *me_userctx; /* User-settable context */ MDBX_hsr_func *me_hsr_callback; /* Callback for kicking laggard readers */ + size_t me_madv_threshold; struct { unsigned dp_reserve_limit; @@ -1131,11 +1414,17 @@ struct MDBX_env { uint8_t spill_min_denominator; uint8_t spill_parent4child_denominator; unsigned merge_threshold_16dot16_percent; +#if !(defined(_WIN32) || defined(_WIN64)) + unsigned writethrough_threshold; +#endif /* Windows */ + bool prefault_write; union { unsigned all; /* tracks options with non-auto values but tuned by user */ struct { unsigned dp_limit : 1; + unsigned rp_augment_limit : 1; + unsigned prefault_write : 1; } non_auto; } flags; } me_options; @@ -1157,26 +1446,31 @@ struct MDBX_env { int semid; } me_sysv_ipc; #endif /* MDBX_LOCKING == MDBX_LOCKING_SYSV */ + bool me_incore; MDBX_env *me_lcklist_next; /* --------------------------------------------------- mostly volatile part */ MDBX_txn *me_txn; /* current write transaction */ - mdbx_fastmutex_t me_dbi_lock; + osal_fastmutex_t me_dbi_lock; MDBX_dbi me_numdbs; /* number of DBs opened */ + bool me_prefault_write; MDBX_page *me_dp_reserve; /* list of malloc'ed blocks for re-use */ unsigned me_dp_reserve_len; /* PNL of pages that became unused in a write txn */ MDBX_PNL me_retired_pages; + osal_ioring_t me_ioring; #if defined(_WIN32) || defined(_WIN64) - MDBX_srwlock me_remap_guard; + osal_srwlock_t me_remap_guard; /* Workaround for LockFileEx and WriteFile multithread bug */ CRITICAL_SECTION me_windowsbug_lock; + char *me_pathname_char; /* cache of multi-byte representation of pathname + to the DB files */ #else - mdbx_fastmutex_t me_remap_guard; + osal_fastmutex_t me_remap_guard; #endif /* -------------------------------------------------------------- debugging */ @@ -1195,7 +1489,7 @@ struct MDBX_env { #define xMDBX_DEBUG_SPILLING 0 #endif #if xMDBX_DEBUG_SPILLING == 2 - unsigned debug_dirtied_est, debug_dirtied_act; + size_t debug_dirtied_est, debug_dirtied_act; #endif /* xMDBX_DEBUG_SPILLING */ /* ------------------------------------------------- stub for lck-less mode */ @@ -1205,148 +1499,24 @@ struct MDBX_env { }; #ifndef __cplusplus -/*----------------------------------------------------------------------------*/ -/* Debug and Logging stuff */ - -#define MDBX_RUNTIME_FLAGS_INIT \ - ((MDBX_DEBUG) > 0) * MDBX_DBG_ASSERT + ((MDBX_DEBUG) > 1) * MDBX_DBG_AUDIT - -extern uint8_t mdbx_runtime_flags; -extern uint8_t mdbx_loglevel; -extern MDBX_debug_func *mdbx_debug_logger; - -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny) { -#if MDBX_DEBUG - if (MDBX_DBG_JITTER & mdbx_runtime_flags) - mdbx_osal_jitter(tiny); -#else - (void)tiny; -#endif -} - -MDBX_INTERNAL_FUNC void MDBX_PRINTF_ARGS(4, 5) - mdbx_debug_log(int level, const char *function, int line, const char *fmt, - ...) MDBX_PRINTF_ARGS(4, 5); -MDBX_INTERNAL_FUNC void mdbx_debug_log_va(int level, const char *function, - int line, const char *fmt, - va_list args); - -#if MDBX_DEBUG -#define mdbx_log_enabled(msg) unlikely(msg <= mdbx_loglevel) -#define mdbx_audit_enabled() unlikely((mdbx_runtime_flags & MDBX_DBG_AUDIT)) -#else /* MDBX_DEBUG */ -#define mdbx_log_enabled(msg) (msg < MDBX_LOG_VERBOSE && msg <= mdbx_loglevel) -#define mdbx_audit_enabled() (0) -#endif /* MDBX_DEBUG */ - -#if MDBX_FORCE_ASSERTIONS -#define mdbx_assert_enabled() (1) -#elif MDBX_DEBUG -#define mdbx_assert_enabled() likely((mdbx_runtime_flags & MDBX_DBG_ASSERT)) -#else -#define mdbx_assert_enabled() (0) -#endif /* assertions */ - -#define mdbx_debug_extra(fmt, ...) \ - do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, __func__, __LINE__, fmt, __VA_ARGS__); \ - } while (0) - -#define mdbx_debug_extra_print(fmt, ...) \ - do { \ - if (mdbx_log_enabled(MDBX_LOG_EXTRA)) \ - mdbx_debug_log(MDBX_LOG_EXTRA, NULL, 0, fmt, __VA_ARGS__); \ - } while (0) - -#define mdbx_trace(fmt, ...) \ - do { \ - if (mdbx_log_enabled(MDBX_LOG_TRACE)) \ - mdbx_debug_log(MDBX_LOG_TRACE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ - } while (0) - -#define mdbx_debug(fmt, ...) \ - do { \ - if (mdbx_log_enabled(MDBX_LOG_DEBUG)) \ - mdbx_debug_log(MDBX_LOG_DEBUG, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ - } while (0) - -#define mdbx_verbose(fmt, ...) \ - do { \ - if (mdbx_log_enabled(MDBX_LOG_VERBOSE)) \ - mdbx_debug_log(MDBX_LOG_VERBOSE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ - } while (0) - -#define mdbx_notice(fmt, ...) \ - do { \ - if (mdbx_log_enabled(MDBX_LOG_NOTICE)) \ - mdbx_debug_log(MDBX_LOG_NOTICE, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ - } while (0) - -#define mdbx_warning(fmt, ...) \ - do { \ - if (mdbx_log_enabled(MDBX_LOG_WARN)) \ - mdbx_debug_log(MDBX_LOG_WARN, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ - } while (0) - -#define mdbx_error(fmt, ...) \ - do { \ - if (mdbx_log_enabled(MDBX_LOG_ERROR)) \ - mdbx_debug_log(MDBX_LOG_ERROR, __func__, __LINE__, fmt "\n", \ - __VA_ARGS__); \ - } while (0) - -#define mdbx_fatal(fmt, ...) \ - mdbx_debug_log(MDBX_LOG_FATAL, __func__, __LINE__, fmt "\n", __VA_ARGS__); - -#define mdbx_ensure_msg(env, expr, msg) \ - do { \ - if (unlikely(!(expr))) \ - mdbx_assert_fail(env, msg, __func__, __LINE__); \ - } while (0) - -#define mdbx_ensure(env, expr) mdbx_ensure_msg(env, expr, #expr) - -/* assert(3) variant in environment context */ -#define mdbx_assert(env, expr) \ - do { \ - if (mdbx_assert_enabled()) \ - mdbx_ensure(env, expr); \ - } while (0) - -/* assert(3) variant in cursor context */ -#define mdbx_cassert(mc, expr) mdbx_assert((mc)->mc_txn->mt_env, expr) - -/* assert(3) variant in transaction context */ -#define mdbx_tassert(txn, expr) mdbx_assert((txn)->mt_env, expr) - -#ifndef xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ -#undef assert -#define assert(expr) mdbx_assert(NULL, expr) -#endif - /*----------------------------------------------------------------------------*/ /* Cache coherence and mmap invalidation */ #if MDBX_CPU_WRITEBACK_INCOHERENT -#define mdbx_flush_incoherent_cpu_writeback() mdbx_memory_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_memory_barrier() #else -#define mdbx_flush_incoherent_cpu_writeback() mdbx_compiler_barrier() +#define osal_flush_incoherent_cpu_writeback() osal_compiler_barrier() #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ MDBX_MAYBE_UNUSED static __inline void -mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { +osal_flush_incoherent_mmap(const void *addr, size_t nbytes, + const intptr_t pagesize) { #if MDBX_MMAP_INCOHERENT_FILE_WRITE char *const begin = (char *)(-pagesize & (intptr_t)addr); char *const end = (char *)(-pagesize & (intptr_t)((char *)addr + nbytes + pagesize - 1)); int err = msync(begin, end - begin, MS_SYNC | MS_INVALIDATE) ? errno : 0; - mdbx_assert(nullptr, err == 0); + eASSERT(nullptr, err == 0); (void)err; #else (void)pagesize; @@ -1356,7 +1526,7 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { #ifdef DCACHE /* MIPS has cache coherency issues. * Note: for any nbytes >= on-chip cache size, entire is flushed. */ - cacheflush(addr, nbytes, DCACHE); + cacheflush((void *)addr, nbytes, DCACHE); #else #error "Oops, cacheflush() not available" #endif /* DCACHE */ @@ -1371,15 +1541,17 @@ mdbx_flush_incoherent_mmap(void *addr, size_t nbytes, const intptr_t pagesize) { /*----------------------------------------------------------------------------*/ /* Internal prototypes */ -MDBX_INTERNAL_FUNC int mdbx_cleanup_dead_readers(MDBX_env *env, int rlocked, - int *dead); -MDBX_INTERNAL_FUNC int mdbx_rthc_alloc(mdbx_thread_key_t *key, - MDBX_reader *begin, MDBX_reader *end); -MDBX_INTERNAL_FUNC void mdbx_rthc_remove(const mdbx_thread_key_t key); +MDBX_INTERNAL_FUNC int cleanup_dead_readers(MDBX_env *env, int rlocked, + int *dead); +MDBX_INTERNAL_FUNC int rthc_alloc(osal_thread_key_t *key, MDBX_reader *begin, + MDBX_reader *end); +MDBX_INTERNAL_FUNC void rthc_remove(const osal_thread_key_t key); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_init(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_global_dtor(void); -MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); +MDBX_INTERNAL_FUNC void global_ctor(void); +MDBX_INTERNAL_FUNC void osal_ctor(void); +MDBX_INTERNAL_FUNC void global_dtor(void); +MDBX_INTERNAL_FUNC void osal_dtor(void); +MDBX_INTERNAL_FUNC void thread_dtor(void *ptr); #endif /* !__cplusplus */ @@ -1441,8 +1613,6 @@ MDBX_INTERNAL_FUNC void mdbx_rthc_thread_dtor(void *ptr); /* Test if a page is a sub page */ #define IS_SUBP(p) (((p)->mp_flags & P_SUBP) != 0) -#define PAGETYPE(p) ((p)->mp_flags & (P_BRANCH | P_LEAF | P_LEAF2 | P_OVERFLOW)) - /* Header for a single key/data pair within a page. * Used in pages of type P_BRANCH and P_LEAF without P_LEAF2. * We guarantee 2-byte alignment for 'MDBX_node's. @@ -1501,12 +1671,12 @@ typedef struct MDBX_node { #error "Oops, some flags overlapped or wrong" #endif -/* max number of pages to commit in one writev() call */ -#define MDBX_COMMIT_PAGES 64 -#if defined(IOV_MAX) && IOV_MAX < MDBX_COMMIT_PAGES /* sysconf(_SC_IOV_MAX) */ -#undef MDBX_COMMIT_PAGES -#define MDBX_COMMIT_PAGES IOV_MAX -#endif +/* Max length of iov-vector passed to writev() call, used for auxilary writes */ +#define MDBX_AUXILARY_IOV_MAX 64 +#if defined(IOV_MAX) && IOV_MAX < MDBX_AUXILARY_IOV_MAX +#undef MDBX_AUXILARY_IOV_MAX +#define MDBX_AUXILARY_IOV_MAX IOV_MAX +#endif /* MDBX_AUXILARY_IOV_MAX */ /* * / @@ -1515,16 +1685,7 @@ typedef struct MDBX_node { * | 1, a > b * \ */ -#ifndef __e2k__ -/* LY: fast enough on most systems */ -#define CMP2INT(a, b) (((b) > (a)) ? -1 : (a) > (b)) -#else -/* LY: more parallelable on VLIW Elbrus */ -#define CMP2INT(a, b) (((a) > (b)) - ((b) > (a))) -#endif - -/* Do not spill pages to disk if txn is getting full, may fail instead */ -#define MDBX_NOSPILL 0x8000 +#define CMP2INT(a, b) (((a) != (b)) ? (((a) < (b)) ? -1 : 1) : 0) MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t int64pgno(int64_t i64) { @@ -1536,14 +1697,14 @@ int64pgno(int64_t i64) { MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_add(size_t base, size_t augend) { assert(base <= MAX_PAGENO + 1 && augend < MAX_PAGENO); - return int64pgno(base + augend); + return int64pgno((int64_t)base + (int64_t)augend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline pgno_t pgno_sub(size_t base, size_t subtrahend) { assert(base >= MIN_PAGENO && base <= MAX_PAGENO + 1 && subtrahend < MAX_PAGENO); - return int64pgno(base - subtrahend); + return int64pgno((int64_t)base - (int64_t)subtrahend); } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __always_inline bool @@ -1563,20 +1724,24 @@ ceil_powerof2(size_t value, size_t granularity) { } MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static unsigned -log2n_powerof2(size_t value) { - assert(value > 0 && value < INT32_MAX && is_powerof2(value)); - assert((value & -(int32_t)value) == value); -#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctzl) - return __builtin_ctzl(value); +log2n_powerof2(size_t value_uintptr) { + assert(value_uintptr > 0 && value_uintptr < INT32_MAX && + is_powerof2(value_uintptr)); + assert((value_uintptr & -(intptr_t)value_uintptr) == value_uintptr); + const uint32_t value_uint32 = (uint32_t)value_uintptr; +#if __GNUC_PREREQ(4, 1) || __has_builtin(__builtin_ctz) + STATIC_ASSERT(sizeof(value_uint32) <= sizeof(unsigned)); + return __builtin_ctz(value_uint32); #elif defined(_MSC_VER) unsigned long index; - _BitScanForward(&index, (unsigned long)value); + STATIC_ASSERT(sizeof(value_uint32) <= sizeof(long)); + _BitScanForward(&index, value_uint32); return index; #else static const uint8_t debruijn_ctz32[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9}; - return debruijn_ctz32[(uint32_t)(value * 0x077CB531u) >> 27]; + return debruijn_ctz32[(uint32_t)(value_uint32 * 0x077CB531ul) >> 27]; #endif } @@ -1585,7 +1750,8 @@ log2n_powerof2(size_t value) { * environment and re-opening it with the new flags. */ #define ENV_CHANGEABLE_FLAGS \ (MDBX_SAFE_NOSYNC | MDBX_NOMETASYNC | MDBX_DEPRECATED_MAPASYNC | \ - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE) + MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_PAGEPERTURB | MDBX_ACCEDE | \ + MDBX_VALIDATION) #define ENV_CHANGELESS_FLAGS \ (MDBX_NOSUBDIR | MDBX_RDONLY | MDBX_WRITEMAP | MDBX_NOTLS | MDBX_NORDAHEAD | \ MDBX_LIFORECLAIM | MDBX_EXCLUSIVE) @@ -1610,14 +1776,14 @@ MDBX_MAYBE_UNUSED static void static_checks(void) { #define MDBX_ASAN_POISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("POISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_POISON_MEMORY_REGION(addr, size); \ } while (0) #define MDBX_ASAN_UNPOISON_MEMORY_REGION(addr, size) \ do { \ - mdbx_trace("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ - (size_t)(size), __LINE__); \ + TRACE("UNPOISON_MEMORY_REGION(%p, %zu) at %u", (void *)(addr), \ + (size_t)(size), __LINE__); \ ASAN_UNPOISON_MEMORY_REGION(addr, size); \ } while (0) diff --git a/src/lck-posix.c b/src/lck-posix.c index ad73e586..cb55727e 100644 --- a/src/lck-posix.c +++ b/src/lck-posix.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -28,10 +28,11 @@ #include #ifndef xMDBX_ALLOY -uint32_t mdbx_linux_kernel_version; +uint32_t linux_kernel_version; bool mdbx_RunningOnWSL1; #endif /* xMDBX_ALLOY */ +MDBX_EXCLUDE_FOR_GPROF __cold static uint8_t probe_for_WSL(const char *tag) { const char *const WSL = strstr(tag, "WSL"); if (WSL && WSL[3] >= '2' && WSL[3] <= '9') @@ -42,14 +43,28 @@ __cold static uint8_t probe_for_WSL(const char *tag) { if (WSL || wsl || strcasestr(tag, "Microsoft")) /* Expecting no new kernel within WSL1, either it will explicitly * marked by an appropriate WSL-version hint. */ - return (mdbx_linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; + return (linux_kernel_version < /* 4.19.x */ 0x04130000) ? 1 : 2; return 0; } #endif /* Linux */ +#ifdef ENABLE_GPROF +extern void _mcleanup(void); +extern void monstartup(unsigned long, unsigned long); +extern void _init(void); +extern void _fini(void); +extern void __gmon_start__(void) __attribute__((__weak__)); +#endif /* ENABLE_GPROF */ + +MDBX_EXCLUDE_FOR_GPROF __cold static __attribute__((__constructor__)) void mdbx_global_constructor(void) { +#ifdef ENABLE_GPROF + if (!&__gmon_start__) + monstartup((uintptr_t)&_init, (uintptr_t)&_fini); +#endif /* ENABLE_GPROF */ + #if defined(__linux__) || defined(__gnu_linux__) struct utsname buffer; if (uname(&buffer) == 0) { @@ -61,7 +76,7 @@ mdbx_global_constructor(void) { if (number > 0) { if (number > 255) number = 255; - mdbx_linux_kernel_version += number << (24 - i * 8); + linux_kernel_version += number << (24 - i * 8); } ++i; } else { @@ -81,12 +96,17 @@ mdbx_global_constructor(void) { } #endif /* Linux */ - mdbx_rthc_global_init(); + global_ctor(); } +MDBX_EXCLUDE_FOR_GPROF __cold static __attribute__((__destructor__)) void mdbx_global_destructor(void) { - mdbx_rthc_global_dtor(); + global_dtor(); +#ifdef ENABLE_GPROF + if (!&__gmon_start__) + _mcleanup(); +#endif /* ENABLE_GPROF */ } /*----------------------------------------------------------------------------*/ @@ -98,15 +118,15 @@ mdbx_global_destructor(void) { * размещаются совместно используемые posix-мьютексы (futex). Посредством * этих мьютексов (см struct MDBX_lockinfo) реализуются: * - Блокировка таблицы читателей для регистрации, - * т.е. функции mdbx_rdt_lock() и mdbx_rdt_unlock(). + * т.е. функции osal_rdt_lock() и osal_rdt_unlock(). * - Блокировка БД для пишущих транзакций, * т.е. функции mdbx_txn_lock() и mdbx_txn_unlock(). * * Остальной функционал реализуется отдельно посредством файловых блокировок: * - Первоначальный захват БД в режиме exclusive/shared и последующий перевод - * в операционный режим, функции mdbx_lck_seize() и mdbx_lck_downgrade(). + * в операционный режим, функции osal_lck_seize() и osal_lck_downgrade(). * - Проверка присутствие процессов-читателей, - * т.е. функции mdbx_rpid_set(), mdbx_rpid_clear() и mdbx_rpid_check(). + * т.е. функции osal_rpid_set(), osal_rpid_clear() и osal_rpid_check(). * * Для блокировки файлов используется fcntl(F_SETLK), так как: * - lockf() оперирует только эксклюзивной блокировкой и требует @@ -150,9 +170,9 @@ mdbx_global_destructor(void) { static int op_setlk, op_setlkw, op_getlk; __cold static void choice_fcntl(void) { assert(!op_setlk && !op_setlkw && !op_getlk); - if ((mdbx_runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 + if ((runtime_flags & MDBX_DBG_LEGACY_MULTIOPEN) == 0 #if defined(__linux__) || defined(__gnu_linux__) - && mdbx_linux_kernel_version > + && linux_kernel_version > 0x030f0000 /* OFD locks are available since 3.15, but engages here only for 3.16 and later kernels (i.e. LTS) because of reliability reasons */ @@ -182,7 +202,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, "The bitness of system `off_t` type is mismatch. Please " "fix build and/or NDK configuration."); #endif /* Android */ - mdbx_jitter4testing(true); + jitter4testing(true); assert(offset >= 0 && len > 0); assert((uint64_t)offset < (uint64_t)INT64_MAX && (uint64_t)len < (uint64_t)INT64_MAX && @@ -208,7 +228,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, lock_op.l_start = offset; lock_op.l_len = len; int rc = MDBX_FCNTL(fd, cmd, &lock_op); - mdbx_jitter4testing(true); + jitter4testing(true); if (rc != -1) { if (cmd == op_getlk) { /* Checks reader by pid. Returns: @@ -243,7 +263,7 @@ static int lck_op(const mdbx_filehandle_t fd, int cmd, const int lck, } } -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { #if MDBX_USE_OFDLOCKS if (unlikely(op_setlk == 0)) choice_fcntl(); @@ -251,21 +271,21 @@ MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { return lck_op(fd, wait ? op_setlkw : op_setlk, F_WRLCK, 0, OFF_T_MAX); } -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_pid > 0); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; return lck_op(env->me_lfd, op_setlk, F_WRLCK, env->me_pid, 1); } -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(env->me_pid > 0); return lck_op(env->me_lfd, op_setlk, F_UNLCK, env->me_pid, 1); } -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { assert(env->me_lfd != INVALID_HANDLE_VALUE); assert(pid > 0); return lck_op(env->me_lfd, op_getlk, F_WRLCK, pid, 1); @@ -274,7 +294,7 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { /*---------------------------------------------------------------------------*/ #if MDBX_LOCKING > MDBX_LOCKING_SYSV -MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { +MDBX_INTERNAL_FUNC int osal_ipclock_stub(osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_init(ipc, false, 1) ? errno : 0; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ @@ -285,7 +305,7 @@ MDBX_INTERNAL_FUNC int mdbx_ipclock_stub(mdbx_ipclock_t *ipc) { #endif } -MDBX_INTERNAL_FUNC int mdbx_ipclock_destroy(mdbx_ipclock_t *ipc) { +MDBX_INTERNAL_FUNC int osal_ipclock_destroy(osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX1988 return sem_destroy(ipc) ? errno : 0; #elif MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ @@ -303,7 +323,7 @@ static int check_fstat(MDBX_env *env) { int rc = MDBX_SUCCESS; if (fstat(env->me_lazy_fd, &st)) { rc = errno; - mdbx_error("fstat(%s), err %d", "DXB", rc); + ERROR("fstat(%s), err %d", "DXB", rc); return rc; } @@ -313,15 +333,14 @@ static int check_fstat(MDBX_env *env) { #else rc = EPERM; #endif - mdbx_error("%s %s, err %d", "DXB", - (st.st_nlink < 1) ? "file was removed" : "not a regular file", - rc); + ERROR("%s %s, err %d", "DXB", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); return rc; } if (st.st_size < (off_t)(MDBX_MIN_PAGESIZE * NUM_METAS)) { - mdbx_verbose("dxb-file is too short (%u), exclusive-lock needed", - (unsigned)st.st_size); + VERBOSE("dxb-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); rc = MDBX_RESULT_TRUE; } @@ -329,7 +348,7 @@ static int check_fstat(MDBX_env *env) { if (fstat(env->me_lfd, &st)) { rc = errno; - mdbx_error("fstat(%s), err %d", "LCK", rc); + ERROR("fstat(%s), err %d", "LCK", rc); return rc; } @@ -339,26 +358,25 @@ static int check_fstat(MDBX_env *env) { #else rc = EPERM; #endif - mdbx_error("%s %s, err %d", "LCK", - (st.st_nlink < 1) ? "file was removed" : "not a regular file", - rc); + ERROR("%s %s, err %d", "LCK", + (st.st_nlink < 1) ? "file was removed" : "not a regular file", rc); return rc; } /* Checking file size for detect the situation when we got the shared lock - * immediately after mdbx_lck_destroy(). */ + * immediately after osal_lck_destroy(). */ if (st.st_size < (off_t)(sizeof(MDBX_lockinfo) + sizeof(MDBX_reader))) { - mdbx_verbose("lck-file is too short (%u), exclusive-lock needed", - (unsigned)st.st_size); + VERBOSE("lck-file is too short (%u), exclusive-lock needed", + (unsigned)st.st_size); rc = MDBX_RESULT_TRUE; } return rc; } -__cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { +__cold MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; #if MDBX_USE_OFDLOCKS if (unlikely(op_setlk == 0)) @@ -369,10 +387,10 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { #if defined(__linux__) || defined(__gnu_linux__) if (unlikely(mdbx_RunningOnWSL1)) { rc = ENOLCK /* No record locks available */; - mdbx_error("%s, err %u", - "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " - "injecting failure to avoid data loss", - rc); + ERROR("%s, err %u", + "WSL1 (Windows Subsystem for Linux) is mad and trouble-full, " + "injecting failure to avoid data loss", + rc); return rc; } #endif /* Linux */ @@ -383,8 +401,8 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "without-lck", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "without-lck", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } return MDBX_RESULT_TRUE /* Done: return with exclusive locking. */; @@ -397,8 +415,8 @@ retry: if (rc == MDBX_RESULT_TRUE) { rc = lck_op(env->me_lfd, op_setlk, F_UNLCK, 0, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "unlock-before-retry", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "unlock-before-retry", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } } @@ -424,23 +442,23 @@ retry: /* the cause may be a collision with POSIX's file-lock recovery. */ if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "dxb-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "dxb-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } /* Fallback to lck-shared */ } else if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "try-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } /* Here could be one of two: - * - mdbx_lck_destroy() from the another process was hold the lock + * - osal_lck_destroy() from the another process was hold the lock * during a destruction. - * - either mdbx_lck_seize() from the another process was got the exclusive + * - either osal_lck_seize() from the another process was got the exclusive * lock and doing initialization. * For distinguish these cases will use size of the lck-file later. */ @@ -449,8 +467,8 @@ retry: * competing process doesn't call lck_downgrade(). */ rc = lck_op(env->me_lfd, op_setlkw, F_RDLCK, 0, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "try-shared", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-shared", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -458,7 +476,7 @@ retry: if (rc == MDBX_RESULT_TRUE) goto retry; if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "lck_fstat", rc); + ERROR("%s, err %u", "lck_fstat", rc); return rc; } @@ -469,8 +487,8 @@ retry: if (!(rc == EAGAIN || rc == EACCES || rc == EBUSY || rc == EWOULDBLOCK || rc == EDEADLK)) { - mdbx_error("%s, err %u", "try-exclusive", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "try-exclusive", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -479,8 +497,8 @@ retry: lck_op(env->me_lazy_fd, op_setlk, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, env->me_pid, 1); if (rc != MDBX_SUCCESS) { - mdbx_error("%s, err %u", "lock-against-without-lck", rc); - mdbx_assert(env, MDBX_IS_ERROR(rc)); + ERROR("%s, err %u", "lock-against-without-lck", rc); + eASSERT(env, MDBX_IS_ERROR(rc)); return rc; } @@ -488,9 +506,9 @@ retry: return MDBX_RESULT_FALSE; } -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { assert(env->me_lfd != INVALID_HANDLE_VALUE); - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; int rc = MDBX_SUCCESS; @@ -503,15 +521,15 @@ MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { if (rc == MDBX_SUCCESS) rc = lck_op(env->me_lfd, op_setlk, F_RDLCK, 0, 1); if (unlikely(rc != 0)) { - mdbx_error("%s, err %u", "lck", rc); + ERROR("%s, err %u", "lck", rc); assert(MDBX_IS_ERROR(rc)); } return rc; } -__cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +__cold MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor) { - if (unlikely(mdbx_getpid() != env->me_pid)) + if (unlikely(osal_getpid() != env->me_pid)) return MDBX_PANIC; int rc = MDBX_SUCCESS; @@ -526,25 +544,25 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, (env->me_flags & MDBX_RDONLY) ? F_RDLCK : F_WRLCK, 0, OFF_T_MAX) == 0) { - mdbx_verbose("%p got exclusive, drown locks", (void *)env); + VERBOSE("%p got exclusive, drown locks", (void *)env); #if MDBX_LOCKING == MDBX_LOCKING_SYSV if (env->me_sysv_ipc.semid != -1) rc = semctl(env->me_sysv_ipc.semid, 2, IPC_RMID) ? errno : 0; #else - rc = mdbx_ipclock_destroy(&lck->mti_rlock); + rc = osal_ipclock_destroy(&lck->mti_rlock); if (rc == 0) - rc = mdbx_ipclock_destroy(&lck->mti_wlock); + rc = osal_ipclock_destroy(&lck->mti_wlock); #endif /* MDBX_LOCKING */ - mdbx_assert(env, rc == 0); + eASSERT(env, rc == 0); if (rc == 0) { const bool synced = lck->mti_unsynced_pages.weak == 0; - mdbx_munmap(&env->me_lck_mmap); + osal_munmap(&env->me_lck_mmap); if (synced) rc = ftruncate(env->me_lfd, 0) ? errno : 0; } - mdbx_jitter4testing(false); + jitter4testing(false); } /* 1) POSIX's fcntl() locks (i.e. when op_setlk == F_SETLK) should be restored @@ -585,7 +603,7 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /* restore file-locks */ rc = lck_op(inprocess_neighbor->me_lfd, F_SETLKW, F_RDLCK, 0, 1); if (rc == MDBX_SUCCESS && inprocess_neighbor->me_live_reader) - rc = mdbx_rpid_set(inprocess_neighbor); + rc = osal_rpid_set(inprocess_neighbor); } } @@ -596,7 +614,7 @@ __cold MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /*---------------------------------------------------------------------------*/ -__cold MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +__cold MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag) { #if MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -743,7 +761,7 @@ bailout: #endif /* MDBX_LOCKING > 0 */ } -__cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, +__cold static int mdbx_ipclock_failed(MDBX_env *env, osal_ipclock_t *ipc, const int err) { int rc = err; #if MDBX_LOCKING == MDBX_LOCKING_POSIX2008 || MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -760,10 +778,10 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, rc = MDBX_PANIC; } } - mdbx_warning("%clock owner died, %s", (rlocked ? 'r' : 'w'), - (rc ? "this process' env is hosed" : "recovering")); + WARNING("%clock owner died, %s", (rlocked ? 'r' : 'w'), + (rc ? "this process' env is hosed" : "recovering")); - int check_rc = mdbx_cleanup_dead_readers(env, rlocked, NULL); + int check_rc = cleanup_dead_readers(env, rlocked, NULL); check_rc = (check_rc == MDBX_SUCCESS) ? MDBX_RESULT_TRUE : check_rc; #if MDBX_LOCKING == MDBX_LOCKING_SYSV @@ -781,7 +799,7 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, check_rc = (mreco_rc == 0) ? check_rc : mreco_rc; if (unlikely(mreco_rc)) - mdbx_error("lock recovery failed, %s", mdbx_strerror(mreco_rc)); + ERROR("lock recovery failed, %s", mdbx_strerror(mreco_rc)); rc = (rc == MDBX_SUCCESS) ? check_rc : rc; if (MDBX_IS_ERROR(rc)) @@ -804,24 +822,24 @@ __cold static int mdbx_ipclock_failed(MDBX_env *env, mdbx_ipclock_t *ipc, #error "FIXME" #endif /* MDBX_LOCKING */ - mdbx_error("mutex (un)lock failed, %s", mdbx_strerror(err)); + ERROR("mutex (un)lock failed, %s", mdbx_strerror(err)); if (rc != EDEADLK) env->me_flags |= MDBX_FATAL_ERROR; return rc; } #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void) { +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void) { /* avoid 32-bit Bionic bug/hang with 32-pit TID */ if (sizeof(pthread_mutex_t) < sizeof(pid_t) + sizeof(unsigned)) { pid_t tid = gettid(); if (unlikely(tid > 0xffff)) { - mdbx_fatal("Raise the ENOSYS(%d) error to avoid hang due " - "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " - "that don’t fit in 16 bits, see " - "https://android.googlesource.com/platform/bionic/+/master/" - "docs/32-bit-abi.md#is-too-small-for-large-pids", - ENOSYS, tid, tid); + FATAL("Raise the ENOSYS(%d) error to avoid hang due " + "the 32-bit Bionic/Android bug with tid/thread_id 0x%08x(%i) " + "that don’t fit in 16 bits, see " + "https://android.googlesource.com/platform/bionic/+/master/" + "docs/32-bit-abi.md#is-too-small-for-large-pids", + ENOSYS, tid, tid); return ENOSYS; } } @@ -829,11 +847,11 @@ MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void) { } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ -static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, +static int mdbx_ipclock_lock(MDBX_env *env, osal_ipclock_t *ipc, const bool dont_wait) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 - int rc = mdbx_check_tid4bionic(); + int rc = osal_check_tid4bionic(); if (likely(rc == 0)) rc = dont_wait ? pthread_mutex_trylock(ipc) : pthread_mutex_lock(ipc); rc = (rc == EBUSY && dont_wait) ? MDBX_BUSY : rc; @@ -869,7 +887,7 @@ static int mdbx_ipclock_lock(MDBX_env *env, mdbx_ipclock_t *ipc, return rc; } -static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { +static int mdbx_ipclock_unlock(MDBX_env *env, osal_ipclock_t *ipc) { #if MDBX_LOCKING == MDBX_LOCKING_POSIX2001 || \ MDBX_LOCKING == MDBX_LOCKING_POSIX2008 int rc = pthread_mutex_unlock(ipc); @@ -891,38 +909,38 @@ static int mdbx_ipclock_unlock(MDBX_env *env, mdbx_ipclock_t *ipc) { return rc; } -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { - mdbx_trace("%s", ">>"); - mdbx_jitter4testing(true); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { + TRACE("%s", ">>"); + jitter4testing(true); int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_rlock, false); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); return rc; } -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { - mdbx_trace("%s", ">>"); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { + TRACE("%s", ">>"); int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_rlock); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) mdbx_panic("%s() failed: err %d\n", __func__, rc); - mdbx_jitter4testing(true); + jitter4testing(true); } int mdbx_txn_lock(MDBX_env *env, bool dont_wait) { - mdbx_trace("%swait %s", dont_wait ? "dont-" : "", ">>"); - mdbx_jitter4testing(true); + TRACE("%swait %s", dont_wait ? "dont-" : "", ">>"); + jitter4testing(true); int rc = mdbx_ipclock_lock(env, &env->me_lck->mti_wlock, dont_wait); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); return MDBX_IS_ERROR(rc) ? rc : MDBX_SUCCESS; } void mdbx_txn_unlock(MDBX_env *env) { - mdbx_trace("%s", ">>"); + TRACE("%s", ">>"); int rc = mdbx_ipclock_unlock(env, &env->me_lck->mti_wlock); - mdbx_trace("<< rc %d", rc); + TRACE("<< rc %d", rc); if (unlikely(rc != MDBX_SUCCESS)) mdbx_panic("%s() failed: err %d\n", __func__, rc); - mdbx_jitter4testing(true); + jitter4testing(true); } #else diff --git a/src/lck-windows.c b/src/lck-windows.c index 0baac86f..8ffccb1b 100644 --- a/src/lck-windows.c +++ b/src/lck-windows.c @@ -1,5 +1,5 @@ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -48,16 +48,16 @@ static switch (reason) { case DLL_PROCESS_ATTACH: mdbx_winnt_import(); - mdbx_rthc_global_init(); + global_ctor(); break; case DLL_PROCESS_DETACH: - mdbx_rthc_global_dtor(); + global_dtor(); break; case DLL_THREAD_ATTACH: break; case DLL_THREAD_DETACH: - mdbx_rthc_thread_dtor(module); + thread_dtor(module); break; } #if MDBX_BUILD_SHARED_LIBRARY @@ -112,32 +112,71 @@ static #define LCK_WAITFOR 0 #define LCK_DONTWAIT LOCKFILE_FAIL_IMMEDIATELY -static __inline BOOL flock(mdbx_filehandle_t fd, DWORD flags, uint64_t offset, - size_t bytes) { +static int flock_with_event(HANDLE fd, HANDLE event, unsigned flags, + size_t offset, size_t bytes) { + TRACE("lock>>: fd %p, event %p, flags 0x%x offset %zu, bytes %zu >>", fd, + event, flags, offset, bytes); OVERLAPPED ov; - ov.hEvent = 0; + ov.Internal = 0; + ov.InternalHigh = 0; + ov.hEvent = event; ov.Offset = (DWORD)offset; ov.OffsetHigh = HIGH_DWORD(offset); - return LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov); + if (LockFileEx(fd, flags, 0, (DWORD)bytes, HIGH_DWORD(bytes), &ov)) { + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", fd, + event, flags, offset, bytes, "done"); + return MDBX_SUCCESS; + } + + DWORD rc = GetLastError(); + if (rc == ERROR_IO_PENDING) { + if (event) { + if (GetOverlappedResult(fd, &ov, &rc, true)) { + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << %s", + fd, event, flags, offset, bytes, "overlapped-done"); + return MDBX_SUCCESS; + } + rc = GetLastError(); + } else + CancelIo(fd); + } + TRACE("lock<<: fd %p, event %p, flags 0x%x offset %zu, bytes %zu << err %d", + fd, event, flags, offset, bytes, (int)rc); + return (int)rc; } -static __inline BOOL funlock(mdbx_filehandle_t fd, uint64_t offset, - size_t bytes) { +static __inline int flock(HANDLE fd, unsigned flags, size_t offset, + size_t bytes) { + return flock_with_event(fd, 0, flags, offset, bytes); +} + +static __inline int flock_data(const MDBX_env *env, unsigned flags, + size_t offset, size_t bytes) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + return flock_with_event(fd4data, env->me_data_lock_event, flags, offset, + bytes); +} + +static int funlock(mdbx_filehandle_t fd, size_t offset, size_t bytes) { + TRACE("unlock: fd %p, offset %zu, bytes %zu", fd, offset, bytes); return UnlockFile(fd, (DWORD)offset, HIGH_DWORD(offset), (DWORD)bytes, - HIGH_DWORD(bytes)); + HIGH_DWORD(bytes)) + ? MDBX_SUCCESS + : (int)GetLastError(); } /*----------------------------------------------------------------------------*/ /* global `write` lock for write-txt processing, * exclusive locking both meta-pages) */ -#define LCK_MAXLEN (1u + ((~(size_t)0) >> 1)) -#define LCK_META_OFFSET 0 -#define LCK_META_LEN (MAX_PAGESIZE * NUM_METAS) -#define LCK_BODY_OFFSET LCK_META_LEN -#define LCK_BODY_LEN (LCK_MAXLEN - LCK_BODY_OFFSET) -#define LCK_BODY LCK_BODY_OFFSET, LCK_BODY_LEN -#define LCK_WHOLE 0, LCK_MAXLEN +#ifdef _WIN64 +#define DXB_MAXLEN UINT64_C(0x7fffFFFFfff00000) +#else +#define DXB_MAXLEN UINT32_C(0x7ff00000) +#endif +#define DXB_BODY (env->me_psize * (size_t)NUM_METAS), DXB_MAXLEN +#define DXB_WHOLE 0, DXB_MAXLEN int mdbx_txn_lock(MDBX_env *env, bool dontwait) { if (dontwait) { @@ -155,38 +194,49 @@ int mdbx_txn_lock(MDBX_env *env, bool dontwait) { } } - if ((env->me_flags & MDBX_EXCLUSIVE) || - flock(env->me_lazy_fd, - dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) - : (LCK_EXCLUSIVE | LCK_WAITFOR), - LCK_BODY)) + if (env->me_flags & MDBX_EXCLUSIVE) { + /* Zap: Failing to release lock 'env->me_windowsbug_lock' + * in function 'mdbx_txn_lock' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); return MDBX_SUCCESS; + } - int rc = (int)GetLastError(); + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + int rc = flock_with_event(fd4data, env->me_data_lock_event, + dontwait ? (LCK_EXCLUSIVE | LCK_DONTWAIT) + : (LCK_EXCLUSIVE | LCK_WAITFOR), + DXB_BODY); if (rc == ERROR_LOCK_VIOLATION && dontwait) { SleepEx(0, true); - if (flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) - return MDBX_SUCCESS; - rc = (int)GetLastError(); + rc = flock_with_event(fd4data, env->me_data_lock_event, + LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); if (rc == ERROR_LOCK_VIOLATION) { SleepEx(0, true); - if (flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) - return MDBX_SUCCESS; - rc = (int)GetLastError(); + rc = flock_with_event(fd4data, env->me_data_lock_event, + LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_BODY); } } + if (rc == MDBX_SUCCESS) { + /* Zap: Failing to release lock 'env->me_windowsbug_lock' + * in function 'mdbx_txn_lock' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(26115); + return rc; + } LeaveCriticalSection(&env->me_windowsbug_lock); return (!dontwait || rc != ERROR_LOCK_VIOLATION) ? rc : MDBX_BUSY; } void mdbx_txn_unlock(MDBX_env *env) { - int rc = (env->me_flags & MDBX_EXCLUSIVE) - ? TRUE - : funlock(env->me_lazy_fd, LCK_BODY); + if ((env->me_flags & MDBX_EXCLUSIVE) == 0) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + int err = funlock(fd4data, DXB_BODY); + if (err != MDBX_SUCCESS) + mdbx_panic("%s failed: err %u", __func__, err); + } LeaveCriticalSection(&env->me_windowsbug_lock); - if (!rc) - mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); } /*----------------------------------------------------------------------------*/ @@ -200,56 +250,57 @@ void mdbx_txn_unlock(MDBX_env *env) { #define LCK_LOWER LCK_LO_OFFSET, LCK_LO_LEN #define LCK_UPPER LCK_UP_OFFSET, LCK_UP_LEN -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env) { - mdbx_srwlock_AcquireShared(&env->me_remap_guard); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env) { + osal_srwlock_AcquireShared(&env->me_remap_guard); if (env->me_lfd == INVALID_HANDLE_VALUE) return MDBX_SUCCESS; /* readonly database in readonly filesystem */ /* transition from S-? (used) to S-E (locked), * e.g. exclusive lock upper-part */ - if ((env->me_flags & MDBX_EXCLUSIVE) || - flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) + if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS; - int rc = (int)GetLastError(); - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); + if (rc == MDBX_SUCCESS) + return MDBX_SUCCESS; + + osal_srwlock_ReleaseShared(&env->me_remap_guard); return rc; } -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env) { - if (env->me_lfd != INVALID_HANDLE_VALUE) { +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env) { + if (env->me_lfd != INVALID_HANDLE_VALUE && + (env->me_flags & MDBX_EXCLUSIVE) == 0) { /* transition from S-E (locked) to S-? (used), e.g. unlock upper-part */ - if ((env->me_flags & MDBX_EXCLUSIVE) == 0 && - !funlock(env->me_lfd, LCK_UPPER)) - mdbx_panic("%s failed: err %u", __func__, (int)GetLastError()); + int err = funlock(env->me_lfd, LCK_UPPER); + if (err != MDBX_SUCCESS) + mdbx_panic("%s failed: err %u", __func__, err); } - mdbx_srwlock_ReleaseShared(&env->me_remap_guard); + osal_srwlock_ReleaseShared(&env->me_remap_guard); } -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait) { - return flock(fd, - wait ? LCK_EXCLUSIVE | LCK_WAITFOR - : LCK_EXCLUSIVE | LCK_DONTWAIT, - 0, LCK_MAXLEN) - ? MDBX_SUCCESS - : (int)GetLastError(); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait) { + return flock( + fd, wait ? LCK_EXCLUSIVE | LCK_WAITFOR : LCK_EXCLUSIVE | LCK_DONTWAIT, 0, + DXB_MAXLEN); } static int suspend_and_append(mdbx_handle_array_t **array, const DWORD ThreadId) { const unsigned limit = (*array)->limit; if ((*array)->count == limit) { - void *ptr = mdbx_realloc( - (limit > ARRAY_LENGTH((*array)->handles)) - ? *array - : /* don't free initial array on the stack */ NULL, - sizeof(mdbx_handle_array_t) + - sizeof(HANDLE) * (limit * 2 - ARRAY_LENGTH((*array)->handles))); + mdbx_handle_array_t *const ptr = + osal_realloc((limit > ARRAY_LENGTH((*array)->handles)) + ? *array + : /* don't free initial array on the stack */ NULL, + sizeof(mdbx_handle_array_t) + + sizeof(HANDLE) * (limit * (size_t)2 - + ARRAY_LENGTH((*array)->handles))); if (!ptr) return MDBX_ENOMEM; if (limit == ARRAY_LENGTH((*array)->handles)) - memcpy(ptr, *array, sizeof(mdbx_handle_array_t)); - *array = (mdbx_handle_array_t *)ptr; + *ptr = **array; + *array = ptr; (*array)->limit = limit * 2; } @@ -273,8 +324,8 @@ static int suspend_and_append(mdbx_handle_array_t **array, } MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { - mdbx_assert(env, (env->me_flags & MDBX_NOTLS) == 0); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { + eASSERT(env, (env->me_flags & MDBX_NOTLS) == 0); const uintptr_t CurrentTid = GetCurrentThreadId(); int rc; if (env->me_lck_mmap.lck) { @@ -296,7 +347,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { rc = suspend_and_append(array, (mdbx_tid_t)reader->mr_tid.weak); if (rc != MDBX_SUCCESS) { bailout_lck: - (void)mdbx_resume_threads_after_remap(*array); + (void)osal_resume_threads_after_remap(*array); return rc; } } @@ -308,7 +359,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { } else { /* Without LCK (i.e. read-only mode). * Walk through a snapshot of all running threads */ - mdbx_assert(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); + eASSERT(env, env->me_flags & (MDBX_EXCLUSIVE | MDBX_RDONLY)); const HANDLE hSnapshot = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0); if (hSnapshot == INVALID_HANDLE_VALUE) return (int)GetLastError(); @@ -320,7 +371,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { rc = (int)GetLastError(); bailout_toolhelp: CloseHandle(hSnapshot); - (void)mdbx_resume_threads_after_remap(*array); + (void)osal_resume_threads_after_remap(*array); return rc; } @@ -345,7 +396,7 @@ mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array) { } MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { +osal_resume_threads_after_remap(mdbx_handle_array_t *array) { int rc = MDBX_SUCCESS; for (unsigned i = 0; i < array->count; ++i) { const HANDLE hThread = array->handles[i]; @@ -384,11 +435,11 @@ mdbx_resume_threads_after_remap(mdbx_handle_array_t *array) { * E-S * E-E = exclusive-write, i.e. exclusive due (re)initialization * - * The mdbx_lck_seize() moves the locking-FSM from the initial free/unlocked + * The osal_lck_seize() moves the locking-FSM from the initial free/unlocked * state to the "exclusive write" (and returns MDBX_RESULT_TRUE) if possible, * or to the "used" (and returns MDBX_RESULT_FALSE). * - * The mdbx_lck_downgrade() moves the locking-FSM from "exclusive write" + * The osal_lck_downgrade() moves the locking-FSM from "exclusive write" * state to the "used" (i.e. shared) state. * * The mdbx_lck_upgrade() moves the locking-FSM from "used" (i.e. shared) @@ -400,40 +451,38 @@ static void lck_unlock(MDBX_env *env) { if (env->me_lfd != INVALID_HANDLE_VALUE) { /* double `unlock` for robustly remove overlapped shared/exclusive locks */ - while (funlock(env->me_lfd, LCK_LOWER)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_lfd, LCK_LOWER); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); - while (funlock(env->me_lfd, LCK_UPPER)) - ; - err = (int)GetLastError(); + do + err = funlock(env->me_lfd, LCK_UPPER); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); } - if (env->me_lazy_fd != INVALID_HANDLE_VALUE) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + if (fd4data != INVALID_HANDLE_VALUE) { /* explicitly unlock to avoid latency for other processes (windows kernel * releases such locks via deferred queues) */ - while (funlock(env->me_lazy_fd, LCK_BODY)) - ; - err = (int)GetLastError(); + do + err = funlock(fd4data, DXB_BODY); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); - while (funlock(env->me_lazy_fd, LCK_WHOLE)) - ; - err = (int)GetLastError(); + do + err = funlock(fd4data, DXB_WHOLE); + while (err == MDBX_SUCCESS); assert(err == ERROR_NOT_LOCKED || (mdbx_RunningUnderWine() && err == ERROR_LOCK_VIOLATION)); - (void)err; SetLastError(ERROR_SUCCESS); } } @@ -442,56 +491,57 @@ static void lck_unlock(MDBX_env *env) { * or as 'used' (S-? and returns MDBX_RESULT_FALSE). * Otherwise returns an error. */ static int internal_seize_lck(HANDLE lfd) { - int rc; assert(lfd != INVALID_HANDLE_VALUE); /* 1) now on ?-? (free), get ?-E (middle) */ - mdbx_jitter4testing(false); - if (!flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER)) { - rc = (int)GetLastError() /* 2) something went wrong, give up */; - mdbx_error("%s, err %u", "?-?(free) >> ?-E(middle)", rc); + jitter4testing(false); + int rc = flock(lfd, LCK_EXCLUSIVE | LCK_WAITFOR, LCK_UPPER); + if (rc != MDBX_SUCCESS) { + /* 2) something went wrong, give up */; + ERROR("%s, err %u", "?-?(free) >> ?-E(middle)", rc); return rc; } /* 3) now on ?-E (middle), try E-E (exclusive-write) */ - mdbx_jitter4testing(false); - if (flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) + jitter4testing(false); + rc = flock(lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + if (rc == MDBX_SUCCESS) return MDBX_RESULT_TRUE /* 4) got E-E (exclusive-write), done */; /* 5) still on ?-E (middle) */ - rc = (int)GetLastError(); - mdbx_jitter4testing(false); + jitter4testing(false); if (rc != ERROR_SHARING_VIOLATION && rc != ERROR_LOCK_VIOLATION) { /* 6) something went wrong, give up */ - if (!funlock(lfd, LCK_UPPER)) + rc = funlock(lfd, LCK_UPPER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "?-E(middle) >> ?-?(free)", - (int)GetLastError()); + rc); return rc; } /* 7) still on ?-E (middle), try S-E (locked) */ - mdbx_jitter4testing(false); - rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER) ? MDBX_RESULT_FALSE - : (int)GetLastError(); + jitter4testing(false); + rc = flock(lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); - mdbx_jitter4testing(false); - if (rc != MDBX_RESULT_FALSE) - mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + jitter4testing(false); + if (rc != MDBX_SUCCESS) + ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); /* 8) now on S-E (locked) or still on ?-E (middle), * transition to S-? (used) or ?-? (free) */ - if (!funlock(lfd, LCK_UPPER)) + int err = funlock(lfd, LCK_UPPER); + if (err != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, - "X-E(locked/middle) >> X-?(used/free)", (int)GetLastError()); + "X-E(locked/middle) >> X-?(used/free)", err); /* 9) now on S-? (used, DONE) or ?-? (free, FAILURE) */ return rc; } -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { - int rc; - - assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; + assert(fd4data != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_RESULT_TRUE /* nope since files were must be opened non-shareable */ @@ -499,17 +549,15 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { if (env->me_lfd == INVALID_HANDLE_VALUE) { /* LY: without-lck mode (e.g. on read-only filesystem) */ - mdbx_jitter4testing(false); - if (!flock(env->me_lazy_fd, LCK_SHARED | LCK_DONTWAIT, LCK_WHOLE)) { - rc = (int)GetLastError(); - mdbx_error("%s, err %u", "without-lck", rc); - return rc; - } - return MDBX_RESULT_FALSE; + jitter4testing(false); + int rc = flock_data(env, LCK_SHARED | LCK_DONTWAIT, DXB_WHOLE); + if (rc != MDBX_SUCCESS) + ERROR("%s, err %u", "without-lck", rc); + return rc; } - rc = internal_seize_lck(env->me_lfd); - mdbx_jitter4testing(false); + int rc = internal_seize_lck(env->me_lfd); + jitter4testing(false); if (rc == MDBX_RESULT_TRUE && (env->me_flags & MDBX_RDONLY) == 0) { /* Check that another process don't operates in without-lck mode. * Doing such check by exclusive locking the body-part of db. Should be @@ -517,46 +565,52 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env) { * - we need an exclusive lock for do so; * - we can't lock meta-pages, otherwise other process could get an error * while opening db in valid (non-conflict) mode. */ - if (!flock(env->me_lazy_fd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_BODY)) { - rc = (int)GetLastError(); - mdbx_error("%s, err %u", "lock-against-without-lck", rc); - mdbx_jitter4testing(false); + int err = flock_data(env, LCK_EXCLUSIVE | LCK_DONTWAIT, DXB_WHOLE); + if (err != MDBX_SUCCESS) { + ERROR("%s, err %u", "lock-against-without-lck", err); + jitter4testing(false); lck_unlock(env); - } else { - mdbx_jitter4testing(false); - if (!funlock(env->me_lazy_fd, LCK_BODY)) - mdbx_panic("%s(%s) failed: err %u", __func__, - "unlock-against-without-lck", (int)GetLastError()); + return err; } + jitter4testing(false); + err = funlock(fd4data, DXB_WHOLE); + if (err != MDBX_SUCCESS) + mdbx_panic("%s(%s) failed: err %u", __func__, + "unlock-against-without-lck", err); } return rc; } -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env) { + const HANDLE fd4data = + env->me_overlapped_fd ? env->me_overlapped_fd : env->me_lazy_fd; /* Transite from exclusive-write state (E-E) to used (S-?) */ - assert(env->me_lazy_fd != INVALID_HANDLE_VALUE); + assert(fd4data != INVALID_HANDLE_VALUE); assert(env->me_lfd != INVALID_HANDLE_VALUE); if (env->me_flags & MDBX_EXCLUSIVE) return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ ; /* 1) now at E-E (exclusive-write), transition to ?_E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) + int rc = funlock(env->me_lfd, LCK_LOWER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, - "E-E(exclusive-write) >> ?-E(middle)", (int)GetLastError()); + "E-E(exclusive-write) >> ?-E(middle)", rc); /* 2) now at ?-E (middle), transition to S-E (locked) */ - if (!flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER)) { - int rc = (int)GetLastError() /* 3) something went wrong, give up */; - mdbx_error("%s, err %u", "?-E(middle) >> S-E(locked)", rc); + rc = flock(env->me_lfd, LCK_SHARED | LCK_DONTWAIT, LCK_LOWER); + if (rc != MDBX_SUCCESS) { + /* 3) something went wrong, give up */; + ERROR("%s, err %u", "?-E(middle) >> S-E(locked)", rc); return rc; } /* 4) got S-E (locked), continue transition to S-? (used) */ - if (!funlock(env->me_lfd, LCK_UPPER)) + rc = funlock(env->me_lfd, LCK_UPPER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> S-?(used)", - (int)GetLastError()); + rc); return MDBX_SUCCESS /* 5) now at S-? (used), done */; } @@ -569,53 +623,72 @@ MDBX_INTERNAL_FUNC int mdbx_lck_upgrade(MDBX_env *env) { return MDBX_SUCCESS /* nope since files were must be opened non-shareable */ ; - int rc; /* 1) now on S-? (used), try S-E (locked) */ - mdbx_jitter4testing(false); - if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER)) { - rc = (int)GetLastError() /* 2) something went wrong, give up */; - mdbx_verbose("%s, err %u", "S-?(used) >> S-E(locked)", rc); + jitter4testing(false); + int rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_UPPER); + if (rc != MDBX_SUCCESS) { + /* 2) something went wrong, give up */; + VERBOSE("%s, err %u", "S-?(used) >> S-E(locked)", rc); return rc; } /* 3) now on S-E (locked), transition to ?-E (middle) */ - if (!funlock(env->me_lfd, LCK_LOWER)) + rc = funlock(env->me_lfd, LCK_LOWER); + if (rc != MDBX_SUCCESS) mdbx_panic("%s(%s) failed: err %u", __func__, "S-E(locked) >> ?-E(middle)", - (int)GetLastError()); + rc); /* 4) now on ?-E (middle), try E-E (exclusive-write) */ - mdbx_jitter4testing(false); - if (!flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER)) { - rc = (int)GetLastError() /* 5) something went wrong, give up */; - mdbx_verbose("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); + jitter4testing(false); + rc = flock(env->me_lfd, LCK_EXCLUSIVE | LCK_DONTWAIT, LCK_LOWER); + if (rc != MDBX_SUCCESS) { + /* 5) something went wrong, give up */; + VERBOSE("%s, err %u", "?-E(middle) >> E-E(exclusive-write)", rc); return rc; } return MDBX_SUCCESS /* 6) now at E-E (exclusive-write), done */; } -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag) { (void)env; (void)inprocess_neighbor; (void)global_uniqueness_flag; + if (mdbx_SetFileIoOverlappedRange && !(env->me_flags & MDBX_RDONLY)) { + HANDLE token = INVALID_HANDLE_VALUE; + TOKEN_PRIVILEGES privileges; + privileges.PrivilegeCount = 1; + privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + if (!OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, + &token) || + !LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, + &privileges.Privileges[0].Luid) || + !AdjustTokenPrivileges(token, FALSE, &privileges, sizeof(privileges), + nullptr, nullptr) || + GetLastError() != ERROR_SUCCESS) + mdbx_SetFileIoOverlappedRange = NULL; + + if (token != INVALID_HANDLE_VALUE) + CloseHandle(token); + } return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor) { /* LY: should unmap before releasing the locks to avoid race condition and * STATUS_USER_MAPPED_FILE/ERROR_USER_MAPPED_FILE */ if (env->me_map) - mdbx_munmap(&env->me_dxb_mmap); + osal_munmap(&env->me_dxb_mmap); if (env->me_lck_mmap.lck) { const bool synced = env->me_lck_mmap.lck->mti_unsynced_pages.weak == 0; - mdbx_munmap(&env->me_lck_mmap); + osal_munmap(&env->me_lck_mmap); if (synced && !inprocess_neighbor && env->me_lfd != INVALID_HANDLE_VALUE && mdbx_lck_upgrade(env) == MDBX_SUCCESS) /* this will fail if LCK is used/mmapped by other process(es) */ - mdbx_ftruncate(env->me_lfd, 0); + osal_ftruncate(env->me_lfd, 0); } lck_unlock(env); return MDBX_SUCCESS; @@ -624,12 +697,12 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /*----------------------------------------------------------------------------*/ /* reader checking (by pid) */ -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env) { (void)env; return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env) { (void)env; return MDBX_SUCCESS; } @@ -640,7 +713,7 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env) { * MDBX_RESULT_TRUE, if pid is live (unable to acquire lock) * MDBX_RESULT_FALSE, if pid is dead (lock acquired) * or otherwise the errcode. */ -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid) { (void)env; HANDLE hProcess = OpenProcess(SYNCHRONIZE, FALSE, pid); int rc; @@ -677,18 +750,18 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid) { // Stub for slim read-write lock // Copyright (C) 1995-2002 Brad Wilson -static void WINAPI stub_srwlock_Init(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_Init(osal_srwlock_t *srwl) { srwl->readerCount = srwl->writerCount = 0; } -static void WINAPI stub_srwlock_AcquireShared(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_AcquireShared(osal_srwlock_t *srwl) { while (true) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); // If there's a writer already, spin without unnecessarily // interlocking the CPUs if (srwl->writerCount != 0) { - YieldProcessor(); + SwitchToThread(); continue; } @@ -702,23 +775,23 @@ static void WINAPI stub_srwlock_AcquireShared(MDBX_srwlock *srwl) { // Remove from the readers list, spin, try again _InterlockedDecrement(&srwl->readerCount); - YieldProcessor(); + SwitchToThread(); } } -static void WINAPI stub_srwlock_ReleaseShared(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_ReleaseShared(osal_srwlock_t *srwl) { assert(srwl->readerCount > 0); _InterlockedDecrement(&srwl->readerCount); } -static void WINAPI stub_srwlock_AcquireExclusive(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_AcquireExclusive(osal_srwlock_t *srwl) { while (true) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); // If there's a writer already, spin without unnecessarily // interlocking the CPUs if (srwl->writerCount != 0) { - YieldProcessor(); + SwitchToThread(); continue; } @@ -733,11 +806,11 @@ static void WINAPI stub_srwlock_AcquireExclusive(MDBX_srwlock *srwl) { // that we're the writer. while (srwl->readerCount != 0) { assert(srwl->writerCount >= 0 && srwl->readerCount >= 0); - YieldProcessor(); + SwitchToThread(); } } -static void WINAPI stub_srwlock_ReleaseExclusive(MDBX_srwlock *srwl) { +static void WINAPI stub_srwlock_ReleaseExclusive(osal_srwlock_t *srwl) { assert(srwl->writerCount == 1 && srwl->readerCount >= 0); srwl->writerCount = 0; } @@ -753,9 +826,9 @@ static uint64_t WINAPI stub_GetTickCount64(void) { /*----------------------------------------------------------------------------*/ #ifndef xMDBX_ALLOY -MDBX_srwlock_function mdbx_srwlock_Init, mdbx_srwlock_AcquireShared, - mdbx_srwlock_ReleaseShared, mdbx_srwlock_AcquireExclusive, - mdbx_srwlock_ReleaseExclusive; +osal_srwlock_t_function osal_srwlock_Init, osal_srwlock_AcquireShared, + osal_srwlock_ReleaseShared, osal_srwlock_AcquireExclusive, + osal_srwlock_ReleaseExclusive; MDBX_NtExtendSection mdbx_NtExtendSection; MDBX_GetFileInformationByHandleEx mdbx_GetFileInformationByHandleEx; @@ -766,6 +839,7 @@ MDBX_NtFsControlFile mdbx_NtFsControlFile; MDBX_PrefetchVirtualMemory mdbx_PrefetchVirtualMemory; MDBX_GetTickCount64 mdbx_GetTickCount64; MDBX_RegGetValueA mdbx_RegGetValueA; +MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; #endif /* xMDBX_ALLOY */ #if __GNUC_PREREQ(8, 0) @@ -774,54 +848,63 @@ MDBX_RegGetValueA mdbx_RegGetValueA; #endif /* GCC/MINGW */ static void mdbx_winnt_import(void) { - const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); - #define GET_PROC_ADDR(dll, ENTRY) \ mdbx_##ENTRY = (MDBX_##ENTRY)GetProcAddress(dll, #ENTRY) - if (GetProcAddress(hNtdll, "wine_get_version")) { - assert(mdbx_RunningUnderWine()); - } else { - GET_PROC_ADDR(hNtdll, NtFsControlFile); - GET_PROC_ADDR(hNtdll, NtExtendSection); - assert(!mdbx_RunningUnderWine()); + const HINSTANCE hNtdll = GetModuleHandleA("ntdll.dll"); + if (hNtdll) { + if (GetProcAddress(hNtdll, "wine_get_version")) { + assert(mdbx_RunningUnderWine()); + } else { + GET_PROC_ADDR(hNtdll, NtFsControlFile); + GET_PROC_ADDR(hNtdll, NtExtendSection); + assert(!mdbx_RunningUnderWine()); + } } const HINSTANCE hKernel32dll = GetModuleHandleA("kernel32.dll"); - GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); - GET_PROC_ADDR(hKernel32dll, GetTickCount64); - if (!mdbx_GetTickCount64) - mdbx_GetTickCount64 = stub_GetTickCount64; - if (!mdbx_RunningUnderWine()) { - GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); - GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); - GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); - GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); + if (hKernel32dll) { + GET_PROC_ADDR(hKernel32dll, GetFileInformationByHandleEx); + GET_PROC_ADDR(hKernel32dll, GetTickCount64); + if (!mdbx_GetTickCount64) + mdbx_GetTickCount64 = stub_GetTickCount64; + if (!mdbx_RunningUnderWine()) { + GET_PROC_ADDR(hKernel32dll, SetFileInformationByHandle); + GET_PROC_ADDR(hKernel32dll, GetVolumeInformationByHandleW); + GET_PROC_ADDR(hKernel32dll, GetFinalPathNameByHandleW); + GET_PROC_ADDR(hKernel32dll, PrefetchVirtualMemory); + GET_PROC_ADDR(hKernel32dll, SetFileIoOverlappedRange); + } + } + + const osal_srwlock_t_function init = + (osal_srwlock_t_function)(hKernel32dll + ? GetProcAddress(hKernel32dll, + "InitializeSRWLock") + : nullptr); + if (init != NULL) { + osal_srwlock_Init = init; + osal_srwlock_AcquireShared = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "AcquireSRWLockShared"); + osal_srwlock_ReleaseShared = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "ReleaseSRWLockShared"); + osal_srwlock_AcquireExclusive = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "AcquireSRWLockExclusive"); + osal_srwlock_ReleaseExclusive = (osal_srwlock_t_function)GetProcAddress( + hKernel32dll, "ReleaseSRWLockExclusive"); + } else { + osal_srwlock_Init = stub_srwlock_Init; + osal_srwlock_AcquireShared = stub_srwlock_AcquireShared; + osal_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; + osal_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; + osal_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; } const HINSTANCE hAdvapi32dll = GetModuleHandleA("advapi32.dll"); - GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); -#undef GET_PROC_ADDR - - const MDBX_srwlock_function init = - (MDBX_srwlock_function)GetProcAddress(hKernel32dll, "InitializeSRWLock"); - if (init != NULL) { - mdbx_srwlock_Init = init; - mdbx_srwlock_AcquireShared = (MDBX_srwlock_function)GetProcAddress( - hKernel32dll, "AcquireSRWLockShared"); - mdbx_srwlock_ReleaseShared = (MDBX_srwlock_function)GetProcAddress( - hKernel32dll, "ReleaseSRWLockShared"); - mdbx_srwlock_AcquireExclusive = (MDBX_srwlock_function)GetProcAddress( - hKernel32dll, "AcquireSRWLockExclusive"); - mdbx_srwlock_ReleaseExclusive = (MDBX_srwlock_function)GetProcAddress( - hKernel32dll, "ReleaseSRWLockExclusive"); - } else { - mdbx_srwlock_Init = stub_srwlock_Init; - mdbx_srwlock_AcquireShared = stub_srwlock_AcquireShared; - mdbx_srwlock_ReleaseShared = stub_srwlock_ReleaseShared; - mdbx_srwlock_AcquireExclusive = stub_srwlock_AcquireExclusive; - mdbx_srwlock_ReleaseExclusive = stub_srwlock_ReleaseExclusive; + if (hAdvapi32dll) { + GET_PROC_ADDR(hAdvapi32dll, RegGetValueA); } +#undef GET_PROC_ADDR } #if __GNUC_PREREQ(8, 0) diff --git a/src/man1/mdbx_chk.1 b/src/man1/mdbx_chk.1 index 1b26ed0e..0f5810d4 100644 --- a/src/man1/mdbx_chk.1 +++ b/src/man1/mdbx_chk.1 @@ -1,6 +1,6 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_CHK 1 "2023-02-14" "MDBX 0.11.14" +.TH MDBX_CHK 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_chk \- MDBX checking tool .SH SYNOPSIS @@ -81,6 +81,13 @@ Turn to a specified meta-page on successful check. .BR \-T Turn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK! .TP +.BR \-u +Warms up the DB before checking via notifying OS kernel of subsequent access to the database pages. +.TP +.BR \-U +Warms up the DB before checking, notifying the OS kernel of subsequent access to the database pages, +then forcibly loads ones by sequential access and tries to lock database pages in memory. +.TP .BR \-n Open MDBX environment(s) which do not use subdirectories. This is legacy option. For now MDBX handles this automatically. diff --git a/src/man1/mdbx_copy.1 b/src/man1/mdbx_copy.1 index 7b47805c..729919b6 100644 --- a/src/man1/mdbx_copy.1 +++ b/src/man1/mdbx_copy.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_COPY 1 "2023-02-14" "MDBX 0.11.14" +.TH MDBX_COPY 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_copy \- MDBX environment copy tool .SH SYNOPSIS @@ -45,6 +45,13 @@ or unused pages will be omitted from the copy. This option will slow down the backup process as it is more CPU-intensive. Currently it fails if the environment has suffered a page leak. .TP +.BR \-u +Warms up the DB before copying via notifying OS kernel of subsequent access to the database pages. +.TP +.BR \-U +Warms up the DB before copying, notifying the OS kernel of subsequent access to the database pages, +then forcibly loads ones by sequential access and tries to lock database pages in memory. +.TP .BR \-n Open MDBX environment(s) which do not use subdirectories. This is legacy option. For now MDBX handles this automatically. diff --git a/src/man1/mdbx_drop.1 b/src/man1/mdbx_drop.1 index 172e84ed..86dd8666 100644 --- a/src/man1/mdbx_drop.1 +++ b/src/man1/mdbx_drop.1 @@ -1,7 +1,7 @@ -.\" Copyright 2021-2022 Leonid Yuriev . +.\" Copyright 2021-2023 Leonid Yuriev . .\" Copyright 2014-2021 Howard Chu, Symas Corp. All Rights Reserved. .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DROP 1 "2023-02-14" "MDBX 0.11.14" +.TH MDBX_DROP 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_drop \- MDBX database delete tool .SH SYNOPSIS diff --git a/src/man1/mdbx_dump.1 b/src/man1/mdbx_dump.1 index a5a2c060..d6eb9577 100644 --- a/src/man1/mdbx_dump.1 +++ b/src/man1/mdbx_dump.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_DUMP 1 "2023-02-14" "MDBX 0.11.14" +.TH MDBX_DUMP 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_dump \- MDBX environment export tool .SH SYNOPSIS @@ -66,6 +66,13 @@ Dump a specific subdatabase. If no database is specified, only the main database .BR \-r Rescure mode. Ignore some errors to dump corrupted DB. .TP +.BR \-u +Warms up the DB before dumping via notifying OS kernel of subsequent access to the database pages. +.TP +.BR \-U +Warms up the DB before dumping, notifying the OS kernel of subsequent access to the database pages, +then forcibly loads ones by sequential access and tries to lock database pages in memory. +.TP .BR \-n Dump an MDBX database which does not use subdirectories. This is legacy option. For now MDBX handles this automatically. diff --git a/src/man1/mdbx_load.1 b/src/man1/mdbx_load.1 index 6a213ee3..798814d9 100644 --- a/src/man1/mdbx_load.1 +++ b/src/man1/mdbx_load.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2014-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_LOAD 1 "2023-02-14" "MDBX 0.11.14" +.TH MDBX_LOAD 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_load \- MDBX environment import tool .SH SYNOPSIS diff --git a/src/man1/mdbx_stat.1 b/src/man1/mdbx_stat.1 index 5c388085..72c15088 100644 --- a/src/man1/mdbx_stat.1 +++ b/src/man1/mdbx_stat.1 @@ -1,8 +1,8 @@ -.\" Copyright 2015-2022 Leonid Yuriev . +.\" Copyright 2015-2023 Leonid Yuriev . .\" Copyright 2012-2015 Howard Chu, Symas Corp. All Rights Reserved. .\" Copyright 2015,2016 Peter-Service R&D LLC . .\" Copying restrictions apply. See COPYRIGHT/LICENSE. -.TH MDBX_STAT 1 "2023-02-14" "MDBX 0.11.14" +.TH MDBX_STAT 1 "2023-03-03" "MDBX 0.12.4" .SH NAME mdbx_stat \- MDBX environment status tool .SH SYNOPSIS diff --git a/src/mdbx.c++ b/src/mdbx.c++ index 76d831b6..590cc07d 100644 --- a/src/mdbx.c++ +++ b/src/mdbx.c++ @@ -1,5 +1,5 @@ // -// Copyright (c) 2020-2022, Leonid Yuriev . +// Copyright (c) 2020-2023, Leonid Yuriev . // SPDX-License-Identifier: Apache-2.0 // // Non-inline part of the libmdbx C++ API @@ -14,6 +14,12 @@ #define __USE_MINGW_ANSI_STDIO 1 #endif /* MinGW */ +/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */ +#if defined(_MSC_VER) && defined(__SANITIZE_ADDRESS__) && \ + !defined(_DISABLE_VECTOR_ANNOTATION) +#define _DISABLE_VECTOR_ANNOTATION +#endif /* _DISABLE_VECTOR_ANNOTATION */ + #include "../mdbx.h++" #include "internals.h" @@ -201,41 +207,6 @@ __cold bug::~bug() noexcept {} #endif /* Unused*/ -//------------------------------------------------------------------------------ - -#if defined(_WIN32) || defined(_WIN64) - -std::string w2mb(const std::wstring &in) { - std::string out; - if (!in.empty()) { - const auto out_len = mdbx_w2mb(nullptr, 0, in.data(), in.size()); - if (out_len < 1) - mdbx::error::throw_exception(GetLastError()); - out.append(out_len, '\0'); - if (out_len != mdbx_w2mb(const_cast(out.data()), out_len, in.data(), - in.size())) - mdbx::error::throw_exception(GetLastError()); - } - return out; -} - -std::wstring mb2w(const char *in) { - std::wstring out; - if (in && *in) { - const auto in_len = strlen(in); - const auto out_len = mdbx_mb2w(nullptr, 0, in, in_len); - if (out_len < 1) - mdbx::error::throw_exception(GetLastError()); - out.append(out_len, '\0'); - if (out_len != - mdbx_mb2w(const_cast(out.data()), out_len, in, in_len)) - mdbx::error::throw_exception(GetLastError()); - } - return out; -} - -#endif /* Windows */ - } // namespace //------------------------------------------------------------------------------ @@ -1240,6 +1211,23 @@ env &env::copy(const ::std::string &destination, bool compactify, return copy(destination.c_str(), compactify, force_dynamic_size); } +#if defined(_WIN32) || defined(_WIN64) +env &env::copy(const wchar_t *destination, bool compactify, + bool force_dynamic_size) { + error::success_or_throw( + ::mdbx_env_copyW(handle_, destination, + (compactify ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS) | + (force_dynamic_size ? MDBX_CP_FORCE_DYNAMIC_SIZE + : MDBX_CP_DEFAULTS))); + return *this; +} + +env &env::copy(const ::std::wstring &destination, bool compactify, + bool force_dynamic_size) { + return copy(destination.c_str(), compactify, force_dynamic_size); +} +#endif /* Windows */ + #ifdef MDBX_STD_FILESYSTEM_PATH env &env::copy(const MDBX_STD_FILESYSTEM_PATH &destination, bool compactify, bool force_dynamic_size) { @@ -1247,20 +1235,15 @@ env &env::copy(const MDBX_STD_FILESYSTEM_PATH &destination, bool compactify, } #endif /* MDBX_STD_FILESYSTEM_PATH */ -#if defined(_WIN32) || defined(_WIN64) -env &env::copy(const ::std::wstring &destination, bool compactify, - bool force_dynamic_size) { - return copy(w2mb(destination), compactify, force_dynamic_size); -} -#endif /* Windows */ - path env::get_path() const { +#if defined(_WIN32) || defined(_WIN64) + const wchar_t *c_wstr; + error::success_or_throw(::mdbx_env_get_pathW(handle_, &c_wstr)); + static_assert(sizeof(path::value_type) == sizeof(wchar_t), "Oops"); + return path(c_wstr); +#else const char *c_str; error::success_or_throw(::mdbx_env_get_path(handle_, &c_str)); -#if defined(_WIN32) || defined(_WIN64) - static_assert(sizeof(path::value_type) == sizeof(wchar_t), "Oops"); - return path(mb2w(c_str)); -#else static_assert(sizeof(path::value_type) == sizeof(char), "Oops"); return path(c_str); #endif @@ -1275,6 +1258,17 @@ bool env::remove(const ::std::string &pathname, const remove_mode mode) { return remove(pathname.c_str(), mode); } +#if defined(_WIN32) || defined(_WIN64) +bool env::remove(const wchar_t *pathname, const remove_mode mode) { + return error::boolean_or_throw( + ::mdbx_env_deleteW(pathname, MDBX_env_delete_mode_t(mode))); +} + +bool env::remove(const ::std::wstring &pathname, const remove_mode mode) { + return remove(pathname.c_str(), mode); +} +#endif /* Windows */ + #ifdef MDBX_STD_FILESYSTEM_PATH bool env::remove(const MDBX_STD_FILESYSTEM_PATH &pathname, const remove_mode mode) { @@ -1282,12 +1276,6 @@ bool env::remove(const MDBX_STD_FILESYSTEM_PATH &pathname, } #endif /* MDBX_STD_FILESYSTEM_PATH */ -#if defined(_WIN32) || defined(_WIN64) -bool env::remove(const ::std::wstring &pathname, const remove_mode mode) { - return remove(w2mb(pathname), mode); -} -#endif /* Windows */ - //------------------------------------------------------------------------------ static inline MDBX_env *create_env() { @@ -1360,6 +1348,44 @@ __cold env_managed::env_managed(const ::std::string &pathname, const env::operate_parameters &op, bool accede) : env_managed(pathname.c_str(), cp, op, accede) {} +#if defined(_WIN32) || defined(_WIN64) +__cold env_managed::env_managed(const wchar_t *pathname, + const operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + error::success_or_throw( + ::mdbx_env_openW(handle_, pathname, op.make_flags(accede), 0)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); +} + +__cold env_managed::env_managed(const wchar_t *pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(create_env()) { + setup(op.max_maps, op.max_readers); + set_geometry(cp.geometry); + error::success_or_throw(::mdbx_env_openW( + handle_, pathname, op.make_flags(accede, cp.use_subdirectory), + cp.file_mode_bits)); + + if (op.options.nested_write_transactions && + !get_options().nested_write_transactions) + MDBX_CXX20_UNLIKELY error::throw_exception(MDBX_INCOMPATIBLE); +} + +__cold env_managed::env_managed(const ::std::wstring &pathname, + const operate_parameters &op, bool accede) + : env_managed(pathname.c_str(), op, accede) {} + +__cold env_managed::env_managed(const ::std::wstring &pathname, + const env_managed::create_parameters &cp, + const env::operate_parameters &op, bool accede) + : env_managed(pathname.c_str(), cp, op, accede) {} +#endif /* Windows */ + #ifdef MDBX_STD_FILESYSTEM_PATH __cold env_managed::env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, const operate_parameters &op, bool accede) @@ -1371,17 +1397,6 @@ __cold env_managed::env_managed(const MDBX_STD_FILESYSTEM_PATH &pathname, : env_managed(pathname.native(), cp, op, accede) {} #endif /* MDBX_STD_FILESYSTEM_PATH */ -#if defined(_WIN32) || defined(_WIN64) -__cold env_managed::env_managed(const ::std::wstring &pathname, - const operate_parameters &op, bool accede) - : env_managed(w2mb(pathname), op, accede) {} - -__cold env_managed::env_managed(const ::std::wstring &pathname, - const env_managed::create_parameters &cp, - const env::operate_parameters &op, bool accede) - : env_managed(w2mb(pathname), cp, op, accede) {} -#endif /* Windows */ - //------------------------------------------------------------------------------ txn_managed txn::start_nested() { @@ -1415,6 +1430,15 @@ void txn_managed::commit() { MDBX_CXX20_UNLIKELY err.throw_exception(); } +void txn_managed::commit(commit_latency *latency) { + const error err = + static_cast(::mdbx_txn_commit_ex(handle_, latency)); + if (MDBX_LIKELY(err.code() != MDBX_THREAD_MISMATCH)) + MDBX_CXX20_LIKELY handle_ = nullptr; + if (MDBX_UNLIKELY(err.code() != MDBX_SUCCESS)) + MDBX_CXX20_UNLIKELY err.throw_exception(); +} + //------------------------------------------------------------------------------ bool txn::drop_map(const char *name, bool throw_if_absent) { diff --git a/src/mdbx_chk.c b/src/mdbx_chk.c index 535cb585..a8c97372 100644 --- a/src/mdbx_chk.c +++ b/src/mdbx_chk.c @@ -1,7 +1,7 @@ /* mdbx_chk.c - memory-mapped database check tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -20,9 +20,11 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" +#include + typedef struct flagbit { int bit; const char *name; @@ -71,7 +73,7 @@ static void signal_handler(int sig) { #define EXIT_FAILURE_CHECK_MINOR EXIT_FAILURE typedef struct { - const char *name; + MDBX_val name; struct { uint64_t branch, large_count, large_volume, leaf; uint64_t subleaf_dupsort, leaf_dupfixed, subleaf_dupfixed; @@ -93,7 +95,7 @@ struct { #define dbi_main walk.dbi[MAIN_DBI] #define dbi_meta walk.dbi[CORE_DBS] -int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE; +int envflags = MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION; MDBX_env *env; MDBX_txn *txn; MDBX_envinfo envinfo; @@ -102,7 +104,7 @@ uint64_t total_unused_bytes, reclaimable_pages, gc_pages, alloc_pages, unused_pages, backed_pages; unsigned verbose; bool ignore_wrong_order, quiet, dont_traversal; -const char *only_subdb; +MDBX_val only_subdb; int stuck_meta = -1; struct problem { @@ -125,7 +127,99 @@ static void MDBX_PRINTF_ARGS(1, 2) print(const char *msg, ...) { } } -static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { +static MDBX_val printable_buf; +static void free_printable_buf(void) { osal_free(printable_buf.iov_base); } + +static const char *sdb_name(const MDBX_val *val) { + if (val == MDBX_PGWALK_MAIN) + return "@MAIN"; + if (val == MDBX_PGWALK_GC) + return "@GC"; + if (val == MDBX_PGWALK_META) + return "@META"; + + const unsigned char *const data = val->iov_base; + const size_t len = val->iov_len; + if (data == MDBX_PGWALK_MAIN) + return "@MAIN"; + if (data == MDBX_PGWALK_GC) + return "@GC"; + if (data == MDBX_PGWALK_META) + return "@META"; + + if (!len) + return ""; + if (!data) + return ""; + if (len > 65536) { + static char buf[64]; + /* NOTE: There is MSYS2 MinGW bug if you here got + * the "unknown conversion type character ‘z’ in format [-Werror=format=]" + * https://stackoverflow.com/questions/74504432/whats-the-proper-way-to-tell-mingw-based-gcc-to-use-ansi-stdio-output-on-windo + */ + snprintf(buf, sizeof(buf), "", len); + return buf; + } + + bool printable = true; + bool quoting = false; + size_t xchars = 0; + for (size_t i = 0; i < val->iov_len && printable; ++i) { + quoting |= data[i] != '_' && isalnum(data[i]) == 0; + printable = isprint(data[i]) != 0 || + (data[i] < ' ' && ++xchars < 4 && len > xchars * 4); + } + + size_t need = len + 1; + if (quoting || !printable) + need += len + /* quotes */ 2 + 2 * /* max xchars */ 4; + if (need > printable_buf.iov_len) { + void *ptr = osal_realloc(printable_buf.iov_base, need); + if (!ptr) + return ""; + if (!printable_buf.iov_base) + atexit(free_printable_buf); + printable_buf.iov_base = ptr; + printable_buf.iov_len = need; + } + + char *out = printable_buf.iov_base; + if (!quoting) { + memcpy(out, data, len); + out += len; + } else if (printable) { + *out++ = '\''; + for (size_t i = 0; i < len; ++i) { + if (data[i] < ' ') { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 4); + static const char hex[] = "0123456789abcdef"; + out[0] = '\\'; + out[1] = 'x'; + out[2] = hex[data[i] >> 4]; + out[3] = hex[data[i] & 15]; + out += 4; + } else if (strchr("\"'`\\", data[i])) { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 2); + out[0] = '\\'; + out[1] = data[i]; + out += 2; + } else { + assert((char *)printable_buf.iov_base + printable_buf.iov_len > + out + 1); + *out++ = data[i]; + } + } + *out++ = '\''; + } + assert((char *)printable_buf.iov_base + printable_buf.iov_len > out); + *out = 0; + return printable_buf.iov_base; +} + +static void va_log(MDBX_log_level_t level, const char *function, int line, + const char *msg, va_list args) { static const char *const prefixes[] = { "!!!fatal: ", " ! " /* error */, " ~ " /* warning */, " " /* notice */, " // " /* verbose */, " //// " /* debug */, @@ -143,13 +237,20 @@ static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { fflush(nullptr); fputs(prefixes[level], out); vfprintf(out, msg, args); - if (msg[strlen(msg) - 1] != '\n') + + const bool have_lf = msg[strlen(msg) - 1] == '\n'; + if (level == MDBX_LOG_FATAL && function && line) + fprintf(out, have_lf ? " %s(), %u\n" : " (%s:%u)\n", + function + (strncmp(function, "mdbx_", 5) ? 5 : 0), line); + else if (!have_lf) fputc('\n', out); fflush(nullptr); } if (level == MDBX_LOG_FATAL) { +#if !MDBX_DEBUG && !MDBX_FORCE_ASSERTIONS exit(EXIT_FAILURE_MDBX); +#endif abort(); } } @@ -157,7 +258,7 @@ static void va_log(MDBX_log_level_t level, const char *msg, va_list args) { static void MDBX_PRINTF_ARGS(1, 2) error(const char *msg, ...) { va_list args; va_start(args, msg); - va_log(MDBX_LOG_ERROR, msg, args); + va_log(MDBX_LOG_ERROR, nullptr, 0, msg, args); va_end(args); } @@ -166,7 +267,7 @@ static void logger(MDBX_log_level_t level, const char *function, int line, (void)line; (void)function; if (level < MDBX_LOG_EXTRA) - va_log(level, msg, args); + va_log(level, function, line, msg, args); } static int check_user_break(void) { @@ -182,19 +283,17 @@ static int check_user_break(void) { } static void pagemap_cleanup(void) { - for (size_t i = CORE_DBS + /* account pseudo-entry for meta */ 1; - i < ARRAY_LENGTH(walk.dbi); ++i) { - if (walk.dbi[i].name) { - mdbx_free((void *)walk.dbi[i].name); - walk.dbi[i].name = nullptr; - } - } - - mdbx_free(walk.pagemap); + osal_free(walk.pagemap); walk.pagemap = nullptr; } -static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { +static bool eq(const MDBX_val a, const MDBX_val b) { + return a.iov_len == b.iov_len && + (a.iov_base == b.iov_base || a.iov_len == 0 || + !memcmp(a.iov_base, b.iov_base, a.iov_len)); +} + +static walk_dbi_t *pagemap_lookup_dbi(const MDBX_val *dbi_name, bool silent) { static walk_dbi_t *last; if (dbi_name == MDBX_PGWALK_MAIN) @@ -204,24 +303,24 @@ static walk_dbi_t *pagemap_lookup_dbi(const char *dbi_name, bool silent) { if (dbi_name == MDBX_PGWALK_META) return &dbi_meta; - if (last && strcmp(last->name, dbi_name) == 0) + if (last && eq(last->name, *dbi_name)) return last; walk_dbi_t *dbi = walk.dbi + CORE_DBS + /* account pseudo-entry for meta */ 1; - for (; dbi < ARRAY_END(walk.dbi) && dbi->name; ++dbi) { - if (strcmp(dbi->name, dbi_name) == 0) + for (; dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { + if (eq(dbi->name, *dbi_name)) return last = dbi; } if (verbose > 0 && !silent) { - print(" - found '%s' area\n", dbi_name); + print(" - found %s area\n", sdb_name(dbi_name)); fflush(nullptr); } if (dbi == ARRAY_END(walk.dbi)) return nullptr; - dbi->name = mdbx_strdup(dbi_name); + dbi->name = *dbi_name; return last = dbi; } @@ -239,7 +338,7 @@ static void MDBX_PRINTF_ARGS(4, 5) break; if (!p) { - p = mdbx_calloc(1, sizeof(*p)); + p = osal_calloc(1, sizeof(*p)); if (unlikely(!p)) return; p->caption = msg; @@ -284,7 +383,7 @@ static size_t problems_pop(struct problem *list) { count += problems_list->count; print("%s%s (%" PRIuPTR ")", i ? ", " : "", problems_list->caption, problems_list->count); - mdbx_free(problems_list); + osal_free(problems_list); problems_list = p; } print("\n"); @@ -296,13 +395,13 @@ static size_t problems_pop(struct problem *list) { } static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, - void *const ctx, const int deep, - const char *const dbi_name_or_tag, const size_t page_size, - const MDBX_page_type_t pagetype, const MDBX_error_t err, - const size_t nentries, const size_t payload_bytes, - const size_t header_bytes, const size_t unused_bytes) { + void *const ctx, const int deep, const MDBX_val *dbi_name, + const size_t page_size, const MDBX_page_type_t pagetype, + const MDBX_error_t err, const size_t nentries, + const size_t payload_bytes, const size_t header_bytes, + const size_t unused_bytes) { (void)ctx; - const bool is_gc_tree = dbi_name_or_tag == MDBX_PGWALK_GC; + const bool is_gc_tree = dbi_name == MDBX_PGWALK_GC; if (deep > 42) { problem_add("deep", deep, "too large", nullptr); data_tree_problems += !is_gc_tree; @@ -310,7 +409,7 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, return MDBX_CORRUPTED /* avoid infinite loop/recursion */; } - walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name_or_tag, false); + walk_dbi_t *dbi = pagemap_lookup_dbi(dbi_name, false); if (!dbi) { data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; @@ -375,14 +474,14 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, } if (pgnumber) { - if (verbose > 3 && (!only_subdb || strcmp(only_subdb, dbi->name) == 0)) { + if (verbose > 3 && (!only_subdb.iov_base || eq(only_subdb, dbi->name))) { if (pgnumber == 1) print(" %s-page %" PRIu64, pagetype_caption, pgno); else print(" %s-span %" PRIu64 "[%u]", pagetype_caption, pgno, pgnumber); print(" of %s: header %" PRIiPTR ", %s %" PRIiPTR ", payload %" PRIiPTR ", unused %" PRIiPTR ", deep %i\n", - dbi->name, header_bytes, + sdb_name(&dbi->name), header_bytes, (pagetype == MDBX_page_branch) ? "keys" : "entries", nentries, payload_bytes, unused_bytes, deep); } @@ -400,8 +499,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, walk_dbi_t *coll_dbi = &walk.dbi[walk.pagemap[spanpgno] - 1]; problem_add("page", spanpgno, (branch && coll_dbi == dbi) ? "loop" : "already used", - "%s-page: by %s, deep %i", pagetype_caption, coll_dbi->name, - deep); + "%s-page: by %s, deep %i", pagetype_caption, + sdb_name(&coll_dbi->name), deep); already_used = true; data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; @@ -472,8 +571,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, data_tree_problems += !is_gc_tree; gc_tree_problems += is_gc_tree; } else { - dbi->payload_bytes += payload_bytes + header_bytes; - walk.total_payload_bytes += payload_bytes + header_bytes; + dbi->payload_bytes += (uint64_t)payload_bytes + header_bytes; + walk.total_payload_bytes += (uint64_t)payload_bytes + header_bytes; } } } @@ -483,8 +582,8 @@ static int pgvisitor(const uint64_t pgno, const unsigned pgnumber, typedef int(visitor)(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data); -static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, - bool silent); +static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, + visitor *handler); static int handle_userdb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { @@ -521,7 +620,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, number = data->iov_len / sizeof(pgno_t) - 1; } else if (data->iov_len - (number + 1) * sizeof(pgno_t) >= /* LY: allow gap up to one page. it is ok - * and better than shink-and-retry inside mdbx_update_gc() */ + * and better than shink-and-retry inside update_gc() */ envinfo.mi_dxb_pagesize) problem_add("entry", txnid, "extra idl space", "%" PRIuSIZE " < %" PRIuSIZE " (minor, not a trouble)", @@ -533,7 +632,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, pgno_t prev = MDBX_PNL_ASCENDING ? NUM_METAS - 1 : txn->mt_next_pgno; pgno_t span = 1; - for (unsigned i = 0; i < number; ++i) { + for (size_t i = 0; i < number; ++i) { if (check_user_break()) return MDBX_EINTR; const pgno_t pgno = iptr[i]; @@ -552,7 +651,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, if (MDBX_PNL_DISORDERED(prev, pgno)) { bad = " [bad sequence]"; problem_add("entry", txnid, "bad sequence", - "%" PRIaPGNO " %c [%u].%" PRIaPGNO, prev, + "%" PRIaPGNO " %c [%zu].%" PRIaPGNO, prev, (prev == pgno) ? '=' : (MDBX_PNL_ASCENDING ? '>' : '<'), i, pgno); } @@ -562,7 +661,7 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, walk.pagemap[pgno] = -1; else if (idx > 0) problem_add("page", pgno, "already used", "by %s", - walk.dbi[idx - 1].name); + sdb_name(&walk.dbi[idx - 1].name)); else problem_add("page", pgno, "already listed in GC", nullptr); } @@ -573,12 +672,12 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, : pgno_sub(pgno, span))) ++span; } - if (verbose > 3 && !only_subdb) { + if (verbose > 3 && !only_subdb.iov_base) { print(" transaction %" PRIaTXN ", %" PRIuPTR " pages, maxspan %" PRIaPGNO "%s\n", txnid, number, span, bad); if (verbose > 4) { - for (unsigned i = 0; i < number; i += span) { + for (size_t i = 0; i < number; i += span) { const pgno_t pgno = iptr[i]; for (span = 1; i + span < number && @@ -600,36 +699,18 @@ static int handle_freedb(const uint64_t record_number, const MDBX_val *key, } static int equal_or_greater(const MDBX_val *a, const MDBX_val *b) { - return (a->iov_len == b->iov_len && - memcmp(a->iov_base, b->iov_base, a->iov_len) == 0) - ? 0 - : 1; + return eq(*a, *b) ? 0 : 1; } static int handle_maindb(const uint64_t record_number, const MDBX_val *key, const MDBX_val *data) { - char *name; - int rc; - size_t i; - - name = key->iov_base; - for (i = 0; i < key->iov_len; ++i) { - if (name[i] < ' ') - return handle_userdb(record_number, key, data); + if (data->iov_len == sizeof(MDBX_db)) { + int rc = process_db(~0u, key, handle_userdb); + if (rc != MDBX_INCOMPATIBLE) { + userdb_count++; + return rc; + } } - - name = mdbx_malloc(key->iov_len + 1); - if (unlikely(!name)) - return MDBX_ENOMEM; - memcpy(name, key->iov_base, key->iov_len); - name[key->iov_len] = '\0'; - userdb_count++; - - rc = process_db(~0u, name, handle_userdb, false); - mdbx_free(name); - if (rc != MDBX_INCOMPATIBLE) - return rc; - return handle_userdb(record_number, key, data); } @@ -683,8 +764,8 @@ static const char *db_flags2valuemode(unsigned flags) { } } -static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, - bool silent) { +static int process_db(MDBX_dbi dbi_handle, const MDBX_val *dbi_name, + visitor *handler) { MDBX_cursor *mc; MDBX_stat ms; MDBX_val key, data; @@ -693,18 +774,19 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, int rc, i; struct problem *saved_list; uint64_t problems_count; + const bool second_pass = dbi_handle == MAIN_DBI; uint64_t record_count = 0, dups = 0; uint64_t key_bytes = 0, data_bytes = 0; if ((MDBX_TXN_FINISHED | MDBX_TXN_ERROR) & mdbx_txn_flags(txn)) { - print(" ! abort processing '%s' due to a previous error\n", - dbi_name ? dbi_name : "@MAIN"); + print(" ! abort processing %s due to a previous error\n", + sdb_name(dbi_name)); return MDBX_BAD_TXN; } if (dbi_handle == ~0u) { - rc = mdbx_dbi_open_ex( + rc = mdbx_dbi_open_ex2( txn, dbi_name, MDBX_DB_ACCEDE, &dbi_handle, (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr, (dbi_name && ignore_wrong_order) ? equal_or_greater : nullptr); @@ -712,27 +794,26 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, if (!dbi_name || rc != MDBX_INCOMPATIBLE) /* LY: mainDB's record is not a user's DB. */ { - error("mdbx_dbi_open('%s') failed, error %d %s\n", - dbi_name ? dbi_name : "main", rc, mdbx_strerror(rc)); + error("mdbx_dbi_open(%s) failed, error %d %s\n", sdb_name(dbi_name), rc, + mdbx_strerror(rc)); } return rc; } } - if (dbi_handle >= CORE_DBS && dbi_name && only_subdb && - strcmp(only_subdb, dbi_name) != 0) { + if (dbi_handle >= CORE_DBS && dbi_name && only_subdb.iov_base && + !eq(only_subdb, *dbi_name)) { if (verbose) { - print("Skip processing '%s'...\n", dbi_name); + print("Skip processing %s...\n", sdb_name(dbi_name)); fflush(nullptr); } skipped_subdb++; return MDBX_SUCCESS; } - if (!silent && verbose) { - print("Processing '%s'...\n", dbi_name ? dbi_name : "@MAIN"); - fflush(nullptr); - } + if (!second_pass && verbose) + print("Processing %s...\n", sdb_name(dbi_name)); + fflush(nullptr); rc = mdbx_dbi_flags(txn, dbi_handle, &flags); if (rc) { @@ -746,7 +827,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, return rc; } - if (!silent && verbose) { + if (!second_pass && verbose) { print(" - key-value kind: %s-key => %s-value", db_flags2keymode(flags), db_flags2valuemode(flags)); if (verbose > 1) { @@ -805,9 +886,9 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, } if (ignore_wrong_order) { /* for debugging with enabled assertions */ - mc->mc_flags |= C_SKIPORD; + mc->mc_checking |= CC_SKIPORD; if (mc->mc_xcursor) - mc->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + mc->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } const size_t maxkeysize = mdbx_env_get_maxkeysize_ex(env, flags); @@ -822,57 +903,75 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, if (rc) goto bailout; - bool bad_key = false; - if (key.iov_len > maxkeysize) { - problem_add("entry", record_count, "key length exceeds max-key-size", - "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); - bad_key = true; - } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && - key.iov_len != sizeof(uint32_t)) { - problem_add("entry", record_count, "wrong key length", - "%" PRIuPTR " != 4or8", key.iov_len); - bad_key = true; - } + if (!second_pass) { + bool bad_key = false; + if (key.iov_len > maxkeysize) { + problem_add("entry", record_count, "key length exceeds max-key-size", + "%" PRIuPTR " > %" PRIuPTR, key.iov_len, maxkeysize); + bad_key = true; + } else if ((flags & MDBX_INTEGERKEY) && key.iov_len != sizeof(uint64_t) && + key.iov_len != sizeof(uint32_t)) { + problem_add("entry", record_count, "wrong key length", + "%" PRIuPTR " != 4or8", key.iov_len); + bad_key = true; + } - bool bad_data = false; - if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && - data.iov_len != sizeof(uint32_t)) { - problem_add("entry", record_count, "wrong data length", - "%" PRIuPTR " != 4or8", data.iov_len); - bad_data = true; - } - - if (prev_key.iov_base) { - if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && - prev_data.iov_len != data.iov_len) { - problem_add("entry", record_count, "different data length", - "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, - data.iov_len); + bool bad_data = false; + if ((flags & MDBX_INTEGERDUP) && data.iov_len != sizeof(uint64_t) && + data.iov_len != sizeof(uint32_t)) { + problem_add("entry", record_count, "wrong data length", + "%" PRIuPTR " != 4or8", data.iov_len); bad_data = true; } - if (!bad_key) { - int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); - if (cmp == 0) { - ++dups; - if ((flags & MDBX_DUPSORT) == 0) { - problem_add("entry", record_count, "duplicated entries", nullptr); - if (prev_data.iov_base && data.iov_len == prev_data.iov_len && - memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == 0) { - problem_add("entry", record_count, "complete duplicate", nullptr); - } - } else if (!bad_data && prev_data.iov_base) { - cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); - if (cmp == 0) { - problem_add("entry", record_count, "complete duplicate", nullptr); - } else if (cmp < 0 && !ignore_wrong_order) { - problem_add("entry", record_count, "wrong order of multi-values", - nullptr); - } - } - } else if (cmp < 0 && !ignore_wrong_order) { - problem_add("entry", record_count, "wrong order of entries", nullptr); + if (prev_key.iov_base) { + if (prev_data.iov_base && !bad_data && (flags & MDBX_DUPFIXED) && + prev_data.iov_len != data.iov_len) { + problem_add("entry", record_count, "different data length", + "%" PRIuPTR " != %" PRIuPTR, prev_data.iov_len, + data.iov_len); + bad_data = true; } + + if (!bad_key) { + int cmp = mdbx_cmp(txn, dbi_handle, &key, &prev_key); + if (cmp == 0) { + ++dups; + if ((flags & MDBX_DUPSORT) == 0) { + problem_add("entry", record_count, "duplicated entries", nullptr); + if (prev_data.iov_base && data.iov_len == prev_data.iov_len && + memcmp(data.iov_base, prev_data.iov_base, data.iov_len) == + 0) { + problem_add("entry", record_count, "complete duplicate", + nullptr); + } + } else if (!bad_data && prev_data.iov_base) { + cmp = mdbx_dcmp(txn, dbi_handle, &data, &prev_data); + if (cmp == 0) { + problem_add("entry", record_count, "complete duplicate", + nullptr); + } else if (cmp < 0 && !ignore_wrong_order) { + problem_add("entry", record_count, + "wrong order of multi-values", nullptr); + } + } + } else if (cmp < 0 && !ignore_wrong_order) { + problem_add("entry", record_count, "wrong order of entries", + nullptr); + } + } + } + + if (!bad_key) { + if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) + print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); + prev_key = key; + } + if (!bad_data) { + if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && + !prev_data.iov_base) + print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); + prev_data = data; } } @@ -886,17 +985,6 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, key_bytes += key.iov_len; data_bytes += data.iov_len; - if (!bad_key) { - if (verbose && (flags & MDBX_INTEGERKEY) && !prev_key.iov_base) - print(" - fixed key-size %" PRIuPTR "\n", key.iov_len); - prev_key = key; - } - if (!bad_data) { - if (verbose && (flags & (MDBX_INTEGERDUP | MDBX_DUPFIXED)) && - !prev_data.iov_base) - print(" - fixed data-size %" PRIuPTR "\n", data.iov_len); - prev_data = data; - } rc = mdbx_cursor_get(mc, &key, &data, MDBX_NEXT); } if (rc != MDBX_NOTFOUND) @@ -909,7 +997,7 @@ static int process_db(MDBX_dbi dbi_handle, char *dbi_name, visitor *handler, "%" PRIu64 " != %" PRIu64, record_count, ms.ms_entries); bailout: problems_count = problems_pop(saved_list); - if (!silent && verbose) { + if (!second_pass && verbose) { print(" - summary: %" PRIu64 " records, %" PRIu64 " dups, %" PRIu64 " key's bytes, %" PRIu64 " data's " "bytes, %" PRIu64 " problems\n", @@ -922,21 +1010,24 @@ bailout: } static void usage(char *prog) { - fprintf(stderr, - "usage: %s [-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] " - "dbpath\n" - " -V\t\tprint version and exit\n" - " -v\t\tmore verbose, could be used multiple times\n" - " -q\t\tbe quiet\n" - " -c\t\tforce cooperative mode (don't try exclusive)\n" - " -w\t\twrite-mode checking\n" - " -d\t\tdisable page-by-page traversal of B-tree\n" - " -i\t\tignore wrong order errors (for custom comparators case)\n" - " -s subdb\tprocess a specific subdatabase only\n" - " -0|1|2\tforce using specific meta-page 0, or 2 for checking\n" - " -t\t\tturn to a specified meta-page on successful check\n" - " -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n", - prog); + fprintf( + stderr, + "usage: %s " + "[-V] [-v] [-q] [-c] [-0|1|2] [-w] [-d] [-i] [-s subdb] [-u|U] dbpath\n" + " -V\t\tprint version and exit\n" + " -v\t\tmore verbose, could be used multiple times\n" + " -q\t\tbe quiet\n" + " -c\t\tforce cooperative mode (don't try exclusive)\n" + " -w\t\twrite-mode checking\n" + " -d\t\tdisable page-by-page traversal of B-tree\n" + " -i\t\tignore wrong order errors (for custom comparators case)\n" + " -s subdb\tprocess a specific subdatabase only\n" + " -u\t\twarmup database before checking\n" + " -U\t\twarmup and try lock database pages in memory before checking\n" + " -0|1|2\tforce using specific meta-page 0, or 2 for checking\n" + " -t\t\tturn to a specified meta-page on successful check\n" + " -T\t\tturn to a specified meta-page EVEN ON UNSUCCESSFUL CHECK!\n", + prog); exit(EXIT_INTERRUPTED); } @@ -1075,6 +1166,8 @@ int main(int argc, char *argv[]) { bool write_locked = false; bool turn_meta = false; bool force_turn_meta = false; + bool warmup = false; + MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default; double elapsed; #if defined(_WIN32) || defined(_WIN64) @@ -1089,15 +1182,16 @@ int main(int argc, char *argv[]) { } #endif - dbi_meta.name = "@META"; - dbi_free.name = "@GC"; - dbi_main.name = "@MAIN"; + dbi_meta.name.iov_base = MDBX_PGWALK_META; + dbi_free.name.iov_base = MDBX_PGWALK_GC; + dbi_main.name.iov_base = MDBX_PGWALK_MAIN; atexit(pagemap_cleanup); if (argc < 2) usage(prog); for (int i; (i = getopt(argc, argv, + "uU" "0" "1" "2" @@ -1168,13 +1262,22 @@ int main(int argc, char *argv[]) { dont_traversal = true; break; case 's': - if (only_subdb && strcmp(only_subdb, optarg)) + if (only_subdb.iov_base && strcmp(only_subdb.iov_base, optarg)) usage(prog); - only_subdb = optarg; + only_subdb.iov_base = optarg; + only_subdb.iov_len = strlen(optarg); break; case 'i': ignore_wrong_order = true; break; + case 'u': + warmup = true; + break; + case 'U': + warmup = true; + warmup_flags = + MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock; + break; default: usage(prog); } @@ -1200,9 +1303,10 @@ int main(int argc, char *argv[]) { error("write-mode must be enabled to turn to the specified meta-page.\n"); rc = EXIT_INTERRUPTED; } - if (only_subdb || dont_traversal) { - error("whole database checking with tree-traversal are required to turn " - "to the specified meta-page.\n"); + if (only_subdb.iov_base || dont_traversal) { + error( + "whole database checking with b-tree traversal are required to turn " + "to the specified meta-page.\n"); rc = EXIT_INTERRUPTED; } } @@ -1231,7 +1335,9 @@ int main(int argc, char *argv[]) { mdbx_setup_debug((verbose < MDBX_LOG_TRACE - 1) ? (MDBX_log_level_t)(verbose + 1) : MDBX_LOG_TRACE, - MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, logger); + MDBX_DBG_DUMP | MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | + MDBX_DBG_LEGACY_OVERLAP | MDBX_DBG_DONT_UPGRADE, + logger); rc = mdbx_env_create(&env); if (rc) { @@ -1274,14 +1380,35 @@ int main(int argc, char *argv[]) { (envflags & MDBX_EXCLUSIVE) ? "monopolistic" : "cooperative"); if ((envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == 0) { + if (verbose) { + print(" - taking write lock..."); + fflush(nullptr); + } rc = mdbx_txn_lock(env, false); if (rc != MDBX_SUCCESS) { error("mdbx_txn_lock() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } + if (verbose) + print(" done\n"); write_locked = true; } + if (warmup) { + if (verbose) { + print(" - warming up..."); + fflush(nullptr); + } + rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); + if (MDBX_IS_ERROR(rc)) { + error("mdbx_env_warmup(flags %u) failed, error %d %s\n", warmup_flags, rc, + mdbx_strerror(rc)); + goto bailout; + } + if (verbose) + print(" %s\n", rc ? "timeout" : "done"); + } + rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); if (rc) { error("mdbx_txn_begin() failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -1330,7 +1457,7 @@ int main(int argc, char *argv[]) { } #endif if (rc) { - error("mdbx_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); + error("osal_filesize() failed, error %d %s\n", rc, mdbx_strerror(rc)); goto bailout; } @@ -1494,7 +1621,7 @@ int main(int argc, char *argv[]) { print("Traversal b-tree by txn#%" PRIaTXN "...\n", txn->mt_txnid); fflush(nullptr); - walk.pagemap = mdbx_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); + walk.pagemap = osal_calloc((size_t)backed_pages, sizeof(*walk.pagemap)); if (!walk.pagemap) { rc = errno ? errno : MDBX_ENOMEM; error("calloc() failed, error %d %s\n", rc, mdbx_strerror(rc)); @@ -1518,8 +1645,8 @@ int main(int argc, char *argv[]) { unused_pages += 1; empty_pages = lost_bytes = 0; - for (walk_dbi_t *dbi = &dbi_main; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) { + for (walk_dbi_t *dbi = &dbi_main; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { empty_pages += dbi->pages.empty; lost_bytes += dbi->lost_bytes; } @@ -1529,9 +1656,10 @@ int main(int argc, char *argv[]) { print(" - pages: walked %" PRIu64 ", left/unused %" PRIu64 "\n", walk.pgcount, unused_pages); if (verbose > 1) { - for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) { - print(" %s: subtotal %" PRIu64, dbi->name, dbi->pages.total); + for (walk_dbi_t *dbi = walk.dbi; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) { + print(" %s: subtotal %" PRIu64, sdb_name(&dbi->name), + dbi->pages.total); if (dbi->pages.other && dbi->pages.other != dbi->pages.total) print(", other %" PRIu64, dbi->pages.other); if (dbi->pages.branch) @@ -1563,14 +1691,15 @@ int main(int argc, char *argv[]) { (total_page_bytes - walk.total_payload_bytes) * 100.0 / total_page_bytes); if (verbose > 2) { - for (walk_dbi_t *dbi = walk.dbi; dbi < ARRAY_END(walk.dbi) && dbi->name; - ++dbi) + for (walk_dbi_t *dbi = walk.dbi; + dbi < ARRAY_END(walk.dbi) && dbi->name.iov_base; ++dbi) if (dbi->pages.total) { uint64_t dbi_bytes = dbi->pages.total * envinfo.mi_dxb_pagesize; print(" %s: subtotal %" PRIu64 " bytes (%.1f%%)," " payload %" PRIu64 " (%.1f%%), unused %" PRIu64 " (%.1f%%)", - dbi->name, dbi_bytes, dbi_bytes * 100.0 / total_page_bytes, - dbi->payload_bytes, dbi->payload_bytes * 100.0 / dbi_bytes, + sdb_name(&dbi->name), dbi_bytes, + dbi_bytes * 100.0 / total_page_bytes, dbi->payload_bytes, + dbi->payload_bytes * 100.0 / dbi_bytes, dbi_bytes - dbi->payload_bytes, (dbi_bytes - dbi->payload_bytes) * 100.0 / dbi_bytes); if (dbi->pages.empty) @@ -1579,7 +1708,7 @@ int main(int argc, char *argv[]) { print(", %" PRIu64 " bytes lost", dbi->lost_bytes); print("\n"); } else - print(" %s: empty\n", dbi->name); + print(" %s: empty\n", sdb_name(&dbi->name)); } print(" - summary: average fill %.1f%%", walk.total_payload_bytes * 100.0 / total_page_bytes); @@ -1594,21 +1723,12 @@ int main(int argc, char *argv[]) { fflush(nullptr); } - if (!verbose) - print("Iterating DBIs...\n"); - if (data_tree_problems) { - print("Skip processing %s since tree is corrupted (%u problems)\n", "@MAIN", - data_tree_problems); - problems_maindb = data_tree_problems; - } else - problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr, false); - if (gc_tree_problems) { - print("Skip processing %s since tree is corrupted (%u problems)\n", "@GC", - gc_tree_problems); + print("Skip processing %s since %s is corrupted (%u problems)\n", "@GC", + "b-tree", gc_tree_problems); problems_freedb = gc_tree_problems; } else - problems_freedb = process_db(FREE_DBI, "@GC", handle_freedb, false); + problems_freedb = process_db(FREE_DBI, MDBX_PGWALK_GC, handle_freedb); if (verbose) { uint64_t value = envinfo.mi_mapsize / envinfo.mi_dxb_pagesize; @@ -1640,7 +1760,7 @@ int main(int argc, char *argv[]) { print(", available %" PRIu64 " (%.1f%%)\n", value, value / percent); } - if (problems_maindb == 0 && problems_freedb == 0) { + if ((problems_maindb = data_tree_problems) == 0 && problems_freedb == 0) { if (!dont_traversal && (envflags & (MDBX_EXCLUSIVE | MDBX_RDONLY)) != MDBX_RDONLY) { if (walk.pgcount != alloc_pages - gc_pages) { @@ -1649,22 +1769,32 @@ int main(int argc, char *argv[]) { walk.pgcount, alloc_pages - gc_pages); } if (unused_pages != gc_pages) { - error("gc pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", + error("GC pages mismatch (%" PRIu64 "(expected) != %" PRIu64 "(GC))\n", unused_pages, gc_pages); } } else if (verbose) { - print(" - skip check used and gc pages (btree-traversal with " + print(" - skip check used and GC pages (btree-traversal with " "monopolistic or read-write mode only)\n"); } - if (!process_db(MAIN_DBI, nullptr, handle_maindb, true)) { - if (!userdb_count && verbose) - print(" - does not contain multiple databases\n"); + problems_maindb = process_db(~0u, /* MAIN_DBI */ nullptr, nullptr); + if (problems_maindb == 0) { + print("Scanning %s for %s...\n", "@MAIN", "sub-database(s)"); + if (!process_db(MAIN_DBI, nullptr, handle_maindb)) { + if (!userdb_count && verbose) + print(" - does not contain multiple databases\n"); + } + } else { + print("Skip processing %s since %s is corrupted (%u problems)\n", + "sub-database(s)", "@MAIN", problems_maindb); } + } else { + print("Skip processing %s since %s is corrupted (%u problems)\n", "@MAIN", + "b-tree", data_tree_problems); } if (rc == 0 && total_problems == 1 && problems_meta == 1 && !dont_traversal && - (envflags & MDBX_RDONLY) == 0 && !only_subdb && stuck_meta < 0 && + (envflags & MDBX_RDONLY) == 0 && !only_subdb.iov_base && stuck_meta < 0 && get_meta_txnid(meta_recent(true)) < envinfo.mi_recent_txnid) { print("Perform sync-to-disk for make steady checkpoint at txn-id #%" PRIi64 "\n", @@ -1683,7 +1813,7 @@ int main(int argc, char *argv[]) { } } - if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb && + if (turn_meta && stuck_meta >= 0 && !dont_traversal && !only_subdb.iov_base && (envflags & (MDBX_RDONLY | MDBX_EXCLUSIVE)) == MDBX_EXCLUSIVE) { const bool successful_check = (rc | total_problems | problems_meta) == 0; if (successful_check || force_turn_meta) { diff --git a/src/mdbx_copy.c b/src/mdbx_copy.c index 4b40b558..52adc312 100644 --- a/src/mdbx_copy.c +++ b/src/mdbx_copy.c @@ -1,7 +1,7 @@ /* mdbx_copy.c - memory-mapped database backup tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #if defined(_WIN32) || defined(_WIN64) @@ -44,14 +44,17 @@ static void signal_handler(int sig) { #endif /* !WINDOWS */ static void usage(const char *prog) { - fprintf(stderr, - "usage: %s [-V] [-q] [-c] src_path [dest_path]\n" - " -V\t\tprint version and exit\n" - " -q\t\tbe quiet\n" - " -c\t\tenable compactification (skip unused pages)\n" - " src_path\tsource database\n" - " dest_path\tdestination (stdout if not specified)\n", - prog); + fprintf( + stderr, + "usage: %s [-V] [-q] [-c] [-u|U] src_path [dest_path]\n" + " -V\t\tprint version and exit\n" + " -q\t\tbe quiet\n" + " -c\t\tenable compactification (skip unused pages)\n" + " -u\t\twarmup database before copying\n" + " -U\t\twarmup and try lock database pages in memory before copying\n" + " src_path\tsource database\n" + " dest_path\tdestination (stdout if not specified)\n", + prog); exit(EXIT_FAILURE); } @@ -62,6 +65,8 @@ int main(int argc, char *argv[]) { unsigned flags = MDBX_RDONLY; unsigned cpflags = 0; bool quiet = false; + bool warmup = false; + MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default; for (; argc > 1 && argv[1][0] == '-'; argc--, argv++) { if (argv[1][1] == 'n' && argv[1][2] == '\0') @@ -70,8 +75,14 @@ int main(int argc, char *argv[]) { cpflags |= MDBX_CP_COMPACT; else if (argv[1][1] == 'q' && argv[1][2] == '\0') quiet = true; - else if ((argv[1][1] == 'h' && argv[1][2] == '\0') || - strcmp(argv[1], "--help") == 0) + else if (argv[1][1] == 'u' && argv[1][2] == '\0') + warmup = true; + else if (argv[1][1] == 'U' && argv[1][2] == '\0') { + warmup = true; + warmup_flags = + MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock; + } else if ((argv[1][1] == 'h' && argv[1][2] == '\0') || + strcmp(argv[1], "--help") == 0) usage(progname); else if (argv[1][1] == 'V' && argv[1][2] == '\0') { printf("mdbx_copy version %d.%d.%d.%d\n" @@ -120,7 +131,12 @@ int main(int argc, char *argv[]) { if (rc == MDBX_SUCCESS) rc = mdbx_env_open(env, argv[1], flags, 0); - if (rc == MDBX_SUCCESS) { + if (rc == MDBX_SUCCESS && warmup) { + act = "warming up"; + rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); + } + + if (!MDBX_IS_ERROR(rc)) { act = "copying"; if (argc == 2) { mdbx_filehandle_t fd; diff --git a/src/mdbx_drop.c b/src/mdbx_drop.c index 9b0a18b5..859710a6 100644 --- a/src/mdbx_drop.c +++ b/src/mdbx_drop.c @@ -1,10 +1,10 @@ /* mdbx_drop.c - memory-mapped database delete tool */ /* - * Copyright 2021 Leonid Yuriev + * Copyright 2021-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * - * Copyright 2016-2022 Howard Chu, Symas Corp. + * Copyright 2016-2021 Howard Chu, Symas Corp. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -22,7 +22,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #include diff --git a/src/mdbx_dump.c b/src/mdbx_dump.c index 6eec6fc3..21a695e2 100644 --- a/src/mdbx_dump.c +++ b/src/mdbx_dump.c @@ -1,7 +1,7 @@ /* mdbx_dump.c - memory-mapped database dump tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #include @@ -66,7 +66,7 @@ static const char hexc[] = "0123456789abcdef"; static void dumpbyte(unsigned char c) { putchar(hexc[c >> 4]); - putchar(hexc[c & 0xf]); + putchar(hexc[c & 15]); } static void text(MDBX_val *v) { @@ -186,10 +186,10 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) { error("mdbx_cursor_open", rc); return rc; } - if (MDBX_DEBUG > 0 && rescue) { - cursor->mc_flags |= C_SKIPORD; + if (rescue) { + cursor->mc_checking |= CC_SKIPORD; if (cursor->mc_xcursor) - cursor->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } while ((rc = mdbx_cursor_get(cursor, &key, &data, MDBX_NEXT)) == @@ -217,19 +217,23 @@ static int dump_sdb(MDBX_txn *txn, MDBX_dbi dbi, char *name) { } static void usage(void) { - fprintf(stderr, - "usage: %s [-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s subdb] " - "dbpath\n" - " -V\t\tprint version and exit\n" - " -q\t\tbe quiet\n" - " -f\t\twrite to file instead of stdout\n" - " -l\t\tlist subDBs and exit\n" - " -p\t\tuse printable characters\n" - " -r\t\trescue mode (ignore errors to dump corrupted DB)\n" - " -a\t\tdump main DB and all subDBs\n" - " -s name\tdump only the specified named subDB\n" - " \t\tby default dump only the main DB\n", - prog); + fprintf( + stderr, + "usage: %s " + "[-V] [-q] [-f file] [-l] [-p] [-r] [-a|-s subdb] [-u|U] " + "dbpath\n" + " -V\t\tprint version and exit\n" + " -q\t\tbe quiet\n" + " -f\t\twrite to file instead of stdout\n" + " -l\t\tlist subDBs and exit\n" + " -p\t\tuse printable characters\n" + " -r\t\trescue mode (ignore errors to dump corrupted DB)\n" + " -a\t\tdump main DB and all subDBs\n" + " -s name\tdump only the specified named subDB\n" + " -u\t\twarmup database before dumping\n" + " -U\t\twarmup and try lock database pages in memory before dumping\n" + " \t\tby default dump only the main DB\n", + prog); exit(EXIT_FAILURE); } @@ -250,11 +254,14 @@ int main(int argc, char *argv[]) { char *subname = nullptr, *buf4free = nullptr; unsigned envflags = 0; bool alldbs = false, list = false; + bool warmup = false; + MDBX_warmup_flags_t warmup_flags = MDBX_warmup_default; if (argc < 2) usage(); while ((i = getopt(argc, argv, + "uU" "a" "f:" "l" @@ -311,6 +318,14 @@ int main(int argc, char *argv[]) { case 'r': rescue = true; break; + case 'u': + warmup = true; + break; + case 'U': + warmup = true; + warmup_flags = + MDBX_warmup_force | MDBX_warmup_touchlimit | MDBX_warmup_lock; + break; default: usage(); } @@ -356,12 +371,22 @@ int main(int argc, char *argv[]) { rc = mdbx_env_open( env, envname, - envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE : MDBX_RDONLY), 0); + envflags | (rescue ? MDBX_RDONLY | MDBX_EXCLUSIVE | MDBX_VALIDATION + : MDBX_RDONLY), + 0); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_env_open", rc); goto env_close; } + if (warmup) { + rc = mdbx_env_warmup(env, nullptr, warmup_flags, 3600 * 65536); + if (MDBX_IS_ERROR(rc)) { + error("mdbx_env_warmup", rc); + goto env_close; + } + } + rc = mdbx_txn_begin(env, nullptr, MDBX_TXN_RDONLY, &txn); if (unlikely(rc != MDBX_SUCCESS)) { error("mdbx_txn_begin", rc); @@ -383,10 +408,10 @@ int main(int argc, char *argv[]) { error("mdbx_cursor_open", rc); goto txn_abort; } - if (MDBX_DEBUG > 0 && rescue) { - cursor->mc_flags |= C_SKIPORD; + if (rescue) { + cursor->mc_checking |= CC_SKIPORD; if (cursor->mc_xcursor) - cursor->mc_xcursor->mx_cursor.mc_flags |= C_SKIPORD; + cursor->mc_xcursor->mx_cursor.mc_checking |= CC_SKIPORD; } bool have_raw = false; @@ -401,7 +426,7 @@ int main(int argc, char *argv[]) { if (memchr(key.iov_base, '\0', key.iov_len)) continue; - subname = mdbx_realloc(buf4free, key.iov_len + 1); + subname = osal_realloc(buf4free, key.iov_len + 1); if (!subname) { rc = MDBX_ENOMEM; break; diff --git a/src/mdbx_load.c b/src/mdbx_load.c index 10e54750..552fedc8 100644 --- a/src/mdbx_load.c +++ b/src/mdbx_load.c @@ -1,7 +1,7 @@ /* mdbx_load.c - memory-mapped database load tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #include @@ -213,7 +213,7 @@ static int readhdr(void) { if (str) { if (*str) { free(subname); - subname = mdbx_strdup(str); + subname = osal_strdup(str); if (!subname) { if (!quiet) perror("strdup()"); @@ -421,7 +421,7 @@ __hot static int readline(MDBX_val *out, MDBX_val *buf) { /* Is buffer too short? */ while (c1[len - 1] != '\n') { - buf->iov_base = mdbx_realloc(buf->iov_base, buf->iov_len * 2); + buf->iov_base = osal_realloc(buf->iov_base, buf->iov_len * 2); if (!buf->iov_base) { if (!quiet) fprintf(stderr, @@ -560,7 +560,7 @@ int main(int argc, char *argv[]) { envflags |= MDBX_NOSUBDIR; break; case 's': - subname = mdbx_strdup(optarg); + subname = osal_strdup(optarg); break; case 'N': putflags |= MDBX_NOOVERWRITE | MDBX_NODUPDATA; @@ -606,7 +606,7 @@ int main(int argc, char *argv[]) { fflush(nullptr); dbuf.iov_len = 4096; - dbuf.iov_base = mdbx_malloc(dbuf.iov_len); + dbuf.iov_base = osal_malloc(dbuf.iov_len); if (!dbuf.iov_base) { rc = MDBX_ENOMEM; error("value-buffer", rc); @@ -673,7 +673,7 @@ int main(int argc, char *argv[]) { goto env_close; } - kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + 1; + kbuf.iov_len = mdbx_env_get_maxvalsize_ex(env, 0) + (size_t)1; if (kbuf.iov_len >= INTPTR_MAX / 2) { if (!quiet) fprintf(stderr, "mdbx_env_get_maxkeysize() failed, returns %zu\n", diff --git a/src/mdbx_stat.c b/src/mdbx_stat.c index fa229435..adedc13e 100644 --- a/src/mdbx_stat.c +++ b/src/mdbx_stat.c @@ -1,7 +1,7 @@ /* mdbx_stat.c - memory-mapped database status tool */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -20,7 +20,7 @@ #pragma warning(disable : 4996) /* The POSIX name is deprecated... */ #endif /* _MSC_VER (warnings) */ -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "internals.h" #if defined(_WIN32) || defined(_WIN64) @@ -256,6 +256,17 @@ int main(int argc, char *argv[]) { printf(" WOP: %8" PRIu64 "\t// number of explicit write operations (not a pages) to a disk\n", mei.mi_pgop_stat.wops); + printf(" PreFault: %8" PRIu64 + "\t// number of prefault write operations (not a pages)\n", + mei.mi_pgop_stat.prefault); + printf(" mInCore: %8" PRIu64 "\t// number of mincore() calls\n", + mei.mi_pgop_stat.mincore); + printf(" mSync: %8" PRIu64 + "\t// number of explicit msync-to-disk operations (not a pages)\n", + mei.mi_pgop_stat.msync); + printf(" fSync: %8" PRIu64 + "\t// number of explicit fsync-to-disk operations (not a pages)\n", + mei.mi_pgop_stat.fsync); } if (envinfo) { @@ -469,13 +480,13 @@ int main(int argc, char *argv[]) { MDBX_dbi subdbi; if (memchr(key.iov_base, '\0', key.iov_len)) continue; - subname = mdbx_malloc(key.iov_len + 1); + subname = osal_malloc(key.iov_len + 1); memcpy(subname, key.iov_base, key.iov_len); subname[key.iov_len] = '\0'; rc = mdbx_dbi_open(txn, subname, MDBX_DB_ACCEDE, &subdbi); if (rc == MDBX_SUCCESS) printf("Status of %s\n", subname); - mdbx_free(subname); + osal_free(subname); if (unlikely(rc != MDBX_SUCCESS)) { if (rc == MDBX_INCOMPATIBLE) continue; diff --git a/src/options.h b/src/options.h index d362da41..0ef27e6f 100644 --- a/src/options.h +++ b/src/options.h @@ -40,6 +40,8 @@ #define MDBX_ENV_CHECKPID 1 #endif #define MDBX_ENV_CHECKPID_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_ENV_CHECKPID) +#elif !(MDBX_ENV_CHECKPID == 0 || MDBX_ENV_CHECKPID == 1) +#error MDBX_ENV_CHECKPID must be defined as 0 or 1 #else #define MDBX_ENV_CHECKPID_CONFIG MDBX_STRINGIFY(MDBX_ENV_CHECKPID) #endif /* MDBX_ENV_CHECKPID */ @@ -49,6 +51,8 @@ #ifndef MDBX_TXN_CHECKOWNER #define MDBX_TXN_CHECKOWNER 1 #define MDBX_TXN_CHECKOWNER_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) +#elif !(MDBX_TXN_CHECKOWNER == 0 || MDBX_TXN_CHECKOWNER == 1) +#error MDBX_TXN_CHECKOWNER must be defined as 0 or 1 #else #define MDBX_TXN_CHECKOWNER_CONFIG MDBX_STRINGIFY(MDBX_TXN_CHECKOWNER) #endif /* MDBX_TXN_CHECKOWNER */ @@ -62,6 +66,8 @@ #define MDBX_TRUST_RTC 1 #endif #define MDBX_TRUST_RTC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_TRUST_RTC) +#elif !(MDBX_TRUST_RTC == 0 || MDBX_TRUST_RTC == 1) +#error MDBX_TRUST_RTC must be defined as 0 or 1 #else #define MDBX_TRUST_RTC_CONFIG MDBX_STRINGIFY(MDBX_TRUST_RTC) #endif /* MDBX_TRUST_RTC */ @@ -73,6 +79,13 @@ #error MDBX_ENABLE_REFUND must be defined as 0 or 1 #endif /* MDBX_ENABLE_REFUND */ +/** Controls profiling of GC search and updates. */ +#ifndef MDBX_ENABLE_PROFGC +#define MDBX_ENABLE_PROFGC 0 +#elif !(MDBX_ENABLE_PROFGC == 0 || MDBX_ENABLE_PROFGC == 1) +#error MDBX_ENABLE_PROFGC must be defined as 0 or 1 +#endif /* MDBX_ENABLE_PROFGC */ + /** Controls gathering statistics for page operations. */ #ifndef MDBX_ENABLE_PGOP_STAT #define MDBX_ENABLE_PGOP_STAT 1 @@ -80,7 +93,32 @@ #error MDBX_ENABLE_PGOP_STAT must be defined as 0 or 1 #endif /* MDBX_ENABLE_PGOP_STAT */ -/** Controls use of POSIX madvise() hints and friends. */ +/** Controls using Unix' mincore() to determine whether DB-pages + * are resident in memory. */ +#ifndef MDBX_ENABLE_MINCORE +#if MDBX_ENABLE_PREFAULT && \ + (defined(MINCORE_INCORE) || !(defined(_WIN32) || defined(_WIN64))) +#define MDBX_ENABLE_MINCORE 1 +#else +#define MDBX_ENABLE_MINCORE 0 +#endif +#elif !(MDBX_ENABLE_MINCORE == 0 || MDBX_ENABLE_MINCORE == 1) +#error MDBX_ENABLE_MINCORE must be defined as 0 or 1 +#endif /* MDBX_ENABLE_MINCORE */ + +/** Enables chunking long list of retired pages during huge transactions commit + * to avoid use sequences of pages. */ +#ifndef MDBX_ENABLE_BIGFOOT +#if MDBX_WORDBITS >= 64 || defined(DOXYGEN) +#define MDBX_ENABLE_BIGFOOT 1 +#else +#define MDBX_ENABLE_BIGFOOT 0 +#endif +#elif !(MDBX_ENABLE_BIGFOOT == 0 || MDBX_ENABLE_BIGFOOT == 1) +#error MDBX_ENABLE_BIGFOOT must be defined as 0 or 1 +#endif /* MDBX_ENABLE_BIGFOOT */ + +/** Controls using of POSIX' madvise() and/or similar hints. */ #ifndef MDBX_ENABLE_MADVISE #define MDBX_ENABLE_MADVISE 1 #elif !(MDBX_ENABLE_MADVISE == 0 || MDBX_ENABLE_MADVISE == 1) @@ -89,11 +127,11 @@ /** Disable some checks to reduce an overhead and detection probability of * database corruption to a values closer to the LMDB. */ -#ifndef MDBX_DISABLE_PAGECHECKS -#define MDBX_DISABLE_PAGECHECKS 0 -#elif !(MDBX_DISABLE_PAGECHECKS == 0 || MDBX_DISABLE_PAGECHECKS == 1) -#error MDBX_DISABLE_PAGECHECKS must be defined as 0 or 1 -#endif /* MDBX_DISABLE_PAGECHECKS */ +#ifndef MDBX_DISABLE_VALIDATION +#define MDBX_DISABLE_VALIDATION 0 +#elif !(MDBX_DISABLE_VALIDATION == 0 || MDBX_DISABLE_VALIDATION == 1) +#error MDBX_DISABLE_VALIDATION must be defined as 0 or 1 +#endif /* MDBX_DISABLE_VALIDATION */ #ifndef MDBX_PNL_PREALLOC_FOR_RADIXSORT #define MDBX_PNL_PREALLOC_FOR_RADIXSORT 1 @@ -109,23 +147,22 @@ #error MDBX_DPL_PREALLOC_FOR_RADIXSORT must be defined as 0 or 1 #endif /* MDBX_DPL_PREALLOC_FOR_RADIXSORT */ -/** Basically, this build-option is for TODO. Guess it should be replaced - * with MDBX_ENABLE_WRITEMAP_SPILLING with the three variants: - * 0/OFF = Don't track dirty pages at all and don't spilling ones. - * This should be by-default on Linux and may-be other systems - * (not sure: Darwin/OSX, FreeBSD, Windows 10) where kernel provides - * properly LRU tracking and async writing on-demand. - * 1/ON = Lite tracking of dirty pages but with LRU labels and explicit - * spilling with msync(MS_ASYNC). */ -#ifndef MDBX_FAKE_SPILL_WRITEMAP -#if defined(__linux__) || defined(__gnu_linux__) -#define MDBX_FAKE_SPILL_WRITEMAP 1 /* msync(MS_ASYNC) is no-op on Linux */ +/** Controls dirty pages tracking, spilling and persisting in MDBX_WRITEMAP + * mode. 0/OFF = Don't track dirty pages at all, don't spill ones, and use + * msync() to persist data. This is by-default on Linux and other systems where + * kernel provides properly LRU tracking and effective flushing on-demand. 1/ON + * = Tracking of dirty pages but with LRU labels for spilling and explicit + * persist ones by write(). This may be reasonable for systems which low + * performance of msync() and/or LRU tracking. */ +#ifndef MDBX_AVOID_MSYNC +#if defined(_WIN32) || defined(_WIN64) +#define MDBX_AVOID_MSYNC 1 #else -#define MDBX_FAKE_SPILL_WRITEMAP 0 +#define MDBX_AVOID_MSYNC 0 #endif -#elif !(MDBX_FAKE_SPILL_WRITEMAP == 0 || MDBX_FAKE_SPILL_WRITEMAP == 1) -#error MDBX_FAKE_SPILL_WRITEMAP must be defined as 0 or 1 -#endif /* MDBX_FAKE_SPILL_WRITEMAP */ +#elif !(MDBX_AVOID_MSYNC == 0 || MDBX_AVOID_MSYNC == 1) +#error MDBX_AVOID_MSYNC must be defined as 0 or 1 +#endif /* MDBX_AVOID_MSYNC */ /** Controls sort order of internal page number lists. * This mostly experimental/advanced option with not for regular MDBX users. @@ -182,6 +219,31 @@ #ifndef MDBX_HAVE_C11ATOMICS #endif /* MDBX_HAVE_C11ATOMICS */ +/** If defined then enables use the GCC's `__builtin_cpu_supports()` + * for runtime dispatching depending on the CPU's capabilities. + * \note Defining `MDBX_HAVE_BUILTIN_CPU_SUPPORTS` to `0` should avoided unless + * build for particular single-target platform, since on AMD64/x86 this disables + * dynamic choice (at runtime) of SSE2 / AVX2 / AVX512 instructions + * with fallback to non-accelerated baseline code. */ +#ifndef MDBX_HAVE_BUILTIN_CPU_SUPPORTS +#if defined(__APPLE__) || defined(BIONIC) +/* Never use any modern features on Apple's or Google's OSes + * since a lot of troubles with compatibility and/or performance */ +#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0 +#elif defined(__e2k__) +#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0 +#elif __has_builtin(__builtin_cpu_supports) || \ + defined(__BUILTIN_CPU_SUPPORTS__) || \ + (defined(__ia32__) && __GNUC_PREREQ(4, 8) && __GLIBC_PREREQ(2, 23)) +#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 1 +#else +#define MDBX_HAVE_BUILTIN_CPU_SUPPORTS 0 +#endif +#elif !(MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 0 || \ + MDBX_HAVE_BUILTIN_CPU_SUPPORTS == 1) +#error MDBX_HAVE_BUILTIN_CPU_SUPPORTS must be defined as 0 or 1 +#endif /* MDBX_HAVE_BUILTIN_CPU_SUPPORTS */ + //------------------------------------------------------------------------------ /** Win32 File Locking API for \ref MDBX_LOCKING */ @@ -248,6 +310,8 @@ #define MDBX_USE_OFDLOCKS 0 #endif #define MDBX_USE_OFDLOCKS_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) +#elif !(MDBX_USE_OFDLOCKS == 0 || MDBX_USE_OFDLOCKS == 1) +#error MDBX_USE_OFDLOCKS must be defined as 0 or 1 #else #define MDBX_USE_OFDLOCKS_CONFIG MDBX_STRINGIFY(MDBX_USE_OFDLOCKS) #endif /* MDBX_USE_OFDLOCKS */ @@ -261,6 +325,8 @@ #else #define MDBX_USE_SENDFILE 0 #endif +#elif !(MDBX_USE_SENDFILE == 0 || MDBX_USE_SENDFILE == 1) +#error MDBX_USE_SENDFILE must be defined as 0 or 1 #endif /* MDBX_USE_SENDFILE */ /** Advanced: Using copy_file_range() syscall (autodetection by default). */ @@ -270,6 +336,8 @@ #else #define MDBX_USE_COPYFILERANGE 0 #endif +#elif !(MDBX_USE_COPYFILERANGE == 0 || MDBX_USE_COPYFILERANGE == 1) +#error MDBX_USE_COPYFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_COPYFILERANGE */ /** Advanced: Using sync_file_range() syscall (autodetection by default). */ @@ -281,6 +349,8 @@ #else #define MDBX_USE_SYNCFILERANGE 0 #endif +#elif !(MDBX_USE_SYNCFILERANGE == 0 || MDBX_USE_SYNCFILERANGE == 1) +#error MDBX_USE_SYNCFILERANGE must be defined as 0 or 1 #endif /* MDBX_USE_SYNCFILERANGE */ //------------------------------------------------------------------------------ @@ -292,6 +362,9 @@ #else #define MDBX_CPU_WRITEBACK_INCOHERENT 1 #endif +#elif !(MDBX_CPU_WRITEBACK_INCOHERENT == 0 || \ + MDBX_CPU_WRITEBACK_INCOHERENT == 1) +#error MDBX_CPU_WRITEBACK_INCOHERENT must be defined as 0 or 1 #endif /* MDBX_CPU_WRITEBACK_INCOHERENT */ #ifndef MDBX_MMAP_INCOHERENT_FILE_WRITE @@ -300,6 +373,9 @@ #else #define MDBX_MMAP_INCOHERENT_FILE_WRITE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_FILE_WRITE == 0 || \ + MDBX_MMAP_INCOHERENT_FILE_WRITE == 1) +#error MDBX_MMAP_INCOHERENT_FILE_WRITE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_FILE_WRITE */ #ifndef MDBX_MMAP_INCOHERENT_CPU_CACHE @@ -312,8 +388,21 @@ /* LY: assume no relevant mmap/dcache issues. */ #define MDBX_MMAP_INCOHERENT_CPU_CACHE 0 #endif +#elif !(MDBX_MMAP_INCOHERENT_CPU_CACHE == 0 || \ + MDBX_MMAP_INCOHERENT_CPU_CACHE == 1) +#error MDBX_MMAP_INCOHERENT_CPU_CACHE must be defined as 0 or 1 #endif /* MDBX_MMAP_INCOHERENT_CPU_CACHE */ +#ifndef MDBX_MMAP_USE_MS_ASYNC +#if MDBX_MMAP_INCOHERENT_FILE_WRITE || MDBX_MMAP_INCOHERENT_CPU_CACHE +#define MDBX_MMAP_USE_MS_ASYNC 1 +#else +#define MDBX_MMAP_USE_MS_ASYNC 0 +#endif +#elif !(MDBX_MMAP_USE_MS_ASYNC == 0 || MDBX_MMAP_USE_MS_ASYNC == 1) +#error MDBX_MMAP_USE_MS_ASYNC must be defined as 0 or 1 +#endif /* MDBX_MMAP_USE_MS_ASYNC */ + #ifndef MDBX_64BIT_ATOMIC #if MDBX_WORDBITS >= 64 || defined(DOXYGEN) #define MDBX_64BIT_ATOMIC 1 @@ -321,6 +410,8 @@ #define MDBX_64BIT_ATOMIC 0 #endif #define MDBX_64BIT_ATOMIC_CONFIG "AUTO=" MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) +#elif !(MDBX_64BIT_ATOMIC == 0 || MDBX_64BIT_ATOMIC == 1) +#error MDBX_64BIT_ATOMIC must be defined as 0 or 1 #else #define MDBX_64BIT_ATOMIC_CONFIG MDBX_STRINGIFY(MDBX_64BIT_ATOMIC) #endif /* MDBX_64BIT_ATOMIC */ @@ -346,6 +437,8 @@ #endif #elif defined(_MSC_VER) || defined(__APPLE__) || defined(DOXYGEN) #define MDBX_64BIT_CAS 1 +#elif !(MDBX_64BIT_CAS == 0 || MDBX_64BIT_CAS == 1) +#error MDBX_64BIT_CAS must be defined as 0 or 1 #else #define MDBX_64BIT_CAS MDBX_64BIT_ATOMIC #endif @@ -355,14 +448,11 @@ #endif /* MDBX_64BIT_CAS */ #ifndef MDBX_UNALIGNED_OK -#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) +#if defined(__ALIGNED__) || defined(__SANITIZE_UNDEFINED__) || \ + defined(ENABLE_UBSAN) #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #elif defined(__ARM_FEATURE_UNALIGNED) #define MDBX_UNALIGNED_OK 4 /* ok unaligned for 32-bit words */ -#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) -/* expecting an optimization will well done, also this - * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ -#define MDBX_UNALIGNED_OK 0 #elif defined(__e2k__) || defined(__elbrus__) #if __iset__ > 4 #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ @@ -371,6 +461,10 @@ #endif #elif defined(__ia32__) #define MDBX_UNALIGNED_OK 8 /* ok unaligned for 64-bit words */ +#elif __CLANG_PREREQ(5, 0) || __GNUC_PREREQ(5, 0) +/* expecting an optimization will well done, also this + * hushes false-positives from UBSAN (undefined behaviour sanitizer) */ +#define MDBX_UNALIGNED_OK 0 #else #define MDBX_UNALIGNED_OK 0 /* no unaligned access allowed */ #endif diff --git a/src/osal.c b/src/osal.c index 2ce3102f..db70dc0b 100644 --- a/src/osal.c +++ b/src/osal.c @@ -1,7 +1,7 @@ /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -18,8 +18,13 @@ #if defined(_WIN32) || defined(_WIN64) +#include #include +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) +#include +#endif + static int waitstatus2errcode(DWORD result) { switch (result) { case WAIT_OBJECT_0: @@ -43,6 +48,8 @@ static int ntstatus2errcode(NTSTATUS status) { OVERLAPPED ov; memset(&ov, 0, sizeof(ov)); ov.Internal = status; + /* Zap: '_Param_(1)' could be '0' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6387); return GetOverlappedResult(NULL, &ov, &dummy, FALSE) ? MDBX_SUCCESS : (int)GetLastError(); } @@ -77,6 +84,8 @@ extern NTSTATUS NTAPI NtMapViewOfSection( extern NTSTATUS NTAPI NtUnmapViewOfSection(IN HANDLE ProcessHandle, IN OPTIONAL PVOID BaseAddress); +/* Zap: Inconsistent annotation for 'NtClose'... */ +MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(28251) extern NTSTATUS NTAPI NtClose(HANDLE Handle); extern NTSTATUS NTAPI NtAllocateVirtualMemory( @@ -224,36 +233,46 @@ __extern_C void __assert(const char *function, const char *file, int line, __cold void mdbx_assert_fail(const MDBX_env *env, const char *msg, const char *func, unsigned line) { #if MDBX_DEBUG - if (env && env->me_assert_func) { + if (env && env->me_assert_func) env->me_assert_func(env, msg, func, line); - return; - } #else (void)env; + assert_fail(msg, func, line); +} + +MDBX_NORETURN __cold void assert_fail(const char *msg, const char *func, + unsigned line) { #endif /* MDBX_DEBUG */ - if (mdbx_debug_logger) - mdbx_debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); + if (debug_logger) + debug_log(MDBX_LOG_FATAL, func, line, "assert: %s\n", msg); else { #if defined(_WIN32) || defined(_WIN64) char *message = nullptr; - const int num = mdbx_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", + const int num = osal_asprintf(&message, "\r\nMDBX-ASSERTION: %s, %s:%u", msg, func ? func : "unknown", line); if (num < 1 || !message) message = ""; OutputDebugStringA(message); - if (IsDebuggerPresent()) - DebugBreak(); #else __assert_fail(msg, "mdbx", line, func); #endif } + while (1) { #if defined(_WIN32) || defined(_WIN64) - FatalExit(ERROR_UNHANDLED_ERROR); +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) + _CrtDbgReport(_CRT_ASSERT, func ? func : "unknown", line, "libmdbx", + "assertion failed: %s", msg); #else - abort(); + if (IsDebuggerPresent()) + DebugBreak(); #endif + FatalExit(STATUS_ASSERTION_FAILURE); +#else + abort(); +#endif + } } __cold void mdbx_panic(const char *fmt, ...) { @@ -261,28 +280,39 @@ __cold void mdbx_panic(const char *fmt, ...) { va_start(ap, fmt); char *message = nullptr; - const int num = mdbx_vasprintf(&message, fmt, ap); + const int num = osal_vasprintf(&message, fmt, ap); va_end(ap); const char *const const_message = - (num < 1 || !message) ? "" - : message; + unlikely(num < 1 || !message) + ? "" + : message; + if (debug_logger) + debug_log(MDBX_LOG_FATAL, "panic", 0, "%s", const_message); + + while (1) { #if defined(_WIN32) || defined(_WIN64) - OutputDebugStringA("\r\nMDBX-PANIC: "); - OutputDebugStringA(const_message); - if (IsDebuggerPresent()) - DebugBreak(); - FatalExit(ERROR_UNHANDLED_ERROR); +#if !MDBX_WITHOUT_MSVC_CRT && defined(_DEBUG) + _CrtDbgReport(_CRT_ASSERT, "mdbx.c", 0, "libmdbx", "panic: %s", + const_message); #else - __assert_fail(const_message, "mdbx", 0, "panic"); - abort(); + OutputDebugStringA("\r\nMDBX-PANIC: "); + OutputDebugStringA(const_message); + if (IsDebuggerPresent()) + DebugBreak(); #endif + FatalExit(ERROR_UNHANDLED_ERROR); +#else + __assert_fail(const_message, "mdbx", 0, "panic"); + abort(); +#endif + } } /*----------------------------------------------------------------------------*/ -#ifndef mdbx_vasprintf -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, +#ifndef osal_vasprintf +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap) { va_list ones; va_copy(ones, ap); @@ -294,7 +324,7 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, return needed; } - *strp = mdbx_malloc(needed + 1); + *strp = osal_malloc(needed + (size_t)1); if (unlikely(*strp == nullptr)) { va_end(ones); #if defined(_WIN32) || defined(_WIN64) @@ -305,30 +335,30 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, return -1; } - int actual = vsnprintf(*strp, needed + 1, fmt, ones); + int actual = vsnprintf(*strp, needed + (size_t)1, fmt, ones); va_end(ones); assert(actual == needed); if (unlikely(actual < 0)) { - mdbx_free(*strp); + osal_free(*strp); *strp = nullptr; } return actual; } -#endif /* mdbx_vasprintf */ +#endif /* osal_vasprintf */ -#ifndef mdbx_asprintf -MDBX_INTERNAL_FUNC int mdbx_asprintf(char **strp, const char *fmt, ...) { +#ifndef osal_asprintf +MDBX_INTERNAL_FUNC int osal_asprintf(char **strp, const char *fmt, ...) { va_list ap; va_start(ap, fmt); - int rc = mdbx_vasprintf(strp, fmt, ap); + int rc = osal_vasprintf(strp, fmt, ap); va_end(ap); return rc; } -#endif /* mdbx_asprintf */ +#endif /* osal_asprintf */ -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result) { assert(is_powerof2(alignment) && alignment >= sizeof(void *)); #if defined(_WIN32) || defined(_WIN64) @@ -349,35 +379,35 @@ MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, #error FIXME #endif } -#endif /* mdbx_memalign_alloc */ +#endif /* osal_memalign_alloc */ -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr) { +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr) { #if defined(_WIN32) || defined(_WIN64) VirtualFree(ptr, 0, MEM_RELEASE); #else - mdbx_free(ptr); + osal_free(ptr); #endif } -#endif /* mdbx_memalign_free */ +#endif /* osal_memalign_free */ -#ifndef mdbx_strdup -char *mdbx_strdup(const char *str) { +#ifndef osal_strdup +char *osal_strdup(const char *str) { if (!str) return NULL; size_t bytes = strlen(str) + 1; - char *dup = mdbx_malloc(bytes); + char *dup = osal_malloc(bytes); if (dup) memcpy(dup, str, bytes); return dup; } -#endif /* mdbx_strdup */ +#endif /* osal_strdup */ /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair) { int rc; - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); #if defined(_WIN32) || defined(_WIN64) if ((condpair->mutex = CreateMutexW(NULL, FALSE, NULL)) == NULL) { rc = (int)GetLastError(); @@ -410,11 +440,11 @@ bailout_cond: (void)pthread_mutex_destroy(&condpair->mutex); #endif bailout_mutex: - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); return rc; } -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) int rc = CloseHandle(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); rc = CloseHandle(condpair->event[0]) ? rc : (int)GetLastError(); @@ -424,20 +454,20 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair) { rc = (err = pthread_cond_destroy(&condpair->cond[0])) ? err : rc; rc = (err = pthread_cond_destroy(&condpair->cond[1])) ? err : rc; #endif - memset(condpair, 0, sizeof(mdbx_condpair_t)); + memset(condpair, 0, sizeof(osal_condpair_t)); return rc; } -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(condpair->mutex, INFINITE); return waitstatus2errcode(code); #else - return mdbx_pthread_mutex_lock(&condpair->mutex); + return osal_pthread_mutex_lock(&condpair->mutex); #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair) { +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair) { #if defined(_WIN32) || defined(_WIN64) return ReleaseMutex(condpair->mutex) ? MDBX_SUCCESS : (int)GetLastError(); #else @@ -445,7 +475,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair) { #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part) { #if defined(_WIN32) || defined(_WIN64) return SetEvent(condpair->event[part]) ? MDBX_SUCCESS : (int)GetLastError(); @@ -454,7 +484,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, #endif } -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part) { #if defined(_WIN32) || defined(_WIN64) DWORD code = SignalObjectAndWait(condpair->mutex, condpair->event[part], @@ -472,7 +502,7 @@ MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) InitializeCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -481,7 +511,7 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) DeleteCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -490,7 +520,7 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) __try { EnterCriticalSection(fastmutex); @@ -503,11 +533,11 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex) { } return MDBX_SUCCESS; #else - return mdbx_pthread_mutex_lock(fastmutex); + return osal_pthread_mutex_lock(fastmutex); #endif } -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) LeaveCriticalSection(fastmutex); return MDBX_SUCCESS; @@ -520,32 +550,653 @@ MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex) { #if defined(_WIN32) || defined(_WIN64) -#ifndef WC_ERR_INVALID_CHARS -static const DWORD WC_ERR_INVALID_CHARS = - (6 /* Windows Vista */ <= /* MajorVersion */ LOBYTE(LOWORD(GetVersion()))) - ? 0x00000080 - : 0; -#endif /* WC_ERR_INVALID_CHARS */ +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst) { + const size_t dst_wlen = MultiByteToWideChar( + CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, -1, nullptr, 0); + wchar_t *dst = *pdst; + int rc = ERROR_INVALID_NAME; + if (unlikely(dst_wlen < 2 || dst_wlen > /* MAX_PATH */ INT16_MAX)) + goto bailout; -size_t mdbx_mb2w(wchar_t *dst, size_t dst_n, const char *src, size_t src_n) { - return MultiByteToWideChar(CP_THREAD_ACP, MB_ERR_INVALID_CHARS, src, - (int)src_n, dst, (int)dst_n); -} + dst = osal_realloc(dst, dst_wlen * sizeof(wchar_t)); + rc = MDBX_ENOMEM; + if (unlikely(!dst)) + goto bailout; -size_t mdbx_w2mb(char *dst, size_t dst_n, const wchar_t *src, size_t src_n) { - return WideCharToMultiByte(CP_THREAD_ACP, WC_ERR_INVALID_CHARS, src, - (int)src_n, dst, (int)dst_n, nullptr, nullptr); + *pdst = dst; + if (likely(dst_wlen == (size_t)MultiByteToWideChar(CP_THREAD_ACP, + MB_ERR_INVALID_CHARS, src, + -1, dst, (int)dst_wlen))) + return MDBX_SUCCESS; + + rc = ERROR_INVALID_NAME; +bailout: + if (*pdst) { + osal_free(*pdst); + *pdst = nullptr; + } + return rc; } #endif /* Windows */ /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; - MUSTDIE_MB2WIDE(pathname, pathnameW); - return DeleteFileW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError(); +#define ior_alignment_mask (ior->pagesize - 1) +#define ior_WriteFile_flag 1 +#define OSAL_IOV_MAX (4096 / sizeof(ior_sgv_element)) + +static void ior_put_event(osal_ioring_t *ior, HANDLE event) { + assert(event && event != INVALID_HANDLE_VALUE && event != ior); + assert(ior->event_stack < ior->allocated); + ior->event_pool[ior->event_stack] = event; + ior->event_stack += 1; +} + +static HANDLE ior_get_event(osal_ioring_t *ior) { + assert(ior->event_stack <= ior->allocated); + if (ior->event_stack > 0) { + ior->event_stack -= 1; + assert(ior->event_pool[ior->event_stack] != 0); + return ior->event_pool[ior->event_stack]; + } + return CreateEventW(nullptr, true, false, nullptr); +} + +static void WINAPI ior_wocr(DWORD err, DWORD bytes, OVERLAPPED *ov) { + osal_ioring_t *ior = ov->hEvent; + ov->Internal = err; + ov->InternalHigh = bytes; + if (++ior->async_completed >= ior->async_waiting) + SetEvent(ior->async_done); +} + +#elif MDBX_HAVE_PWRITEV +#if defined(_SC_IOV_MAX) +static size_t osal_iov_max; +#define OSAL_IOV_MAX osal_iov_max +#else +#define OSAL_IOV_MAX IOV_MAX +#endif +#else +#undef OSAL_IOV_MAX +#endif /* OSAL_IOV_MAX */ + +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t *ior +#if defined(_WIN32) || defined(_WIN64) + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd +#endif /* Windows */ +) { + memset(ior, 0, sizeof(osal_ioring_t)); + +#if defined(_WIN32) || defined(_WIN64) + ior->overlapped_fd = overlapped_fd; + ior->direct = enable_direct && overlapped_fd; + const unsigned pagesize = (unsigned)osal_syspagesize(); + ior->pagesize = pagesize; + ior->pagesize_ln2 = (uint8_t)log2n_powerof2(pagesize); + ior->async_done = ior_get_event(ior); + if (!ior->async_done) + return GetLastError(); +#endif /* !Windows */ + +#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) + assert(osal_iov_max > 0); +#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ + + ior->boundary = ptr_disp(ior->pool, ior->allocated); + return MDBX_SUCCESS; +} + +static __inline size_t ior_offset(const ior_item_t *item) { +#if defined(_WIN32) || defined(_WIN64) + return item->ov.Offset | (size_t)((sizeof(size_t) > sizeof(item->ov.Offset)) + ? (uint64_t)item->ov.OffsetHigh << 32 + : 0); +#else + return item->offset; +#endif /* !Windows */ +} + +static __inline ior_item_t *ior_next(ior_item_t *item, size_t sgvcnt) { +#if defined(ior_sgv_element) + assert(sgvcnt > 0); + return (ior_item_t *)ptr_disp(item, sizeof(ior_item_t) - + sizeof(ior_sgv_element) + + sizeof(ior_sgv_element) * sgvcnt); +#else + assert(sgvcnt == 1); + (void)sgvcnt; + return item + 1; +#endif +} + +MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ior, const size_t offset, + void *data, const size_t bytes) { + assert(bytes && data); + assert(bytes % MIN_PAGESIZE == 0 && bytes <= MAX_WRITE); + assert(offset % MIN_PAGESIZE == 0 && offset + (uint64_t)bytes <= MAX_MAPSIZE); + +#if defined(_WIN32) || defined(_WIN64) + const unsigned segments = (unsigned)(bytes >> ior->pagesize_ln2); + const bool use_gather = + ior->direct && ior->overlapped_fd && ior->slots_left >= segments; +#endif /* Windows */ + + ior_item_t *item = ior->pool; + if (likely(ior->last)) { + item = ior->last; + if (unlikely(ior_offset(item) + ior_last_bytes(ior, item) == offset) && + likely(ior_last_bytes(ior, item) + bytes <= MAX_WRITE)) { +#if defined(_WIN32) || defined(_WIN64) + if (use_gather && + ((bytes | (uintptr_t)data | ior->last_bytes | + (uintptr_t)(uint64_t)item->sgv[0].Buffer) & + ior_alignment_mask) == 0 && + ior->last_sgvcnt + (size_t)segments < OSAL_IOV_MAX) { + assert(ior->overlapped_fd); + assert((item->single.iov_len & ior_WriteFile_flag) == 0); + assert(item->sgv[ior->last_sgvcnt].Buffer == 0); + ior->last_bytes += bytes; + size_t i = 0; + do { + item->sgv[ior->last_sgvcnt + i].Buffer = PtrToPtr64(data); + data = ptr_disp(data, ior->pagesize); + } while (++i < segments); + ior->slots_left -= segments; + item->sgv[ior->last_sgvcnt += segments].Buffer = 0; + assert((item->single.iov_len & ior_WriteFile_flag) == 0); + return MDBX_SUCCESS; + } + const void *end = ptr_disp(item->single.iov_base, + item->single.iov_len - ior_WriteFile_flag); + if (unlikely(end == data)) { + assert((item->single.iov_len & ior_WriteFile_flag) != 0); + item->single.iov_len += bytes; + return MDBX_SUCCESS; + } +#elif MDBX_HAVE_PWRITEV + assert((int)item->sgvcnt > 0); + const void *end = ptr_disp(item->sgv[item->sgvcnt - 1].iov_base, + item->sgv[item->sgvcnt - 1].iov_len); + if (unlikely(end == data)) { + item->sgv[item->sgvcnt - 1].iov_len += bytes; + ior->last_bytes += bytes; + return MDBX_SUCCESS; + } + if (likely(item->sgvcnt < OSAL_IOV_MAX)) { + if (unlikely(ior->slots_left < 1)) + return MDBX_RESULT_TRUE; + item->sgv[item->sgvcnt].iov_base = data; + item->sgv[item->sgvcnt].iov_len = bytes; + ior->last_bytes += bytes; + item->sgvcnt += 1; + ior->slots_left -= 1; + return MDBX_SUCCESS; + } +#else + const void *end = ptr_disp(item->single.iov_base, item->single.iov_len); + if (unlikely(end == data)) { + item->single.iov_len += bytes; + return MDBX_SUCCESS; + } +#endif + } + item = ior_next(item, ior_last_sgvcnt(ior, item)); + } + + if (unlikely(ior->slots_left < 1)) + return MDBX_RESULT_TRUE; + + unsigned slots_used = 1; +#if defined(_WIN32) || defined(_WIN64) + item->ov.Internal = item->ov.InternalHigh = 0; + item->ov.Offset = (DWORD)offset; + item->ov.OffsetHigh = HIGH_DWORD(offset); + item->ov.hEvent = 0; + if (!use_gather || ((bytes | (uintptr_t)(data)) & ior_alignment_mask) != 0 || + segments > OSAL_IOV_MAX) { + /* WriteFile() */ + item->single.iov_base = data; + item->single.iov_len = bytes + ior_WriteFile_flag; + assert((item->single.iov_len & ior_WriteFile_flag) != 0); + } else { + /* WriteFileGather() */ + assert(ior->overlapped_fd); + item->sgv[0].Buffer = PtrToPtr64(data); + for (size_t i = 1; i < segments; ++i) { + data = ptr_disp(data, ior->pagesize); + item->sgv[slots_used].Buffer = PtrToPtr64(data); + } + item->sgv[slots_used].Buffer = 0; + assert((item->single.iov_len & ior_WriteFile_flag) == 0); + slots_used = segments; + } + ior->last_bytes = bytes; + ior_last_sgvcnt(ior, item) = slots_used; +#elif MDBX_HAVE_PWRITEV + item->offset = offset; + item->sgv[0].iov_base = data; + item->sgv[0].iov_len = bytes; + ior->last_bytes = bytes; + ior_last_sgvcnt(ior, item) = slots_used; +#else + item->offset = offset; + item->single.iov_base = data; + item->single.iov_len = bytes; +#endif /* !Windows */ + ior->slots_left -= slots_used; + ior->last = item; + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC void osal_ioring_walk( + osal_ioring_t *ior, iov_ctx_t *ctx, + void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)) { + for (ior_item_t *item = ior->pool; item <= ior->last;) { +#if defined(_WIN32) || defined(_WIN64) + size_t offset = ior_offset(item); + char *data = item->single.iov_base; + size_t bytes = item->single.iov_len - ior_WriteFile_flag; + size_t i = 1; + if (bytes & ior_WriteFile_flag) { + data = Ptr64ToPtr(item->sgv[0].Buffer); + bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); + while (item->sgv[i].Buffer) { + if (data + ior->pagesize != item->sgv[i].Buffer) { + callback(ctx, offset, data, bytes); + offset += bytes; + data = Ptr64ToPtr(item->sgv[i].Buffer); + bytes = 0; + } + bytes += ior->pagesize; + ++i; + } + } + assert(bytes < MAX_WRITE); + callback(ctx, offset, data, bytes); +#elif MDBX_HAVE_PWRITEV + assert(item->sgvcnt > 0); + size_t offset = item->offset; + size_t i = 0; + do { + callback(ctx, offset, item->sgv[i].iov_base, item->sgv[i].iov_len); + offset += item->sgv[i].iov_len; + } while (++i != item->sgvcnt); +#else + const size_t i = 1; + callback(ctx, item->offset, item->single.iov_base, item->single.iov_len); +#endif + item = ior_next(item, i); + } +} + +MDBX_INTERNAL_FUNC osal_ioring_write_result_t +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd) { + osal_ioring_write_result_t r = {MDBX_SUCCESS, 0}; + +#if defined(_WIN32) || defined(_WIN64) + HANDLE *const end_wait_for = + ior->event_pool + ior->allocated + + /* был выделен один дополнительный элемент для async_done */ 1; + HANDLE *wait_for = end_wait_for; + LONG async_started = 0; + for (ior_item_t *item = ior->pool; item <= ior->last;) { + item->ov.Internal = STATUS_PENDING; + size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; + r.wops += 1; + if (bytes & ior_WriteFile_flag) { + assert(ior->overlapped_fd && fd == ior->overlapped_fd); + bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); + while (item->sgv[i].Buffer) { + bytes += ior->pagesize; + ++i; + } + assert(bytes < MAX_WRITE); + item->ov.hEvent = ior_get_event(ior); + if (unlikely(!item->ov.hEvent)) { + bailout_geterr: + r.err = GetLastError(); + bailout_rc: + assert(r.err != MDBX_SUCCESS); + CancelIo(fd); + return r; + } + if (WriteFileGather(fd, item->sgv, (DWORD)bytes, nullptr, &item->ov)) { + assert(item->ov.Internal == 0 && + WaitForSingleObject(item->ov.hEvent, 0) == WAIT_OBJECT_0); + ior_put_event(ior, item->ov.hEvent); + item->ov.hEvent = 0; + } else { + r.err = (int)GetLastError(); + if (unlikely(r.err != ERROR_IO_PENDING)) { + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileGather", fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); + goto bailout_rc; + } + assert(wait_for > ior->event_pool + ior->event_stack); + *--wait_for = item->ov.hEvent; + } + } else if (fd == ior->overlapped_fd) { + assert(bytes < MAX_WRITE); + retry: + item->ov.hEvent = ior; + if (WriteFileEx(fd, item->single.iov_base, (DWORD)bytes, &item->ov, + ior_wocr)) { + async_started += 1; + } else { + r.err = (int)GetLastError(); + switch (r.err) { + default: + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileEx", fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); + goto bailout_rc; + case ERROR_NOT_FOUND: + case ERROR_USER_MAPPED_FILE: + case ERROR_LOCK_VIOLATION: + WARNING( + "%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFileEx", fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); + SleepEx(0, true); + goto retry; + case ERROR_INVALID_USER_BUFFER: + case ERROR_NOT_ENOUGH_MEMORY: + if (SleepEx(0, true) == WAIT_IO_COMPLETION) + goto retry; + goto bailout_rc; + case ERROR_IO_PENDING: + async_started += 1; + } + } + } else { + assert(bytes < MAX_WRITE); + DWORD written = 0; + if (!WriteFile(fd, item->single.iov_base, (DWORD)bytes, &written, + &item->ov)) { + r.err = (int)GetLastError(); + ERROR("%s: fd %p, item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "WriteFile", fd, __Wpedantic_format_voidptr(item), + item - ior->pool, ((MDBX_page *)item->single.iov_base)->mp_pgno, + bytes, item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + r.err); + goto bailout_rc; + } else if (unlikely(written != bytes)) { + r.err = ERROR_WRITE_FAULT; + goto bailout_rc; + } + } + item = ior_next(item, i); + } + + assert(ior->async_waiting > ior->async_completed && + ior->async_waiting == INT_MAX); + ior->async_waiting = async_started; + if (async_started > ior->async_completed && end_wait_for == wait_for) { + assert(wait_for > ior->event_pool + ior->event_stack); + *--wait_for = ior->async_done; + } + + const size_t pending_count = end_wait_for - wait_for; + if (pending_count) { + /* Ждем до MAXIMUM_WAIT_OBJECTS (64) последних хендлов, а после избирательно + * ждем посредством GetOverlappedResult(), если какие-то более ранние + * элементы еще не завершены. В целом, так получается меньше системных + * вызовов, т.е. меньше накладных расходов. Однако, не факт что эта экономия + * не будет перекрыта неэффективностью реализации + * WaitForMultipleObjectsEx(), но тогда это проблемы на стороне M$. */ + DWORD madness; + do + madness = WaitForMultipleObjectsEx((pending_count < MAXIMUM_WAIT_OBJECTS) + ? (DWORD)pending_count + : MAXIMUM_WAIT_OBJECTS, + wait_for, true, + /* сутки */ 86400000ul, true); + while (madness == WAIT_IO_COMPLETION); + STATIC_ASSERT(WAIT_OBJECT_0 == 0); + if (/* madness >= WAIT_OBJECT_0 && */ + madness < WAIT_OBJECT_0 + MAXIMUM_WAIT_OBJECTS) + r.err = MDBX_SUCCESS; + else if (madness >= WAIT_ABANDONED_0 && + madness < WAIT_ABANDONED_0 + MAXIMUM_WAIT_OBJECTS) { + r.err = ERROR_ABANDONED_WAIT_0; + goto bailout_rc; + } else if (madness == WAIT_TIMEOUT) { + r.err = WAIT_TIMEOUT; + goto bailout_rc; + } else { + r.err = /* madness == WAIT_FAILED */ MDBX_PROBLEM; + goto bailout_rc; + } + + assert(ior->async_waiting == ior->async_completed); + for (ior_item_t *item = ior->pool; item <= ior->last;) { + size_t i = 1, bytes = item->single.iov_len - ior_WriteFile_flag; + if (bytes & ior_WriteFile_flag) { + bytes = ior->pagesize; + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); + while (item->sgv[i].Buffer) { + bytes += ior->pagesize; + ++i; + } + if (!HasOverlappedIoCompleted(&item->ov)) { + DWORD written = 0; + if (unlikely(!GetOverlappedResult(fd, &item->ov, &written, true))) { + ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "GetOverlappedResult", __Wpedantic_format_voidptr(item), + item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + (int)GetLastError()); + goto bailout_geterr; + } + assert(MDBX_SUCCESS == item->ov.Internal); + assert(written == item->ov.InternalHigh); + } + } else { + assert(HasOverlappedIoCompleted(&item->ov)); + } + assert(item->ov.Internal != ERROR_IO_PENDING); + if (unlikely(item->ov.Internal != MDBX_SUCCESS)) { + DWORD written = 0; + r.err = (int)item->ov.Internal; + if ((r.err & 0x80000000) && + GetOverlappedResult(NULL, &item->ov, &written, true)) + r.err = (int)GetLastError(); + ERROR("%s: item %p (%zu), pgno %u, bytes %zu, offset %" PRId64 + ", err %d", + "Result", __Wpedantic_format_voidptr(item), item - ior->pool, + ((MDBX_page *)item->single.iov_base)->mp_pgno, bytes, + item->ov.Offset + ((uint64_t)item->ov.OffsetHigh << 32), + (int)GetLastError()); + goto bailout_rc; + } + if (unlikely(item->ov.InternalHigh != bytes)) { + r.err = ERROR_WRITE_FAULT; + goto bailout_rc; + } + item = ior_next(item, i); + } + assert(ior->async_waiting == ior->async_completed); + } else { + assert(r.err == MDBX_SUCCESS); + } + assert(ior->async_waiting == ior->async_completed); + +#else + STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), + "libmdbx requires 64-bit file I/O on 64-bit systems"); + for (ior_item_t *item = ior->pool; item <= ior->last;) { +#if MDBX_HAVE_PWRITEV + assert(item->sgvcnt > 0); + if (item->sgvcnt == 1) + r.err = osal_pwrite(fd, item->sgv[0].iov_base, item->sgv[0].iov_len, + item->offset); + else + r.err = osal_pwritev(fd, item->sgv, item->sgvcnt, item->offset); + + // TODO: io_uring_prep_write(sqe, fd, ...); + + item = ior_next(item, item->sgvcnt); +#else + r.err = osal_pwrite(fd, item->single.iov_base, item->single.iov_len, + item->offset); + item = ior_next(item, 1); +#endif + r.wops += 1; + if (unlikely(r.err != MDBX_SUCCESS)) + break; + } + + // TODO: io_uring_submit(&ring) + // TODO: err = io_uring_wait_cqe(&ring, &cqe); + // TODO: io_uring_cqe_seen(&ring, cqe); + +#endif /* !Windows */ + return r; +} + +MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *ior) { +#if defined(_WIN32) || defined(_WIN64) + if (ior->last) { + for (ior_item_t *item = ior->pool; item <= ior->last;) { + if (!HasOverlappedIoCompleted(&item->ov)) { + assert(ior->overlapped_fd); + CancelIoEx(ior->overlapped_fd, &item->ov); + } + if (item->ov.hEvent && item->ov.hEvent != ior) + ior_put_event(ior, item->ov.hEvent); + size_t i = 1; + if ((item->single.iov_len & ior_WriteFile_flag) == 0) { + /* Zap: Reading invalid data from 'item->sgv' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6385); + while (item->sgv[i].Buffer) + ++i; + } + item = ior_next(item, i); + } + } + ior->async_waiting = INT_MAX; + ior->async_completed = 0; + ResetEvent(ior->async_done); +#endif /* !Windows */ + ior->slots_left = ior->allocated; + ior->last = nullptr; +} + +static void ior_cleanup(osal_ioring_t *ior, const size_t since) { + osal_ioring_reset(ior); +#if defined(_WIN32) || defined(_WIN64) + for (size_t i = since; i < ior->event_stack; ++i) { + /* Zap: Using uninitialized memory '**ior.event_pool' */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6001); + CloseHandle(ior->event_pool[i]); + } + ior->event_stack = 0; +#else + (void)since; +#endif /* Windows */ +} + +MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *ior, size_t items) { + assert(items > 0 && items < INT_MAX / sizeof(ior_item_t)); +#if defined(_WIN32) || defined(_WIN64) + if (ior->state & IOR_STATE_LOCKED) + return MDBX_SUCCESS; + const bool useSetFileIoOverlappedRange = + ior->overlapped_fd && mdbx_SetFileIoOverlappedRange && items > 42; + const size_t ceiling = + useSetFileIoOverlappedRange + ? ((items < 65536 / 2 / sizeof(ior_item_t)) ? 65536 : 65536 * 4) + : 1024; + const size_t bytes = ceil_powerof2(sizeof(ior_item_t) * items, ceiling); + items = bytes / sizeof(ior_item_t); +#endif /* Windows */ + + if (items != ior->allocated) { + assert(items >= osal_ioring_used(ior)); + if (items < ior->allocated) + ior_cleanup(ior, items); +#if defined(_WIN32) || defined(_WIN64) + void *ptr = osal_realloc( + ior->event_pool, + (items + /* extra for waiting the async_done */ 1) * sizeof(HANDLE)); + if (unlikely(!ptr)) + return MDBX_ENOMEM; + ior->event_pool = ptr; + + int err = osal_memalign_alloc(ceiling, bytes, &ptr); + if (unlikely(err != MDBX_SUCCESS)) + return err; + if (ior->pool) { + memcpy(ptr, ior->pool, ior->allocated * sizeof(ior_item_t)); + osal_memalign_free(ior->pool); + } +#else + void *ptr = osal_realloc(ior->pool, sizeof(ior_item_t) * items); + if (unlikely(!ptr)) + return MDBX_ENOMEM; +#endif + ior->pool = ptr; + + if (items > ior->allocated) + memset(ior->pool + ior->allocated, 0, + sizeof(ior_item_t) * (items - ior->allocated)); + ior->allocated = (unsigned)items; + ior->boundary = ptr_disp(ior->pool, ior->allocated); +#if defined(_WIN32) || defined(_WIN64) + if (useSetFileIoOverlappedRange) { + if (mdbx_SetFileIoOverlappedRange(ior->overlapped_fd, ptr, (ULONG)bytes)) + ior->state += IOR_STATE_LOCKED; + else + return GetLastError(); + } +#endif /* Windows */ + } + return MDBX_SUCCESS; +} + +MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *ior) { + if (ior->allocated) + ior_cleanup(ior, 0); +#if defined(_WIN32) || defined(_WIN64) + osal_memalign_free(ior->pool); + osal_free(ior->event_pool); + CloseHandle(ior->async_done); + if (ior->overlapped_fd) + CloseHandle(ior->overlapped_fd); +#else + osal_free(ior->pool); +#endif + memset(ior, 0, sizeof(osal_ioring_t)); +} + +/*----------------------------------------------------------------------------*/ + +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname) { +#if defined(_WIN32) || defined(_WIN64) + return DeleteFileW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); #else return unlink(pathname) ? errno : MDBX_SUCCESS; #endif @@ -555,26 +1206,66 @@ MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname) { static bool is_valid_fd(int fd) { return !(isatty(fd) < 0 && errno == EBADF); } #endif /*! Windows */ -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname) { +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname) { #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; - MUSTDIE_MB2WIDE(pathname, pathnameW); - return RemoveDirectoryW(pathnameW) ? MDBX_SUCCESS : (int)GetLastError(); + return RemoveDirectoryW(pathname) ? MDBX_SUCCESS : (int)GetLastError(); #else return rmdir(pathname) ? errno : MDBX_SUCCESS; #endif } -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname) { +#if defined(_WIN32) || defined(_WIN64) + if (GetFileAttributesW(pathname) != INVALID_FILE_ATTRIBUTES) + return MDBX_RESULT_TRUE; + int err = GetLastError(); + return (err == ERROR_FILE_NOT_FOUND || err == ERROR_PATH_NOT_FOUND) + ? MDBX_RESULT_FALSE + : err; +#else + if (access(pathname, F_OK) == 0) + return MDBX_RESULT_TRUE; + int err = errno; + return (err == ENOENT || err == ENOTDIR) ? MDBX_RESULT_FALSE : err; +#endif +} + +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len) { + const pathchar_t *ext = nullptr; + for (size_t i = 0; i < len && pathname[i]; i++) + if (pathname[i] == '.') + ext = pathname + i; + else if (osal_isdirsep(pathname[i])) + ext = nullptr; + return (pathchar_t *)ext; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len) { +#if defined(_WIN32) || defined(_WIN64) + for (size_t i = 0; i < len; ++i) { + pathchar_t a = l[i]; + pathchar_t b = r[i]; + a = (a == '\\') ? '/' : a; + b = (b == '\\') ? '/' : b; + if (a != b) + return false; + } + return true; +#else + return memcmp(l, r, len * sizeof(pathchar_t)) == 0; +#endif +} + +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits) { *fd = INVALID_HANDLE_VALUE; #if defined(_WIN32) || defined(_WIN64) - const wchar_t *pathnameW = nullptr; - MUSTDIE_MB2WIDE(pathname, pathnameW); - DWORD CreationDisposition = unix_mode_bits ? OPEN_ALWAYS : OPEN_EXISTING; DWORD FlagsAndAttributes = FILE_FLAG_POSIX_SEMANTICS | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED; @@ -599,17 +1290,25 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, case MDBX_OPEN_DXB_LAZY: DesiredAccess |= GENERIC_READ | GENERIC_WRITE; break; + case MDBX_OPEN_DXB_OVERLAPPED_DIRECT: + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; + /* fall through */ + __fallthrough; + case MDBX_OPEN_DXB_OVERLAPPED: + FlagsAndAttributes |= FILE_FLAG_OVERLAPPED; + /* fall through */ + __fallthrough; case MDBX_OPEN_DXB_DSYNC: CreationDisposition = OPEN_EXISTING; - DesiredAccess |= GENERIC_WRITE; + DesiredAccess |= GENERIC_WRITE | GENERIC_READ; FlagsAndAttributes |= FILE_FLAG_WRITE_THROUGH; break; case MDBX_OPEN_COPY: CreationDisposition = CREATE_NEW; ShareMode = 0; DesiredAccess |= GENERIC_WRITE; - FlagsAndAttributes |= - (env->me_psize < env->me_os_psize) ? 0 : FILE_FLAG_NO_BUFFERING; + if (env->me_psize >= env->me_os_psize) + FlagsAndAttributes |= FILE_FLAG_NO_BUFFERING; break; case MDBX_OPEN_DELETE: CreationDisposition = OPEN_EXISTING; @@ -619,12 +1318,12 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, break; } - *fd = CreateFileW(pathnameW, DesiredAccess, ShareMode, NULL, + *fd = CreateFileW(pathname, DesiredAccess, ShareMode, NULL, CreationDisposition, FlagsAndAttributes, NULL); if (*fd == INVALID_HANDLE_VALUE) { int err = (int)GetLastError(); if (err == ERROR_ACCESS_DENIED && purpose == MDBX_OPEN_LCK) { - if (GetFileAttributesW(pathnameW) == INVALID_FILE_ATTRIBUTES && + if (GetFileAttributesW(pathname) == INVALID_FILE_ATTRIBUTES && GetLastError() == ERROR_FILE_NOT_FOUND) err = ERROR_FILE_NOT_FOUND; } @@ -643,7 +1342,7 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, (FILE_ATTRIBUTE_HIDDEN | FILE_ATTRIBUTE_NOT_CONTENT_INDEXED | FILE_ATTRIBUTE_TEMPORARY | FILE_ATTRIBUTE_COMPRESSED); if (AttributesDiff) - (void)SetFileAttributesW(pathnameW, info.dwFileAttributes ^ AttributesDiff); + (void)SetFileAttributesW(pathname, info.dwFileAttributes ^ AttributesDiff); #else int flags = unix_mode_bits ? O_CREAT : 0; @@ -697,18 +1396,18 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, int stub_fd0 = -1, stub_fd1 = -1, stub_fd2 = -1; static const char dev_null[] = "/dev/null"; if (!is_valid_fd(STDIN_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", - STDIN_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "IN", + STDIN_FILENO, dev_null); stub_fd0 = open(dev_null, O_RDONLY | O_NOCTTY); } if (!is_valid_fd(STDOUT_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", - "OUT", STDOUT_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "OUT", + STDOUT_FILENO, dev_null); stub_fd1 = open(dev_null, O_WRONLY | O_NOCTTY); } if (!is_valid_fd(STDERR_FILENO)) { - mdbx_warning("STD%s_FILENO/%d is invalid, open %s for temporary stub", - "ERR", STDERR_FILENO, dev_null); + WARNING("STD%s_FILENO/%d is invalid, open %s for temporary stub", "ERR", + STDERR_FILENO, dev_null); stub_fd2 = open(dev_null, O_WRONLY | O_NOCTTY); } #else @@ -733,20 +1432,20 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, /* Safeguard for https://libmdbx.dqdkfa.ru/dead-github/issues/144 */ #if STDIN_FILENO == 0 && STDOUT_FILENO == 1 && STDERR_FILENO == 2 if (*fd == STDIN_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", - STDIN_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "IN", + STDIN_FILENO); assert(stub_fd0 == -1); *fd = dup(stub_fd0 = *fd); } if (*fd == STDOUT_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", - STDOUT_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "OUT", + STDOUT_FILENO); assert(stub_fd1 == -1); *fd = dup(stub_fd1 = *fd); } if (*fd == STDERR_FILENO) { - mdbx_warning("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", - STDERR_FILENO); + WARNING("Got STD%s_FILENO/%d, avoid using it by dup(fd)", "ERR", + STDERR_FILENO); assert(stub_fd2 == -1); *fd = dup(stub_fd2 = *fd); } @@ -757,10 +1456,9 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, if (stub_fd2 != -1) close(stub_fd2); if (*fd >= STDIN_FILENO && *fd <= STDERR_FILENO) { - mdbx_error( - "Rejecting the use of a FD in the range " - "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", - STDIN_FILENO, STDERR_FILENO); + ERROR("Rejecting the use of a FD in the range " + "STDIN_FILENO/%d..STDERR_FILENO/%d to prevent database corruption", + STDIN_FILENO, STDERR_FILENO); close(*fd); return EBADF; } @@ -787,7 +1485,7 @@ MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) return CloseHandle(fd) ? MDBX_SUCCESS : (int)GetLastError(); #else @@ -796,7 +1494,7 @@ MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd) { #endif } -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, uint64_t offset) { if (bytes > MAX_WRITE) return MDBX_EINVAL; @@ -823,7 +1521,7 @@ MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t bytes, return (bytes == (size_t)read) ? MDBX_SUCCESS : MDBX_ENODATA; } -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t bytes, uint64_t offset) { while (true) { #if defined(_WIN32) || defined(_WIN64) @@ -855,11 +1553,11 @@ MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, #endif bytes -= written; offset += written; - buf = (char *)buf + written; + buf = ptr_disp(buf, written); } } -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t bytes) { while (true) { #if defined(_WIN32) || defined(_WIN64) @@ -885,32 +1583,34 @@ MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, } #endif bytes -= written; - buf = (char *)buf + written; + buf = ptr_disp(buf, written); } } -int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, - uint64_t offset, size_t expected_written) { -#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || \ - (defined(__ANDROID_API__) && __ANDROID_API__ < 24) +int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, size_t sgvcnt, + uint64_t offset) { + size_t expected = 0; + for (size_t i = 0; i < sgvcnt; ++i) + expected += iov[i].iov_len; +#if !MDBX_HAVE_PWRITEV size_t written = 0; - for (int i = 0; i < iovcnt; ++i) { - int rc = mdbx_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); + for (size_t i = 0; i < sgvcnt; ++i) { + int rc = osal_pwrite(fd, iov[i].iov_base, iov[i].iov_len, offset); if (unlikely(rc != MDBX_SUCCESS)) return rc; written += iov[i].iov_len; offset += iov[i].iov_len; } - return (expected_written == written) ? MDBX_SUCCESS - : MDBX_EIO /* ERROR_WRITE_FAULT */; + return (expected == written) ? MDBX_SUCCESS + : MDBX_EIO /* ERROR_WRITE_FAULT */; #else int rc; intptr_t written; do { STATIC_ASSERT_MSG(sizeof(off_t) >= sizeof(size_t), "libmdbx requires 64-bit file I/O on 64-bit systems"); - written = pwritev(fd, iov, iovcnt, offset); - if (likely(expected_written == (size_t)written)) + written = pwritev(fd, iov, sgvcnt, offset); + if (likely(expected == (size_t)written)) return MDBX_SUCCESS; rc = errno; } while (rc == EINTR); @@ -918,8 +1618,8 @@ int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, int iovcnt, #endif } -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - enum mdbx_syncmode_bits mode_bits) { +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + enum osal_syncmode_bits mode_bits) { #if defined(_WIN32) || defined(_WIN64) if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && !FlushFileBuffers(fd)) return (int)GetLastError(); @@ -940,21 +1640,21 @@ MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, while (1) { switch (mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_SIZE)) { case MDBX_SYNC_NONE: + case MDBX_SYNC_KICK: return MDBX_SUCCESS /* nothing to do */; #if defined(_POSIX_SYNCHRONIZED_IO) && _POSIX_SYNCHRONIZED_IO > 0 case MDBX_SYNC_DATA: - if (fdatasync(fd) == 0) + if (likely(fdatasync(fd) == 0)) return MDBX_SUCCESS; break /* error */; #if defined(__linux__) || defined(__gnu_linux__) case MDBX_SYNC_SIZE: - if (mdbx_linux_kernel_version >= 0x03060000) - return MDBX_SUCCESS; - __fallthrough /* fall through */; + assert(linux_kernel_version >= 0x03060000); + return MDBX_SUCCESS; #endif /* Linux */ #endif /* _POSIX_SYNCHRONIZED_IO > 0 */ default: - if (fsync(fd) == 0) + if (likely(fsync(fd) == 0)) return MDBX_SUCCESS; } @@ -965,7 +1665,7 @@ MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, #endif } -int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { +int osal_filesize(mdbx_filehandle_t fd, uint64_t *length) { #if defined(_WIN32) || defined(_WIN64) BY_HANDLE_FILE_INFORMATION info; if (!GetFileInformationByHandle(fd, &info)) @@ -984,7 +1684,7 @@ int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) { +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd) { #if defined(_WIN32) || defined(_WIN64) switch (GetFileType(fd)) { case FILE_TYPE_DISK: @@ -1015,7 +1715,7 @@ MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd) { #endif } -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_SetFileInformationByHandle) { FILE_END_OF_FILE_INFO EndOfFileInfo; @@ -1039,7 +1739,7 @@ MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length) { #endif } -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER li; li.QuadPart = pos; @@ -1055,7 +1755,7 @@ MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos) { /*----------------------------------------------------------------------------*/ MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg) { #if defined(_WIN32) || defined(_WIN64) @@ -1066,7 +1766,7 @@ mdbx_thread_create(mdbx_thread_t *thread, #endif } -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread) { #if defined(_WIN32) || defined(_WIN64) DWORD code = WaitForSingleObject(thread, INFINITE); return waitstatus2errcode(code); @@ -1078,29 +1778,43 @@ MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread) { /*----------------------------------------------------------------------------*/ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits) { - uint8_t *ptr = (uint8_t *)map->address + offset; + enum osal_syncmode_bits mode_bits) { + if (!MDBX_MMAP_USE_MS_ASYNC && mode_bits == MDBX_SYNC_NONE) + return MDBX_SUCCESS; + + void *ptr = ptr_disp(map->base, offset); #if defined(_WIN32) || defined(_WIN64) if (!FlushViewOfFile(ptr, length)) return (int)GetLastError(); + if ((mode_bits & (MDBX_SYNC_DATA | MDBX_SYNC_IODQ)) && + !FlushFileBuffers(map->fd)) + return (int)GetLastError(); #else #if defined(__linux__) || defined(__gnu_linux__) - if (mode_bits == MDBX_SYNC_NONE && mdbx_linux_kernel_version > 0x02061300) - /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly - * tracks dirty pages and flushes them to storage as necessary. */ - return MDBX_SUCCESS; + /* Since Linux 2.6.19, MS_ASYNC is in fact a no-op. The kernel properly + * tracks dirty pages and flushes ones as necessary. */ + // + // However, this behavior may be changed in custom kernels, + // so just leave such optimization to the libc discretion. + // NOTE: The MDBX_MMAP_USE_MS_ASYNC must be defined to 1 for such cases. + // + // assert(linux_kernel_version > 0x02061300); + // if (mode_bits <= MDBX_SYNC_KICK) + // return MDBX_SUCCESS; #endif /* Linux */ if (msync(ptr, length, (mode_bits & MDBX_SYNC_DATA) ? MS_SYNC : MS_ASYNC)) return errno; - mode_bits &= ~MDBX_SYNC_DATA; + if ((mode_bits & MDBX_SYNC_SIZE) && fsync(map->fd)) + return errno; #endif - return mdbx_fsync(map->fd, mode_bits); + return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err) { +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err) { #if defined(_WIN32) || defined(_WIN64) (void)pathname; (void)err; @@ -1128,7 +1842,51 @@ MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, return MDBX_SUCCESS; } -static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle) { +#if defined(_WIN32) || defined(_WIN64) + (void)handle; +#else + struct statfs statfs_info; + if (fstatfs(handle, &statfs_info)) + return errno; + +#if defined(__OpenBSD__) + const unsigned type = 0; +#else + const unsigned type = statfs_info.f_type; +#endif + switch (type) { + case 0x28cd3d45 /* CRAMFS_MAGIC */: + case 0x858458f6 /* RAMFS_MAGIC */: + case 0x01021994 /* TMPFS_MAGIC */: + case 0x73717368 /* SQUASHFS_MAGIC */: + case 0x7275 /* ROMFS_MAGIC */: + return MDBX_RESULT_TRUE; + } + +#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__BSD__) || defined(__bsdi__) || defined(__DragonFly__) || \ + defined(__APPLE__) || defined(__MACH__) || defined(MFSNAMELEN) || \ + defined(MFSTYPENAMELEN) || defined(VFS_NAMELEN) + const char *const name = statfs_info.f_fstypename; + const size_t name_len = sizeof(statfs_info.f_fstypename); +#else + const char *const name = ""; + const size_t name_len = 0; +#endif + if (name_len) { + if (strncasecmp("tmpfs", name, 6) == 0 || + strncasecmp("mfs", name, 4) == 0 || + strncasecmp("ramfs", name, 6) == 0 || + strncasecmp("romfs", name, 6) == 0) + return MDBX_RESULT_TRUE; + } +#endif /* !Windows */ + + return MDBX_RESULT_FALSE; +} + +static int osal_check_fs_local(mdbx_filehandle_t handle, int flags) { #if defined(_WIN32) || defined(_WIN64) if (mdbx_RunningUnderWine() && !(flags & MDBX_EXCLUSIVE)) return ERROR_NOT_CAPABLE /* workaround for Wine */; @@ -1175,7 +1933,7 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { } if (mdbx_GetVolumeInformationByHandleW && mdbx_GetFinalPathNameByHandleW) { - WCHAR *PathBuffer = mdbx_malloc(sizeof(WCHAR) * INT16_MAX); + WCHAR *PathBuffer = osal_malloc(sizeof(WCHAR) * INT16_MAX); if (!PathBuffer) return MDBX_ENOMEM; @@ -1243,7 +2001,7 @@ static int mdbx_check_fs_local(mdbx_filehandle_t handle, int flags) { } bailout: - mdbx_free(PathBuffer); + osal_free(PathBuffer); return rc; } @@ -1420,11 +2178,10 @@ static int check_mmap_limit(const size_t limit) { const int log2page = log2n_powerof2(pagesize); if ((limit >> (log2page + 7)) > (size_t)total_ram_pages || (limit >> (log2page + 6)) > (size_t)avail_ram_pages) { - mdbx_error( - "%s (%zu pages) is too large for available (%zu pages) or total " - "(%zu pages) system RAM", - "database upper size limit", limit >> log2page, avail_ram_pages, - total_ram_pages); + ERROR("%s (%zu pages) is too large for available (%zu pages) or total " + "(%zu pages) system RAM", + "database upper size limit", limit >> log2page, avail_ram_pages, + total_ram_pages); return MDBX_TOO_LARGE; } } @@ -1432,19 +2189,18 @@ static int check_mmap_limit(const size_t limit) { return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, - const size_t size, const size_t limit, - const unsigned options) { +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options) { assert(size <= limit); map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; map->filesize = 0; #if defined(_WIN32) || defined(_WIN64) map->section = NULL; #endif /* Windows */ - int err = mdbx_check_fs_local(map->fd, flags); + int err = osal_check_fs_local(map->fd, flags); if (unlikely(err != MDBX_SUCCESS)) return err; @@ -1453,7 +2209,8 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, return err; if ((flags & MDBX_RDONLY) == 0 && (options & MMAP_OPTION_TRUNCATE) != 0) { - err = mdbx_ftruncate(map->fd, size); + err = osal_ftruncate(map->fd, size); + VERBOSE("ftruncate %zu, err %d", size, err); if (err != MDBX_SUCCESS) return err; map->filesize = size; @@ -1461,10 +2218,17 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, map->current = size; #endif /* !Windows */ } else { - err = mdbx_filesize(map->fd, &map->filesize); + err = osal_filesize(map->fd, &map->filesize); + VERBOSE("filesize %" PRIu64 ", err %d", map->filesize, err); if (err != MDBX_SUCCESS) return err; -#if !(defined(_WIN32) || defined(_WIN64)) +#if defined(_WIN32) || defined(_WIN64) + if (map->filesize < size) { + WARNING("file size (%zu) less than requested for mapping (%zu)", + (size_t)map->filesize, size); + size = (size_t)map->filesize; + } +#else map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; #endif /* !Windows */ } @@ -1490,7 +2254,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, : mdbx_RunningUnderWine() ? size : limit; err = NtMapViewOfSection( - map->section, GetCurrentProcess(), &map->address, + map->section, GetCurrentProcess(), &map->base, /* ZeroBits */ 0, /* CommitSize */ 0, /* SectionOffset */ NULL, &ViewSize, @@ -1501,10 +2265,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, if (!NT_SUCCESS(err)) { NtClose(map->section); map->section = 0; - map->address = nullptr; + map->base = nullptr; return ntstatus2errcode(err); } - assert(map->address != MAP_FAILED); + assert(map->base != MAP_FAILED); map->current = (size_t)SectionSize.QuadPart; map->limit = ViewSize; @@ -1535,7 +2299,7 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, #define MAP_NORESERVE 0 #endif - map->address = mmap( + map->base = mmap( NULL, limit, (flags & MDBX_WRITEMAP) ? PROT_READ | PROT_WRITE : PROT_READ, MAP_SHARED | MAP_FILE | MAP_NORESERVE | (F_ISSET(flags, MDBX_UTTERLY_NOSYNC) ? MAP_NOSYNC : 0) | @@ -1543,10 +2307,10 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, : MAP_CONCEAL), map->fd, 0); - if (unlikely(map->address == MAP_FAILED)) { + if (unlikely(map->base == MAP_FAILED)) { map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; assert(errno != 0); return errno; } @@ -1554,38 +2318,37 @@ MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK - if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) + if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) return errno; #endif /* MADV_DONTFORK */ #ifdef MADV_NOHUGEPAGE - (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); + (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); #endif /* MADV_NOHUGEPAGE */ #endif /* MDBX_ENABLE_MADVISE */ #endif /* ! Windows */ - VALGRIND_MAKE_MEM_DEFINED(map->address, map->current); - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->current); + VALGRIND_MAKE_MEM_DEFINED(map->base, map->current); + MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->current); return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map) { + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, - (map->filesize && map->filesize < map->limit) - ? map->filesize - : map->limit); + MDBX_ASAN_UNPOISON_MEMORY_REGION( + map->base, (map->filesize && map->filesize < map->limit) ? map->filesize + : map->limit); #if defined(_WIN32) || defined(_WIN64) if (map->section) NtClose(map->section); - NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + NTSTATUS rc = NtUnmapViewOfSection(GetCurrentProcess(), map->base); if (!NT_SUCCESS(rc)) ntstatus2errcode(rc); #else - if (unlikely(munmap(map->address, map->limit))) { + if (unlikely(munmap(map->base, map->limit))) { assert(errno != 0); return errno; } @@ -1593,31 +2356,44 @@ MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map) { map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; return MDBX_SUCCESS; } -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit) { + int rc = osal_filesize(map->fd, &map->filesize); + VERBOSE("flags 0x%x, size %zu, limit %zu, filesize %" PRIu64, flags, size, + limit, map->filesize); assert(size <= limit); + if (rc != MDBX_SUCCESS) { + map->filesize = 0; + return rc; + } + #if defined(_WIN32) || defined(_WIN64) assert(size != map->current || limit != map->limit || size < map->filesize); NTSTATUS status; LARGE_INTEGER SectionSize; - int err, rc = MDBX_SUCCESS; + int err; - if (!(flags & MDBX_RDONLY) && limit == map->limit && size > map->current && - /* workaround for Wine */ mdbx_NtExtendSection) { - /* growth rw-section */ - SectionSize.QuadPart = size; - status = mdbx_NtExtendSection(map->section, &SectionSize); - if (!NT_SUCCESS(status)) - return ntstatus2errcode(status); - map->current = size; - if (map->filesize < size) - map->filesize = size; - return MDBX_SUCCESS; + if (limit == map->limit && size > map->current) { + if ((flags & MDBX_RDONLY) && map->filesize >= size) { + map->current = size; + return MDBX_SUCCESS; + } else if (!(flags & MDBX_RDONLY) && + /* workaround for Wine */ mdbx_NtExtendSection) { + /* growth rw-section */ + SectionSize.QuadPart = size; + status = mdbx_NtExtendSection(map->section, &SectionSize); + if (!NT_SUCCESS(status)) + return ntstatus2errcode(status); + map->current = size; + if (map->filesize < size) + map->filesize = size; + return MDBX_SUCCESS; + } } if (limit > map->limit) { @@ -1626,7 +2402,7 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, return err; /* check ability of address space for growth before unmap */ - PVOID BaseAddress = (PBYTE)map->address + map->limit; + PVOID BaseAddress = (PBYTE)map->base + map->limit; SIZE_T RegionSize = limit - map->limit; status = NtAllocateVirtualMemory(GetCurrentProcess(), &BaseAddress, 0, &RegionSize, MEM_RESERVE, PAGE_NOACCESS); @@ -1646,14 +2422,17 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, * - change size of mapped view; * - extend read-only mapping; * Therefore we should unmap/map entire section. */ - if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) + if ((flags & MDBX_MRESIZE_MAY_UNMAP) == 0) { + if (size <= map->current && limit == map->limit) + return MDBX_SUCCESS; return MDBX_EPERM; + } /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ - MDBX_ASAN_UNPOISON_MEMORY_REGION(map->address, map->limit); - status = NtUnmapViewOfSection(GetCurrentProcess(), map->address); + MDBX_ASAN_UNPOISON_MEMORY_REGION(map->base, map->limit); + status = NtUnmapViewOfSection(GetCurrentProcess(), map->base); if (!NT_SUCCESS(status)) return ntstatus2errcode(status); status = NtClose(map->section); @@ -1664,8 +2443,7 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, if (!NT_SUCCESS(status)) { bailout_ntstatus: err = ntstatus2errcode(status); - bailout: - map->address = NULL; + map->base = NULL; map->current = map->limit = 0; if (ReservedAddress) { ReservedSize = 0; @@ -1680,7 +2458,7 @@ MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, retry_file_and_section: /* resizing of the file may take a while, * therefore we reserve address space to avoid occupy it by other threads */ - ReservedAddress = map->address; + ReservedAddress = map->base; status = NtAllocateVirtualMemory(GetCurrentProcess(), &ReservedAddress, 0, &ReservedSize, MEM_RESERVE, PAGE_NOACCESS); if (!NT_SUCCESS(status)) { @@ -1690,15 +2468,11 @@ retry_file_and_section: if (flags & MDBX_MRESIZE_MAY_MOVE) /* the base address could be changed */ - map->address = NULL; + map->base = NULL; } - err = mdbx_filesize(map->fd, &map->filesize); - if (err != MDBX_SUCCESS) - goto bailout; - if ((flags & MDBX_RDONLY) == 0 && map->filesize != size) { - err = mdbx_ftruncate(map->fd, size); + err = osal_ftruncate(map->fd, size); if (err == MDBX_SUCCESS) map->filesize = size; /* ignore error, because Windows unable shrink file @@ -1735,7 +2509,7 @@ retry_file_and_section: retry_mapview:; SIZE_T ViewSize = (flags & MDBX_RDONLY) ? size : limit; status = NtMapViewOfSection( - map->section, GetCurrentProcess(), &map->address, + map->section, GetCurrentProcess(), &map->base, /* ZeroBits */ 0, /* CommitSize */ 0, /* SectionOffset */ NULL, &ViewSize, @@ -1746,15 +2520,15 @@ retry_mapview:; if (!NT_SUCCESS(status)) { if (status == (NTSTATUS) /* STATUS_CONFLICTING_ADDRESSES */ 0xC0000018 && - map->address && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { + map->base && (flags & MDBX_MRESIZE_MAY_MOVE) != 0) { /* try remap at another base address */ - map->address = NULL; + map->base = NULL; goto retry_mapview; } NtClose(map->section); map->section = NULL; - if (map->address && (size != map->current || limit != map->limit)) { + if (map->base && (size != map->current || limit != map->limit)) { /* try remap with previously size and limit, * but will return MDBX_UNABLE_EXTEND_MAPSIZE on success */ rc = (limit > map->limit) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; @@ -1766,25 +2540,24 @@ retry_mapview:; /* no way to recovery */ goto bailout_ntstatus; } - assert(map->address != MAP_FAILED); + assert(map->base != MAP_FAILED); map->current = (size_t)SectionSize.QuadPart; map->limit = ViewSize; #else /* Windows */ - map->filesize = 0; - int rc = mdbx_filesize(map->fd, &map->filesize); - if (rc != MDBX_SUCCESS) - return rc; - if (flags & MDBX_RDONLY) { + if (size > map->filesize) + rc = MDBX_UNABLE_EXTEND_MAPSIZE; + else if (size < map->filesize && map->filesize > limit) + rc = MDBX_EPERM; map->current = (map->filesize > limit) ? limit : (size_t)map->filesize; - if (map->current != size) - rc = (size > map->current) ? MDBX_UNABLE_EXTEND_MAPSIZE : MDBX_EPERM; } else { - if (map->filesize != size) { - rc = mdbx_ftruncate(map->fd, size); + if (size > map->filesize || + (size < map->filesize && (flags & MDBX_SHRINK_ALLOWED))) { + rc = osal_ftruncate(map->fd, size); + VERBOSE("ftruncate %zu, err %d", size, rc); if (rc != MDBX_SUCCESS) return rc; map->filesize = size; @@ -1798,7 +2571,7 @@ retry_mapview:; * - this allows us to clear the mask only within the file size * when closing the mapping. */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - (char *)map->address + size, + ptr_disp(map->base, size), ((map->current < map->limit) ? map->current : map->limit) - size); } map->current = size; @@ -1810,7 +2583,7 @@ retry_mapview:; if (limit < map->limit) { /* unmap an excess at end of mapping. */ // coverity[offset_free : FALSE] - if (unlikely(munmap(map->dxb + limit, map->limit - limit))) { + if (unlikely(munmap(ptr_disp(map->base, limit), map->limit - limit))) { assert(errno != 0); return errno; } @@ -1823,10 +2596,10 @@ retry_mapview:; return err; assert(limit > map->limit); - uint8_t *ptr = MAP_FAILED; + void *ptr = MAP_FAILED; #if (defined(__linux__) || defined(__gnu_linux__)) && defined(_GNU_SOURCE) - ptr = mremap(map->address, map->limit, limit, + ptr = mremap(map->base, map->limit, limit, #if defined(MREMAP_MAYMOVE) (flags & MDBX_MRESIZE_MAY_MOVE) ? MREMAP_MAYMOVE : #endif /* MREMAP_MAYMOVE */ @@ -1855,11 +2628,11 @@ retry_mapview:; if (ptr == MAP_FAILED) { /* Try to mmap additional space beyond the end of mapping. */ - ptr = mmap(map->dxb + map->limit, limit - map->limit, mmap_prot, + ptr = mmap(ptr_disp(map->base, map->limit), limit - map->limit, mmap_prot, mmap_flags | MAP_FIXED_NOREPLACE, map->fd, map->limit); - if (ptr == map->dxb + map->limit) + if (ptr == ptr_disp(map->base, map->limit)) /* успешно прилепили отображение в конец */ - ptr = map->dxb; + ptr = map->base; else if (ptr != MAP_FAILED) { /* the desired address is busy, unmap unsuitable one */ if (unlikely(munmap(ptr, limit - map->limit))) { @@ -1892,13 +2665,13 @@ retry_mapview:; return MDBX_UNABLE_EXTEND_MAPSIZE; } - if (unlikely(munmap(map->address, map->limit))) { + if (unlikely(munmap(map->base, map->limit))) { assert(errno != 0); return errno; } // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, limit, mmap_prot, + ptr = mmap(map->base, limit, mmap_prot, (flags & MDBX_MRESIZE_MAY_MOVE) ? mmap_flags : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE @@ -1908,13 +2681,13 @@ retry_mapview:; unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, limit, mmap_prot, mmap_flags | MAP_FIXED, - map->fd, 0); + ptr = + mmap(map->base, limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0); if (unlikely(ptr == MAP_FAILED)) { /* try to restore prev mapping */ // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, map->limit, mmap_prot, + ptr = mmap(map->base, map->limit, mmap_prot, (flags & MDBX_MRESIZE_MAY_MOVE) ? mmap_flags : mmap_flags | (MAP_FIXED_NOREPLACE ? MAP_FIXED_NOREPLACE @@ -1924,19 +2697,20 @@ retry_mapview:; unlikely(ptr == MAP_FAILED) && !(flags & MDBX_MRESIZE_MAY_MOVE) && errno == /* kernel don't support MAP_FIXED_NOREPLACE */ EINVAL) // coverity[pass_freed_arg : FALSE] - ptr = mmap(map->address, map->limit, mmap_prot, mmap_flags | MAP_FIXED, + ptr = mmap(map->base, map->limit, mmap_prot, mmap_flags | MAP_FIXED, map->fd, 0); if (unlikely(ptr == MAP_FAILED)) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ + * See + * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 + */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->address, - (map->current < map->limit) ? map->current : map->limit); + map->base, (map->current < map->limit) ? map->current : map->limit); map->limit = 0; map->current = 0; - map->address = nullptr; + map->base = nullptr; assert(errno != 0); return errno; } @@ -1946,43 +2720,48 @@ retry_mapview:; } assert(ptr && ptr != MAP_FAILED); - if (map->address != ptr) { - VALGRIND_MAKE_MEM_NOACCESS(map->address, map->current); + if (map->base != ptr) { + VALGRIND_MAKE_MEM_NOACCESS(map->base, map->current); /* Unpoisoning is required for ASAN to avoid false-positive diagnostic * when this memory will re-used by malloc or another mmapping. - * See https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 */ + * See + * https://libmdbx.dqdkfa.ru/dead-github/pull/93#issuecomment-613687203 + */ MDBX_ASAN_UNPOISON_MEMORY_REGION( - map->address, (map->current < map->limit) ? map->current : map->limit); + map->base, (map->current < map->limit) ? map->current : map->limit); VALGRIND_MAKE_MEM_DEFINED(ptr, map->current); MDBX_ASAN_UNPOISON_MEMORY_REGION(ptr, map->current); - map->address = ptr; + map->base = ptr; } map->limit = limit; #if MDBX_ENABLE_MADVISE #ifdef MADV_DONTFORK - if (unlikely(madvise(map->address, map->limit, MADV_DONTFORK) != 0)) { + if (unlikely(madvise(map->base, map->limit, MADV_DONTFORK) != 0)) { assert(errno != 0); return errno; } #endif /* MADV_DONTFORK */ #ifdef MADV_NOHUGEPAGE - (void)madvise(map->address, map->limit, MADV_NOHUGEPAGE); + (void)madvise(map->base, map->limit, MADV_NOHUGEPAGE); #endif /* MADV_NOHUGEPAGE */ #endif /* MDBX_ENABLE_MADVISE */ #endif /* POSIX / Windows */ + /* Zap: Redundant code */ + MDBX_SUPPRESS_GOOFY_MSVC_ANALYZER(6287); assert(rc != MDBX_SUCCESS || - (map->address != nullptr && map->address != MAP_FAILED && - map->current == size && map->limit == limit)); + (map->base != nullptr && map->base != MAP_FAILED && + map->current == size && map->limit == limit && + map->filesize >= size)); return rc; } /*----------------------------------------------------------------------------*/ -__cold MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny) { +__cold MDBX_INTERNAL_FUNC void osal_jitter(bool tiny) { for (;;) { #if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ defined(__x86_64__) @@ -2009,10 +2788,15 @@ __cold MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny) { } } +/*----------------------------------------------------------------------------*/ + #if defined(_WIN32) || defined(_WIN64) +static LARGE_INTEGER performance_frequency; #elif defined(__APPLE__) || defined(__MACH__) #include +static uint64_t ratio_16dot16_to_monotine; #elif defined(__linux__) || defined(__gnu_linux__) +static clockid_t posix_clockid; __cold static clockid_t choice_monoclock(void) { struct timespec probe; #if defined(CLOCK_BOOTTIME) @@ -2027,28 +2811,16 @@ __cold static clockid_t choice_monoclock(void) { #endif return CLOCK_MONOTONIC; } +#elif defined(CLOCK_MONOTONIC) +#define posix_clockid CLOCK_MONOTONIC +#else +#define posix_clockid CLOCK_REALTIME #endif -/*----------------------------------------------------------------------------*/ - +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16) { #if defined(_WIN32) || defined(_WIN64) -static LARGE_INTEGER performance_frequency; -#elif defined(__APPLE__) || defined(__MACH__) -static uint64_t ratio_16dot16_to_monotine; -#endif - -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { -#if defined(_WIN32) || defined(_WIN64) - if (unlikely(performance_frequency.QuadPart == 0)) - QueryPerformanceFrequency(&performance_frequency); const uint64_t ratio = performance_frequency.QuadPart; #elif defined(__APPLE__) || defined(__MACH__) - if (unlikely(ratio_16dot16_to_monotine == 0)) { - mach_timebase_info_data_t ti; - mach_timebase_info(&ti); - ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; - } const uint64_t ratio = ratio_16dot16_to_monotine; #else const uint64_t ratio = UINT64_C(1000000000); @@ -2057,53 +2829,89 @@ mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16) { return likely(ret || seconds_16dot16 == 0) ? ret : /* fix underflow */ 1; } -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime) { - static uint64_t limit; - if (unlikely(monotime > limit)) { - if (limit != 0) - return UINT32_MAX; - limit = mdbx_osal_16dot16_to_monotime(UINT32_MAX - 1); - if (monotime > limit) - return UINT32_MAX; - } +static uint64_t monotime_limit; +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime) { + if (unlikely(monotime > monotime_limit)) + return UINT32_MAX; + const uint32_t ret = #if defined(_WIN32) || defined(_WIN64) (uint32_t)((monotime << 16) / performance_frequency.QuadPart); #elif defined(__APPLE__) || defined(__MACH__) (uint32_t)((monotime << 16) / ratio_16dot16_to_monotine); #else - (uint32_t)(monotime * 128 / 1953125); + (uint32_t)((monotime << 7) / 1953125); #endif - return likely(ret || monotime == 0) ? ret : /* fix underflow */ 1; + return ret; } -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void) { +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void) { #if defined(_WIN32) || defined(_WIN64) LARGE_INTEGER counter; - counter.QuadPart = 0; - QueryPerformanceCounter(&counter); - return counter.QuadPart; + if (QueryPerformanceCounter(&counter)) + return counter.QuadPart; #elif defined(__APPLE__) || defined(__MACH__) return mach_absolute_time(); #else - -#if defined(__linux__) || defined(__gnu_linux__) - static clockid_t posix_clockid = -1; - if (unlikely(posix_clockid < 0)) - posix_clockid = choice_monoclock(); -#elif defined(CLOCK_MONOTONIC) -#define posix_clockid CLOCK_MONOTONIC -#else -#define posix_clockid CLOCK_REALTIME -#endif - struct timespec ts; - if (unlikely(clock_gettime(posix_clockid, &ts) != 0)) { - ts.tv_nsec = 0; - ts.tv_sec = 0; - } - return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; + if (likely(clock_gettime(posix_clockid, &ts) == 0)) + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; #endif + return 0; +} + +MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults) { +#if defined(_WIN32) || defined(_WIN64) + if (optional_page_faults) { + PROCESS_MEMORY_COUNTERS pmc; + *optional_page_faults = + GetProcessMemoryInfo(GetCurrentProcess(), &pmc, sizeof(pmc)) + ? pmc.PageFaultCount + : 0; + } + FILETIME unused, usermode; + if (GetThreadTimes(GetCurrentThread(), + /* CreationTime */ &unused, + /* ExitTime */ &unused, + /* KernelTime */ &unused, + /* UserTime */ &usermode)) { + /* one second = 10_000_000 * 100ns = 78125 * (1 << 7) * 100ns; + * result = (h * f / 10_000_000) << 32) + l * f / 10_000_000 = + * = ((h * f) >> 7) / 78125) << 32) + ((l * f) >> 7) / 78125; + * 1) {h, l} *= f; + * 2) {h, l} >>= 7; + * 3) result = ((h / 78125) << 32) + l / 78125; */ + uint64_t l = usermode.dwLowDateTime * performance_frequency.QuadPart; + uint64_t h = usermode.dwHighDateTime * performance_frequency.QuadPart; + l = h << (64 - 7) | l >> 7; + h = h >> 7; + return ((h / 78125) << 32) + l / 78125; + } +#elif defined(RUSAGE_THREAD) || defined(RUSAGE_LWP) +#ifndef RUSAGE_THREAD +#define RUSAGE_THREAD RUSAGE_LWP /* Solaris */ +#endif + struct rusage usage; + if (getrusage(RUSAGE_THREAD, &usage) == 0) { + if (optional_page_faults) + *optional_page_faults = usage.ru_majflt; + return usage.ru_utime.tv_sec * UINT64_C(1000000000) + + usage.ru_utime.tv_usec * 1000u; + } + if (optional_page_faults) + *optional_page_faults = 0; +#elif defined(CLOCK_THREAD_CPUTIME_ID) + if (optional_page_faults) + *optional_page_faults = 0; + struct timespec ts; + if (likely(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts) == 0)) + return ts.tv_sec * UINT64_C(1000000000) + ts.tv_nsec; +#else + /* FIXME */ + if (optional_page_faults) + *optional_page_faults = 0; +#endif + return 0; } /*----------------------------------------------------------------------------*/ @@ -2117,7 +2925,7 @@ static void bootid_shake(bin128_t *p) { p->d = e + p->a; } -static void bootid_collect(bin128_t *p, const void *s, size_t n) { +__cold static void bootid_collect(bin128_t *p, const void *s, size_t n) { p->y += UINT64_C(64526882297375213); bootid_shake(p); for (size_t i = 0; i < n; ++i) { @@ -2142,13 +2950,13 @@ static void bootid_collect(bin128_t *p, const void *s, size_t n) { #if defined(_WIN32) || defined(_WIN64) -static uint64_t windows_systemtime_ms() { +__cold static uint64_t windows_systemtime_ms() { FILETIME ft; GetSystemTimeAsFileTime(&ft); return ((uint64_t)ft.dwHighDateTime << 32 | ft.dwLowDateTime) / 10000ul; } -static uint64_t windows_bootime(void) { +__cold static uint64_t windows_bootime(void) { unsigned confirmed = 0; uint64_t boottime = 0; uint64_t up0 = mdbx_GetTickCount64(); @@ -2175,8 +2983,9 @@ static uint64_t windows_bootime(void) { return 0; } -static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, LPCSTR lpValue, - PVOID pvData, LPDWORD pcbData) { +__cold static LSTATUS mdbx_RegGetValue(HKEY hKey, LPCSTR lpSubKey, + LPCSTR lpValue, PVOID pvData, + LPDWORD pcbData) { LSTATUS rc; if (!mdbx_RegGetValueA) { /* an old Windows 2000/XP */ @@ -2246,7 +3055,7 @@ bootid_parse_uuid(bin128_t *s, const void *p, const size_t n) { return false; } -__cold MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void) { +__cold MDBX_INTERNAL_FUNC bin128_t osal_bootid(void) { bin128_t bin = {{0, 0}}; bool got_machineid = false, got_boottime = false, got_bootseq = false; @@ -2559,7 +3368,7 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, if (avail_pages) *avail_pages = -1; - const intptr_t pagesize = mdbx_syspagesize(); + const intptr_t pagesize = osal_syspagesize(); if (page_size) *page_size = pagesize; if (unlikely(pagesize < MIN_PAGESIZE || !is_powerof2(pagesize))) @@ -2670,3 +3479,49 @@ __cold int mdbx_get_sysraminfo(intptr_t *page_size, intptr_t *total_pages, return MDBX_SUCCESS; } + +#ifndef xMDBX_ALLOY +unsigned sys_pagesize; +MDBX_MAYBE_UNUSED unsigned sys_pagesize_ln2, sys_allocation_granularity; +#endif /* xMDBX_ALLOY */ + +void osal_ctor(void) { +#if MDBX_HAVE_PWRITEV && defined(_SC_IOV_MAX) + osal_iov_max = sysconf(_SC_IOV_MAX); + if (RUNNING_ON_VALGRIND && osal_iov_max > 64) + /* чтобы не описывать все 1024 исключения в valgrind_suppress.txt */ + osal_iov_max = 64; +#endif /* MDBX_HAVE_PWRITEV && _SC_IOV_MAX */ + +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + sys_pagesize = si.dwPageSize; + sys_allocation_granularity = si.dwAllocationGranularity; +#else + sys_pagesize = sysconf(_SC_PAGE_SIZE); + sys_allocation_granularity = (MDBX_WORDBITS > 32) ? 65536 : 4096; + sys_allocation_granularity = (sys_allocation_granularity > sys_pagesize) + ? sys_allocation_granularity + : sys_pagesize; +#endif + assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0); + assert(sys_allocation_granularity >= sys_pagesize && + sys_allocation_granularity % sys_pagesize == 0); + sys_pagesize_ln2 = log2n_powerof2(sys_pagesize); + +#if defined(__linux__) || defined(__gnu_linux__) + posix_clockid = choice_monoclock(); +#endif + +#if defined(_WIN32) || defined(_WIN64) + QueryPerformanceFrequency(&performance_frequency); +#elif defined(__APPLE__) || defined(__MACH__) + mach_timebase_info_data_t ti; + mach_timebase_info(&ti); + ratio_16dot16_to_monotine = UINT64_C(1000000000) * ti.denom / ti.numer; +#endif + monotime_limit = osal_16dot16_to_monotime(UINT32_MAX - 1); +} + +void osal_dtor(void) {} diff --git a/src/osal.h b/src/osal.h index 564e3b6d..4e228ed7 100644 --- a/src/osal.h +++ b/src/osal.h @@ -1,7 +1,7 @@ /* https://en.wikipedia.org/wiki/Operating_system_abstraction_layer */ /* - * Copyright 2015-2022 Leonid Yuriev + * Copyright 2015-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -58,7 +58,7 @@ #include #endif -MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_compiler_barrier(void) { #if defined(__clang__) || defined(__GNUC__) __asm__ __volatile__("" ::: "memory"); #elif defined(_MSC_VER) @@ -78,7 +78,7 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_compiler_barrier(void) { #endif } -MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { +MDBX_MAYBE_UNUSED static __inline void osal_memory_barrier(void) { #ifdef MDBX_HAVE_C11ATOMICS atomic_thread_fence(memory_order_seq_cst); #elif defined(__ATOMIC_SEQ_CST) @@ -116,8 +116,8 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_memory_barrier(void) { #if defined(_WIN32) || defined(_WIN64) #define HAVE_SYS_STAT_H #define HAVE_SYS_TYPES_H -typedef HANDLE mdbx_thread_t; -typedef unsigned mdbx_thread_key_t; +typedef HANDLE osal_thread_t; +typedef unsigned osal_thread_key_t; #define MAP_FAILED NULL #define HIGH_DWORD(v) ((DWORD)((sizeof(v) > 4) ? ((uint64_t)(v) >> 32) : 0)) #define THREAD_CALL WINAPI @@ -125,8 +125,8 @@ typedef unsigned mdbx_thread_key_t; typedef struct { HANDLE mutex; HANDLE event[2]; -} mdbx_condpair_t; -typedef CRITICAL_SECTION mdbx_fastmutex_t; +} osal_condpair_t; +typedef CRITICAL_SECTION osal_fastmutex_t; #if !defined(_MSC_VER) && !defined(__try) /* *INDENT-OFF* */ @@ -139,36 +139,36 @@ typedef CRITICAL_SECTION mdbx_fastmutex_t; #if MDBX_WITHOUT_MSVC_CRT -#ifndef mdbx_malloc -static inline void *mdbx_malloc(size_t bytes) { +#ifndef osal_malloc +static inline void *osal_malloc(size_t bytes) { return HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_malloc */ +#endif /* osal_malloc */ -#ifndef mdbx_calloc -static inline void *mdbx_calloc(size_t nelem, size_t size) { +#ifndef osal_calloc +static inline void *osal_calloc(size_t nelem, size_t size) { return HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, nelem * size); } -#endif /* mdbx_calloc */ +#endif /* osal_calloc */ -#ifndef mdbx_realloc -static inline void *mdbx_realloc(void *ptr, size_t bytes) { +#ifndef osal_realloc +static inline void *osal_realloc(void *ptr, size_t bytes) { return ptr ? HeapReAlloc(GetProcessHeap(), 0, ptr, bytes) : HeapAlloc(GetProcessHeap(), 0, bytes); } -#endif /* mdbx_realloc */ +#endif /* osal_realloc */ -#ifndef mdbx_free -static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } -#endif /* mdbx_free */ +#ifndef osal_free +static inline void osal_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } +#endif /* osal_free */ #else /* MDBX_WITHOUT_MSVC_CRT */ -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup _strdup +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup _strdup #endif /* MDBX_WITHOUT_MSVC_CRT */ @@ -180,26 +180,23 @@ static inline void mdbx_free(void *ptr) { HeapFree(GetProcessHeap(), 0, ptr); } #define vsnprintf _vsnprintf /* ntdll */ #endif -size_t mdbx_mb2w(wchar_t *dst, size_t dst_n, const char *src, size_t src_n); -size_t mdbx_w2mb(char *dst, size_t dst_n, const wchar_t *src, size_t src_n); - #else /*----------------------------------------------------------------------*/ -typedef pthread_t mdbx_thread_t; -typedef pthread_key_t mdbx_thread_key_t; +typedef pthread_t osal_thread_t; +typedef pthread_key_t osal_thread_key_t; #define INVALID_HANDLE_VALUE (-1) #define THREAD_CALL #define THREAD_RESULT void * typedef struct { pthread_mutex_t mutex; pthread_cond_t cond[2]; -} mdbx_condpair_t; -typedef pthread_mutex_t mdbx_fastmutex_t; -#define mdbx_malloc malloc -#define mdbx_calloc calloc -#define mdbx_realloc realloc -#define mdbx_free free -#define mdbx_strdup strdup +} osal_condpair_t; +typedef pthread_mutex_t osal_fastmutex_t; +#define osal_malloc malloc +#define osal_calloc calloc +#define osal_realloc realloc +#define osal_free free +#define osal_strdup strdup #endif /* Platform */ #if __GLIBC_PREREQ(2, 12) || defined(__FreeBSD__) || defined(malloc_usable_size) @@ -213,24 +210,30 @@ typedef pthread_mutex_t mdbx_fastmutex_t; /*----------------------------------------------------------------------------*/ /* OS abstraction layer stuff */ +MDBX_INTERNAL_VAR unsigned sys_pagesize; +MDBX_MAYBE_UNUSED MDBX_INTERNAL_VAR unsigned sys_pagesize_ln2, + sys_allocation_granularity; + /* Get the size of a memory page for the system. * This is the basic size that the platform's memory manager uses, and is * fundamental to the use of memory-mapped files. */ MDBX_MAYBE_UNUSED MDBX_NOTHROW_CONST_FUNCTION static __inline size_t -mdbx_syspagesize(void) { -#if defined(_WIN32) || defined(_WIN64) - SYSTEM_INFO si; - GetSystemInfo(&si); - return si.dwPageSize; -#else - return sysconf(_SC_PAGE_SIZE); -#endif +osal_syspagesize(void) { + assert(sys_pagesize > 0 && (sys_pagesize & (sys_pagesize - 1)) == 0); + return sys_pagesize; } -typedef struct mdbx_mmap_param { +#if defined(_WIN32) || defined(_WIN64) +typedef wchar_t pathchar_t; +#define MDBX_PRIsPATH "ls" +#else +typedef char pathchar_t; +#define MDBX_PRIsPATH "s" +#endif + +typedef struct osal_mmap { union { - void *address; - uint8_t *dxb; + void *base; struct MDBX_lockinfo *lck; }; mdbx_filehandle_t fd; @@ -240,7 +243,7 @@ typedef struct mdbx_mmap_param { #if defined(_WIN32) || defined(_WIN64) HANDLE section; /* memory-mapped section handle */ #endif -} mdbx_mmap_t; +} osal_mmap_t; typedef union bin128 { __anonymous_struct_extension__ struct { uint64_t x, y; }; @@ -248,28 +251,162 @@ typedef union bin128 { } bin128_t; #if defined(_WIN32) || defined(_WIN64) -typedef union MDBX_srwlock { +typedef union osal_srwlock { __anonymous_struct_extension__ struct { long volatile readerCount; long volatile writerCount; }; RTL_SRWLOCK native; -} MDBX_srwlock; +} osal_srwlock_t; #endif /* Windows */ +#ifndef MDBX_HAVE_PWRITEV +#if defined(_WIN32) || defined(_WIN64) + +#define MDBX_HAVE_PWRITEV 0 + +#elif defined(__ANDROID_API__) + +#if __ANDROID_API__ < 24 +#define MDBX_HAVE_PWRITEV 0 +#else +#define MDBX_HAVE_PWRITEV 1 +#endif + +#elif defined(__APPLE__) || defined(__MACH__) || defined(_DARWIN_C_SOURCE) + +#if defined(MAC_OS_X_VERSION_MIN_REQUIRED) && defined(MAC_OS_VERSION_11_0) && \ + MAC_OS_X_VERSION_MIN_REQUIRED >= MAC_OS_VERSION_11_0 +/* FIXME: add checks for IOS versions, etc */ +#define MDBX_HAVE_PWRITEV 1 +#else +#define MDBX_HAVE_PWRITEV 0 +#endif + +#elif defined(_SC_IOV_MAX) || (defined(IOV_MAX) && IOV_MAX > 1) +#define MDBX_HAVE_PWRITEV 1 +#else +#define MDBX_HAVE_PWRITEV 0 +#endif +#endif /* MDBX_HAVE_PWRITEV */ + +typedef struct ior_item { +#if defined(_WIN32) || defined(_WIN64) + OVERLAPPED ov; +#define ior_svg_gap4terminator 1 +#define ior_sgv_element FILE_SEGMENT_ELEMENT +#else + size_t offset; +#if MDBX_HAVE_PWRITEV + size_t sgvcnt; +#define ior_svg_gap4terminator 0 +#define ior_sgv_element struct iovec +#endif /* MDBX_HAVE_PWRITEV */ +#endif /* !Windows */ + union { + MDBX_val single; +#if defined(ior_sgv_element) + ior_sgv_element sgv[1 + ior_svg_gap4terminator]; +#endif /* ior_sgv_element */ + }; +} ior_item_t; + +typedef struct osal_ioring { + unsigned slots_left; + unsigned allocated; +#if defined(_WIN32) || defined(_WIN64) +#define IOR_STATE_LOCKED 1 + HANDLE overlapped_fd; + unsigned pagesize; + unsigned last_sgvcnt; + size_t last_bytes; + uint8_t direct, state, pagesize_ln2; + unsigned event_stack; + HANDLE *event_pool; + volatile LONG async_waiting; + volatile LONG async_completed; + HANDLE async_done; + +#define ior_last_sgvcnt(ior, item) (ior)->last_sgvcnt +#define ior_last_bytes(ior, item) (ior)->last_bytes +#elif MDBX_HAVE_PWRITEV + unsigned last_bytes; +#define ior_last_sgvcnt(ior, item) (item)->sgvcnt +#define ior_last_bytes(ior, item) (ior)->last_bytes +#else +#define ior_last_sgvcnt(ior, item) (1) +#define ior_last_bytes(ior, item) (item)->single.iov_len +#endif /* !Windows */ + ior_item_t *last; + ior_item_t *pool; + char *boundary; +} osal_ioring_t; + #ifndef __cplusplus +/* Actually this is not ioring for now, but on the way. */ +MDBX_INTERNAL_FUNC int osal_ioring_create(osal_ioring_t * +#if defined(_WIN32) || defined(_WIN64) + , + bool enable_direct, + mdbx_filehandle_t overlapped_fd +#endif /* Windows */ +); +MDBX_INTERNAL_FUNC int osal_ioring_resize(osal_ioring_t *, size_t items); +MDBX_INTERNAL_FUNC void osal_ioring_destroy(osal_ioring_t *); +MDBX_INTERNAL_FUNC void osal_ioring_reset(osal_ioring_t *); +MDBX_INTERNAL_FUNC int osal_ioring_add(osal_ioring_t *ctx, const size_t offset, + void *data, const size_t bytes); +typedef struct osal_ioring_write_result { + int err; + unsigned wops; +} osal_ioring_write_result_t; +MDBX_INTERNAL_FUNC osal_ioring_write_result_t +osal_ioring_write(osal_ioring_t *ior, mdbx_filehandle_t fd); + +typedef struct iov_ctx iov_ctx_t; +MDBX_INTERNAL_FUNC void osal_ioring_walk( + osal_ioring_t *ior, iov_ctx_t *ctx, + void (*callback)(iov_ctx_t *ctx, size_t offset, void *data, size_t bytes)); + +MDBX_MAYBE_UNUSED static inline unsigned +osal_ioring_left(const osal_ioring_t *ior) { + return ior->slots_left; +} + +MDBX_MAYBE_UNUSED static inline unsigned +osal_ioring_used(const osal_ioring_t *ior) { + return ior->allocated - ior->slots_left; +} + +MDBX_MAYBE_UNUSED static inline int +osal_ioring_prepare(osal_ioring_t *ior, size_t items, size_t bytes) { + items = (items > 32) ? items : 32; +#if defined(_WIN32) || defined(_WIN64) + if (ior->direct) { + const size_t npages = bytes >> ior->pagesize_ln2; + items = (items > npages) ? items : npages; + } +#else + (void)bytes; +#endif + items = (items < 65536) ? items : 65536; + if (likely(ior->allocated >= items)) + return MDBX_SUCCESS; + return osal_ioring_resize(ior, items); +} + /*----------------------------------------------------------------------------*/ /* libc compatibility stuff */ #if (!defined(__GLIBC__) && __GLIBC_PREREQ(2, 1)) && \ (defined(_GNU_SOURCE) || defined(_BSD_SOURCE)) -#define mdbx_asprintf asprintf -#define mdbx_vasprintf vasprintf +#define osal_asprintf asprintf +#define osal_vasprintf vasprintf #else MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC - MDBX_PRINTF_ARGS(2, 3) int mdbx_asprintf(char **strp, const char *fmt, ...); -MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); + MDBX_PRINTF_ARGS(2, 3) int osal_asprintf(char **strp, const char *fmt, ...); +MDBX_INTERNAL_FUNC int osal_vasprintf(char **strp, const char *fmt, va_list ap); #endif #if !defined(MADV_DODUMP) && defined(MADV_CORE) @@ -280,12 +417,14 @@ MDBX_INTERNAL_FUNC int mdbx_vasprintf(char **strp, const char *fmt, va_list ap); #define MADV_DONTDUMP MADV_NOCORE #endif /* MADV_NOCORE -> MADV_DONTDUMP */ -MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void mdbx_osal_jitter(bool tiny); -MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); +MDBX_MAYBE_UNUSED MDBX_INTERNAL_FUNC void osal_jitter(bool tiny); +MDBX_MAYBE_UNUSED static __inline void jitter4testing(bool tiny); /* max bytes to write in one call */ -#if defined(_WIN32) || defined(_WIN64) -#define MAX_WRITE UINT32_C(0x01000000) +#if defined(_WIN64) +#define MAX_WRITE UINT32_C(0x10000000) +#elif defined(_WIN32) +#define MAX_WRITE UINT32_C(0x04000000) #else #define MAX_WRITE UINT32_C(0x3f000000) @@ -332,15 +471,15 @@ MDBX_MAYBE_UNUSED static __inline void mdbx_jitter4testing(bool tiny); #endif #if defined(__linux__) || defined(__gnu_linux__) -MDBX_INTERNAL_VAR uint32_t mdbx_linux_kernel_version; +MDBX_INTERNAL_VAR uint32_t linux_kernel_version; MDBX_INTERNAL_VAR bool mdbx_RunningOnWSL1 /* Windows Subsystem 1 for Linux */; #endif /* Linux */ -#ifndef mdbx_strdup -LIBMDBX_API char *mdbx_strdup(const char *str); +#ifndef osal_strdup +LIBMDBX_API char *osal_strdup(const char *str); #endif -MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { +MDBX_MAYBE_UNUSED static __inline int osal_get_errno(void) { #if defined(_WIN32) || defined(_WIN64) DWORD rc = GetLastError(); #else @@ -349,84 +488,101 @@ MDBX_MAYBE_UNUSED static __inline int mdbx_get_errno(void) { return rc; } -#ifndef mdbx_memalign_alloc -MDBX_INTERNAL_FUNC int mdbx_memalign_alloc(size_t alignment, size_t bytes, +#ifndef osal_memalign_alloc +MDBX_INTERNAL_FUNC int osal_memalign_alloc(size_t alignment, size_t bytes, void **result); #endif -#ifndef mdbx_memalign_free -MDBX_INTERNAL_FUNC void mdbx_memalign_free(void *ptr); +#ifndef osal_memalign_free +MDBX_INTERNAL_FUNC void osal_memalign_free(void *ptr); #endif -MDBX_INTERNAL_FUNC int mdbx_condpair_init(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_lock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_unlock(mdbx_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_condpair_signal(mdbx_condpair_t *condpair, +MDBX_INTERNAL_FUNC int osal_condpair_init(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_lock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_unlock(osal_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_signal(osal_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_wait(mdbx_condpair_t *condpair, bool part); -MDBX_INTERNAL_FUNC int mdbx_condpair_destroy(mdbx_condpair_t *condpair); +MDBX_INTERNAL_FUNC int osal_condpair_wait(osal_condpair_t *condpair, bool part); +MDBX_INTERNAL_FUNC int osal_condpair_destroy(osal_condpair_t *condpair); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_init(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_acquire(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_release(mdbx_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_fastmutex_destroy(mdbx_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_init(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_acquire(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_release(osal_fastmutex_t *fastmutex); +MDBX_INTERNAL_FUNC int osal_fastmutex_destroy(osal_fastmutex_t *fastmutex); -MDBX_INTERNAL_FUNC int mdbx_pwritev(mdbx_filehandle_t fd, struct iovec *iov, - int iovcnt, uint64_t offset, - size_t expected_written); -MDBX_INTERNAL_FUNC int mdbx_pread(mdbx_filehandle_t fd, void *buf, size_t count, +MDBX_INTERNAL_FUNC int osal_pwritev(mdbx_filehandle_t fd, struct iovec *iov, + size_t sgvcnt, uint64_t offset); +MDBX_INTERNAL_FUNC int osal_pread(mdbx_filehandle_t fd, void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_pwrite(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_pwrite(mdbx_filehandle_t fd, const void *buf, size_t count, uint64_t offset); -MDBX_INTERNAL_FUNC int mdbx_write(mdbx_filehandle_t fd, const void *buf, +MDBX_INTERNAL_FUNC int osal_write(mdbx_filehandle_t fd, const void *buf, size_t count); MDBX_INTERNAL_FUNC int -mdbx_thread_create(mdbx_thread_t *thread, +osal_thread_create(osal_thread_t *thread, THREAD_RESULT(THREAD_CALL *start_routine)(void *), void *arg); -MDBX_INTERNAL_FUNC int mdbx_thread_join(mdbx_thread_t thread); +MDBX_INTERNAL_FUNC int osal_thread_join(osal_thread_t thread); -enum mdbx_syncmode_bits { +enum osal_syncmode_bits { MDBX_SYNC_NONE = 0, - MDBX_SYNC_DATA = 1, - MDBX_SYNC_SIZE = 2, - MDBX_SYNC_IODQ = 4 + MDBX_SYNC_KICK = 1, + MDBX_SYNC_DATA = 2, + MDBX_SYNC_SIZE = 4, + MDBX_SYNC_IODQ = 8 }; -MDBX_INTERNAL_FUNC int mdbx_fsync(mdbx_filehandle_t fd, - const enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_ftruncate(mdbx_filehandle_t fd, uint64_t length); -MDBX_INTERNAL_FUNC int mdbx_fseek(mdbx_filehandle_t fd, uint64_t pos); -MDBX_INTERNAL_FUNC int mdbx_filesize(mdbx_filehandle_t fd, uint64_t *length); +MDBX_INTERNAL_FUNC int osal_fsync(mdbx_filehandle_t fd, + const enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_ftruncate(mdbx_filehandle_t fd, uint64_t length); +MDBX_INTERNAL_FUNC int osal_fseek(mdbx_filehandle_t fd, uint64_t pos); +MDBX_INTERNAL_FUNC int osal_filesize(mdbx_filehandle_t fd, uint64_t *length); -enum mdbx_openfile_purpose { - MDBX_OPEN_DXB_READ = 0, - MDBX_OPEN_DXB_LAZY = 1, - MDBX_OPEN_DXB_DSYNC = 2, - MDBX_OPEN_LCK = 3, - MDBX_OPEN_COPY = 4, - MDBX_OPEN_DELETE = 5 +enum osal_openfile_purpose { + MDBX_OPEN_DXB_READ, + MDBX_OPEN_DXB_LAZY, + MDBX_OPEN_DXB_DSYNC, +#if defined(_WIN32) || defined(_WIN64) + MDBX_OPEN_DXB_OVERLAPPED, + MDBX_OPEN_DXB_OVERLAPPED_DIRECT, +#endif /* Windows */ + MDBX_OPEN_LCK, + MDBX_OPEN_COPY, + MDBX_OPEN_DELETE }; -MDBX_INTERNAL_FUNC int mdbx_openfile(const enum mdbx_openfile_purpose purpose, - const MDBX_env *env, const char *pathname, +MDBX_MAYBE_UNUSED static __inline bool osal_isdirsep(pathchar_t c) { + return +#if defined(_WIN32) || defined(_WIN64) + c == '\\' || +#endif + c == '/'; +} + +MDBX_INTERNAL_FUNC bool osal_pathequal(const pathchar_t *l, const pathchar_t *r, + size_t len); +MDBX_INTERNAL_FUNC pathchar_t *osal_fileext(const pathchar_t *pathname, + size_t len); +MDBX_INTERNAL_FUNC int osal_fileexists(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_openfile(const enum osal_openfile_purpose purpose, + const MDBX_env *env, + const pathchar_t *pathname, mdbx_filehandle_t *fd, mdbx_mode_t unix_mode_bits); -MDBX_INTERNAL_FUNC int mdbx_closefile(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_removefile(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_removedirectory(const char *pathname); -MDBX_INTERNAL_FUNC int mdbx_is_pipe(mdbx_filehandle_t fd); -MDBX_INTERNAL_FUNC int mdbx_lockfile(mdbx_filehandle_t fd, bool wait); +MDBX_INTERNAL_FUNC int osal_closefile(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_removefile(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_removedirectory(const pathchar_t *pathname); +MDBX_INTERNAL_FUNC int osal_is_pipe(mdbx_filehandle_t fd); +MDBX_INTERNAL_FUNC int osal_lockfile(mdbx_filehandle_t fd, bool wait); #define MMAP_OPTION_TRUNCATE 1 #define MMAP_OPTION_SEMAPHORE 2 -MDBX_INTERNAL_FUNC int mdbx_mmap(const int flags, mdbx_mmap_t *map, - const size_t must, const size_t limit, - const unsigned options); -MDBX_INTERNAL_FUNC int mdbx_munmap(mdbx_mmap_t *map); +MDBX_INTERNAL_FUNC int osal_mmap(const int flags, osal_mmap_t *map, size_t size, + const size_t limit, const unsigned options); +MDBX_INTERNAL_FUNC int osal_munmap(osal_mmap_t *map); #define MDBX_MRESIZE_MAY_MOVE 0x00000100 #define MDBX_MRESIZE_MAY_UNMAP 0x00000200 -MDBX_INTERNAL_FUNC int mdbx_mresize(const int flags, mdbx_mmap_t *map, +MDBX_INTERNAL_FUNC int osal_mresize(const int flags, osal_mmap_t *map, size_t size, size_t limit); #if defined(_WIN32) || defined(_WIN64) typedef struct { @@ -434,17 +590,19 @@ typedef struct { HANDLE handles[31]; } mdbx_handle_array_t; MDBX_INTERNAL_FUNC int -mdbx_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); +osal_suspend_threads_before_remap(MDBX_env *env, mdbx_handle_array_t **array); MDBX_INTERNAL_FUNC int -mdbx_resume_threads_after_remap(mdbx_handle_array_t *array); +osal_resume_threads_after_remap(mdbx_handle_array_t *array); #endif /* Windows */ -MDBX_INTERNAL_FUNC int mdbx_msync(mdbx_mmap_t *map, size_t offset, +MDBX_INTERNAL_FUNC int osal_msync(const osal_mmap_t *map, size_t offset, size_t length, - enum mdbx_syncmode_bits mode_bits); -MDBX_INTERNAL_FUNC int mdbx_check_fs_rdonly(mdbx_filehandle_t handle, - const char *pathname, int err); + enum osal_syncmode_bits mode_bits); +MDBX_INTERNAL_FUNC int osal_check_fs_rdonly(mdbx_filehandle_t handle, + const pathchar_t *pathname, + int err); +MDBX_INTERNAL_FUNC int osal_check_fs_incore(mdbx_filehandle_t handle); -MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { +MDBX_MAYBE_UNUSED static __inline uint32_t osal_getpid(void) { STATIC_ASSERT(sizeof(mdbx_pid_t) <= sizeof(uint32_t)); #if defined(_WIN32) || defined(_WIN64) return GetCurrentProcessId(); @@ -454,7 +612,7 @@ MDBX_MAYBE_UNUSED static __inline uint32_t mdbx_getpid(void) { #endif } -MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { +MDBX_MAYBE_UNUSED static __inline uintptr_t osal_thread_self(void) { mdbx_tid_t thunk; STATIC_ASSERT(sizeof(uintptr_t) >= sizeof(thunk)); #if defined(_WIN32) || defined(_WIN64) @@ -467,24 +625,30 @@ MDBX_MAYBE_UNUSED static __inline uintptr_t mdbx_thread_self(void) { #if !defined(_WIN32) && !defined(_WIN64) #if defined(__ANDROID_API__) || defined(ANDROID) || defined(BIONIC) -MDBX_INTERNAL_FUNC int mdbx_check_tid4bionic(void); +MDBX_INTERNAL_FUNC int osal_check_tid4bionic(void); #else -static __inline int mdbx_check_tid4bionic(void) { return 0; } +static __inline int osal_check_tid4bionic(void) { return 0; } #endif /* __ANDROID_API__ || ANDROID) || BIONIC */ MDBX_MAYBE_UNUSED static __inline int -mdbx_pthread_mutex_lock(pthread_mutex_t *mutex) { - int err = mdbx_check_tid4bionic(); +osal_pthread_mutex_lock(pthread_mutex_t *mutex) { + int err = osal_check_tid4bionic(); return unlikely(err) ? err : pthread_mutex_lock(mutex); } #endif /* !Windows */ -MDBX_INTERNAL_FUNC uint64_t mdbx_osal_monotime(void); -MDBX_INTERNAL_FUNC uint64_t -mdbx_osal_16dot16_to_monotime(uint32_t seconds_16dot16); -MDBX_INTERNAL_FUNC uint32_t mdbx_osal_monotime_to_16dot16(uint64_t monotime); +MDBX_INTERNAL_FUNC uint64_t osal_monotime(void); +MDBX_INTERNAL_FUNC uint64_t osal_cputime(size_t *optional_page_faults); +MDBX_INTERNAL_FUNC uint64_t osal_16dot16_to_monotime(uint32_t seconds_16dot16); +MDBX_INTERNAL_FUNC uint32_t osal_monotime_to_16dot16(uint64_t monotime); -MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); +MDBX_MAYBE_UNUSED static inline uint32_t +osal_monotime_to_16dot16_noUnderflow(uint64_t monotime) { + uint32_t seconds_16dot16 = osal_monotime_to_16dot16(monotime); + return seconds_16dot16 ? seconds_16dot16 : /* fix underflow */ (monotime > 0); +} + +MDBX_INTERNAL_FUNC bin128_t osal_bootid(void); /*----------------------------------------------------------------------------*/ /* lck stuff */ @@ -500,7 +664,7 @@ MDBX_INTERNAL_FUNC bin128_t mdbx_osal_bootid(void); /// MUST NOT initialize shared synchronization objects in memory-mapped /// LCK-file that are already in use. /// \return Error code or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_init(MDBX_env *env, MDBX_env *inprocess_neighbor, int global_uniqueness_flag); @@ -521,7 +685,7 @@ MDBX_INTERNAL_FUNC int mdbx_lck_init(MDBX_env *env, /// of other instances of MDBX_env within the current process, e.g. /// restore POSIX-fcntl locks after the closing of file descriptors. /// \return Error code (MDBX_PANIC) or zero on success. -MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, +MDBX_INTERNAL_FUNC int osal_lck_destroy(MDBX_env *env, MDBX_env *inprocess_neighbor); /// \brief Connects to shared interprocess locking objects and tries to acquire @@ -529,14 +693,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_destroy(MDBX_env *env, /// Depending on implementation or/and platform (Windows) this function may /// acquire the non-OS super-level lock (e.g. for shared synchronization /// objects initialization), which will be downgraded to OS-exclusive or -/// shared via explicit calling of mdbx_lck_downgrade(). +/// shared via explicit calling of osal_lck_downgrade(). /// \return /// MDBX_RESULT_TRUE (-1) - if an exclusive lock was acquired and thus /// the current process is the first and only after the last use of DB. /// MDBX_RESULT_FALSE (0) - if a shared lock was acquired and thus /// DB has already been opened and now is used by other processes. /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_seize(MDBX_env *env); /// \brief Downgrades the level of initially acquired lock to /// operational level specified by argument. The reason for such downgrade: @@ -549,14 +713,14 @@ MDBX_INTERNAL_FUNC int mdbx_lck_seize(MDBX_env *env); /// (env->me_flags & MDBX_EXCLUSIVE) != 0 - downgrade to exclusive /// operational lock. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_lck_downgrade(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_lck_downgrade(MDBX_env *env); /// \brief Locks LCK-file or/and table of readers for (de)registering. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rdt_lock(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rdt_lock(MDBX_env *env); /// \brief Unlocks LCK-file or/and table of readers after (de)registering. -MDBX_INTERNAL_FUNC void mdbx_rdt_unlock(MDBX_env *env); +MDBX_INTERNAL_FUNC void osal_rdt_unlock(MDBX_env *env); /// \brief Acquires lock for DB change (on writing transaction start) /// Reading transactions will not be blocked. @@ -571,15 +735,15 @@ LIBMDBX_API void mdbx_txn_unlock(MDBX_env *env); /// \brief Sets alive-flag of reader presence (indicative lock) for PID of /// the current process. The function does no more than needed for -/// the correct working of mdbx_rpid_check() in other processes. +/// the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_set(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_set(MDBX_env *env); /// \brief Resets alive-flag of reader presence (indicative lock) /// for PID of the current process. The function does no more than needed -/// for the correct working of mdbx_rpid_check() in other processes. +/// for the correct working of osal_rpid_check() in other processes. /// \return Error code or zero on success -MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); +MDBX_INTERNAL_FUNC int osal_rpid_clear(MDBX_env *env); /// \brief Checks for reading process status with the given pid with help of /// alive-flag of presence (indicative lock) or using another way. @@ -589,28 +753,16 @@ MDBX_INTERNAL_FUNC int mdbx_rpid_clear(MDBX_env *env); /// MDBX_RESULT_FALSE (0) - if the reader process with the given PID is absent /// or not working with DB (indicative lock is not present). /// Otherwise (not 0 and not -1) - error code. -MDBX_INTERNAL_FUNC int mdbx_rpid_check(MDBX_env *env, uint32_t pid); +MDBX_INTERNAL_FUNC int osal_rpid_check(MDBX_env *env, uint32_t pid); #if defined(_WIN32) || defined(_WIN64) -#define MUSTDIE_MB2WIDE(FROM, TO) \ - do { \ - const char *const from_tmp = (FROM); \ - const size_t from_mblen = strlen(from_tmp); \ - const size_t to_wlen = mdbx_mb2w(nullptr, 0, from_tmp, from_mblen); \ - if (to_wlen < 1 || to_wlen > /* MAX_PATH */ INT16_MAX) \ - return ERROR_INVALID_NAME; \ - wchar_t *const to_tmp = _alloca((to_wlen + 1) * sizeof(wchar_t)); \ - if (to_wlen + 1 != \ - mdbx_mb2w(to_tmp, to_wlen + 1, from_tmp, from_mblen + 1)) \ - return ERROR_INVALID_NAME; \ - (TO) = to_tmp; \ - } while (0) +MDBX_INTERNAL_FUNC int osal_mb2w(const char *const src, wchar_t **const pdst); -typedef void(WINAPI *MDBX_srwlock_function)(MDBX_srwlock *); -MDBX_INTERNAL_VAR MDBX_srwlock_function mdbx_srwlock_Init, - mdbx_srwlock_AcquireShared, mdbx_srwlock_ReleaseShared, - mdbx_srwlock_AcquireExclusive, mdbx_srwlock_ReleaseExclusive; +typedef void(WINAPI *osal_srwlock_t_function)(osal_srwlock_t *); +MDBX_INTERNAL_VAR osal_srwlock_t_function osal_srwlock_Init, + osal_srwlock_AcquireShared, osal_srwlock_ReleaseShared, + osal_srwlock_AcquireExclusive, osal_srwlock_ReleaseExclusive; #if _WIN32_WINNT < 0x0600 /* prior to Windows Vista */ typedef enum _FILE_INFO_BY_HANDLE_CLASS { @@ -726,12 +878,57 @@ MDBX_INTERNAL_VAR MDBX_RegGetValueA mdbx_RegGetValueA; NTSYSAPI ULONG RtlRandomEx(PULONG Seed); +typedef BOOL(WINAPI *MDBX_SetFileIoOverlappedRange)(HANDLE FileHandle, + PUCHAR OverlappedRangeStart, + ULONG Length); +MDBX_INTERNAL_VAR MDBX_SetFileIoOverlappedRange mdbx_SetFileIoOverlappedRange; + #endif /* Windows */ #endif /* !__cplusplus */ /*----------------------------------------------------------------------------*/ +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint64_t +osal_bswap64(uint64_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap64) + return __builtin_bswap64(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_uint64(v); +#elif defined(__bswap_64) + return __bswap_64(v); +#elif defined(bswap_64) + return bswap_64(v); +#else + return v << 56 | v >> 56 | ((v << 40) & UINT64_C(0x00ff000000000000)) | + ((v << 24) & UINT64_C(0x0000ff0000000000)) | + ((v << 8) & UINT64_C(0x000000ff00000000)) | + ((v >> 8) & UINT64_C(0x00000000ff000000)) | + ((v >> 24) & UINT64_C(0x0000000000ff0000)) | + ((v >> 40) & UINT64_C(0x000000000000ff00)); +#endif +} + +MDBX_MAYBE_UNUSED MDBX_NOTHROW_PURE_FUNCTION static __always_inline uint32_t +osal_bswap32(uint32_t v) { +#if __GNUC_PREREQ(4, 4) || __CLANG_PREREQ(4, 0) || \ + __has_builtin(__builtin_bswap32) + return __builtin_bswap32(v); +#elif defined(_MSC_VER) && !defined(__clang__) + return _byteswap_ulong(v); +#elif defined(__bswap_32) + return __bswap_32(v); +#elif defined(bswap_32) + return bswap_32(v); +#else + return v << 24 | v >> 24 | ((v << 8) & UINT32_C(0x00ff0000)) | + ((v >> 8) & UINT32_C(0x0000ff00)); +#endif +} + +/*----------------------------------------------------------------------------*/ + #if defined(_MSC_VER) && _MSC_VER >= 1900 /* LY: MSVC 2015/2017/2019 has buggy/inconsistent PRIuPTR/PRIxPTR macros * for internal format-args checker. */ diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f8fb1618..1889c8b8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -2,32 +2,32 @@ enable_language(CXX) include(../cmake/compiler.cmake) set(LIBMDBX_TEST_SOURCES - base.h - cases.cc - chrono.cc - chrono.h - config.cc - config.h - copy.cc - dead.cc - hill.cc - jitter.cc - keygen.cc - keygen.h - log.cc - log.h - main.cc - osal.h - osal-unix.cc - osal-windows.cc - test.cc - test.h - try.cc - utils.cc - utils.h - append.cc - ttl.cc - nested.cc + base.h++ + cases.c++ + chrono.c++ + chrono.h++ + config.c++ + config.h++ + copy.c++ + dead.c++ + hill.c++ + jitter.c++ + keygen.c++ + keygen.h++ + log.c++ + log.h++ + main.c++ + osal.h++ + osal-unix.c++ + osal-windows.c++ + test.c++ + test.h++ + try.c++ + utils.c++ + utils.h++ + append.c++ + ttl.c++ + nested.c++ ) if(NOT MDBX_BUILD_CXX) @@ -92,11 +92,13 @@ else() set_tests_properties(smoke_chk PROPERTIES DEPENDS smoke TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "cooperative mode" REQUIRED_FILES smoke.db) add_test(NAME smoke_chk_copy COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv smoke.db-copy) set_tests_properties(smoke_chk_copy PROPERTIES DEPENDS smoke TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "cooperative mode" REQUIRED_FILES smoke.db-copy) endif() @@ -109,15 +111,16 @@ else() TIMEOUT 600 RUN_SERIAL OFF) if(MDBX_BUILD_TOOLS) - add_test(NAME dupsort_writemap_chk COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv dupsort_writemap.db) + add_test(NAME dupsort_writemap_chk COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvvwc dupsort_writemap.db) set_tests_properties(dupsort_writemap_chk PROPERTIES DEPENDS dupsort_writemap TIMEOUT 60 REQUIRED_FILES dupsort_writemap.db) - add_test(NAME dupsort_writemap_chk_copy COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv dupsort_writemap.db-copy) + add_test(NAME dupsort_writemap_chk_copy COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvvc dupsort_writemap.db-copy) set_tests_properties(dupsort_writemap_chk_copy PROPERTIES DEPENDS dupsort_writemap TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "monopolistic mode" REQUIRED_FILES dupsort_writemap.db-copy) endif() @@ -128,15 +131,17 @@ else() TIMEOUT 1800 RUN_SERIAL OFF) if(MDBX_BUILD_TOOLS) - add_test(NAME uniq_nested_chk COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv uniq_nested.db) + add_test(NAME uniq_nested_chk COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvvw uniq_nested.db) set_tests_properties(uniq_nested_chk PROPERTIES DEPENDS uniq_nested TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "cooperative mode" REQUIRED_FILES uniq_nested.db) add_test(NAME uniq_nested_chk_copy COMMAND ${MDBX_OUTPUT_DIR}/mdbx_chk -nvv uniq_nested.db-copy) set_tests_properties(uniq_nested_chk_copy PROPERTIES DEPENDS uniq_nested TIMEOUT 60 + FAIL_REGULAR_EXPRESSION "cooperative mode" REQUIRED_FILES uniq_nested.db-copy) endif() diff --git a/test/append.cc b/test/append.c++ similarity index 85% rename from test/append.cc rename to test/append.c++ index d75e0231..d2486001 100644 --- a/test/append.cc +++ b/test/append.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" class testcase_append : public testcase { public: @@ -21,7 +21,14 @@ public: bool run() override; static bool review_params(actor_params ¶ms) { - return testcase::review_params(params) && params.make_keygen_linear(); + if (!testcase::review_params(params)) + return false; + const bool ordered = !flipcoin_x3(); + log_notice("the '%s' key-generation mode is selected", + ordered ? "ordered/linear" : "unordered/non-linear"); + if (ordered && !params.make_keygen_linear()) + return false; + return true; } }; REGISTER_TESTCASE(append); @@ -133,8 +140,6 @@ bool testcase_append::run() { } } else failure_perror("mdbx_get_equal_or_great()", err); - - assert(!expect_key_mismatch); } err = mdbx_cursor_put(cursor_guard.get(), &key->value, &data->value, flags); @@ -148,12 +153,25 @@ bool testcase_append::run() { if (!expect_key_mismatch) { if (unlikely(err != MDBX_SUCCESS)) - failure_perror("mdbx_cursor_put(insert-a)", err); + failure_perror("mdbx_cursor_put(append)", err); ++inserted_number; inserted_checksum.push((uint32_t)inserted_number, key->value); inserted_checksum.push(10639, data->value); + + if (config.params.speculum) { + Item item(iov2dataview(key), iov2dataview(data)); + const auto insertion_result = speculum.insert(item); + if (!insertion_result.second) { + char dump_key[32], dump_value[32]; + log_error( + "speculum.append: unexpected %s {%s, %s}", "MDBX_SUCCESS", + mdbx_dump_val(&key->value, dump_key, sizeof(dump_key)), + mdbx_dump_val(&data->value, dump_value, sizeof(dump_value))); + return false; + } + } } else if (unlikely(err != MDBX_EKEYMISMATCH)) - failure_perror("mdbx_cursor_put(insert-a) != MDBX_EKEYMISMATCH", err); + failure_perror("mdbx_cursor_put(append) != MDBX_EKEYMISMATCH", err); if (++txn_nops >= config.params.batch_write) { err = breakable_restart(); @@ -166,6 +184,10 @@ bool testcase_append::run() { committed_inserted_number = inserted_number; committed_inserted_checksum = inserted_checksum; txn_nops = 0; + if (!speculum_verify()) { + log_notice("append: bailout breakable_restart"); + return false; + } } report(1); @@ -181,6 +203,10 @@ bool testcase_append::run() { } //---------------------------------------------------------------------------- txn_begin(true); + if (!speculum_verify()) { + log_notice("append: bailout verify"); + return false; + } cursor_renew(); MDBX_val check_key, check_data; @@ -209,7 +235,8 @@ bool testcase_append::run() { failure("read_count(%" PRIu64 ") != inserted_number(%" PRIu64 ")", read_count, inserted_number); - if (unlikely(read_checksum.value != inserted_checksum.value)) + if (unlikely(read_checksum.value != inserted_checksum.value) && + !keyvalue_maker.is_unordered()) failure("read_checksum(0x%016" PRIu64 ") " "!= inserted_checksum(0x%016" PRIu64 ")", read_checksum.value, inserted_checksum.value); diff --git a/test/base.h b/test/base.h++ similarity index 92% rename from test/base.h rename to test/base.h++ index 4c113f72..f3a7701e 100644 --- a/test/base.h +++ b/test/base.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -30,6 +30,10 @@ #define _WIN32_WINNT 0x0601 /* Windows 7 */ #endif #ifdef _MSC_VER +/* Workaround for MSVC' header `extern "C"` vs `std::` redefinition bug */ +#if defined(__SANITIZE_ADDRESS__) && !defined(_DISABLE_VECTOR_ANNOTATION) +#define _DISABLE_VECTOR_ANNOTATION +#endif /* _DISABLE_VECTOR_ANNOTATION */ #ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS #endif /* _CRT_SECURE_NO_WARNINGS */ @@ -94,7 +98,7 @@ #define MDBX_INTERNAL_FUNC #define MDBX_INTERNAL_VAR extern -#define xMDBX_TOOLS /* Avoid using internal mdbx_assert() */ +#define xMDBX_TOOLS /* Avoid using internal eASSERT() */ #include "../mdbx.h++" #include "../src/base.h" #include "../src/osal.h" diff --git a/test/cases.cc b/test/cases.c++ similarity index 98% rename from test/cases.cc rename to test/cases.c++ index 75432e5c..97421e7d 100644 --- a/test/cases.cc +++ b/test/cases.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" registry *registry::instance() { static registry *singleton; diff --git a/test/chrono.cc b/test/chrono.c++ similarity index 93% rename from test/chrono.cc rename to test/chrono.c++ index ec22b39b..4d53b60d 100644 --- a/test/chrono.cc +++ b/test/chrono.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" namespace chrono { @@ -87,10 +87,11 @@ time from_ms(uint64_t ms) { time now_realtime() { #if defined(_WIN32) || defined(_WIN64) || defined(_WINDOWS) static void(WINAPI * query_time)(LPFILETIME); - if (!query_time) { - query_time = (void(WINAPI *)(LPFILETIME))GetProcAddress( - GetModuleHandle(TEXT("kernel32.dll")), - "GetSystemTimePreciseAsFileTime"); + if (unlikely(!query_time)) { + HMODULE hModule = GetModuleHandle(TEXT("kernel32.dll")); + if (hModule) + query_time = (void(WINAPI *)(LPFILETIME))GetProcAddress( + hModule, "GetSystemTimePreciseAsFileTime"); if (!query_time) query_time = GetSystemTimeAsFileTime; } diff --git a/test/chrono.h b/test/chrono.h++ similarity index 95% rename from test/chrono.h rename to test/chrono.h++ index 5ee08856..5d29b1c2 100644 --- a/test/chrono.h +++ b/test/chrono.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -14,8 +14,8 @@ #pragma once -#include "base.h" -#include "utils.h" +#include "base.h++" +#include "utils.h++" namespace chrono { diff --git a/test/config.cc b/test/config.c++ similarity index 99% rename from test/config.cc rename to test/config.c++ index 61b299b4..31cf9395 100644 --- a/test/config.cc +++ b/test/config.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #if defined(_MSC_VER) && !defined(strcasecmp) #define strcasecmp(str, len) _stricmp(str, len) @@ -369,7 +369,6 @@ const struct option_verb mode_bits[] = { {"notls", unsigned(MDBX_NOTLS)}, {"nordahead", unsigned(MDBX_NORDAHEAD)}, {"nomeminit", unsigned(MDBX_NOMEMINIT)}, - {"coalesce", unsigned(MDBX_COALESCE)}, {"lifo", unsigned(MDBX_LIFORECLAIM)}, {"perturb", unsigned(MDBX_PAGEPERTURB)}, {"accede", unsigned(MDBX_ACCEDE)}, diff --git a/test/config.h b/test/config.h++ similarity index 99% rename from test/config.h rename to test/config.h++ index 8c93981e..f57dce7c 100644 --- a/test/config.h +++ b/test/config.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -14,9 +14,9 @@ #pragma once -#include "base.h" -#include "log.h" -#include "utils.h" +#include "base.h++" +#include "log.h++" +#include "utils.h++" #define ACTOR_ID_MAX INT16_MAX diff --git a/test/copy.cc b/test/copy.c++ similarity index 94% rename from test/copy.cc rename to test/copy.c++ index 37c58a24..93ae77c8 100644 --- a/test/copy.cc +++ b/test/copy.c++ @@ -1,4 +1,4 @@ -#include "test.h" +#include "test.h++" class testcase_copy : public testcase { const std::string copy_pathname; @@ -15,7 +15,7 @@ REGISTER_TESTCASE(copy); void testcase_copy::copy_db(const bool with_compaction) { int err = mdbx_env_delete(copy_pathname.c_str(), MDBX_ENV_JUST_DELETE); if (err != MDBX_SUCCESS && err != MDBX_RESULT_TRUE) - failure_perror("mdbx_removefile()", err); + failure_perror("osal_removefile()", err); err = mdbx_env_copy(db_guard.get(), copy_pathname.c_str(), with_compaction ? MDBX_CP_COMPACT : MDBX_CP_DEFAULTS); diff --git a/test/dead.cc b/test/dead.c++ similarity index 94% rename from test/dead.cc rename to test/dead.c++ index 05304bc2..d0f8cb09 100644 --- a/test/dead.cc +++ b/test/dead.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" class testcase_deadread : public testcase { public: diff --git a/test/hill.cc b/test/hill.c++ similarity index 99% rename from test/hill.cc rename to test/hill.c++ index 5aea4d71..79234b7d 100644 --- a/test/hill.cc +++ b/test/hill.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" /* LY: тест "холмиком": * - сначала наполняем таблицу циклическими CRUD-манипуляциями, diff --git a/test/jitter.cc b/test/jitter.c++ similarity index 98% rename from test/jitter.cc rename to test/jitter.c++ index 2c781d7f..b25599b0 100644 --- a/test/jitter.cc +++ b/test/jitter.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" class testcase_jitter : public testcase { protected: diff --git a/test/keygen.cc b/test/keygen.c++ similarity index 97% rename from test/keygen.cc rename to test/keygen.c++ index 807954f3..a6d20f33 100644 --- a/test/keygen.cc +++ b/test/keygen.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" namespace keygen { @@ -227,7 +227,8 @@ void maker::setup(const config::actor_params_pod &actor, unsigned actor_id, (void)thread_number; mapping = actor.keygen; - salt = (actor.keygen.seed + actor_id) * UINT64_C(14653293970879851569); + salt = + (actor.keygen.seed + uint64_t(actor_id)) * UINT64_C(14653293970879851569); base = actor.serial_base(); } @@ -315,11 +316,12 @@ void __hot maker::mk_begin(const serial_t serial, const essentials ¶ms, out.value.iov_len = std::max(unsigned(params.minlen), length(serial)); const auto variation = params.maxlen - params.minlen; if (variation) { - if (serial % (variation + 1)) { + if (serial % (variation + serial_t(1))) { auto refix = serial * UINT64_C(48835288005252737); refix ^= refix >> 32; - out.value.iov_len = std::max( - out.value.iov_len, params.minlen + 1 + size_t(refix) % variation); + out.value.iov_len = + std::max(out.value.iov_len, + params.minlen + size_t(1) + size_t(refix) % variation); } } diff --git a/test/keygen.h b/test/keygen.h++ similarity index 98% rename from test/keygen.h rename to test/keygen.h++ index 53219f5d..9e2410fd 100644 --- a/test/keygen.h +++ b/test/keygen.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -14,10 +14,10 @@ #pragma once -#include "base.h" -#include "config.h" -#include "log.h" -#include "utils.h" +#include "base.h++" +#include "config.h++" +#include "log.h++" +#include "utils.h++" namespace keygen { diff --git a/test/log.cc b/test/log.c++ similarity index 96% rename from test/log.cc rename to test/log.c++ index beb96203..04dad84d 100644 --- a/test/log.cc +++ b/test/log.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" static void fflushall() { fflush(nullptr); } @@ -61,7 +61,8 @@ static FILE *last; void setlevel(loglevel priority) { level = priority; int rc = mdbx_setup_debug(MDBX_log_level_t(priority), - MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER, + MDBX_DBG_ASSERT | MDBX_DBG_AUDIT | MDBX_DBG_JITTER | + MDBX_DBG_DUMP, mdbx_logger); log_trace("set mdbx debug-opts: 0x%02x", rc); } @@ -141,7 +142,7 @@ void output_nocheckloglevel_ap(const logging::loglevel priority, prefix.c_str(), level2str(priority), suffix.c_str()); va_list ones; - memset(&ones, 0, sizeof(ones)) /* zap MSVC and other stupid compilers */; + memset(&ones, 0, sizeof(ones)) /* zap MSVC and other goofy compilers */; if (same_or_higher(priority, error)) va_copy(ones, ap); vfprintf(last, format, ap); @@ -152,11 +153,11 @@ void output_nocheckloglevel_ap(const logging::loglevel priority, switch (end) { default: putc('\n', last); - // fall through + MDBX_CXX17_FALLTHROUGH; // fall through case '\n': fflush(last); last = nullptr; - // fall through + MDBX_CXX17_FALLTHROUGH; // fall through case ' ': case '_': case ':': diff --git a/test/log.h b/test/log.h++ similarity index 96% rename from test/log.h rename to test/log.h++ index bc9f4579..aa111ac9 100644 --- a/test/log.h +++ b/test/log.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -14,8 +14,8 @@ #pragma once -#include "base.h" -#include "chrono.h" +#include "base.h++" +#include "chrono.h++" MDBX_NORETURN void usage(void); MDBX_NORETURN void MDBX_PRINTF_ARGS(1, 2) failure(const char *fmt, ...); diff --git a/test/long_stochastic.sh b/test/long_stochastic.sh index 906bacee..900c1319 100755 --- a/test/long_stochastic.sh +++ b/test/long_stochastic.sh @@ -12,6 +12,7 @@ UNAME="$(uname -s 2>/dev/null || echo Unknown)" DB_UPTO_MB=17408 PAGESIZE=min DONT_CHECK_RAM=no +EXTRA=no while [ -n "$1" ] do @@ -31,8 +32,9 @@ do echo "--dir PATH Specifies directory for test DB and other files (it will be cleared)" echo "--db-upto-mb NN Limits upper size of test DB to the NN megabytes" echo "--no-geometry-jitter Disable jitter for geometry upper-size" - echo "--pagesize NN Use specified page size (256 is minimal and used by default) " - echo "--dont-check-ram-size Don't check available RAM " + echo "--pagesize NN Use specified page size (256 is minimal and used by default)" + echo "--dont-check-ram-size Don't check available RAM" + echo "--extra Iterate extra modes/flags" echo "--help Print this usage help and exit" exit -2 ;; @@ -136,7 +138,7 @@ do PAGESIZE=$((1024*64)) ;; *) - echo "Invalig page size '$2'" + echo "Invalid page size '$2'" exit -2 ;; esac @@ -145,6 +147,9 @@ do --dont-check-ram-size) DONT_CHECK_RAM=yes ;; + --extra) + EXTRA=yes + ;; *) echo "Unknown option '$1'" exit -2 @@ -350,9 +355,12 @@ else } fi -syncmodes=("" ,+nosync-safe ,+nosync-utterly) -options=(writemap lifo notls perturb) - +if [ "$EXTRA" != "no" ]; then + options=(writemap lifo notls perturb nomeminit nordahead) +else + options=(writemap lifo notls) +fi +syncmodes=("" ,+nosync-safe ,+nosync-utterly ,+nometasync) function join { local IFS="$1"; shift; echo "$*"; } function bits2options { @@ -414,65 +422,89 @@ for nops in 10 33 100 333 1000 3333 10000 33333 100000 333333 1000000 3333333 10 split=30 caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} split=24 caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} split=16 caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} + if [ "$EXTRA" != "no" ]; then + split=10 + caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) int-key,with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + caption="Probe #$((++count)) with-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ + --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ + --keygen.seed=${seed} + fi + split=4 caption="Probe #$((++count)) int-key,w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) int-key,int-data, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=+key.integer,+data.integer --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=max \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} caption="Probe #$((++count)) w/o-dups, split=${split}, case $((++subcase)) of ${cases}" probe \ --pagesize=$PAGESIZE --size-upper-upto=${db_size_mb}M --table=-data.dups --keygen.split=${split} --keylen.min=min --keylen.max=max --datalen.min=min --datalen.max=1111 \ - --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%3]} \ + --nops=$nops --batch.write=$wbatch --mode=$(bits2options $bits)${syncmodes[count%4]} \ --keygen.seed=${seed} done # options loop=$((loop + 1)) diff --git a/test/main.cc b/test/main.c++ similarity index 98% rename from test/main.cc rename to test/main.c++ index 27858e52..2b8ff655 100644 --- a/test/main.cc +++ b/test/main.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #if !(defined(_WIN32) || defined(_WIN64)) #include @@ -98,7 +98,6 @@ MDBX_NORETURN void usage(void) { " accede == MDBX_ACCEDE\n" " nometasync == MDBX_NOMETASYNC\n" " lifo == MDBX_LIFORECLAIM\n" - " coalesce == MDBX_COALESCE\n" " nosync-safe == MDBX_SAFE_NOSYNC\n" " writemap == MDBX_WRITEMAP\n" " nosync-utterly == MDBX_UTTERLY_NOSYNC\n" @@ -130,8 +129,7 @@ void actor_params::set_defaults(const std::string &tmpdir) { #endif pathname_db = tmpdir + "mdbx-test.db"; - mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_SAFE_NOSYNC | - MDBX_NOMEMINIT | MDBX_COALESCE | MDBX_LIFORECLAIM | MDBX_ACCEDE; + mode_flags = MDBX_NOSUBDIR | MDBX_WRITEMAP | MDBX_SYNC_DURABLE | MDBX_ACCEDE; table_flags = MDBX_DUPSORT; size_lower = -1; @@ -682,9 +680,9 @@ int main(int argc, char *const argv[]) { if (!actor) continue; - log_verbose("actor #%u, id %d, pid %ld: %s\n", actor->actor_id, - actor->space_id, (long)pid, status2str(status)); if (status > as_running) { + log_notice("actor #%u, id %d, pid %ld: %s\n", actor->actor_id, + actor->space_id, (long)pid, status2str(status)); left -= 1; if (status != as_successful) { if (global::config::failfast && !failed) { @@ -694,6 +692,9 @@ int main(int argc, char *const argv[]) { } failed = true; } + } else { + log_verbose("actor #%u, id %d, pid %ld: %s\n", actor->actor_id, + actor->space_id, (long)pid, status2str(status)); } } else { if (timeout_seconds_left == 0) diff --git a/test/nested.cc b/test/nested.c++ similarity index 99% rename from test/nested.cc rename to test/nested.c++ index 098eada0..48299c79 100644 --- a/test/nested.cc +++ b/test/nested.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #include /* LY: тест "эмуляцией time-to-live" с вложенными транзакциями: diff --git a/test/osal-unix.cc b/test/osal-unix.c++ similarity index 99% rename from test/osal-unix.cc rename to test/osal-unix.c++ index 320ebad3..094d6769 100644 --- a/test/osal-unix.cc +++ b/test/osal-unix.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #if !(defined(_WIN32) || defined(_WIN64)) diff --git a/test/osal-windows.cc b/test/osal-windows.c++ similarity index 93% rename from test/osal-windows.cc rename to test/osal-windows.c++ index fc19315c..24cde253 100644 --- a/test/osal-windows.cc +++ b/test/osal-windows.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #if defined(_WIN32) || defined(_WIN64) @@ -71,7 +71,7 @@ void osal_setup(const std::vector &actors) { events.reserve(n); for (unsigned i = 0; i < n; ++i) { - HANDLE hEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + HANDLE hEvent = CreateEventW(NULL, TRUE, FALSE, NULL); if (!hEvent) failure_perror("CreateEvent()", GetLastError()); hEvent = make_inheritable(hEvent); @@ -79,22 +79,22 @@ void osal_setup(const std::vector &actors) { events[i] = hEvent; } - hBarrierSemaphore = CreateSemaphore(NULL, 0, (LONG)actors.size(), NULL); + hBarrierSemaphore = CreateSemaphoreW(NULL, 0, (LONG)actors.size(), NULL); if (!hBarrierSemaphore) failure_perror("CreateSemaphore(BarrierSemaphore)", GetLastError()); hBarrierSemaphore = make_inheritable(hBarrierSemaphore); - hBarrierEvent = CreateEvent(NULL, TRUE, FALSE, NULL); + hBarrierEvent = CreateEventW(NULL, TRUE, FALSE, NULL); if (!hBarrierEvent) failure_perror("CreateEvent(BarrierEvent)", GetLastError()); hBarrierEvent = make_inheritable(hBarrierEvent); - hProgressActiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + hProgressActiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL); if (!hProgressActiveEvent) failure_perror("CreateEvent(ProgressActiveEvent)", GetLastError()); hProgressActiveEvent = make_inheritable(hProgressActiveEvent); - hProgressPassiveEvent = CreateEvent(NULL, FALSE, FALSE, NULL); + hProgressPassiveEvent = CreateEventW(NULL, FALSE, FALSE, NULL); if (!hProgressPassiveEvent) failure_perror("CreateEvent(ProgressPassiveEvent)", GetLastError()); hProgressPassiveEvent = make_inheritable(hProgressPassiveEvent); @@ -248,7 +248,7 @@ Environment: CommandLine.push_back('"'); for (auto It = Argument.begin();; ++It) { - unsigned NumberBackslashes = 0; + size_t NumberBackslashes = 0; while (It != Argument.end() && *It == '\\') { ++It; @@ -348,6 +348,7 @@ actor_status osal_actor_info(const mdbx_pid_t pid) { status = as_debugging; break; case STATUS_CONTROL_C_EXIT: + case /* STATUS_INTERRUPTED */ 0xC0000515L: status = as_killed; break; case EXCEPTION_ACCESS_VIOLATION: @@ -357,6 +358,12 @@ actor_status osal_actor_info(const mdbx_pid_t pid) { case EXCEPTION_INVALID_DISPOSITION: case EXCEPTION_ILLEGAL_INSTRUCTION: case EXCEPTION_NONCONTINUABLE_EXCEPTION: + case /* STATUS_STACK_BUFFER_OVERRUN, STATUS_BUFFER_OVERFLOW_PREVENTED */ + 0xC0000409L: + case /* STATUS_ASSERTION_FAILURE */ 0xC0000420L: + case /* STATUS_HEAP_CORRUPTION */ 0xC0000374L: + case /* STATUS_CONTROL_STACK_VIOLATION */ 0xC00001B2L: + log_error("pid %zu, exception 0x%x", (intptr_t)pid, (unsigned)ExitCode); status = as_coredump; break; default: @@ -428,7 +435,7 @@ void osal_udelay(size_t us) { unsigned timeslice_ms = 1; while (timeBeginPeriod(timeslice_ms) == TIMERR_NOCANDO) ++timeslice_ms; - threshold_us = timeslice_ms * 1500u; + threshold_us = timeslice_ms * size_t(1500); assert(threshold_us > 0); } diff --git a/test/osal.h b/test/osal.h++ similarity index 94% rename from test/osal.h rename to test/osal.h++ index a893ddb4..ef3b5562 100644 --- a/test/osal.h +++ b/test/osal.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -14,7 +14,7 @@ #pragma once -#include "base.h" +#include "base.h++" void osal_setup(const std::vector &actors); void osal_broadcast(unsigned id); diff --git a/test/pcrf/pcrf_test.c b/test/pcrf/pcrf_test.c index 96bb631d..1d1f1e7e 100644 --- a/test/pcrf/pcrf_test.c +++ b/test/pcrf/pcrf_test.c @@ -1,5 +1,5 @@ /* - * Copyright 2016-2022 Leonid Yuriev . + * Copyright 2016-2023 Leonid Yuriev . * Copyright 2015 Vladimir Romanov * , Yota Lab. * diff --git a/test/test.cc b/test/test.c++ similarity index 98% rename from test/test.cc rename to test/test.c++ index 6f620223..77c90c0a 100644 --- a/test/test.cc +++ b/test/test.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" const char *testcase2str(const actor_testcase testcase) { switch (testcase) { @@ -100,7 +100,7 @@ int testcase::hsr_callback(const MDBX_env *env, const MDBX_txn *txn, info.mi_geo.current >= info.mi_geo.upper)) { osal_yield(); if (retry > 0) - osal_udelay(retry * 100); + osal_udelay(retry * size_t(100)); return MDBX_RESULT_FALSE /* retry / wait until reader done */; } @@ -158,12 +158,17 @@ void testcase::db_open() { if (config.params.random_writemap && flipcoin()) mode ^= MDBX_WRITEMAP; - actual_env_mode = mode; int rc = mdbx_env_open(db_guard.get(), config.params.pathname_db.c_str(), mode, 0640); if (unlikely(rc != MDBX_SUCCESS)) failure_perror("mdbx_env_open()", rc); + unsigned env_flags_proxy; + rc = mdbx_env_get_flags(db_guard.get(), &env_flags_proxy); + if (unlikely(rc != MDBX_SUCCESS)) + failure_perror("mdbx_env_get_flags()", rc); + actual_env_mode = MDBX_env_flags_t(env_flags_proxy); + rc = mdbx_env_set_syncperiod(db_guard.get(), unsigned(0.042 * 65536)); if (unlikely(rc != MDBX_SUCCESS) && rc != MDBX_BUSY) failure_perror("mdbx_env_set_syncperiod()", rc); @@ -199,6 +204,19 @@ void testcase::txn_begin(bool readonly, MDBX_txn_flags_t flags) { log_trace("<< txn_begin(%s, 0x%04X)", readonly ? "read-only" : "read-write", flags); + + if (flipcoin_n(5)) { + const unsigned mask = + unsigned(MDBX_warmup_default | MDBX_warmup_force | MDBX_warmup_oomsafe | + MDBX_warmup_lock | MDBX_warmup_touchlimit); + static unsigned counter; + MDBX_warmup_flags_t warmup_flags = MDBX_warmup_flags_t( + (counter > MDBX_warmup_release) ? prng64() & mask : counter); + counter += 1; + int err = mdbx_env_warmup(db_guard.get(), txn, warmup_flags, 0); + log_trace("== counter %u, env_warmup(flags %u), rc %d", counter, + warmup_flags, err); + } } int testcase::breakable_commit() { diff --git a/test/test.h b/test/test.h++ similarity index 97% rename from test/test.h rename to test/test.h++ index 40bb01ac..6158ba66 100644 --- a/test/test.h +++ b/test/test.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -14,13 +14,13 @@ #pragma once -#include "base.h" -#include "chrono.h" -#include "config.h" -#include "keygen.h" -#include "log.h" -#include "osal.h" -#include "utils.h" +#include "base.h++" +#include "chrono.h++" +#include "config.h++" +#include "keygen.h++" +#include "log.h++" +#include "osal.h++" +#include "utils.h++" #include #include @@ -101,10 +101,10 @@ class testcase; class registry { struct record { - actor_testcase id; + actor_testcase id = ac_none; std::string name; - bool (*review_params)(actor_params &); - testcase *(*constructor)(const actor_config &, const mdbx_pid_t); + bool (*review_params)(actor_params &) = nullptr; + testcase *(*constructor)(const actor_config &, const mdbx_pid_t) = nullptr; }; std::unordered_map name2id; std::unordered_map id2record; diff --git a/test/try.cc b/test/try.c++ similarity index 97% rename from test/try.cc rename to test/try.c++ index da81e631..50c959c9 100644 --- a/test/try.cc +++ b/test/try.c++ @@ -1,4 +1,4 @@ -#include "test.h" +#include "test.h++" class testcase_try : public testcase { public: diff --git a/test/ttl.cc b/test/ttl.c++ similarity index 99% rename from test/ttl.cc rename to test/ttl.c++ index de3c9f42..a7049022 100644 --- a/test/ttl.cc +++ b/test/ttl.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #include #include diff --git a/test/utils.cc b/test/utils.c++ similarity index 98% rename from test/utils.cc rename to test/utils.c++ index 33420cc0..71d56eb8 100644 --- a/test/utils.cc +++ b/test/utils.c++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -12,7 +12,7 @@ * . */ -#include "test.h" +#include "test.h++" #include #if defined(HAVE_IEEE754_H) || __has_include() #include diff --git a/test/utils.h b/test/utils.h++ similarity index 99% rename from test/utils.h rename to test/utils.h++ index dd52dc00..98763536 100644 --- a/test/utils.h +++ b/test/utils.h++ @@ -1,5 +1,5 @@ /* - * Copyright 2017-2022 Leonid Yuriev + * Copyright 2017-2023 Leonid Yuriev * and other libmdbx authors: please see AUTHORS file. * All rights reserved. * @@ -13,7 +13,7 @@ */ #pragma once -#include "base.h" +#include "base.h++" #if !defined(__BYTE_ORDER__) || !defined(__ORDER_LITTLE_ENDIAN__) || \ !defined(__ORDER_BIG_ENDIAN__) diff --git a/test/valgrind_suppress.txt b/test/valgrind_suppress.txt index 01952901..5bc50077 100644 --- a/test/valgrind_suppress.txt +++ b/test/valgrind_suppress.txt @@ -4,7 +4,7 @@ msync(start) fun:msync ... - fun:mdbx_sync_locked* + fun:sync_locked* } { msync-whole-mmap-2 @@ -12,7 +12,7 @@ msync(start) fun:msync ... - fun:mdbx_env_sync_internal* + fun:env_sync* } { msync-whole-mmap-3 @@ -20,7 +20,7 @@ msync(start) fun:msync ... - fun:mdbx_mapresize* + fun:map_resize* } { msync-wipe-steady @@ -28,21 +28,43 @@ msync(start) fun:msync ... - fun:mdbx_wipe_steady* + fun:wipe_steady* +} +{ + msync-meta + Memcheck:Param + msync(start) + fun:msync + ... + fun:meta_sync* +} +{ + msync-spill + Memcheck:Param + msync(start) + fun:msync + ... + fun:txn_spill* } -# memcmp() inside mdbx_iov_write() as workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269 +# memcmp() inside iov_write() as workaround for https://libmdbx.dqdkfa.ru/dead-github/issues/269 { - write-page-check-bcmp + iov-pagecheck-1 Memcheck:Cond fun:bcmp - fun:mdbx_iov_write* + fun:iov_callback4dirtypages + fun:osal_ioring_walk + fun:iov_complete + fun:iov_write } { - write-page-check-memcmp + iov-pagecheck-2 Memcheck:Cond fun:memcmp* - fun:mdbx_iov_write* + fun:iov_callback4dirtypages + fun:osal_ioring_walk + fun:iov_complete + fun:iov_write } # single-page flush by pwrite() @@ -52,7 +74,7 @@ pwrite(buf) fun:pwrite ... - fun:mdbx_iov_write* + fun:iov_write* } { pwrite64-page-flush @@ -60,7 +82,7 @@ pwrite64(buf) fun:pwrite ... - fun:mdbx_iov_write* + fun:iov_write* } # modern Valgrind don't support the `vector[...]` pattern @@ -70,16 +92,16 @@ # pwritev(vector[...]) # fun:pwritev # ... -# fun:mdbx_iov_write* +# fun:iov_write* #} -# for((i=0;i<64;++i)); do echo -e "{\n pwritev-page-flush-$i\n Memcheck:Param\n pwritev(vector[$i])\n fun:pwritev\n ...\n fun:mdbx_iov_write*\n}"; done >> valgrind_suppress.txt +# for((i=0;i<64;++i)); do echo -e "{\n pwritev-page-flush-$i\n Memcheck:Param\n pwritev(vector[$i])\n fun:pwritev\n ...\n fun:iov_write*\n}"; done >> valgrind_suppress.txt { pwritev-page-flush-0 Memcheck:Param pwritev(vector[0]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-1 @@ -87,7 +109,7 @@ pwritev(vector[1]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-2 @@ -95,7 +117,7 @@ pwritev(vector[2]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-3 @@ -103,7 +125,7 @@ pwritev(vector[3]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-4 @@ -111,7 +133,7 @@ pwritev(vector[4]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-5 @@ -119,7 +141,7 @@ pwritev(vector[5]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-6 @@ -127,7 +149,7 @@ pwritev(vector[6]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-7 @@ -135,7 +157,7 @@ pwritev(vector[7]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-8 @@ -143,7 +165,7 @@ pwritev(vector[8]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-9 @@ -151,7 +173,7 @@ pwritev(vector[9]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-10 @@ -159,7 +181,7 @@ pwritev(vector[10]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-11 @@ -167,7 +189,7 @@ pwritev(vector[11]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-12 @@ -175,7 +197,7 @@ pwritev(vector[12]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-13 @@ -183,7 +205,7 @@ pwritev(vector[13]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-14 @@ -191,7 +213,7 @@ pwritev(vector[14]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-15 @@ -199,7 +221,7 @@ pwritev(vector[15]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-16 @@ -207,7 +229,7 @@ pwritev(vector[16]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-17 @@ -215,7 +237,7 @@ pwritev(vector[17]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-18 @@ -223,7 +245,7 @@ pwritev(vector[18]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-19 @@ -231,7 +253,7 @@ pwritev(vector[19]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-20 @@ -239,7 +261,7 @@ pwritev(vector[20]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-21 @@ -247,7 +269,7 @@ pwritev(vector[21]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-22 @@ -255,7 +277,7 @@ pwritev(vector[22]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-23 @@ -263,7 +285,7 @@ pwritev(vector[23]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-24 @@ -271,7 +293,7 @@ pwritev(vector[24]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-25 @@ -279,7 +301,7 @@ pwritev(vector[25]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-26 @@ -287,7 +309,7 @@ pwritev(vector[26]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-27 @@ -295,7 +317,7 @@ pwritev(vector[27]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-28 @@ -303,7 +325,7 @@ pwritev(vector[28]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-29 @@ -311,7 +333,7 @@ pwritev(vector[29]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-30 @@ -319,7 +341,7 @@ pwritev(vector[30]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-31 @@ -327,7 +349,7 @@ pwritev(vector[31]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-32 @@ -335,7 +357,7 @@ pwritev(vector[32]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-33 @@ -343,7 +365,7 @@ pwritev(vector[33]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-34 @@ -351,7 +373,7 @@ pwritev(vector[34]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-35 @@ -359,7 +381,7 @@ pwritev(vector[35]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-36 @@ -367,7 +389,7 @@ pwritev(vector[36]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-37 @@ -375,7 +397,7 @@ pwritev(vector[37]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-38 @@ -383,7 +405,7 @@ pwritev(vector[38]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-39 @@ -391,7 +413,7 @@ pwritev(vector[39]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-40 @@ -399,7 +421,7 @@ pwritev(vector[40]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-41 @@ -407,7 +429,7 @@ pwritev(vector[41]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-42 @@ -415,7 +437,7 @@ pwritev(vector[42]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-43 @@ -423,7 +445,7 @@ pwritev(vector[43]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-44 @@ -431,7 +453,7 @@ pwritev(vector[44]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-45 @@ -439,7 +461,7 @@ pwritev(vector[45]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-46 @@ -447,7 +469,7 @@ pwritev(vector[46]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-47 @@ -455,7 +477,7 @@ pwritev(vector[47]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-48 @@ -463,7 +485,7 @@ pwritev(vector[48]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-49 @@ -471,7 +493,7 @@ pwritev(vector[49]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-50 @@ -479,7 +501,7 @@ pwritev(vector[50]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-51 @@ -487,7 +509,7 @@ pwritev(vector[51]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-52 @@ -495,7 +517,7 @@ pwritev(vector[52]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-53 @@ -503,7 +525,7 @@ pwritev(vector[53]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-54 @@ -511,7 +533,7 @@ pwritev(vector[54]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-55 @@ -519,7 +541,7 @@ pwritev(vector[55]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-56 @@ -527,7 +549,7 @@ pwritev(vector[56]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-57 @@ -535,7 +557,7 @@ pwritev(vector[57]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-58 @@ -543,7 +565,7 @@ pwritev(vector[58]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-59 @@ -551,7 +573,7 @@ pwritev(vector[59]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-60 @@ -559,7 +581,7 @@ pwritev(vector[60]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-61 @@ -567,7 +589,7 @@ pwritev(vector[61]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-62 @@ -575,7 +597,7 @@ pwritev(vector[62]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* } { pwritev-page-flush-63 @@ -583,5 +605,5 @@ pwritev(vector[63]) fun:pwritev ... - fun:mdbx_iov_write* + fun:iov_write* }